;;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: System; Base: 10 -*-
;;;;
;;;; evcl - 12 - Number - Kernel rem pi/2
;;; arch/generic/lisp/math/gen-math-f64-krem-pio2.lisp
;;;
;;; This file is part of Evita Common Lisp.
;;;
;;; Copyright (C) 1996-2007 by Project Vogue.
;;; Written by Yoshifumi "VOGUE" INOUE. (yosi@msn.com)
;;;
;;; @(#)$Id: //proj/evcl3/mainline/arch/generic/lisp/libm/float64/gen-float64-krem-pio2.lisp#1 $
;;;
;;; Description:
;;;  This file contains implementation of following macros:
;;;     float64-krem-pio2
;
(in-package #:strict-math)

#|
/* @(#)k_rem_pio2.c 5.1 93/09/24
/*
 * ====================================================
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
 *
 * Developed at SunPro, a Sun Microsystems, Inc. business.
 * Permission to use, copy, modify, and distribute this
 * software is freely granted, provided that this notice
 * is preserved.
 * ====================================================


/*
 * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
 * double x[],y[]; int e0,nx,prec; int ipio2[];
 *
 * __kernel_rem_pio2 return the last three digits of N with
 *              y = x - N*pi/2
 * so that |y| < pi/2.
 *
 * The method is to compute the integer (mod 8) and fraction parts of
 * (2/pi)*x without doing the full multiplication. In general we
 * skip the part of the product that are known to be a huge integer (
 * more accurately, = 0 mod 8 ). Thus the number of operations are
 * independent of the exponent of the input.
 *
 * (2/pi) is represented by an array of 24-bit integers in ipio2[].
 *
 * Input parameters:
 *      x[]        The input value (must be positive) is broken into nx
 *              pieces of 24-bit integers in double precision format.
 *              x[i] will be the i-th 24 bit of x. The scaled exponent
 *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
 *              match x's up to 24 bits.
 *
 *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
 *                      e0 = ilogb(z)-23
 *                      z  = scalbn(z,-e0)
 *              for i = 0,1,2
 *                      x[i] = floor(z)
 *                      z    = (z-x[i])*2**24
 *
 *
 *      y[]        ouput result in an array of double precision numbers.
 *              The dimension of y[] is:
 *                      24-bit  precision        1
 *                      53-bit  precision        2
 *                      64-bit  precision        2
 *                      113-bit precision        3
 *              The actual value is the sum of them. Thus for 113-bit
 *              precison, one may have to do something like:
 *
 *              long double t,w,r_head, r_tail;
 *              t = (long double)y[2] + (long double)y[1];
 *              w = (long double)y[0];
 *              r_head = t+w;
 *              r_tail = w - (r_head - t);
 *
 *      e0        The exponent of x[0]
 *
 *      nx        dimension of x[]
 *
 *      prec        an integer indicating the precision:
 *                      0        24  bits (single)
 *                      1        53  bits (double)
 *                      2        64  bits (extended)
 *                      3        113 bits (quad)
 *
 *      ipio2[]
 *              integer array, contains the (24*i)-th to (24*i+23)-th
 *              bit of 2/pi after binary point. The corresponding
 *              floating value is
 *
 *                      ipio2[i] * 2^(-24(i+1)).
 *
 * External function:
 *      double scalbn(), floor();
 *
 *
 * Here is the description of some local variables:
 *
 *      jk        jk+1 is the initial number of terms of ipio2[] needed
 *              in the computation. The recommended value is 2,3,4,
 *              6 for single, double, extended,and quad.
 *
 *      jz        local integer variable indicating the number of
 *              terms of ipio2[] used.
 *
 *      jx        nx - 1
 *
 *      jv        index for pointing to the suitable ipio2[] for the
 *              computation. In general, we want
 *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
 *              is an integer. Thus
 *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
 *              Hence jv = max(0,(e0-3)/24).
 *
 *      jp        jp+1 is the number of terms in PIo2[] needed, jp = jk.
 *
 *      q[]        double array with integral value, representing the
 *              24-bits chunk of the product of x and 2/pi.
 *
 *      q0        the corresponding exponent of q[0]. Note that the
 *              exponent for q[i] would be q0-24*i.
 *
 *      PIo2[]        double precision array, obtained by cutting pi/2
 *              into 24 bits chunks.
 *
 *      f[]        ipio2[] in floating point
 *
 *      iq[]        integer array by breaking up q[] in 24-bits chunk.
 *
 *      fq[]        final product of x*(2/pi) in fq[0],..,fq[jk]
 *
 *      ih        integer. If >0 it indicates q[] is >= 0.5, hence
 *              it also indicates the *sign* of the result.
 *
|#


;;; Constants:
;;; The hexadecimal values are the intended ones for the following
;;; constants. The decimal values may be used, provided that the
;;; compiler will convert from decimal to binary accurately enough
;;; to produce the hexadecimal values shown.

;#ifdef __STDC__
;static const double PIo2[] = {
;#else
;static double PIo2[] = {
;#endif
;};
(defparameter +pi-over-two+ (make-array 8 :element-type 'double-float
    :initial-contents #(
     1.57079625129699707031d+00
     7.54978941586159635335d-08
     5.39030252995776476554d-15
     3.28200341580791294123d-22
     1.27065575308067607349d-29
     1.22933308981111328932d-36
     2.73370053816464559624d-44
     2.16741683877804819444d-51
 ) ) )

(defun float64-kernel-rem-pio2 (x e0 nx prec ipio2)
    ;(declare (values (member -1 0 1) double-float double-float))
    (declare (type (simple-array double-float (*)) x))
    (declare (type fixnum e0 nx))
    (declare (type (integer 0 3) prec))
  (prog* (
    (zero   0.0d0)
    (one    1.0d0)
    (two24  #+nil 1.67772160000000000000e+07
            #.(encode-float64 #x41700000 #x00000000) )
    (twon24 #+nil 5.96046447753906250000e-08
            #.(encode-float64 #x3E700000 #x00000000) )
    ;; initialize jk
    (jk (svref #(2 3 4 6) prec))
    (jp jk)
    ;; determine jx,jv,q0, note that 3>q0
    (jx (- nx 1))
    (jv (truncate (- e0 3) 24)) ; if(jv<0) jv=0;
    (q0 (- e0 (* 24 (+ jv 1))))
    ;; set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk]
    (f
      (loop
        with f = (make-array 20 :element-type 'double-float)
        for i to (+ jx jk)
        for j = (- jv jx) then (1+ j) do
        (setf (elt f i)
              (if (minusp j) zero (float (elt ipio2 j) 0d0)) )
        finally (return f) ) )
    ;;compute q[0],q[1],...q[jk]
    (q
      (loop
        with q = (make-array 20 :element-type 'double-float)
        for i to jk do
          (loop
            with fw of-type double-float = 0d0
            for j to jx do
              (incf fw (* (elt x j) (elt f (- (+ jx i) j))))
              (setf (elt q i) fw) )
        finally (return q) ) )
    (jz jk)
    (iq (make-array 20 :element-type '(signed-byte 32)))
    (fq (make-array jk :element-type 'double-float))
    (z (elt q jz))
    (n 0)
    )
  recompute
    ;; distill q[] into iq[] reversingly
    (loop
      for i = 0 then (1+ i)
      for j = jz then (1- j)
      while (> j 0) do
        (let ((fw (float (truncate (* twon24 z)) 0d0)))
          (setf (elt iq i) (truncate (- z (* two24 fw))))
          (setq z (+ (elt q (1- j)) fw)) ))

    ;; compute n
    (setq z  (scale-float64 z q0))           ; actual value of z
    (decf z  (* 8.0d0 (floor (* z 0.125d0))))  ; trim off integer >= 8
    (setq n (truncate z))
    (decf z (float n 0d0))
    (let ((ih
            (cond
              ((> q0 0)    ; need iq[jz-1] to determine n
                (let ((i  (ash (elt iq (- jz 1)) (- (- 24 q0)))))
                  (incf n i)
                  (decf (elt iq (- jz 1)) (ash i (- 24 q0)))
                  (ash (elt iq (- jz 1)) (- (- 23 q0))) ) )
              ((eql q0 0) (ash (elt iq (- jz 1)) -23) )
              ((>= z 0.5d0) 2)
              (t 0) ) ))
      (when (> ih 0)    ; q > 0.5
        (incf n)
        (let ((carry 0))
          ;; compute 1-q
          (loop for i below jz do
            (let ((j (elt iq i)))
              (cond
                ((not (eql carry 0))
                  (setf (elt iq i) (- #xffffff j)) )
                ((not (eql j 0))
                  (setq carry 1)
                  (setf (elt iq i) (- #x1000000 j)) )) ))

            ;; rare case: chance is 1 in 12
            (when (> q0 0)
              (case q0
                (1 (setf (elt iq (- jz 1))
                        (logand (elt iq (- jz 1)) #x7fffff) ) )
                (2 (setf (elt iq (- jz 1))
                        (logand (elt iq (- jz 1)) #x3fffff) ) )))

            (when (eql ih 2)
              (setq z (- one z))
              (unless (eql carry 0)
                (decf z (scale-float64 one (truncate q0))) )) ))

    ;; check if recomputation is needed
       (when (= z zero)
        (let
         ((j 0))
         (loop for i = (- jz 1) then (1- i)
               while (>= i jk)
               do (setq j (logior j (elt iq i))))
         (when (= j 0)
          (let
           ((k 1))
           (loop while (= (elt iq (- jk k)) 0) do (incf k))
           (loop for i = (1+ jz) then (1+ i)
                 while (zerop (elt iq (- jk k)))
                 do (setf (elt f (+ jx i)) (float (elt ipio2 (+ jv i)) 0d0))
                 do (setf (elt q i)
                     (loop for j = 0 then (1+ j)
                           with fw of-type double-float = 0d0
                           while (<= j jx)
                           do (setq fw (+ fw (* (elt x j) (elt f (- (+ jx i) j)))))
                           finally (return fw))))
           (decf jz k)
           (go recompute)))))


;        if(z==zero) {
;            j = 0;
;            for (i=jz-1;i>=jk;i--) j |= iq[i];
;            if(j==0) { /* need recomputation
;                for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed
;
;                for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k]
;                    f[jx+i] = (double) ipio2[jv+i];
;                    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
;                    q[i] = fw;
;                }
;                jz += k;
;                goto recompute;
;            }
;        }

    ;; chop off zero terms
    (if (= z 0.0d0)
     (progn
      (decf jz)
      (decf q0 24)
      (loop while (zerop (elt iq jz))
            do (decf jz)
            do (decf q0 24)))
     (progn
      (setq z (scale-float64 z (- (truncate q0))))
      (if (>= z two24)
       (let
        ((fw (float (truncate (* twon24 z)) 0d0)))
        (setf (elt iq jz) (truncate (- z (* two24 fw))))
        (incf jz)
        (incf q0 24)
        (setf (elt iq jz) (truncate fw)))
       (setf (elt iq jz) (truncate z)))))

;        if(z==0.0) {
;            jz -= 1; q0 -= 24;
;            while(iq[jz]==0) { jz--; q0-=24;}
;        } else { /* break z into 24-bit if necessary
;            z = scalbn(z,-(int)q0);
;            if(z>=two24) {
;                fw = (double)((__int32_t)(twon24*z));
;                iq[jz] = (__int32_t)(z-two24*fw);
;                jz += 1; q0 += 24;
;                iq[jz] = (__int32_t) fw;
;            } else iq[jz] = (__int32_t) z ;
;        }
;

    ;; convert integer "bit" chunk to floating-point value
    (let
     ((fw (scale-float64 one (truncate q0))))
     (loop for i = jz then (1- i)
           while (>= i 0)
           do (setf (elt q i) (* fw (float (elt iq i) 0d0)))
           do (setq fw (* fw twon24))))

;        fw = scalbn(one,(int)q0);
;        for(i=jz;i>=0;i--) {
;            q[i] = fw*(double)iq[i]; fw*=twon24;
;        }

    ;; compute PIo2[0,...,jp]*q[jz,...,0]

    (let
     ((fw 0.0d0))
     (loop for i = jz then (1- i)
           while (>= i 0)
           do (loop for k = 0 then (1+ k)
                    while (and (<= k jp) (<= k (- jz i)))
                    do (setq fw (+ fw (* (float (elt +pi-over-two+ k) 0d0) (elt q (+ i k))))))
           do (setf (elt fq (- jz i)) fw)))

;        for(i=jz;i>=0;i--) {
;            for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
;            fq[jz-i] = fw;
;        }
;

    ;; compress fq[] into y[]
    (let
     ((y0 0)
      (y1 0)
      (fw 0.0d0))
     (case prec ; of course this will be 2
      (0
       (loop for i = jz then (1- i)
             while (>= i 0)
             do (setq fw (+ fw (elt fq i))))
       (setq y0 (if (= ih 0) fw (- fw))))
      ((1 2)
       (loop for i = jz then (1- i)
             while (>= i 0)
             do (setq fw (+ fw (elt fq i))))
       (setq y0 (if (= ih 0) fw (- fw)))
       (setq fw (- (elt fq 0) fw))
       (loop for i = 0 then (1+ i)
             while (<= i jz)
             do (setq fw (+ fw (elt fq i))))
       (setq y1 (if (= ih 0) fw (- fw))))
      (3 ; anytime the source comment says painful, and I don't have to do it....
       (error "Don't support 113 bit precision")))

;        switch(prec) {
;            case 0:
;                fw = 0.0;
;                for (i=jz;i>=0;i--) fw += fq[i];
;                y[0] = (ih==0)? fw: -fw;
;                break;
;            case 1:
;            case 2:
;                fw = 0.0;
;                for (i=jz;i>=0;i--) fw += fq[i];
;                y[0] = (ih==0)? fw: -fw;
;                fw = fq[0]-fw;
;                for (i=1;i<=jz;i++) fw += fq[i];
;                y[1] = (ih==0)? fw: -fw;
;                break;
;            case 3:        /* painful
;                for (i=jz;i>0;i--) {
;                    fw      = fq[i-1]+fq[i];
;                    fq[i]  += fq[i-1]-fw;
;                    fq[i-1] = fw;
;                }
;                for (i=jz;i>1;i--) {
;                    fw      = fq[i-1]+fq[i];
;                    fq[i]  += fq[i-1]-fw;
;                    fq[i-1] = fw;
;                }
;                for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
;                if(ih==0) {
;                    y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
;                } else {
;                    y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
;                }
;        }
      (values (logand n 7) y0 y1)))))
;        return n&7;
;}