#if defined HAVE_CONFIG_H #include "config.h" #endif ! 1d fft routine based on Clive Temperton's GPFA package ! Explanation of the module structure: ! ! First come two modules implementing the sp and dp versions of the ! 'gpfa' main routine in Clive Temperton's package. Each exports a ! 'gpfa' interface. ! Then, a wrapper module 'gridxc_fft_gpfa' imports both interfaces and ! re-exports 'gpfa', and also contains the precision-independent parts ! of the GPFA package (setgpfa) and two wrapper routines: ! ! fft_gpfa ! fft_gpfa_ez, for packed arrays ! ! These wrapper routines avoid the explicit use of the 'trigs' ! twiddle factors argument, which is instead generated as needed. ! ! NOTES for building-system developers: ! ! While there is only one source file, three* modules need to be accounted for: ! ! gridxc_fft_gpfa.mod ! gridxc_gpfa_core_sp.mod ! gridxc_gpfa_core_dp.mod ! ! If this is uwieldly, the auxiliary modules could be eliminated, and the code inserted ! in one big module. In that case, however, all the routines would need to be disambiguated. ! (For example, gpfa2f --> gpfa2f_sp / gpfa2f_dp) module gridxc_gpfa_core_sp ! ! 1d fft routine based on Clive Temperton's GPFA package ! ! Vectors to be transformed are in SINGLE precision (wp=sp) implicit none ! In this module, trigs twiddle factors and intermediate ! variables are kept in double precision. ! (This was the standard approach in Siesta. In other codes the Temperton code ! is simply converted wholesale to sp or dp) integer, parameter :: dp = selected_real_kind(10,100) integer, parameter :: sp = selected_real_kind(6,10) integer, parameter :: wp = sp interface gpfa module procedure gpfa_ end interface gpfa public :: gpfa private CONTAINS !-------------------------------------------------------------- ! ! What follows is the original code by Temperton (save setgpfa) ! with explicit declarations of variables ! ! In this module, trigs twiddle factors and intermediate ! variables are kept in double precision. ! For arbitrary-precision support the parameter constants might ! be computed instead of inserted explicitly. ! ! For example: sin60 = sin(2.0_wp*pi/3.0_wp), where wp is the working ! precision, and ! pi = 4.0_wp * atan(1.0_wp), or ! twopi = 4.0_wp * asin(1.0_wp) ! !********************************************************************* ! SUBROUTINE 'GPFA' ! SELF-SORTING IN-PLACE GENERALIZED PRIME FACTOR (COMPLEX) FFT ! ! *** THIS IS THE ALL-FORTRAN VERSION *** ! ------------------------------- ! ! CALL GPFA(A,B,TRIGS,INC,JUMP,N,LOT,ISIGN) ! ! A IS FIRST REAL INPUT/OUTPUT VECTOR ! B IS FIRST IMAGINARY INPUT/OUTPUT VECTOR ! TRIGS IS A TABLE OF TWIDDLE FACTORS, PRECALCULATED ! BY CALLING SUBROUTINE 'SETGPFA' ! INC IS THE INCREMENT WITHIN EACH DATA VECTOR ! JUMP IS THE INCREMENT BETWEEN DATA VECTORS ! N IS THE LENGTH OF THE TRANSFORMS: ! ----------------------------------- ! N = (2**IP) * (3**IQ) * (5**IR) ! ----------------------------------- ! LOT IS THE NUMBER OF TRANSFORMS ! ISIGN = +1 FOR FORWARD TRANSFORM ! = -1 FOR INVERSE TRANSFORM ! ! WRITTEN BY CLIVE TEMPERTON ! RECHERCHE EN PREVISION NUMERIQUE ! ATMOSPHERIC ENVIRONMENT SERVICE, CANADA ! !---------------------------------------------------------------------- ! ! DEFINITION OF TRANSFORM ! ----------------------- ! ! X(J) = SUM(K=0,...,N-1)(C(K)*EXP(ISIGN*2*I*J*K*PI/N)) ! !--------------------------------------------------------------------- ! ! FOR A MATHEMATICAL DEVELOPMENT OF THE ALGORITHM USED, ! SEE: ! ! C TEMPERTON : "A GENERALIZED PRIME FACTOR FFT ALGORITHM ! FOR ANY N = (2**P)(3**Q)(5**R)", ! SIAM J. SCI. STAT. COMP., MAY 1992. ! SUBROUTINE GPFA_(A,B,TRIGS,INC,JUMP,N,LOT,ISIGN) IMPLICIT NONE ! ! NOTE: The original declaration of these arguments (here and below) ! was: A(*), B(*) REAL(wp) :: A(:), B(:) REAL(dp) TRIGS(*) INTEGER INC, JUMP, N, LOT, ISIGN ! INTEGER & NJ(3), NN, IFAC, LL, KK, IP, IQ, IR, I character(len=20) :: msg ! ! DECOMPOSE N INTO FACTORS 2,3,5 ! ------------------------------ NN = N IFAC = 2 ! DO 30 LL = 1 , 3 KK = 0 10 CONTINUE IF (MOD(NN,IFAC).NE.0) GO TO 20 KK = KK + 1 NN = NN / IFAC GO TO 10 20 CONTINUE NJ(LL) = KK IFAC = IFAC + LL 30 CONTINUE ! IF (NN.NE.1) THEN write(msg,"(i0)") N call die('GPFA: '//trim(msg)//' IS NOT A LEGAL VALUE OF N') ENDIF ! IP = NJ(1) IQ = NJ(2) IR = NJ(3) ! ! COMPUTE THE TRANSFORM ! --------------------- I = 1 IF (IP.GT.0) THEN CALL GPFA2F(A,B,TRIGS,INC,JUMP,N,IP,LOT,ISIGN) I = I + 2 * ( 2**IP) ENDIF IF (IQ.GT.0) THEN CALL GPFA3F(A,B,TRIGS(I),INC,JUMP,N,IQ,LOT,ISIGN) I = I + 2 * (3**IQ) ENDIF IF (IR.GT.0) THEN CALL GPFA5F(A,B,TRIGS(I),INC,JUMP,N,IR,LOT,ISIGN) ENDIF ! RETURN END SUBROUTINE GPFA_ !********************************************************************* ! fortran version of *gpfa2* - ! radix-2 section of self-sorting, in-place, generalized pfa ! central radix-2 and radix-8 passes included ! so that transform length can be any power of 2 ! subroutine gpfa2f(a,b,trigs,inc,jump,n,mm,lot,isign) real(wp) :: a(*), b(*) real(dp) :: trigs(*) integer inc, jump, n, mm, lot, isign integer n2, inq, jstepx, ninc, ink, m2, m8, & m, mh, nblox, left, istart, nb, nvex, & la, mu, ipass, jstep, jstepl, jjj, ja, & nu, jb, jc, jd, j, l, kk, k, je, jf, & jg, jh, laincl, ll, ji, jj, jk, jl, jm, & jn, jo, jp real(dp) :: s, aja, ajc, t0, t2, ajb, ajd, & t1, t3, bja, bjc, u0, u2, bjb, & bjd, u1, u3, co1, si1, co2, si2, & co3, si3, ss, c1, c2, c3, aje, & ajf, ajg, ajh, bje, bjf, bjg, & bjh, co4, si4, co5, si5, co6, si6, & co7, si7, aji, bjm, ajj, bjj, ajk, & ajl, bji, bjk, ajo, bjl, bjo, ajm, & ajn, ajp, bjn, bjp #ifdef OLD_CRAY integer, parameter :: lvr = CRAY_LVR_PARAMETER #else integer, parameter :: lvr = 1024 #endif ! ! *************************************************************** ! * * ! * N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. * ! * RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE * ! * (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER. * ! * * ! *************************************************************** ! n2 = 2**mm inq = n/n2 jstepx = (n2-n) * inc ninc = n * inc ink = inc * inq ! m2 = 0 m8 = 0 if (mod(mm,2).eq.0) then m = mm/2 else if (mod(mm,4).eq.1) then m = (mm-1)/2 m2 = 1 else if (mod(mm,4).eq.3) then m = (mm-3)/2 m8 = 1 endif mh = (m+1)/2 ! nblox = 1 + (lot-1)/lvr left = lot s = real(isign,kind=dp) istart = 1 ! ! loop on blocks of lvr transforms ! -------------------------------- do 500 nb = 1 , nblox ! if (left.le.lvr) then nvex = left else if (left.lt.(2*lvr)) then nvex = left/2 nvex = nvex + mod(nvex,2) else nvex = lvr endif left = left - nvex ! la = 1 ! ! loop on type I radix-4 passes ! ----------------------------- mu = mod(inq,4) if (isign.eq.-1) mu = 4 - mu ss = 1.0_dp if (mu.eq.3) ss = -1.0_dp ! if (mh.eq.0) go to 200 ! do 160 ipass = 1 , mh jstep = (n*inc) / (4*la) jstepl = jstep - ninc ! ! k = 0 loop (no twiddle factors) ! ------------------------------- do 120 jjj = 0 , (n-1)*inc , 4*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 115 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 110 l = 1 , nvex aja = a(ja+j) ajc = a(jc+j) t0 = aja + ajc t2 = aja - ajc ajb = a(jb+j) ajd = a(jd+j) t1 = ajb + ajd t3 = ss * ( ajb - ajd ) bja = b(ja+j) bjc = b(jc+j) u0 = bja + bjc u2 = bja - bjc bjb = b(jb+j) bjd = b(jd+j) u1 = bjb + bjd u3 = ss * ( bjb - bjd ) a(ja+j) = t0 + t1 a(jc+j) = t0 - t1 b(ja+j) = u0 + u1 b(jc+j) = u0 - u1 a(jb+j) = t2 - u3 a(jd+j) = t2 + u3 b(jb+j) = u2 + t3 b(jd+j) = u2 - t3 j = j + jump 110 continue ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 115 continue 120 continue ! ! finished if n2 = 4 ! ------------------ if (n2.eq.4) go to 490 kk = 2 * la ! ! loop on nonzero k ! ----------------- do 150 k = ink , jstep-ink , ink co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) co3 = trigs(3*kk+1) si3 = s*trigs(3*kk+2) ! ! loop along transform ! -------------------- do 140 jjj = k , (n-1)*inc , 4*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 135 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 130 l = 1 , nvex aja = a(ja+j) ajc = a(jc+j) t0 = aja + ajc t2 = aja - ajc ajb = a(jb+j) ajd = a(jd+j) t1 = ajb + ajd t3 = ss * ( ajb - ajd ) bja = b(ja+j) bjc = b(jc+j) u0 = bja + bjc u2 = bja - bjc bjb = b(jb+j) bjd = b(jd+j) u1 = bjb + bjd u3 = ss * ( bjb - bjd ) a(ja+j) = t0 + t1 b(ja+j) = u0 + u1 a(jb+j) = co1*(t2-u3) - si1*(u2+t3) b(jb+j) = si1*(t2-u3) + co1*(u2+t3) a(jc+j) = co2*(t0-t1) - si2*(u0-u1) b(jc+j) = si2*(t0-t1) + co2*(u0-u1) a(jd+j) = co3*(t2+u3) - si3*(u2-t3) b(jd+j) = si3*(t2+u3) + co3*(u2-t3) j = j + jump 130 continue !-----( end of loop across transforms ) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 135 continue 140 continue !-----( end of loop along transforms ) kk = kk + 2*la 150 continue !-----( end of loop on nonzero k ) la = 4*la 160 continue !-----( end of loop on type I radix-4 passes) ! ! central radix-2 pass ! -------------------- 200 continue if (m2.eq.0) go to 300 ! jstep = (n*inc) / (2*la) jstepl = jstep - ninc ! ! k=0 loop (no twiddle factors) ! ----------------------------- do 220 jjj = 0 , (n-1)*inc , 2*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 215 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 210 l = 1 , nvex aja = a(ja+j) ajb = a(jb+j) t0 = aja - ajb a(ja+j) = aja + ajb a(jb+j) = t0 bja = b(ja+j) bjb = b(jb+j) u0 = bja - bjb b(ja+j) = bja + bjb b(jb+j) = u0 j = j + jump 210 continue !-----(end of loop across transforms) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 215 continue 220 continue ! ! finished if n2=2 ! ---------------- if (n2.eq.2) go to 490 ! kk = 2 * la ! ! loop on nonzero k ! ----------------- do 260 k = ink , jstep - ink , ink co1 = trigs(kk+1) si1 = s*trigs(kk+2) ! ! loop along transforms ! --------------------- do 250 jjj = k , (n-1)*inc , 2*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 245 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc j = 0 ! ! loop across transforms ! ---------------------- if (kk.eq.n2/2) then #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 230 l = 1 , nvex aja = a(ja+j) ajb = a(jb+j) t0 = ss * ( aja - ajb ) a(ja+j) = aja + ajb bjb = b(jb+j) bja = b(ja+j) a(jb+j) = ss * ( bjb - bja ) b(ja+j) = bja + bjb b(jb+j) = t0 j = j + jump 230 continue ! else ! #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 240 l = 1 , nvex aja = a(ja+j) ajb = a(jb+j) t0 = aja - ajb a(ja+j) = aja + ajb bja = b(ja+j) bjb = b(jb+j) u0 = bja - bjb b(ja+j) = bja + bjb a(jb+j) = co1*t0 - si1*u0 b(jb+j) = si1*t0 + co1*u0 j = j + jump 240 continue ! endif ! !-----(end of loop across transforms) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 245 continue 250 continue !-----(end of loop along transforms) kk = kk + 2 * la 260 continue !-----(end of loop on nonzero k) !-----(end of radix-2 pass) ! la = 2 * la go to 400 ! ! central radix-8 pass ! -------------------- 300 continue if (m8.eq.0) go to 400 jstep = (n*inc) / (8*la) jstepl = jstep - ninc mu = mod(inq,8) if (isign.eq.-1) mu = 8 - mu c1 = 1.0_dp if (mu.eq.3.or.mu.eq.7) c1 = -1.0_dp c2 = sqrt(0.5_dp) if (mu.eq.3.or.mu.eq.5) c2 = -c2 c3 = c1 * c2 ! ! stage 1 ! ------- do 320 k = 0 , jstep - ink , ink do 315 jjj = k , (n-1)*inc , 8*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 312 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc j = 0 #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 310 l = 1 , nvex aja = a(ja+j) aje = a(je+j) t0 = aja - aje a(ja+j) = aja + aje ajc = a(jc+j) ajg = a(jg+j) t1 = c1 * ( ajc - ajg ) a(je+j) = ajc + ajg ajb = a(jb+j) ajf = a(jf+j) t2 = ajb - ajf a(jc+j) = ajb + ajf ajd = a(jd+j) ajh = a(jh+j) t3 = ajd - ajh a(jg+j) = ajd + ajh a(jb+j) = t0 a(jf+j) = t1 a(jd+j) = c2 * ( t2 - t3 ) a(jh+j) = c3 * ( t2 + t3 ) bja = b(ja+j) bje = b(je+j) u0 = bja - bje b(ja+j) = bja + bje bjc = b(jc+j) bjg = b(jg+j) u1 = c1 * ( bjc - bjg ) b(je+j) = bjc + bjg bjb = b(jb+j) bjf = b(jf+j) u2 = bjb - bjf b(jc+j) = bjb + bjf bjd = b(jd+j) bjh = b(jh+j) u3 = bjd - bjh b(jg+j) = bjd + bjh b(jb+j) = u0 b(jf+j) = u1 b(jd+j) = c2 * ( u2 - u3 ) b(jh+j) = c3 * ( u2 + u3 ) j = j + jump 310 continue ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 312 continue 315 continue 320 continue ! ! stage 2 ! ------- ! ! k=0 (no twiddle factors) ! ------------------------ do 330 jjj = 0 , (n-1)*inc , 8*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 328 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc j = 0 #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 325 l = 1 , nvex aja = a(ja+j) aje = a(je+j) t0 = aja + aje t2 = aja - aje ajc = a(jc+j) ajg = a(jg+j) t1 = ajc + ajg t3 = c1 * ( ajc - ajg ) bja = b(ja+j) bje = b(je+j) u0 = bja + bje u2 = bja - bje bjc = b(jc+j) bjg = b(jg+j) u1 = bjc + bjg u3 = c1 * ( bjc - bjg ) a(ja+j) = t0 + t1 a(je+j) = t0 - t1 b(ja+j) = u0 + u1 b(je+j) = u0 - u1 a(jc+j) = t2 - u3 a(jg+j) = t2 + u3 b(jc+j) = u2 + t3 b(jg+j) = u2 - t3 ajb = a(jb+j) ajd = a(jd+j) t0 = ajb + ajd t2 = ajb - ajd ajf = a(jf+j) ajh = a(jh+j) t1 = ajf - ajh t3 = ajf + ajh bjb = b(jb+j) bjd = b(jd+j) u0 = bjb + bjd u2 = bjb - bjd bjf = b(jf+j) bjh = b(jh+j) u1 = bjf - bjh u3 = bjf + bjh a(jb+j) = t0 - u3 a(jh+j) = t0 + u3 b(jb+j) = u0 + t3 b(jh+j) = u0 - t3 a(jd+j) = t2 + u1 a(jf+j) = t2 - u1 b(jd+j) = u2 - t1 b(jf+j) = u2 + t1 j = j + jump 325 continue ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 328 continue 330 continue ! if (n2.eq.8) go to 490 ! ! loop on nonzero k ! ----------------- kk = 2 * la ! do 350 k = ink , jstep - ink , ink ! co1 = trigs(kk+1) si1 = s * trigs(kk+2) co2 = trigs(2*kk+1) si2 = s * trigs(2*kk+2) co3 = trigs(3*kk+1) si3 = s * trigs(3*kk+2) co4 = trigs(4*kk+1) si4 = s * trigs(4*kk+2) co5 = trigs(5*kk+1) si5 = s * trigs(5*kk+2) co6 = trigs(6*kk+1) si6 = s * trigs(6*kk+2) co7 = trigs(7*kk+1) si7 = s * trigs(7*kk+2) ! do 345 jjj = k , (n-1)*inc , 8*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 342 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc j = 0 #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 340 l = 1 , nvex aja = a(ja+j) aje = a(je+j) t0 = aja + aje t2 = aja - aje ajc = a(jc+j) ajg = a(jg+j) t1 = ajc + ajg t3 = c1 * ( ajc - ajg ) bja = b(ja+j) bje = b(je+j) u0 = bja + bje u2 = bja - bje bjc = b(jc+j) bjg = b(jg+j) u1 = bjc + bjg u3 = c1 * ( bjc - bjg ) a(ja+j) = t0 + t1 b(ja+j) = u0 + u1 a(je+j) = co4*(t0-t1) - si4*(u0-u1) b(je+j) = si4*(t0-t1) + co4*(u0-u1) a(jc+j) = co2*(t2-u3) - si2*(u2+t3) b(jc+j) = si2*(t2-u3) + co2*(u2+t3) a(jg+j) = co6*(t2+u3) - si6*(u2-t3) b(jg+j) = si6*(t2+u3) + co6*(u2-t3) ajb = a(jb+j) ajd = a(jd+j) t0 = ajb + ajd t2 = ajb - ajd ajf = a(jf+j) ajh = a(jh+j) t1 = ajf - ajh t3 = ajf + ajh bjb = b(jb+j) bjd = b(jd+j) u0 = bjb + bjd u2 = bjb - bjd bjf = b(jf+j) bjh = b(jh+j) u1 = bjf - bjh u3 = bjf + bjh a(jb+j) = co1*(t0-u3) - si1*(u0+t3) b(jb+j) = si1*(t0-u3) + co1*(u0+t3) a(jh+j) = co7*(t0+u3) - si7*(u0-t3) b(jh+j) = si7*(t0+u3) + co7*(u0-t3) a(jd+j) = co3*(t2+u1) - si3*(u2-t1) b(jd+j) = si3*(t2+u1) + co3*(u2-t1) a(jf+j) = co5*(t2-u1) - si5*(u2+t1) b(jf+j) = si5*(t2-u1) + co5*(u2+t1) j = j + jump 340 continue ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 342 continue 345 continue kk = kk + 2 * la 350 continue ! la = 8 * la ! ! loop on type II radix-4 passes ! ------------------------------ 400 continue mu = mod(inq,4) if (isign.eq.-1) mu = 4 - mu ss = 1.0_dp if (mu.eq.3) ss = -1.0_dp ! do 480 ipass = mh+1 , m jstep = (n*inc) / (4*la) jstepl = jstep - ninc laincl = la * ink - ninc ! ! k=0 loop (no twiddle factors) ! ----------------------------- do 430 ll = 0 , (la-1)*ink , 4*jstep ! do 420 jjj = ll , (n-1)*inc , 4*la*ink ja = istart + jjj ! ! "transverse" loop ! ----------------- do 415 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = ja + laincl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc ji = je + laincl if (ji.lt.istart) ji = ji + ninc jj = ji + jstepl if (jj.lt.istart) jj = jj + ninc jk = jj + jstepl if (jk.lt.istart) jk = jk + ninc jl = jk + jstepl if (jl.lt.istart) jl = jl + ninc jm = ji + laincl if (jm.lt.istart) jm = jm + ninc jn = jm + jstepl if (jn.lt.istart) jn = jn + ninc jo = jn + jstepl if (jo.lt.istart) jo = jo + ninc jp = jo + jstepl if (jp.lt.istart) jp = jp + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 410 l = 1 , nvex aja = a(ja+j) ajc = a(jc+j) t0 = aja + ajc t2 = aja - ajc ajb = a(jb+j) ajd = a(jd+j) t1 = ajb + ajd t3 = ss * ( ajb - ajd ) aji = a(ji+j) ajc = aji bja = b(ja+j) bjc = b(jc+j) u0 = bja + bjc u2 = bja - bjc bjb = b(jb+j) bjd = b(jd+j) u1 = bjb + bjd u3 = ss * ( bjb - bjd ) aje = a(je+j) ajb = aje a(ja+j) = t0 + t1 a(ji+j) = t0 - t1 b(ja+j) = u0 + u1 bjc = u0 - u1 bjm = b(jm+j) bjd = bjm a(je+j) = t2 - u3 ajd = t2 + u3 bjb = u2 + t3 b(jm+j) = u2 - t3 !---------------------- ajg = a(jg+j) t0 = ajb + ajg t2 = ajb - ajg ajf = a(jf+j) ajh = a(jh+j) t1 = ajf + ajh t3 = ss * ( ajf - ajh ) ajj = a(jj+j) ajg = ajj bje = b(je+j) bjg = b(jg+j) u0 = bje + bjg u2 = bje - bjg bjf = b(jf+j) bjh = b(jh+j) u1 = bjf + bjh u3 = ss * ( bjf - bjh ) b(je+j) = bjb a(jb+j) = t0 + t1 a(jj+j) = t0 - t1 bjj = b(jj+j) bjg = bjj b(jb+j) = u0 + u1 b(jj+j) = u0 - u1 a(jf+j) = t2 - u3 ajh = t2 + u3 b(jf+j) = u2 + t3 bjh = u2 - t3 !---------------------- ajk = a(jk+j) t0 = ajc + ajk t2 = ajc - ajk ajl = a(jl+j) t1 = ajg + ajl t3 = ss * ( ajg - ajl ) bji = b(ji+j) bjk = b(jk+j) u0 = bji + bjk u2 = bji - bjk ajo = a(jo+j) ajl = ajo bjl = b(jl+j) u1 = bjg + bjl u3 = ss * ( bjg - bjl ) b(ji+j) = bjc a(jc+j) = t0 + t1 a(jk+j) = t0 - t1 bjo = b(jo+j) bjl = bjo b(jc+j) = u0 + u1 b(jk+j) = u0 - u1 a(jg+j) = t2 - u3 a(jo+j) = t2 + u3 b(jg+j) = u2 + t3 b(jo+j) = u2 - t3 !---------------------- ajm = a(jm+j) t0 = ajm + ajl t2 = ajm - ajl ajn = a(jn+j) ajp = a(jp+j) t1 = ajn + ajp t3 = ss * ( ajn - ajp ) a(jm+j) = ajd u0 = bjd + bjl u2 = bjd - bjl bjn = b(jn+j) bjp = b(jp+j) u1 = bjn + bjp u3 = ss * ( bjn - bjp ) a(jn+j) = ajh a(jd+j) = t0 + t1 a(jl+j) = t0 - t1 b(jd+j) = u0 + u1 b(jl+j) = u0 - u1 b(jn+j) = bjh a(jh+j) = t2 - u3 a(jp+j) = t2 + u3 b(jh+j) = u2 + t3 b(jp+j) = u2 - t3 j = j + jump 410 continue !-----( end of loop across transforms ) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 415 continue 420 continue 430 continue !-----( end of double loop for k=0 ) ! ! finished if last pass ! --------------------- if (ipass.eq.m) go to 490 ! kk = 2*la ! ! loop on nonzero k ! ----------------- do 470 k = ink , jstep-ink , ink co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) co3 = trigs(3*kk+1) si3 = s*trigs(3*kk+2) ! ! double loop along first transform in block ! ------------------------------------------ do 460 ll = k , (la-1)*ink , 4*jstep ! do 450 jjj = ll , (n-1)*inc , 4*la*ink ja = istart + jjj ! ! "transverse" loop ! ----------------- do 445 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = ja + laincl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc ji = je + laincl if (ji.lt.istart) ji = ji + ninc jj = ji + jstepl if (jj.lt.istart) jj = jj + ninc jk = jj + jstepl if (jk.lt.istart) jk = jk + ninc jl = jk + jstepl if (jl.lt.istart) jl = jl + ninc jm = ji + laincl if (jm.lt.istart) jm = jm + ninc jn = jm + jstepl if (jn.lt.istart) jn = jn + ninc jo = jn + jstepl if (jo.lt.istart) jo = jo + ninc jp = jo + jstepl if (jp.lt.istart) jp = jp + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 440 l = 1 , nvex aja = a(ja+j) ajc = a(jc+j) t0 = aja + ajc t2 = aja - ajc ajb = a(jb+j) ajd = a(jd+j) t1 = ajb + ajd t3 = ss * ( ajb - ajd ) aji = a(ji+j) ajc = aji bja = b(ja+j) bjc = b(jc+j) u0 = bja + bjc u2 = bja - bjc bjb = b(jb+j) bjd = b(jd+j) u1 = bjb + bjd u3 = ss * ( bjb - bjd ) aje = a(je+j) ajb = aje a(ja+j) = t0 + t1 b(ja+j) = u0 + u1 a(je+j) = co1*(t2-u3) - si1*(u2+t3) bjb = si1*(t2-u3) + co1*(u2+t3) bjm = b(jm+j) bjd = bjm a(ji+j) = co2*(t0-t1) - si2*(u0-u1) bjc = si2*(t0-t1) + co2*(u0-u1) ajd = co3*(t2+u3) - si3*(u2-t3) b(jm+j) = si3*(t2+u3) + co3*(u2-t3) !---------------------------------------- ajg = a(jg+j) t0 = ajb + ajg t2 = ajb - ajg ajf = a(jf+j) ajh = a(jh+j) t1 = ajf + ajh t3 = ss * ( ajf - ajh ) ajj = a(jj+j) ajg = ajj bje = b(je+j) bjg = b(jg+j) u0 = bje + bjg u2 = bje - bjg bjf = b(jf+j) bjh = b(jh+j) u1 = bjf + bjh u3 = ss * ( bjf - bjh ) b(je+j) = bjb a(jb+j) = t0 + t1 b(jb+j) = u0 + u1 bjj = b(jj+j) bjg = bjj a(jf+j) = co1*(t2-u3) - si1*(u2+t3) b(jf+j) = si1*(t2-u3) + co1*(u2+t3) a(jj+j) = co2*(t0-t1) - si2*(u0-u1) b(jj+j) = si2*(t0-t1) + co2*(u0-u1) ajh = co3*(t2+u3) - si3*(u2-t3) bjh = si3*(t2+u3) + co3*(u2-t3) !---------------------------------------- ajk = a(jk+j) t0 = ajc + ajk t2 = ajc - ajk ajl = a(jl+j) t1 = ajg + ajl t3 = ss * ( ajg - ajl ) bji = b(ji+j) bjk = b(jk+j) u0 = bji + bjk u2 = bji - bjk ajo = a(jo+j) ajl = ajo bjl = b(jl+j) u1 = bjg + bjl u3 = ss * ( bjg - bjl ) b(ji+j) = bjc a(jc+j) = t0 + t1 b(jc+j) = u0 + u1 bjo = b(jo+j) bjl = bjo a(jg+j) = co1*(t2-u3) - si1*(u2+t3) b(jg+j) = si1*(t2-u3) + co1*(u2+t3) a(jk+j) = co2*(t0-t1) - si2*(u0-u1) b(jk+j) = si2*(t0-t1) + co2*(u0-u1) a(jo+j) = co3*(t2+u3) - si3*(u2-t3) b(jo+j) = si3*(t2+u3) + co3*(u2-t3) !---------------------------------------- ajm = a(jm+j) t0 = ajm + ajl t2 = ajm - ajl ajn = a(jn+j) ajp = a(jp+j) t1 = ajn + ajp t3 = ss * ( ajn - ajp ) a(jm+j) = ajd u0 = bjd + bjl u2 = bjd - bjl a(jn+j) = ajh bjn = b(jn+j) bjp = b(jp+j) u1 = bjn + bjp u3 = ss * ( bjn - bjp ) b(jn+j) = bjh a(jd+j) = t0 + t1 b(jd+j) = u0 + u1 a(jh+j) = co1*(t2-u3) - si1*(u2+t3) b(jh+j) = si1*(t2-u3) + co1*(u2+t3) a(jl+j) = co2*(t0-t1) - si2*(u0-u1) b(jl+j) = si2*(t0-t1) + co2*(u0-u1) a(jp+j) = co3*(t2+u3) - si3*(u2-t3) b(jp+j) = si3*(t2+u3) + co3*(u2-t3) j = j + jump 440 continue !-----(end of loop across transforms) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 445 continue 450 continue 460 continue !-----( end of double loop for this k ) kk = kk + 2*la 470 continue !-----( end of loop over values of k ) la = 4*la 480 continue !-----( end of loop on type II radix-4 passes ) !-----( nvex transforms completed) 490 continue istart = istart + nvex * jump 500 continue !-----( end of loop on blocks of transforms ) ! return end subroutine gpfa2f !****************************************************************** ! fortran version of *gpfa3* - ! radix-3 section of self-sorting, in-place ! generalized PFA ! !------------------------------------------------------------------- ! subroutine gpfa3f(a,b,trigs,inc,jump,n,mm,lot,isign) real(wp) :: a(*), b(*) real(dp) :: trigs(*) integer inc, jump, n, mm, lot, isign real(dp), parameter :: sin60 = 0.866025403784437_dp integer :: je, jf, jg, jh, laincl, ll, ji integer :: n3, inq, jstepx, ninc, ink, mu, m, mh, & nblox, left, istart, nb, nvex, la, & ipass, jstep, jstepl, jjj, ja, nu, jb, & jc, j, l, kk, k, jd real(dp) :: s, ajb, ajc, t1, aja, t2, t3, bjb, bjc, & u1, bja, u2, u3, co1, si1, co2, si2, ajd, & bjd real(dp) :: c1, aje, ajg, ajf, ajh, bje, & bjf, bjg, bjh, aji, bji integer, parameter :: lvr = 128 ! ! *************************************************************** ! * * ! * N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. * ! * RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE * ! * (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER. * ! * * ! *************************************************************** ! n3 = 3**mm inq = n/n3 jstepx = (n3-n) * inc ninc = n * inc ink = inc * inq mu = mod(inq,3) if (isign.eq.-1) mu = 3-mu m = mm mh = (m+1)/2 s = real(isign,kind=dp) c1 = sin60 if (mu.eq.2) c1 = -c1 ! nblox = 1 + (lot-1)/lvr left = lot s = real(isign,kind=dp) istart = 1 ! ! loop on blocks of lvr transforms ! -------------------------------- do 500 nb = 1 , nblox ! if (left.le.lvr) then nvex = left else if (left.lt.(2*lvr)) then nvex = left/2 nvex = nvex + mod(nvex,2) else nvex = lvr endif left = left - nvex ! la = 1 ! ! loop on type I radix-3 passes ! ----------------------------- do 160 ipass = 1 , mh jstep = (n*inc) / (3*la) jstepl = jstep - ninc ! ! k = 0 loop (no twiddle factors) ! ------------------------------- do 120 jjj = 0 , (n-1)*inc , 3*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 115 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 110 l = 1 , nvex ajb = a(jb+j) ajc = a(jc+j) t1 = ajb + ajc aja = a(ja+j) t2 = aja - 0.5_dp * t1 t3 = c1 * ( ajb - ajc ) bjb = b(jb+j) bjc = b(jc+j) u1 = bjb + bjc bja = b(ja+j) u2 = bja - 0.5_dp * u1 u3 = c1 * ( bjb - bjc ) a(ja+j) = aja + t1 b(ja+j) = bja + u1 a(jb+j) = t2 - u3 b(jb+j) = u2 + t3 a(jc+j) = t2 + u3 b(jc+j) = u2 - t3 j = j + jump 110 continue ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 115 continue 120 continue ! ! finished if n3 = 3 ! ------------------ if (n3.eq.3) go to 490 kk = 2 * la ! ! loop on nonzero k ! ----------------- do 150 k = ink , jstep-ink , ink co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) ! ! loop along transform ! -------------------- do 140 jjj = k , (n-1)*inc , 3*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 135 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 130 l = 1 , nvex ajb = a(jb+j) ajc = a(jc+j) t1 = ajb + ajc aja = a(ja+j) t2 = aja - 0.5_dp * t1 t3 = c1 * ( ajb - ajc ) bjb = b(jb+j) bjc = b(jc+j) u1 = bjb + bjc bja = b(ja+j) u2 = bja - 0.5_dp * u1 u3 = c1 * ( bjb - bjc ) a(ja+j) = aja + t1 b(ja+j) = bja + u1 a(jb+j) = co1*(t2-u3) - si1*(u2+t3) b(jb+j) = si1*(t2-u3) + co1*(u2+t3) a(jc+j) = co2*(t2+u3) - si2*(u2-t3) b(jc+j) = si2*(t2+u3) + co2*(u2-t3) j = j + jump 130 continue !-----( end of loop across transforms ) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 135 continue 140 continue !-----( end of loop along transforms ) kk = kk + 2*la 150 continue !-----( end of loop on nonzero k ) la = 3*la 160 continue !-----( end of loop on type I radix-3 passes) ! ! loop on type II radix-3 passes ! ------------------------------ 400 continue ! do 480 ipass = mh+1 , m jstep = (n*inc) / (3*la) jstepl = jstep - ninc laincl = la*ink - ninc ! ! k=0 loop (no twiddle factors) ! ----------------------------- do 430 ll = 0 , (la-1)*ink , 3*jstep ! do 420 jjj = ll , (n-1)*inc , 3*la*ink ja = istart + jjj ! ! "transverse" loop ! ----------------- do 415 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = ja + laincl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jd + laincl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc ji = jh + jstepl if (ji.lt.istart) ji = ji + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 410 l = 1 , nvex ajb = a(jb+j) ajc = a(jc+j) t1 = ajb + ajc aja = a(ja+j) t2 = aja - 0.5_dp * t1 t3 = c1 * ( ajb - ajc ) ajd = a(jd+j) ajb = ajd bjb = b(jb+j) bjc = b(jc+j) u1 = bjb + bjc bja = b(ja+j) u2 = bja - 0.5_dp * u1 u3 = c1 * ( bjb - bjc ) bjd = b(jd+j) bjb = bjd a(ja+j) = aja + t1 b(ja+j) = bja + u1 a(jd+j) = t2 - u3 b(jd+j) = u2 + t3 ajc = t2 + u3 bjc = u2 - t3 !---------------------- aje = a(je+j) ajf = a(jf+j) t1 = aje + ajf t2 = ajb - 0.5_dp * t1 t3 = c1 * ( aje - ajf ) ajh = a(jh+j) ajf = ajh bje = b(je+j) bjf = b(jf+j) u1 = bje + bjf u2 = bjb - 0.5_dp * u1 u3 = c1 * ( bje - bjf ) bjh = b(jh+j) bjf = bjh a(jb+j) = ajb + t1 b(jb+j) = bjb + u1 a(je+j) = t2 - u3 b(je+j) = u2 + t3 a(jh+j) = t2 + u3 b(jh+j) = u2 - t3 !---------------------- aji = a(ji+j) t1 = ajf + aji ajg = a(jg+j) t2 = ajg - 0.5_dp * t1 t3 = c1 * ( ajf - aji ) t1 = ajg + t1 a(jg+j) = ajc bji = b(ji+j) u1 = bjf + bji bjg = b(jg+j) u2 = bjg - 0.5_dp * u1 u3 = c1 * ( bjf - bji ) u1 = bjg + u1 b(jg+j) = bjc a(jc+j) = t1 b(jc+j) = u1 a(jf+j) = t2 - u3 b(jf+j) = u2 + t3 a(ji+j) = t2 + u3 b(ji+j) = u2 - t3 j = j + jump 410 continue !-----( end of loop across transforms ) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 415 continue 420 continue 430 continue !-----( end of double loop for k=0 ) ! ! finished if last pass ! --------------------- if (ipass.eq.m) go to 490 ! kk = 2*la ! ! loop on nonzero k ! ----------------- do 470 k = ink , jstep-ink , ink co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) ! ! double loop along first transform in block ! ------------------------------------------ do 460 ll = k , (la-1)*ink , 3*jstep ! do 450 jjj = ll , (n-1)*inc , 3*la*ink ja = istart + jjj ! ! "transverse" loop ! ----------------- do 445 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = ja + laincl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jd + laincl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc ji = jh + jstepl if (ji.lt.istart) ji = ji + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 440 l = 1 , nvex ajb = a(jb+j) ajc = a(jc+j) t1 = ajb + ajc aja = a(ja+j) t2 = aja - 0.5_dp * t1 t3 = c1 * ( ajb - ajc ) ajd = a(jd+j) ajb = ajd bjb = b(jb+j) bjc = b(jc+j) u1 = bjb + bjc bja = b(ja+j) u2 = bja - 0.5_dp * u1 u3 = c1 * ( bjb - bjc ) bjd = b(jd+j) bjb = bjd a(ja+j) = aja + t1 b(ja+j) = bja + u1 a(jd+j) = co1*(t2-u3) - si1*(u2+t3) b(jd+j) = si1*(t2-u3) + co1*(u2+t3) ajc = co2*(t2+u3) - si2*(u2-t3) bjc = si2*(t2+u3) + co2*(u2-t3) !---------------------- aje = a(je+j) ajf = a(jf+j) t1 = aje + ajf t2 = ajb - 0.5_dp * t1 t3 = c1 * ( aje - ajf ) ajh = a(jh+j) ajf = ajh bje = b(je+j) bjf = b(jf+j) u1 = bje + bjf u2 = bjb - 0.5_dp * u1 u3 = c1 * ( bje - bjf ) bjh = b(jh+j) bjf = bjh a(jb+j) = ajb + t1 b(jb+j) = bjb + u1 a(je+j) = co1*(t2-u3) - si1*(u2+t3) b(je+j) = si1*(t2-u3) + co1*(u2+t3) a(jh+j) = co2*(t2+u3) - si2*(u2-t3) b(jh+j) = si2*(t2+u3) + co2*(u2-t3) !---------------------- aji = a(ji+j) t1 = ajf + aji ajg = a(jg+j) t2 = ajg - 0.5_dp * t1 t3 = c1 * ( ajf - aji ) t1 = ajg + t1 a(jg+j) = ajc bji = b(ji+j) u1 = bjf + bji bjg = b(jg+j) u2 = bjg - 0.5_dp * u1 u3 = c1 * ( bjf - bji ) u1 = bjg + u1 b(jg+j) = bjc a(jc+j) = t1 b(jc+j) = u1 a(jf+j) = co1*(t2-u3) - si1*(u2+t3) b(jf+j) = si1*(t2-u3) + co1*(u2+t3) a(ji+j) = co2*(t2+u3) - si2*(u2-t3) b(ji+j) = si2*(t2+u3) + co2*(u2-t3) j = j + jump 440 continue !-----(end of loop across transforms) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 445 continue 450 continue 460 continue !-----( end of double loop for this k ) kk = kk + 2*la 470 continue !-----( end of loop over values of k ) la = 3*la 480 continue !-----( end of loop on type II radix-3 passes ) !-----( nvex transforms completed) 490 continue istart = istart + nvex * jump 500 continue !-----( end of loop on blocks of transforms ) ! return end subroutine gpfa3f !******************************************************************* ! fortran version of *gpfa5* - ! radix-5 section of self-sorting, in-place, ! generalized pfa ! !------------------------------------------------------------------- ! subroutine gpfa5f(a,b,trigs,inc,jump,n,mm,lot,isign) real(wp) :: a(*), b(*) real(dp) :: trigs(*) integer inc, jump, n, mm, lot, isign real(dp), parameter :: sin36 = 0.587785252292473_dp, & sin72 = 0.951056516295154_dp, & qrt5 = 0.559016994374947_dp integer, parameter :: lvr = 128 integer :: inq, jstepx, ninc, ink, mu, m, mh, & nblox, n5, left, istart, nb, nvex, la, & ipass, jstep, jstepl, kk, k, jjj, ja, & nu, jb, jc, jd, je, j, laincl, ll, l, & jf, jg, jh, ji, jj, jk, jl, jm, jn, jo, & jp, jq, jr, js, jt, ju, jv, jw, jx, jy real(dp) :: s, c1, c2, c3, co1, si1, co2, si2, co3, & si3, co4, si4, ajb, aje, t1, ajc, ajd, & t2, t3, t4, t5, t6, aja, t7, t8, t9, t10, & t11, bjb, bje, u1, bjc, bjd, u2, u3, u4, & u5, u6, bja, u7, u8, u9, u10, u11, ajf, & ajk, bjf, bjk, ajg, ajj, ajh, aji, ajl, & ajq, bjg, bjj, bjh, bji, bjl, bjq, ajo, & ajm, ajn, ajr, ajw, bjo, bjm, bjn, bjr, & bjw, ajt, ajs, ajx, ajp, ax, bjt, bjs, & bjx, bjp, bx, ajv, ajy, aju, bjv, bjy, & bju ! ! *************************************************************** ! * * ! * N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. * ! * RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE * ! * (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER. * ! * * ! *************************************************************** ! n5 = 5 ** mm inq = n / n5 jstepx = (n5-n) * inc ninc = n * inc ink = inc * inq mu = mod(inq,5) if (isign.eq.-1) mu = 5 - mu ! m = mm mh = (m+1)/2 s = real(isign,kind=dp) c1 = qrt5 c2 = sin72 c3 = sin36 if (mu.eq.2.or.mu.eq.3) then c1 = -c1 c2 = sin36 c3 = sin72 endif if (mu.eq.3.or.mu.eq.4) c2 = -c2 if (mu.eq.2.or.mu.eq.4) c3 = -c3 ! nblox = 1 + (lot-1)/lvr left = lot s = real(isign,kind=dp) istart = 1 ! ! loop on blocks of lvr transforms ! -------------------------------- do 500 nb = 1 , nblox ! if (left.le.lvr) then nvex = left else if (left.lt.(2*lvr)) then nvex = left/2 nvex = nvex + mod(nvex,2) else nvex = lvr endif left = left - nvex ! la = 1 ! ! loop on type I radix-5 passes ! ----------------------------- do 160 ipass = 1 , mh jstep = (n*inc) / (5*la) jstepl = jstep - ninc kk = 0 ! ! loop on k ! --------- do 150 k = 0 , jstep-ink , ink ! if (k.gt.0) then co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) co3 = trigs(3*kk+1) si3 = s*trigs(3*kk+2) co4 = trigs(4*kk+1) si4 = s*trigs(4*kk+2) endif ! ! loop along transform ! -------------------- do 140 jjj = k , (n-1)*inc , 5*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 135 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc j = 0 ! ! loop across transforms ! ---------------------- if (k.eq.0) then ! #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 110 l = 1 , nvex ajb = a(jb+j) aje = a(je+j) t1 = ajb + aje ajc = a(jc+j) ajd = a(jd+j) t2 = ajc + ajd t3 = ajb - aje t4 = ajc - ajd t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aja = a(ja+j) t7 = aja - 0.25_dp * t5 a(ja+j) = aja + t5 t8 = t7 + t6 t9 = t7 - t6 t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjb = b(jb+j) bje = b(je+j) u1 = bjb + bje bjc = b(jc+j) bjd = b(jd+j) u2 = bjc + bjd u3 = bjb - bje u4 = bjc - bjd u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bja = b(ja+j) u7 = bja - 0.25_dp * u5 b(ja+j) = bja + u5 u8 = u7 + u6 u9 = u7 - u6 u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jb+j) = t8 - u11 b(jb+j) = u8 + t11 a(je+j) = t8 + u11 b(je+j) = u8 - t11 a(jc+j) = t9 - u10 b(jc+j) = u9 + t10 a(jd+j) = t9 + u10 b(jd+j) = u9 - t10 j = j + jump 110 continue ! else ! #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 130 l = 1 , nvex ajb = a(jb+j) aje = a(je+j) t1 = ajb + aje ajc = a(jc+j) ajd = a(jd+j) t2 = ajc + ajd t3 = ajb - aje t4 = ajc - ajd t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aja = a(ja+j) t7 = aja - 0.25_dp * t5 a(ja+j) = aja + t5 t8 = t7 + t6 t9 = t7 - t6 t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjb = b(jb+j) bje = b(je+j) u1 = bjb + bje bjc = b(jc+j) bjd = b(jd+j) u2 = bjc + bjd u3 = bjb - bje u4 = bjc - bjd u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bja = b(ja+j) u7 = bja - 0.25_dp * u5 b(ja+j) = bja + u5 u8 = u7 + u6 u9 = u7 - u6 u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jb+j) = co1*(t8-u11) - si1*(u8+t11) b(jb+j) = si1*(t8-u11) + co1*(u8+t11) a(je+j) = co4*(t8+u11) - si4*(u8-t11) b(je+j) = si4*(t8+u11) + co4*(u8-t11) a(jc+j) = co2*(t9-u10) - si2*(u9+t10) b(jc+j) = si2*(t9-u10) + co2*(u9+t10) a(jd+j) = co3*(t9+u10) - si3*(u9-t10) b(jd+j) = si3*(t9+u10) + co3*(u9-t10) j = j + jump 130 continue ! endif ! !-----( end of loop across transforms ) ! ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 135 continue 140 continue !-----( end of loop along transforms ) kk = kk + 2*la 150 continue !-----( end of loop on nonzero k ) la = 5*la 160 continue !-----( end of loop on type I radix-5 passes) ! if (n.eq.5) go to 490 ! ! loop on type II radix-5 passes ! ------------------------------ 400 continue ! do 480 ipass = mh+1 , m jstep = (n*inc) / (5*la) jstepl = jstep - ninc laincl = la * ink - ninc kk = 0 ! ! loop on k ! --------- do 470 k = 0 , jstep-ink , ink ! if (k.gt.0) then co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) co3 = trigs(3*kk+1) si3 = s*trigs(3*kk+2) co4 = trigs(4*kk+1) si4 = s*trigs(4*kk+2) endif ! ! double loop along first transform in block ! ------------------------------------------ do 460 ll = k , (la-1)*ink , 5*jstep ! do 450 jjj = ll , (n-1)*inc , 5*la*ink ja = istart + jjj ! ! "transverse" loop ! ----------------- do 445 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = ja + laincl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc ji = jh + jstepl if (ji.lt.istart) ji = ji + ninc jj = ji + jstepl if (jj.lt.istart) jj = jj + ninc jk = jf + laincl if (jk.lt.istart) jk = jk + ninc jl = jk + jstepl if (jl.lt.istart) jl = jl + ninc jm = jl + jstepl if (jm.lt.istart) jm = jm + ninc jn = jm + jstepl if (jn.lt.istart) jn = jn + ninc jo = jn + jstepl if (jo.lt.istart) jo = jo + ninc jp = jk + laincl if (jp.lt.istart) jp = jp + ninc jq = jp + jstepl if (jq.lt.istart) jq = jq + ninc jr = jq + jstepl if (jr.lt.istart) jr = jr + ninc js = jr + jstepl if (js.lt.istart) js = js + ninc jt = js + jstepl if (jt.lt.istart) jt = jt + ninc ju = jp + laincl if (ju.lt.istart) ju = ju + ninc jv = ju + jstepl if (jv.lt.istart) jv = jv + ninc jw = jv + jstepl if (jw.lt.istart) jw = jw + ninc jx = jw + jstepl if (jx.lt.istart) jx = jx + ninc jy = jx + jstepl if (jy.lt.istart) jy = jy + ninc j = 0 ! ! loop across transforms ! ---------------------- if (k.eq.0) then ! #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 410 l = 1 , nvex ajb = a(jb+j) aje = a(je+j) t1 = ajb + aje ajc = a(jc+j) ajd = a(jd+j) t2 = ajc + ajd t3 = ajb - aje t4 = ajc - ajd ajf = a(jf+j) ajb = ajf t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aja = a(ja+j) t7 = aja - 0.25_dp * t5 a(ja+j) = aja + t5 t8 = t7 + t6 t9 = t7 - t6 ajk = a(jk+j) ajc = ajk t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjb = b(jb+j) bje = b(je+j) u1 = bjb + bje bjc = b(jc+j) bjd = b(jd+j) u2 = bjc + bjd u3 = bjb - bje u4 = bjc - bjd bjf = b(jf+j) bjb = bjf u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bja = b(ja+j) u7 = bja - 0.25_dp * u5 b(ja+j) = bja + u5 u8 = u7 + u6 u9 = u7 - u6 bjk = b(jk+j) bjc = bjk u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jf+j) = t8 - u11 b(jf+j) = u8 + t11 aje = t8 + u11 bje = u8 - t11 a(jk+j) = t9 - u10 b(jk+j) = u9 + t10 ajd = t9 + u10 bjd = u9 - t10 !---------------------- ajg = a(jg+j) ajj = a(jj+j) t1 = ajg + ajj ajh = a(jh+j) aji = a(ji+j) t2 = ajh + aji t3 = ajg - ajj t4 = ajh - aji ajl = a(jl+j) ajh = ajl t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) t7 = ajb - 0.25_dp * t5 a(jb+j) = ajb + t5 t8 = t7 + t6 t9 = t7 - t6 ajq = a(jq+j) aji = ajq t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjg = b(jg+j) bjj = b(jj+j) u1 = bjg + bjj bjh = b(jh+j) bji = b(ji+j) u2 = bjh + bji u3 = bjg - bjj u4 = bjh - bji bjl = b(jl+j) bjh = bjl u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) u7 = bjb - 0.25_dp * u5 b(jb+j) = bjb + u5 u8 = u7 + u6 u9 = u7 - u6 bjq = b(jq+j) bji = bjq u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jg+j) = t8 - u11 b(jg+j) = u8 + t11 ajj = t8 + u11 bjj = u8 - t11 a(jl+j) = t9 - u10 b(jl+j) = u9 + t10 a(jq+j) = t9 + u10 b(jq+j) = u9 - t10 !---------------------- ajo = a(jo+j) t1 = ajh + ajo ajm = a(jm+j) ajn = a(jn+j) t2 = ajm + ajn t3 = ajh - ajo t4 = ajm - ajn ajr = a(jr+j) ajn = ajr t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) t7 = ajc - 0.25_dp * t5 a(jc+j) = ajc + t5 t8 = t7 + t6 t9 = t7 - t6 ajw = a(jw+j) ajo = ajw t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjo = b(jo+j) u1 = bjh + bjo bjm = b(jm+j) bjn = b(jn+j) u2 = bjm + bjn u3 = bjh - bjo u4 = bjm - bjn bjr = b(jr+j) bjn = bjr u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) u7 = bjc - 0.25_dp * u5 b(jc+j) = bjc + u5 u8 = u7 + u6 u9 = u7 - u6 bjw = b(jw+j) bjo = bjw u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jh+j) = t8 - u11 b(jh+j) = u8 + t11 a(jw+j) = t8 + u11 b(jw+j) = u8 - t11 a(jm+j) = t9 - u10 b(jm+j) = u9 + t10 a(jr+j) = t9 + u10 b(jr+j) = u9 - t10 !---------------------- ajt = a(jt+j) t1 = aji + ajt ajs = a(js+j) t2 = ajn + ajs t3 = aji - ajt t4 = ajn - ajs ajx = a(jx+j) ajt = ajx t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) ajp = a(jp+j) t7 = ajp - 0.25_dp * t5 ax = ajp + t5 t8 = t7 + t6 t9 = t7 - t6 a(jp+j) = ajd t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 a(jd+j) = ax bjt = b(jt+j) u1 = bji + bjt bjs = b(js+j) u2 = bjn + bjs u3 = bji - bjt u4 = bjn - bjs bjx = b(jx+j) bjt = bjx u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bjp = b(jp+j) u7 = bjp - 0.25_dp * u5 bx = bjp + u5 u8 = u7 + u6 u9 = u7 - u6 b(jp+j) = bjd u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 b(jd+j) = bx a(ji+j) = t8 - u11 b(ji+j) = u8 + t11 a(jx+j) = t8 + u11 b(jx+j) = u8 - t11 a(jn+j) = t9 - u10 b(jn+j) = u9 + t10 a(js+j) = t9 + u10 b(js+j) = u9 - t10 !---------------------- ajv = a(jv+j) ajy = a(jy+j) t1 = ajv + ajy t2 = ajo + ajt t3 = ajv - ajy t4 = ajo - ajt a(jv+j) = ajj t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aju = a(ju+j) t7 = aju - 0.25_dp * t5 ax = aju + t5 t8 = t7 + t6 t9 = t7 - t6 a(ju+j) = aje t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 a(je+j) = ax bjv = b(jv+j) bjy = b(jy+j) u1 = bjv + bjy u2 = bjo + bjt u3 = bjv - bjy u4 = bjo - bjt b(jv+j) = bjj u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bju = b(ju+j) u7 = bju - 0.25_dp * u5 bx = bju + u5 u8 = u7 + u6 u9 = u7 - u6 b(ju+j) = bje u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 b(je+j) = bx a(jj+j) = t8 - u11 b(jj+j) = u8 + t11 a(jy+j) = t8 + u11 b(jy+j) = u8 - t11 a(jo+j) = t9 - u10 b(jo+j) = u9 + t10 a(jt+j) = t9 + u10 b(jt+j) = u9 - t10 j = j + jump 410 continue ! else ! #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 440 l = 1 , nvex ajb = a(jb+j) aje = a(je+j) t1 = ajb + aje ajc = a(jc+j) ajd = a(jd+j) t2 = ajc + ajd t3 = ajb - aje t4 = ajc - ajd ajf = a(jf+j) ajb = ajf t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aja = a(ja+j) t7 = aja - 0.25_dp * t5 a(ja+j) = aja + t5 t8 = t7 + t6 t9 = t7 - t6 ajk = a(jk+j) ajc = ajk t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjb = b(jb+j) bje = b(je+j) u1 = bjb + bje bjc = b(jc+j) bjd = b(jd+j) u2 = bjc + bjd u3 = bjb - bje u4 = bjc - bjd bjf = b(jf+j) bjb = bjf u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bja = b(ja+j) u7 = bja - 0.25_dp * u5 b(ja+j) = bja + u5 u8 = u7 + u6 u9 = u7 - u6 bjk = b(jk+j) bjc = bjk u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jf+j) = co1*(t8-u11) - si1*(u8+t11) b(jf+j) = si1*(t8-u11) + co1*(u8+t11) aje = co4*(t8+u11) - si4*(u8-t11) bje = si4*(t8+u11) + co4*(u8-t11) a(jk+j) = co2*(t9-u10) - si2*(u9+t10) b(jk+j) = si2*(t9-u10) + co2*(u9+t10) ajd = co3*(t9+u10) - si3*(u9-t10) bjd = si3*(t9+u10) + co3*(u9-t10) !---------------------- ajg = a(jg+j) ajj = a(jj+j) t1 = ajg + ajj ajh = a(jh+j) aji = a(ji+j) t2 = ajh + aji t3 = ajg - ajj t4 = ajh - aji ajl = a(jl+j) ajh = ajl t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) t7 = ajb - 0.25_dp * t5 a(jb+j) = ajb + t5 t8 = t7 + t6 t9 = t7 - t6 ajq = a(jq+j) aji = ajq t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjg = b(jg+j) bjj = b(jj+j) u1 = bjg + bjj bjh = b(jh+j) bji = b(ji+j) u2 = bjh + bji u3 = bjg - bjj u4 = bjh - bji bjl = b(jl+j) bjh = bjl u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) u7 = bjb - 0.25_dp * u5 b(jb+j) = bjb + u5 u8 = u7 + u6 u9 = u7 - u6 bjq = b(jq+j) bji = bjq u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jg+j) = co1*(t8-u11) - si1*(u8+t11) b(jg+j) = si1*(t8-u11) + co1*(u8+t11) ajj = co4*(t8+u11) - si4*(u8-t11) bjj = si4*(t8+u11) + co4*(u8-t11) a(jl+j) = co2*(t9-u10) - si2*(u9+t10) b(jl+j) = si2*(t9-u10) + co2*(u9+t10) a(jq+j) = co3*(t9+u10) - si3*(u9-t10) b(jq+j) = si3*(t9+u10) + co3*(u9-t10) !---------------------- ajo = a(jo+j) t1 = ajh + ajo ajm = a(jm+j) ajn = a(jn+j) t2 = ajm + ajn t3 = ajh - ajo t4 = ajm - ajn ajr = a(jr+j) ajn = ajr t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) t7 = ajc - 0.25_dp * t5 a(jc+j) = ajc + t5 t8 = t7 + t6 t9 = t7 - t6 ajw = a(jw+j) ajo = ajw t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjo = b(jo+j) u1 = bjh + bjo bjm = b(jm+j) bjn = b(jn+j) u2 = bjm + bjn u3 = bjh - bjo u4 = bjm - bjn bjr = b(jr+j) bjn = bjr u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) u7 = bjc - 0.25_dp * u5 b(jc+j) = bjc + u5 u8 = u7 + u6 u9 = u7 - u6 bjw = b(jw+j) bjo = bjw u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jh+j) = co1*(t8-u11) - si1*(u8+t11) b(jh+j) = si1*(t8-u11) + co1*(u8+t11) a(jw+j) = co4*(t8+u11) - si4*(u8-t11) b(jw+j) = si4*(t8+u11) + co4*(u8-t11) a(jm+j) = co2*(t9-u10) - si2*(u9+t10) b(jm+j) = si2*(t9-u10) + co2*(u9+t10) a(jr+j) = co3*(t9+u10) - si3*(u9-t10) b(jr+j) = si3*(t9+u10) + co3*(u9-t10) !---------------------- ajt = a(jt+j) t1 = aji + ajt ajs = a(js+j) t2 = ajn + ajs t3 = aji - ajt t4 = ajn - ajs ajx = a(jx+j) ajt = ajx t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) ajp = a(jp+j) t7 = ajp - 0.25_dp * t5 ax = ajp + t5 t8 = t7 + t6 t9 = t7 - t6 a(jp+j) = ajd t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 a(jd+j) = ax bjt = b(jt+j) u1 = bji + bjt bjs = b(js+j) u2 = bjn + bjs u3 = bji - bjt u4 = bjn - bjs bjx = b(jx+j) bjt = bjx u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bjp = b(jp+j) u7 = bjp - 0.25_dp * u5 bx = bjp + u5 u8 = u7 + u6 u9 = u7 - u6 b(jp+j) = bjd u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 b(jd+j) = bx a(ji+j) = co1*(t8-u11) - si1*(u8+t11) b(ji+j) = si1*(t8-u11) + co1*(u8+t11) a(jx+j) = co4*(t8+u11) - si4*(u8-t11) b(jx+j) = si4*(t8+u11) + co4*(u8-t11) a(jn+j) = co2*(t9-u10) - si2*(u9+t10) b(jn+j) = si2*(t9-u10) + co2*(u9+t10) a(js+j) = co3*(t9+u10) - si3*(u9-t10) b(js+j) = si3*(t9+u10) + co3*(u9-t10) !---------------------- ajv = a(jv+j) ajy = a(jy+j) t1 = ajv + ajy t2 = ajo + ajt t3 = ajv - ajy t4 = ajo - ajt a(jv+j) = ajj t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aju = a(ju+j) t7 = aju - 0.25_dp * t5 ax = aju + t5 t8 = t7 + t6 t9 = t7 - t6 a(ju+j) = aje t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 a(je+j) = ax bjv = b(jv+j) bjy = b(jy+j) u1 = bjv + bjy u2 = bjo + bjt u3 = bjv - bjy u4 = bjo - bjt b(jv+j) = bjj u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bju = b(ju+j) u7 = bju - 0.25_dp * u5 bx = bju + u5 u8 = u7 + u6 u9 = u7 - u6 b(ju+j) = bje u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 b(je+j) = bx a(jj+j) = co1*(t8-u11) - si1*(u8+t11) b(jj+j) = si1*(t8-u11) + co1*(u8+t11) a(jy+j) = co4*(t8+u11) - si4*(u8-t11) b(jy+j) = si4*(t8+u11) + co4*(u8-t11) a(jo+j) = co2*(t9-u10) - si2*(u9+t10) b(jo+j) = si2*(t9-u10) + co2*(u9+t10) a(jt+j) = co3*(t9+u10) - si3*(u9-t10) b(jt+j) = si3*(t9+u10) + co3*(u9-t10) j = j + jump 440 continue ! endif ! !-----(end of loop across transforms) ! ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 445 continue 450 continue 460 continue !-----( end of double loop for this k ) kk = kk + 2*la 470 continue !-----( end of loop over values of k ) la = 5*la 480 continue !-----( end of loop on type II radix-5 passes ) !-----( nvex transforms completed) 490 continue istart = istart + nvex * jump 500 continue !-----( end of loop on blocks of transforms ) ! return end subroutine gpfa5f end module gridxc_gpfa_core_sp module gridxc_gpfa_core_dp ! ! 1d fft routine based on Clive Temperton's GPFA package ! ! Vectors to be transformed are in DOUBLE precision (wp=dp) implicit none ! In this module, trigs twiddle factors and intermediate ! variables are kept in double precision. integer, parameter :: dp = selected_real_kind(10,100) integer, parameter :: sp = selected_real_kind(6,10) integer, parameter :: wp = dp interface gpfa module procedure gpfa_ end interface gpfa public :: gpfa private CONTAINS !-------------------------------------------------------------- ! ! What follows is the original code by Temperton (save setgpfa) ! with explicit declarations of variables ! ! In this module, trigs twiddle factors and intermediate ! variables are kept in double precision. ! For arbitrary-precision support the parameter constants might ! be computed instead of inserted explicitly. ! ! For example: sin60 = sin(2.0_wp*pi/3.0_wp), where wp is the working ! precision, and ! pi = 4.0_wp * atan(1.0_wp), or ! twopi = 4.0_wp * asin(1.0_wp) ! !********************************************************************* ! SUBROUTINE 'GPFA' ! SELF-SORTING IN-PLACE GENERALIZED PRIME FACTOR (COMPLEX) FFT ! ! *** THIS IS THE ALL-FORTRAN VERSION *** ! ------------------------------- ! ! CALL GPFA(A,B,TRIGS,INC,JUMP,N,LOT,ISIGN) ! ! A IS FIRST REAL INPUT/OUTPUT VECTOR ! B IS FIRST IMAGINARY INPUT/OUTPUT VECTOR ! TRIGS IS A TABLE OF TWIDDLE FACTORS, PRECALCULATED ! BY CALLING SUBROUTINE 'SETGPFA' ! INC IS THE INCREMENT WITHIN EACH DATA VECTOR ! JUMP IS THE INCREMENT BETWEEN DATA VECTORS ! N IS THE LENGTH OF THE TRANSFORMS: ! ----------------------------------- ! N = (2**IP) * (3**IQ) * (5**IR) ! ----------------------------------- ! LOT IS THE NUMBER OF TRANSFORMS ! ISIGN = +1 FOR FORWARD TRANSFORM ! = -1 FOR INVERSE TRANSFORM ! ! WRITTEN BY CLIVE TEMPERTON ! RECHERCHE EN PREVISION NUMERIQUE ! ATMOSPHERIC ENVIRONMENT SERVICE, CANADA ! !---------------------------------------------------------------------- ! ! DEFINITION OF TRANSFORM ! ----------------------- ! ! X(J) = SUM(K=0,...,N-1)(C(K)*EXP(ISIGN*2*I*J*K*PI/N)) ! !--------------------------------------------------------------------- ! ! FOR A MATHEMATICAL DEVELOPMENT OF THE ALGORITHM USED, ! SEE: ! ! C TEMPERTON : "A GENERALIZED PRIME FACTOR FFT ALGORITHM ! FOR ANY N = (2**P)(3**Q)(5**R)", ! SIAM J. SCI. STAT. COMP., MAY 1992. ! SUBROUTINE GPFA_(A,B,TRIGS,INC,JUMP,N,LOT,ISIGN) IMPLICIT NONE ! REAL(wp) :: A(:), B(:) REAL(dp) TRIGS(*) INTEGER INC, JUMP, N, LOT, ISIGN ! INTEGER & NJ(3), NN, IFAC, LL, KK, IP, IQ, IR, I character(len=20) :: msg ! ! DECOMPOSE N INTO FACTORS 2,3,5 ! ------------------------------ NN = N IFAC = 2 ! DO 30 LL = 1 , 3 KK = 0 10 CONTINUE IF (MOD(NN,IFAC).NE.0) GO TO 20 KK = KK + 1 NN = NN / IFAC GO TO 10 20 CONTINUE NJ(LL) = KK IFAC = IFAC + LL 30 CONTINUE ! IF (NN.NE.1) THEN write(msg,"(i0)") N call die('GPFA: '//trim(msg)//' IS NOT A LEGAL VALUE OF N') ENDIF ! IP = NJ(1) IQ = NJ(2) IR = NJ(3) ! ! COMPUTE THE TRANSFORM ! --------------------- I = 1 IF (IP.GT.0) THEN CALL GPFA2F(A,B,TRIGS,INC,JUMP,N,IP,LOT,ISIGN) I = I + 2 * ( 2**IP) ENDIF IF (IQ.GT.0) THEN CALL GPFA3F(A,B,TRIGS(I),INC,JUMP,N,IQ,LOT,ISIGN) I = I + 2 * (3**IQ) ENDIF IF (IR.GT.0) THEN CALL GPFA5F(A,B,TRIGS(I),INC,JUMP,N,IR,LOT,ISIGN) ENDIF ! RETURN END SUBROUTINE GPFA_ !********************************************************************* ! fortran version of *gpfa2* - ! radix-2 section of self-sorting, in-place, generalized pfa ! central radix-2 and radix-8 passes included ! so that transform length can be any power of 2 ! subroutine gpfa2f(a,b,trigs,inc,jump,n,mm,lot,isign) real(wp) :: a(*), b(*) real(dp) :: trigs(*) integer inc, jump, n, mm, lot, isign integer n2, inq, jstepx, ninc, ink, m2, m8, & m, mh, nblox, left, istart, nb, nvex, & la, mu, ipass, jstep, jstepl, jjj, ja, & nu, jb, jc, jd, j, l, kk, k, je, jf, & jg, jh, laincl, ll, ji, jj, jk, jl, jm, & jn, jo, jp real(dp) :: s, aja, ajc, t0, t2, ajb, ajd, & t1, t3, bja, bjc, u0, u2, bjb, & bjd, u1, u3, co1, si1, co2, si2, & co3, si3, ss, c1, c2, c3, aje, & ajf, ajg, ajh, bje, bjf, bjg, & bjh, co4, si4, co5, si5, co6, si6, & co7, si7, aji, bjm, ajj, bjj, ajk, & ajl, bji, bjk, ajo, bjl, bjo, ajm, & ajn, ajp, bjn, bjp #ifdef OLD_CRAY integer, parameter :: lvr = CRAY_LVR_PARAMETER #else integer, parameter :: lvr = 1024 #endif ! ! *************************************************************** ! * * ! * N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. * ! * RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE * ! * (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER. * ! * * ! *************************************************************** ! n2 = 2**mm inq = n/n2 jstepx = (n2-n) * inc ninc = n * inc ink = inc * inq ! m2 = 0 m8 = 0 if (mod(mm,2).eq.0) then m = mm/2 else if (mod(mm,4).eq.1) then m = (mm-1)/2 m2 = 1 else if (mod(mm,4).eq.3) then m = (mm-3)/2 m8 = 1 endif mh = (m+1)/2 ! nblox = 1 + (lot-1)/lvr left = lot s = real(isign,kind=dp) istart = 1 ! ! loop on blocks of lvr transforms ! -------------------------------- do 500 nb = 1 , nblox ! if (left.le.lvr) then nvex = left else if (left.lt.(2*lvr)) then nvex = left/2 nvex = nvex + mod(nvex,2) else nvex = lvr endif left = left - nvex ! la = 1 ! ! loop on type I radix-4 passes ! ----------------------------- mu = mod(inq,4) if (isign.eq.-1) mu = 4 - mu ss = 1.0_dp if (mu.eq.3) ss = -1.0_dp ! if (mh.eq.0) go to 200 ! do 160 ipass = 1 , mh jstep = (n*inc) / (4*la) jstepl = jstep - ninc ! ! k = 0 loop (no twiddle factors) ! ------------------------------- do 120 jjj = 0 , (n-1)*inc , 4*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 115 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 110 l = 1 , nvex aja = a(ja+j) ajc = a(jc+j) t0 = aja + ajc t2 = aja - ajc ajb = a(jb+j) ajd = a(jd+j) t1 = ajb + ajd t3 = ss * ( ajb - ajd ) bja = b(ja+j) bjc = b(jc+j) u0 = bja + bjc u2 = bja - bjc bjb = b(jb+j) bjd = b(jd+j) u1 = bjb + bjd u3 = ss * ( bjb - bjd ) a(ja+j) = t0 + t1 a(jc+j) = t0 - t1 b(ja+j) = u0 + u1 b(jc+j) = u0 - u1 a(jb+j) = t2 - u3 a(jd+j) = t2 + u3 b(jb+j) = u2 + t3 b(jd+j) = u2 - t3 j = j + jump 110 continue ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 115 continue 120 continue ! ! finished if n2 = 4 ! ------------------ if (n2.eq.4) go to 490 kk = 2 * la ! ! loop on nonzero k ! ----------------- do 150 k = ink , jstep-ink , ink co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) co3 = trigs(3*kk+1) si3 = s*trigs(3*kk+2) ! ! loop along transform ! -------------------- do 140 jjj = k , (n-1)*inc , 4*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 135 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 130 l = 1 , nvex aja = a(ja+j) ajc = a(jc+j) t0 = aja + ajc t2 = aja - ajc ajb = a(jb+j) ajd = a(jd+j) t1 = ajb + ajd t3 = ss * ( ajb - ajd ) bja = b(ja+j) bjc = b(jc+j) u0 = bja + bjc u2 = bja - bjc bjb = b(jb+j) bjd = b(jd+j) u1 = bjb + bjd u3 = ss * ( bjb - bjd ) a(ja+j) = t0 + t1 b(ja+j) = u0 + u1 a(jb+j) = co1*(t2-u3) - si1*(u2+t3) b(jb+j) = si1*(t2-u3) + co1*(u2+t3) a(jc+j) = co2*(t0-t1) - si2*(u0-u1) b(jc+j) = si2*(t0-t1) + co2*(u0-u1) a(jd+j) = co3*(t2+u3) - si3*(u2-t3) b(jd+j) = si3*(t2+u3) + co3*(u2-t3) j = j + jump 130 continue !-----( end of loop across transforms ) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 135 continue 140 continue !-----( end of loop along transforms ) kk = kk + 2*la 150 continue !-----( end of loop on nonzero k ) la = 4*la 160 continue !-----( end of loop on type I radix-4 passes) ! ! central radix-2 pass ! -------------------- 200 continue if (m2.eq.0) go to 300 ! jstep = (n*inc) / (2*la) jstepl = jstep - ninc ! ! k=0 loop (no twiddle factors) ! ----------------------------- do 220 jjj = 0 , (n-1)*inc , 2*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 215 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 210 l = 1 , nvex aja = a(ja+j) ajb = a(jb+j) t0 = aja - ajb a(ja+j) = aja + ajb a(jb+j) = t0 bja = b(ja+j) bjb = b(jb+j) u0 = bja - bjb b(ja+j) = bja + bjb b(jb+j) = u0 j = j + jump 210 continue !-----(end of loop across transforms) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 215 continue 220 continue ! ! finished if n2=2 ! ---------------- if (n2.eq.2) go to 490 ! kk = 2 * la ! ! loop on nonzero k ! ----------------- do 260 k = ink , jstep - ink , ink co1 = trigs(kk+1) si1 = s*trigs(kk+2) ! ! loop along transforms ! --------------------- do 250 jjj = k , (n-1)*inc , 2*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 245 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc j = 0 ! ! loop across transforms ! ---------------------- if (kk.eq.n2/2) then #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 230 l = 1 , nvex aja = a(ja+j) ajb = a(jb+j) t0 = ss * ( aja - ajb ) a(ja+j) = aja + ajb bjb = b(jb+j) bja = b(ja+j) a(jb+j) = ss * ( bjb - bja ) b(ja+j) = bja + bjb b(jb+j) = t0 j = j + jump 230 continue ! else ! #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 240 l = 1 , nvex aja = a(ja+j) ajb = a(jb+j) t0 = aja - ajb a(ja+j) = aja + ajb bja = b(ja+j) bjb = b(jb+j) u0 = bja - bjb b(ja+j) = bja + bjb a(jb+j) = co1*t0 - si1*u0 b(jb+j) = si1*t0 + co1*u0 j = j + jump 240 continue ! endif ! !-----(end of loop across transforms) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 245 continue 250 continue !-----(end of loop along transforms) kk = kk + 2 * la 260 continue !-----(end of loop on nonzero k) !-----(end of radix-2 pass) ! la = 2 * la go to 400 ! ! central radix-8 pass ! -------------------- 300 continue if (m8.eq.0) go to 400 jstep = (n*inc) / (8*la) jstepl = jstep - ninc mu = mod(inq,8) if (isign.eq.-1) mu = 8 - mu c1 = 1.0_dp if (mu.eq.3.or.mu.eq.7) c1 = -1.0_dp c2 = sqrt(0.5_dp) if (mu.eq.3.or.mu.eq.5) c2 = -c2 c3 = c1 * c2 ! ! stage 1 ! ------- do 320 k = 0 , jstep - ink , ink do 315 jjj = k , (n-1)*inc , 8*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 312 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc j = 0 #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 310 l = 1 , nvex aja = a(ja+j) aje = a(je+j) t0 = aja - aje a(ja+j) = aja + aje ajc = a(jc+j) ajg = a(jg+j) t1 = c1 * ( ajc - ajg ) a(je+j) = ajc + ajg ajb = a(jb+j) ajf = a(jf+j) t2 = ajb - ajf a(jc+j) = ajb + ajf ajd = a(jd+j) ajh = a(jh+j) t3 = ajd - ajh a(jg+j) = ajd + ajh a(jb+j) = t0 a(jf+j) = t1 a(jd+j) = c2 * ( t2 - t3 ) a(jh+j) = c3 * ( t2 + t3 ) bja = b(ja+j) bje = b(je+j) u0 = bja - bje b(ja+j) = bja + bje bjc = b(jc+j) bjg = b(jg+j) u1 = c1 * ( bjc - bjg ) b(je+j) = bjc + bjg bjb = b(jb+j) bjf = b(jf+j) u2 = bjb - bjf b(jc+j) = bjb + bjf bjd = b(jd+j) bjh = b(jh+j) u3 = bjd - bjh b(jg+j) = bjd + bjh b(jb+j) = u0 b(jf+j) = u1 b(jd+j) = c2 * ( u2 - u3 ) b(jh+j) = c3 * ( u2 + u3 ) j = j + jump 310 continue ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 312 continue 315 continue 320 continue ! ! stage 2 ! ------- ! ! k=0 (no twiddle factors) ! ------------------------ do 330 jjj = 0 , (n-1)*inc , 8*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 328 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc j = 0 #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 325 l = 1 , nvex aja = a(ja+j) aje = a(je+j) t0 = aja + aje t2 = aja - aje ajc = a(jc+j) ajg = a(jg+j) t1 = ajc + ajg t3 = c1 * ( ajc - ajg ) bja = b(ja+j) bje = b(je+j) u0 = bja + bje u2 = bja - bje bjc = b(jc+j) bjg = b(jg+j) u1 = bjc + bjg u3 = c1 * ( bjc - bjg ) a(ja+j) = t0 + t1 a(je+j) = t0 - t1 b(ja+j) = u0 + u1 b(je+j) = u0 - u1 a(jc+j) = t2 - u3 a(jg+j) = t2 + u3 b(jc+j) = u2 + t3 b(jg+j) = u2 - t3 ajb = a(jb+j) ajd = a(jd+j) t0 = ajb + ajd t2 = ajb - ajd ajf = a(jf+j) ajh = a(jh+j) t1 = ajf - ajh t3 = ajf + ajh bjb = b(jb+j) bjd = b(jd+j) u0 = bjb + bjd u2 = bjb - bjd bjf = b(jf+j) bjh = b(jh+j) u1 = bjf - bjh u3 = bjf + bjh a(jb+j) = t0 - u3 a(jh+j) = t0 + u3 b(jb+j) = u0 + t3 b(jh+j) = u0 - t3 a(jd+j) = t2 + u1 a(jf+j) = t2 - u1 b(jd+j) = u2 - t1 b(jf+j) = u2 + t1 j = j + jump 325 continue ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 328 continue 330 continue ! if (n2.eq.8) go to 490 ! ! loop on nonzero k ! ----------------- kk = 2 * la ! do 350 k = ink , jstep - ink , ink ! co1 = trigs(kk+1) si1 = s * trigs(kk+2) co2 = trigs(2*kk+1) si2 = s * trigs(2*kk+2) co3 = trigs(3*kk+1) si3 = s * trigs(3*kk+2) co4 = trigs(4*kk+1) si4 = s * trigs(4*kk+2) co5 = trigs(5*kk+1) si5 = s * trigs(5*kk+2) co6 = trigs(6*kk+1) si6 = s * trigs(6*kk+2) co7 = trigs(7*kk+1) si7 = s * trigs(7*kk+2) ! do 345 jjj = k , (n-1)*inc , 8*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 342 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc j = 0 #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 340 l = 1 , nvex aja = a(ja+j) aje = a(je+j) t0 = aja + aje t2 = aja - aje ajc = a(jc+j) ajg = a(jg+j) t1 = ajc + ajg t3 = c1 * ( ajc - ajg ) bja = b(ja+j) bje = b(je+j) u0 = bja + bje u2 = bja - bje bjc = b(jc+j) bjg = b(jg+j) u1 = bjc + bjg u3 = c1 * ( bjc - bjg ) a(ja+j) = t0 + t1 b(ja+j) = u0 + u1 a(je+j) = co4*(t0-t1) - si4*(u0-u1) b(je+j) = si4*(t0-t1) + co4*(u0-u1) a(jc+j) = co2*(t2-u3) - si2*(u2+t3) b(jc+j) = si2*(t2-u3) + co2*(u2+t3) a(jg+j) = co6*(t2+u3) - si6*(u2-t3) b(jg+j) = si6*(t2+u3) + co6*(u2-t3) ajb = a(jb+j) ajd = a(jd+j) t0 = ajb + ajd t2 = ajb - ajd ajf = a(jf+j) ajh = a(jh+j) t1 = ajf - ajh t3 = ajf + ajh bjb = b(jb+j) bjd = b(jd+j) u0 = bjb + bjd u2 = bjb - bjd bjf = b(jf+j) bjh = b(jh+j) u1 = bjf - bjh u3 = bjf + bjh a(jb+j) = co1*(t0-u3) - si1*(u0+t3) b(jb+j) = si1*(t0-u3) + co1*(u0+t3) a(jh+j) = co7*(t0+u3) - si7*(u0-t3) b(jh+j) = si7*(t0+u3) + co7*(u0-t3) a(jd+j) = co3*(t2+u1) - si3*(u2-t1) b(jd+j) = si3*(t2+u1) + co3*(u2-t1) a(jf+j) = co5*(t2-u1) - si5*(u2+t1) b(jf+j) = si5*(t2-u1) + co5*(u2+t1) j = j + jump 340 continue ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 342 continue 345 continue kk = kk + 2 * la 350 continue ! la = 8 * la ! ! loop on type II radix-4 passes ! ------------------------------ 400 continue mu = mod(inq,4) if (isign.eq.-1) mu = 4 - mu ss = 1.0_dp if (mu.eq.3) ss = -1.0_dp ! do 480 ipass = mh+1 , m jstep = (n*inc) / (4*la) jstepl = jstep - ninc laincl = la * ink - ninc ! ! k=0 loop (no twiddle factors) ! ----------------------------- do 430 ll = 0 , (la-1)*ink , 4*jstep ! do 420 jjj = ll , (n-1)*inc , 4*la*ink ja = istart + jjj ! ! "transverse" loop ! ----------------- do 415 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = ja + laincl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc ji = je + laincl if (ji.lt.istart) ji = ji + ninc jj = ji + jstepl if (jj.lt.istart) jj = jj + ninc jk = jj + jstepl if (jk.lt.istart) jk = jk + ninc jl = jk + jstepl if (jl.lt.istart) jl = jl + ninc jm = ji + laincl if (jm.lt.istart) jm = jm + ninc jn = jm + jstepl if (jn.lt.istart) jn = jn + ninc jo = jn + jstepl if (jo.lt.istart) jo = jo + ninc jp = jo + jstepl if (jp.lt.istart) jp = jp + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 410 l = 1 , nvex aja = a(ja+j) ajc = a(jc+j) t0 = aja + ajc t2 = aja - ajc ajb = a(jb+j) ajd = a(jd+j) t1 = ajb + ajd t3 = ss * ( ajb - ajd ) aji = a(ji+j) ajc = aji bja = b(ja+j) bjc = b(jc+j) u0 = bja + bjc u2 = bja - bjc bjb = b(jb+j) bjd = b(jd+j) u1 = bjb + bjd u3 = ss * ( bjb - bjd ) aje = a(je+j) ajb = aje a(ja+j) = t0 + t1 a(ji+j) = t0 - t1 b(ja+j) = u0 + u1 bjc = u0 - u1 bjm = b(jm+j) bjd = bjm a(je+j) = t2 - u3 ajd = t2 + u3 bjb = u2 + t3 b(jm+j) = u2 - t3 !---------------------- ajg = a(jg+j) t0 = ajb + ajg t2 = ajb - ajg ajf = a(jf+j) ajh = a(jh+j) t1 = ajf + ajh t3 = ss * ( ajf - ajh ) ajj = a(jj+j) ajg = ajj bje = b(je+j) bjg = b(jg+j) u0 = bje + bjg u2 = bje - bjg bjf = b(jf+j) bjh = b(jh+j) u1 = bjf + bjh u3 = ss * ( bjf - bjh ) b(je+j) = bjb a(jb+j) = t0 + t1 a(jj+j) = t0 - t1 bjj = b(jj+j) bjg = bjj b(jb+j) = u0 + u1 b(jj+j) = u0 - u1 a(jf+j) = t2 - u3 ajh = t2 + u3 b(jf+j) = u2 + t3 bjh = u2 - t3 !---------------------- ajk = a(jk+j) t0 = ajc + ajk t2 = ajc - ajk ajl = a(jl+j) t1 = ajg + ajl t3 = ss * ( ajg - ajl ) bji = b(ji+j) bjk = b(jk+j) u0 = bji + bjk u2 = bji - bjk ajo = a(jo+j) ajl = ajo bjl = b(jl+j) u1 = bjg + bjl u3 = ss * ( bjg - bjl ) b(ji+j) = bjc a(jc+j) = t0 + t1 a(jk+j) = t0 - t1 bjo = b(jo+j) bjl = bjo b(jc+j) = u0 + u1 b(jk+j) = u0 - u1 a(jg+j) = t2 - u3 a(jo+j) = t2 + u3 b(jg+j) = u2 + t3 b(jo+j) = u2 - t3 !---------------------- ajm = a(jm+j) t0 = ajm + ajl t2 = ajm - ajl ajn = a(jn+j) ajp = a(jp+j) t1 = ajn + ajp t3 = ss * ( ajn - ajp ) a(jm+j) = ajd u0 = bjd + bjl u2 = bjd - bjl bjn = b(jn+j) bjp = b(jp+j) u1 = bjn + bjp u3 = ss * ( bjn - bjp ) a(jn+j) = ajh a(jd+j) = t0 + t1 a(jl+j) = t0 - t1 b(jd+j) = u0 + u1 b(jl+j) = u0 - u1 b(jn+j) = bjh a(jh+j) = t2 - u3 a(jp+j) = t2 + u3 b(jh+j) = u2 + t3 b(jp+j) = u2 - t3 j = j + jump 410 continue !-----( end of loop across transforms ) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 415 continue 420 continue 430 continue !-----( end of double loop for k=0 ) ! ! finished if last pass ! --------------------- if (ipass.eq.m) go to 490 ! kk = 2*la ! ! loop on nonzero k ! ----------------- do 470 k = ink , jstep-ink , ink co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) co3 = trigs(3*kk+1) si3 = s*trigs(3*kk+2) ! ! double loop along first transform in block ! ------------------------------------------ do 460 ll = k , (la-1)*ink , 4*jstep ! do 450 jjj = ll , (n-1)*inc , 4*la*ink ja = istart + jjj ! ! "transverse" loop ! ----------------- do 445 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = ja + laincl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc ji = je + laincl if (ji.lt.istart) ji = ji + ninc jj = ji + jstepl if (jj.lt.istart) jj = jj + ninc jk = jj + jstepl if (jk.lt.istart) jk = jk + ninc jl = jk + jstepl if (jl.lt.istart) jl = jl + ninc jm = ji + laincl if (jm.lt.istart) jm = jm + ninc jn = jm + jstepl if (jn.lt.istart) jn = jn + ninc jo = jn + jstepl if (jo.lt.istart) jo = jo + ninc jp = jo + jstepl if (jp.lt.istart) jp = jp + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 440 l = 1 , nvex aja = a(ja+j) ajc = a(jc+j) t0 = aja + ajc t2 = aja - ajc ajb = a(jb+j) ajd = a(jd+j) t1 = ajb + ajd t3 = ss * ( ajb - ajd ) aji = a(ji+j) ajc = aji bja = b(ja+j) bjc = b(jc+j) u0 = bja + bjc u2 = bja - bjc bjb = b(jb+j) bjd = b(jd+j) u1 = bjb + bjd u3 = ss * ( bjb - bjd ) aje = a(je+j) ajb = aje a(ja+j) = t0 + t1 b(ja+j) = u0 + u1 a(je+j) = co1*(t2-u3) - si1*(u2+t3) bjb = si1*(t2-u3) + co1*(u2+t3) bjm = b(jm+j) bjd = bjm a(ji+j) = co2*(t0-t1) - si2*(u0-u1) bjc = si2*(t0-t1) + co2*(u0-u1) ajd = co3*(t2+u3) - si3*(u2-t3) b(jm+j) = si3*(t2+u3) + co3*(u2-t3) !---------------------------------------- ajg = a(jg+j) t0 = ajb + ajg t2 = ajb - ajg ajf = a(jf+j) ajh = a(jh+j) t1 = ajf + ajh t3 = ss * ( ajf - ajh ) ajj = a(jj+j) ajg = ajj bje = b(je+j) bjg = b(jg+j) u0 = bje + bjg u2 = bje - bjg bjf = b(jf+j) bjh = b(jh+j) u1 = bjf + bjh u3 = ss * ( bjf - bjh ) b(je+j) = bjb a(jb+j) = t0 + t1 b(jb+j) = u0 + u1 bjj = b(jj+j) bjg = bjj a(jf+j) = co1*(t2-u3) - si1*(u2+t3) b(jf+j) = si1*(t2-u3) + co1*(u2+t3) a(jj+j) = co2*(t0-t1) - si2*(u0-u1) b(jj+j) = si2*(t0-t1) + co2*(u0-u1) ajh = co3*(t2+u3) - si3*(u2-t3) bjh = si3*(t2+u3) + co3*(u2-t3) !---------------------------------------- ajk = a(jk+j) t0 = ajc + ajk t2 = ajc - ajk ajl = a(jl+j) t1 = ajg + ajl t3 = ss * ( ajg - ajl ) bji = b(ji+j) bjk = b(jk+j) u0 = bji + bjk u2 = bji - bjk ajo = a(jo+j) ajl = ajo bjl = b(jl+j) u1 = bjg + bjl u3 = ss * ( bjg - bjl ) b(ji+j) = bjc a(jc+j) = t0 + t1 b(jc+j) = u0 + u1 bjo = b(jo+j) bjl = bjo a(jg+j) = co1*(t2-u3) - si1*(u2+t3) b(jg+j) = si1*(t2-u3) + co1*(u2+t3) a(jk+j) = co2*(t0-t1) - si2*(u0-u1) b(jk+j) = si2*(t0-t1) + co2*(u0-u1) a(jo+j) = co3*(t2+u3) - si3*(u2-t3) b(jo+j) = si3*(t2+u3) + co3*(u2-t3) !---------------------------------------- ajm = a(jm+j) t0 = ajm + ajl t2 = ajm - ajl ajn = a(jn+j) ajp = a(jp+j) t1 = ajn + ajp t3 = ss * ( ajn - ajp ) a(jm+j) = ajd u0 = bjd + bjl u2 = bjd - bjl a(jn+j) = ajh bjn = b(jn+j) bjp = b(jp+j) u1 = bjn + bjp u3 = ss * ( bjn - bjp ) b(jn+j) = bjh a(jd+j) = t0 + t1 b(jd+j) = u0 + u1 a(jh+j) = co1*(t2-u3) - si1*(u2+t3) b(jh+j) = si1*(t2-u3) + co1*(u2+t3) a(jl+j) = co2*(t0-t1) - si2*(u0-u1) b(jl+j) = si2*(t0-t1) + co2*(u0-u1) a(jp+j) = co3*(t2+u3) - si3*(u2-t3) b(jp+j) = si3*(t2+u3) + co3*(u2-t3) j = j + jump 440 continue !-----(end of loop across transforms) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 445 continue 450 continue 460 continue !-----( end of double loop for this k ) kk = kk + 2*la 470 continue !-----( end of loop over values of k ) la = 4*la 480 continue !-----( end of loop on type II radix-4 passes ) !-----( nvex transforms completed) 490 continue istart = istart + nvex * jump 500 continue !-----( end of loop on blocks of transforms ) ! return end subroutine gpfa2f !****************************************************************** ! fortran version of *gpfa3* - ! radix-3 section of self-sorting, in-place ! generalized PFA ! !------------------------------------------------------------------- ! subroutine gpfa3f(a,b,trigs,inc,jump,n,mm,lot,isign) real(wp) :: a(*), b(*) real(dp) :: trigs(*) integer inc, jump, n, mm, lot, isign real(dp), parameter :: sin60 = 0.866025403784437_dp integer :: je, jf, jg, jh, laincl, ll, ji integer :: n3, inq, jstepx, ninc, ink, mu, m, mh, & nblox, left, istart, nb, nvex, la, & ipass, jstep, jstepl, jjj, ja, nu, jb, & jc, j, l, kk, k, jd real(dp) :: s, ajb, ajc, t1, aja, t2, t3, bjb, bjc, & u1, bja, u2, u3, co1, si1, co2, si2, ajd, & bjd real(dp) :: c1, aje, ajg, ajf, ajh, bje, & bjf, bjg, bjh, aji, bji integer, parameter :: lvr = 128 ! ! *************************************************************** ! * * ! * N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. * ! * RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE * ! * (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER. * ! * * ! *************************************************************** ! n3 = 3**mm inq = n/n3 jstepx = (n3-n) * inc ninc = n * inc ink = inc * inq mu = mod(inq,3) if (isign.eq.-1) mu = 3-mu m = mm mh = (m+1)/2 s = real(isign,kind=dp) c1 = sin60 if (mu.eq.2) c1 = -c1 ! nblox = 1 + (lot-1)/lvr left = lot s = real(isign,kind=dp) istart = 1 ! ! loop on blocks of lvr transforms ! -------------------------------- do 500 nb = 1 , nblox ! if (left.le.lvr) then nvex = left else if (left.lt.(2*lvr)) then nvex = left/2 nvex = nvex + mod(nvex,2) else nvex = lvr endif left = left - nvex ! la = 1 ! ! loop on type I radix-3 passes ! ----------------------------- do 160 ipass = 1 , mh jstep = (n*inc) / (3*la) jstepl = jstep - ninc ! ! k = 0 loop (no twiddle factors) ! ------------------------------- do 120 jjj = 0 , (n-1)*inc , 3*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 115 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 110 l = 1 , nvex ajb = a(jb+j) ajc = a(jc+j) t1 = ajb + ajc aja = a(ja+j) t2 = aja - 0.5_dp * t1 t3 = c1 * ( ajb - ajc ) bjb = b(jb+j) bjc = b(jc+j) u1 = bjb + bjc bja = b(ja+j) u2 = bja - 0.5_dp * u1 u3 = c1 * ( bjb - bjc ) a(ja+j) = aja + t1 b(ja+j) = bja + u1 a(jb+j) = t2 - u3 b(jb+j) = u2 + t3 a(jc+j) = t2 + u3 b(jc+j) = u2 - t3 j = j + jump 110 continue ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 115 continue 120 continue ! ! finished if n3 = 3 ! ------------------ if (n3.eq.3) go to 490 kk = 2 * la ! ! loop on nonzero k ! ----------------- do 150 k = ink , jstep-ink , ink co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) ! ! loop along transform ! -------------------- do 140 jjj = k , (n-1)*inc , 3*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 135 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 130 l = 1 , nvex ajb = a(jb+j) ajc = a(jc+j) t1 = ajb + ajc aja = a(ja+j) t2 = aja - 0.5_dp * t1 t3 = c1 * ( ajb - ajc ) bjb = b(jb+j) bjc = b(jc+j) u1 = bjb + bjc bja = b(ja+j) u2 = bja - 0.5_dp * u1 u3 = c1 * ( bjb - bjc ) a(ja+j) = aja + t1 b(ja+j) = bja + u1 a(jb+j) = co1*(t2-u3) - si1*(u2+t3) b(jb+j) = si1*(t2-u3) + co1*(u2+t3) a(jc+j) = co2*(t2+u3) - si2*(u2-t3) b(jc+j) = si2*(t2+u3) + co2*(u2-t3) j = j + jump 130 continue !-----( end of loop across transforms ) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 135 continue 140 continue !-----( end of loop along transforms ) kk = kk + 2*la 150 continue !-----( end of loop on nonzero k ) la = 3*la 160 continue !-----( end of loop on type I radix-3 passes) ! ! loop on type II radix-3 passes ! ------------------------------ 400 continue ! do 480 ipass = mh+1 , m jstep = (n*inc) / (3*la) jstepl = jstep - ninc laincl = la*ink - ninc ! ! k=0 loop (no twiddle factors) ! ----------------------------- do 430 ll = 0 , (la-1)*ink , 3*jstep ! do 420 jjj = ll , (n-1)*inc , 3*la*ink ja = istart + jjj ! ! "transverse" loop ! ----------------- do 415 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = ja + laincl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jd + laincl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc ji = jh + jstepl if (ji.lt.istart) ji = ji + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 410 l = 1 , nvex ajb = a(jb+j) ajc = a(jc+j) t1 = ajb + ajc aja = a(ja+j) t2 = aja - 0.5_dp * t1 t3 = c1 * ( ajb - ajc ) ajd = a(jd+j) ajb = ajd bjb = b(jb+j) bjc = b(jc+j) u1 = bjb + bjc bja = b(ja+j) u2 = bja - 0.5_dp * u1 u3 = c1 * ( bjb - bjc ) bjd = b(jd+j) bjb = bjd a(ja+j) = aja + t1 b(ja+j) = bja + u1 a(jd+j) = t2 - u3 b(jd+j) = u2 + t3 ajc = t2 + u3 bjc = u2 - t3 !---------------------- aje = a(je+j) ajf = a(jf+j) t1 = aje + ajf t2 = ajb - 0.5_dp * t1 t3 = c1 * ( aje - ajf ) ajh = a(jh+j) ajf = ajh bje = b(je+j) bjf = b(jf+j) u1 = bje + bjf u2 = bjb - 0.5_dp * u1 u3 = c1 * ( bje - bjf ) bjh = b(jh+j) bjf = bjh a(jb+j) = ajb + t1 b(jb+j) = bjb + u1 a(je+j) = t2 - u3 b(je+j) = u2 + t3 a(jh+j) = t2 + u3 b(jh+j) = u2 - t3 !---------------------- aji = a(ji+j) t1 = ajf + aji ajg = a(jg+j) t2 = ajg - 0.5_dp * t1 t3 = c1 * ( ajf - aji ) t1 = ajg + t1 a(jg+j) = ajc bji = b(ji+j) u1 = bjf + bji bjg = b(jg+j) u2 = bjg - 0.5_dp * u1 u3 = c1 * ( bjf - bji ) u1 = bjg + u1 b(jg+j) = bjc a(jc+j) = t1 b(jc+j) = u1 a(jf+j) = t2 - u3 b(jf+j) = u2 + t3 a(ji+j) = t2 + u3 b(ji+j) = u2 - t3 j = j + jump 410 continue !-----( end of loop across transforms ) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 415 continue 420 continue 430 continue !-----( end of double loop for k=0 ) ! ! finished if last pass ! --------------------- if (ipass.eq.m) go to 490 ! kk = 2*la ! ! loop on nonzero k ! ----------------- do 470 k = ink , jstep-ink , ink co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) ! ! double loop along first transform in block ! ------------------------------------------ do 460 ll = k , (la-1)*ink , 3*jstep ! do 450 jjj = ll , (n-1)*inc , 3*la*ink ja = istart + jjj ! ! "transverse" loop ! ----------------- do 445 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = ja + laincl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = je + jstepl if (jf.lt.istart) jf = jf + ninc jg = jd + laincl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc ji = jh + jstepl if (ji.lt.istart) ji = ji + ninc j = 0 ! ! loop across transforms ! ---------------------- #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 440 l = 1 , nvex ajb = a(jb+j) ajc = a(jc+j) t1 = ajb + ajc aja = a(ja+j) t2 = aja - 0.5_dp * t1 t3 = c1 * ( ajb - ajc ) ajd = a(jd+j) ajb = ajd bjb = b(jb+j) bjc = b(jc+j) u1 = bjb + bjc bja = b(ja+j) u2 = bja - 0.5_dp * u1 u3 = c1 * ( bjb - bjc ) bjd = b(jd+j) bjb = bjd a(ja+j) = aja + t1 b(ja+j) = bja + u1 a(jd+j) = co1*(t2-u3) - si1*(u2+t3) b(jd+j) = si1*(t2-u3) + co1*(u2+t3) ajc = co2*(t2+u3) - si2*(u2-t3) bjc = si2*(t2+u3) + co2*(u2-t3) !---------------------- aje = a(je+j) ajf = a(jf+j) t1 = aje + ajf t2 = ajb - 0.5_dp * t1 t3 = c1 * ( aje - ajf ) ajh = a(jh+j) ajf = ajh bje = b(je+j) bjf = b(jf+j) u1 = bje + bjf u2 = bjb - 0.5_dp * u1 u3 = c1 * ( bje - bjf ) bjh = b(jh+j) bjf = bjh a(jb+j) = ajb + t1 b(jb+j) = bjb + u1 a(je+j) = co1*(t2-u3) - si1*(u2+t3) b(je+j) = si1*(t2-u3) + co1*(u2+t3) a(jh+j) = co2*(t2+u3) - si2*(u2-t3) b(jh+j) = si2*(t2+u3) + co2*(u2-t3) !---------------------- aji = a(ji+j) t1 = ajf + aji ajg = a(jg+j) t2 = ajg - 0.5_dp * t1 t3 = c1 * ( ajf - aji ) t1 = ajg + t1 a(jg+j) = ajc bji = b(ji+j) u1 = bjf + bji bjg = b(jg+j) u2 = bjg - 0.5_dp * u1 u3 = c1 * ( bjf - bji ) u1 = bjg + u1 b(jg+j) = bjc a(jc+j) = t1 b(jc+j) = u1 a(jf+j) = co1*(t2-u3) - si1*(u2+t3) b(jf+j) = si1*(t2-u3) + co1*(u2+t3) a(ji+j) = co2*(t2+u3) - si2*(u2-t3) b(ji+j) = si2*(t2+u3) + co2*(u2-t3) j = j + jump 440 continue !-----(end of loop across transforms) ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 445 continue 450 continue 460 continue !-----( end of double loop for this k ) kk = kk + 2*la 470 continue !-----( end of loop over values of k ) la = 3*la 480 continue !-----( end of loop on type II radix-3 passes ) !-----( nvex transforms completed) 490 continue istart = istart + nvex * jump 500 continue !-----( end of loop on blocks of transforms ) ! return end subroutine gpfa3f !******************************************************************* ! fortran version of *gpfa5* - ! radix-5 section of self-sorting, in-place, ! generalized pfa ! !------------------------------------------------------------------- ! subroutine gpfa5f(a,b,trigs,inc,jump,n,mm,lot,isign) real(wp) :: a(*), b(*) real(dp) :: trigs(*) integer inc, jump, n, mm, lot, isign real(dp), parameter :: sin36 = 0.587785252292473_dp, & sin72 = 0.951056516295154_dp, & qrt5 = 0.559016994374947_dp integer, parameter :: lvr = 128 integer :: inq, jstepx, ninc, ink, mu, m, mh, & nblox, n5, left, istart, nb, nvex, la, & ipass, jstep, jstepl, kk, k, jjj, ja, & nu, jb, jc, jd, je, j, laincl, ll, l, & jf, jg, jh, ji, jj, jk, jl, jm, jn, jo, & jp, jq, jr, js, jt, ju, jv, jw, jx, jy real(dp) :: s, c1, c2, c3, co1, si1, co2, si2, co3, & si3, co4, si4, ajb, aje, t1, ajc, ajd, & t2, t3, t4, t5, t6, aja, t7, t8, t9, t10, & t11, bjb, bje, u1, bjc, bjd, u2, u3, u4, & u5, u6, bja, u7, u8, u9, u10, u11, ajf, & ajk, bjf, bjk, ajg, ajj, ajh, aji, ajl, & ajq, bjg, bjj, bjh, bji, bjl, bjq, ajo, & ajm, ajn, ajr, ajw, bjo, bjm, bjn, bjr, & bjw, ajt, ajs, ajx, ajp, ax, bjt, bjs, & bjx, bjp, bx, ajv, ajy, aju, bjv, bjy, & bju ! ! *************************************************************** ! * * ! * N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. * ! * RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE * ! * (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER. * ! * * ! *************************************************************** ! n5 = 5 ** mm inq = n / n5 jstepx = (n5-n) * inc ninc = n * inc ink = inc * inq mu = mod(inq,5) if (isign.eq.-1) mu = 5 - mu ! m = mm mh = (m+1)/2 s = real(isign,kind=dp) c1 = qrt5 c2 = sin72 c3 = sin36 if (mu.eq.2.or.mu.eq.3) then c1 = -c1 c2 = sin36 c3 = sin72 endif if (mu.eq.3.or.mu.eq.4) c2 = -c2 if (mu.eq.2.or.mu.eq.4) c3 = -c3 ! nblox = 1 + (lot-1)/lvr left = lot s = real(isign,kind=dp) istart = 1 ! ! loop on blocks of lvr transforms ! -------------------------------- do 500 nb = 1 , nblox ! if (left.le.lvr) then nvex = left else if (left.lt.(2*lvr)) then nvex = left/2 nvex = nvex + mod(nvex,2) else nvex = lvr endif left = left - nvex ! la = 1 ! ! loop on type I radix-5 passes ! ----------------------------- do 160 ipass = 1 , mh jstep = (n*inc) / (5*la) jstepl = jstep - ninc kk = 0 ! ! loop on k ! --------- do 150 k = 0 , jstep-ink , ink ! if (k.gt.0) then co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) co3 = trigs(3*kk+1) si3 = s*trigs(3*kk+2) co4 = trigs(4*kk+1) si4 = s*trigs(4*kk+2) endif ! ! loop along transform ! -------------------- do 140 jjj = k , (n-1)*inc , 5*jstep ja = istart + jjj ! ! "transverse" loop ! ----------------- do 135 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc j = 0 ! ! loop across transforms ! ---------------------- if (k.eq.0) then ! #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 110 l = 1 , nvex ajb = a(jb+j) aje = a(je+j) t1 = ajb + aje ajc = a(jc+j) ajd = a(jd+j) t2 = ajc + ajd t3 = ajb - aje t4 = ajc - ajd t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aja = a(ja+j) t7 = aja - 0.25_dp * t5 a(ja+j) = aja + t5 t8 = t7 + t6 t9 = t7 - t6 t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjb = b(jb+j) bje = b(je+j) u1 = bjb + bje bjc = b(jc+j) bjd = b(jd+j) u2 = bjc + bjd u3 = bjb - bje u4 = bjc - bjd u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bja = b(ja+j) u7 = bja - 0.25_dp * u5 b(ja+j) = bja + u5 u8 = u7 + u6 u9 = u7 - u6 u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jb+j) = t8 - u11 b(jb+j) = u8 + t11 a(je+j) = t8 + u11 b(je+j) = u8 - t11 a(jc+j) = t9 - u10 b(jc+j) = u9 + t10 a(jd+j) = t9 + u10 b(jd+j) = u9 - t10 j = j + jump 110 continue ! else ! #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 130 l = 1 , nvex ajb = a(jb+j) aje = a(je+j) t1 = ajb + aje ajc = a(jc+j) ajd = a(jd+j) t2 = ajc + ajd t3 = ajb - aje t4 = ajc - ajd t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aja = a(ja+j) t7 = aja - 0.25_dp * t5 a(ja+j) = aja + t5 t8 = t7 + t6 t9 = t7 - t6 t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjb = b(jb+j) bje = b(je+j) u1 = bjb + bje bjc = b(jc+j) bjd = b(jd+j) u2 = bjc + bjd u3 = bjb - bje u4 = bjc - bjd u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bja = b(ja+j) u7 = bja - 0.25_dp * u5 b(ja+j) = bja + u5 u8 = u7 + u6 u9 = u7 - u6 u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jb+j) = co1*(t8-u11) - si1*(u8+t11) b(jb+j) = si1*(t8-u11) + co1*(u8+t11) a(je+j) = co4*(t8+u11) - si4*(u8-t11) b(je+j) = si4*(t8+u11) + co4*(u8-t11) a(jc+j) = co2*(t9-u10) - si2*(u9+t10) b(jc+j) = si2*(t9-u10) + co2*(u9+t10) a(jd+j) = co3*(t9+u10) - si3*(u9-t10) b(jd+j) = si3*(t9+u10) + co3*(u9-t10) j = j + jump 130 continue ! endif ! !-----( end of loop across transforms ) ! ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 135 continue 140 continue !-----( end of loop along transforms ) kk = kk + 2*la 150 continue !-----( end of loop on nonzero k ) la = 5*la 160 continue !-----( end of loop on type I radix-5 passes) ! if (n.eq.5) go to 490 ! ! loop on type II radix-5 passes ! ------------------------------ 400 continue ! do 480 ipass = mh+1 , m jstep = (n*inc) / (5*la) jstepl = jstep - ninc laincl = la * ink - ninc kk = 0 ! ! loop on k ! --------- do 470 k = 0 , jstep-ink , ink ! if (k.gt.0) then co1 = trigs(kk+1) si1 = s*trigs(kk+2) co2 = trigs(2*kk+1) si2 = s*trigs(2*kk+2) co3 = trigs(3*kk+1) si3 = s*trigs(3*kk+2) co4 = trigs(4*kk+1) si4 = s*trigs(4*kk+2) endif ! ! double loop along first transform in block ! ------------------------------------------ do 460 ll = k , (la-1)*ink , 5*jstep ! do 450 jjj = ll , (n-1)*inc , 5*la*ink ja = istart + jjj ! ! "transverse" loop ! ----------------- do 445 nu = 1 , inq jb = ja + jstepl if (jb.lt.istart) jb = jb + ninc jc = jb + jstepl if (jc.lt.istart) jc = jc + ninc jd = jc + jstepl if (jd.lt.istart) jd = jd + ninc je = jd + jstepl if (je.lt.istart) je = je + ninc jf = ja + laincl if (jf.lt.istart) jf = jf + ninc jg = jf + jstepl if (jg.lt.istart) jg = jg + ninc jh = jg + jstepl if (jh.lt.istart) jh = jh + ninc ji = jh + jstepl if (ji.lt.istart) ji = ji + ninc jj = ji + jstepl if (jj.lt.istart) jj = jj + ninc jk = jf + laincl if (jk.lt.istart) jk = jk + ninc jl = jk + jstepl if (jl.lt.istart) jl = jl + ninc jm = jl + jstepl if (jm.lt.istart) jm = jm + ninc jn = jm + jstepl if (jn.lt.istart) jn = jn + ninc jo = jn + jstepl if (jo.lt.istart) jo = jo + ninc jp = jk + laincl if (jp.lt.istart) jp = jp + ninc jq = jp + jstepl if (jq.lt.istart) jq = jq + ninc jr = jq + jstepl if (jr.lt.istart) jr = jr + ninc js = jr + jstepl if (js.lt.istart) js = js + ninc jt = js + jstepl if (jt.lt.istart) jt = jt + ninc ju = jp + laincl if (ju.lt.istart) ju = ju + ninc jv = ju + jstepl if (jv.lt.istart) jv = jv + ninc jw = jv + jstepl if (jw.lt.istart) jw = jw + ninc jx = jw + jstepl if (jx.lt.istart) jx = jx + ninc jy = jx + jstepl if (jy.lt.istart) jy = jy + ninc j = 0 ! ! loop across transforms ! ---------------------- if (k.eq.0) then ! #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 410 l = 1 , nvex ajb = a(jb+j) aje = a(je+j) t1 = ajb + aje ajc = a(jc+j) ajd = a(jd+j) t2 = ajc + ajd t3 = ajb - aje t4 = ajc - ajd ajf = a(jf+j) ajb = ajf t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aja = a(ja+j) t7 = aja - 0.25_dp * t5 a(ja+j) = aja + t5 t8 = t7 + t6 t9 = t7 - t6 ajk = a(jk+j) ajc = ajk t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjb = b(jb+j) bje = b(je+j) u1 = bjb + bje bjc = b(jc+j) bjd = b(jd+j) u2 = bjc + bjd u3 = bjb - bje u4 = bjc - bjd bjf = b(jf+j) bjb = bjf u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bja = b(ja+j) u7 = bja - 0.25_dp * u5 b(ja+j) = bja + u5 u8 = u7 + u6 u9 = u7 - u6 bjk = b(jk+j) bjc = bjk u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jf+j) = t8 - u11 b(jf+j) = u8 + t11 aje = t8 + u11 bje = u8 - t11 a(jk+j) = t9 - u10 b(jk+j) = u9 + t10 ajd = t9 + u10 bjd = u9 - t10 !---------------------- ajg = a(jg+j) ajj = a(jj+j) t1 = ajg + ajj ajh = a(jh+j) aji = a(ji+j) t2 = ajh + aji t3 = ajg - ajj t4 = ajh - aji ajl = a(jl+j) ajh = ajl t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) t7 = ajb - 0.25_dp * t5 a(jb+j) = ajb + t5 t8 = t7 + t6 t9 = t7 - t6 ajq = a(jq+j) aji = ajq t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjg = b(jg+j) bjj = b(jj+j) u1 = bjg + bjj bjh = b(jh+j) bji = b(ji+j) u2 = bjh + bji u3 = bjg - bjj u4 = bjh - bji bjl = b(jl+j) bjh = bjl u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) u7 = bjb - 0.25_dp * u5 b(jb+j) = bjb + u5 u8 = u7 + u6 u9 = u7 - u6 bjq = b(jq+j) bji = bjq u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jg+j) = t8 - u11 b(jg+j) = u8 + t11 ajj = t8 + u11 bjj = u8 - t11 a(jl+j) = t9 - u10 b(jl+j) = u9 + t10 a(jq+j) = t9 + u10 b(jq+j) = u9 - t10 !---------------------- ajo = a(jo+j) t1 = ajh + ajo ajm = a(jm+j) ajn = a(jn+j) t2 = ajm + ajn t3 = ajh - ajo t4 = ajm - ajn ajr = a(jr+j) ajn = ajr t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) t7 = ajc - 0.25_dp * t5 a(jc+j) = ajc + t5 t8 = t7 + t6 t9 = t7 - t6 ajw = a(jw+j) ajo = ajw t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjo = b(jo+j) u1 = bjh + bjo bjm = b(jm+j) bjn = b(jn+j) u2 = bjm + bjn u3 = bjh - bjo u4 = bjm - bjn bjr = b(jr+j) bjn = bjr u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) u7 = bjc - 0.25_dp * u5 b(jc+j) = bjc + u5 u8 = u7 + u6 u9 = u7 - u6 bjw = b(jw+j) bjo = bjw u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jh+j) = t8 - u11 b(jh+j) = u8 + t11 a(jw+j) = t8 + u11 b(jw+j) = u8 - t11 a(jm+j) = t9 - u10 b(jm+j) = u9 + t10 a(jr+j) = t9 + u10 b(jr+j) = u9 - t10 !---------------------- ajt = a(jt+j) t1 = aji + ajt ajs = a(js+j) t2 = ajn + ajs t3 = aji - ajt t4 = ajn - ajs ajx = a(jx+j) ajt = ajx t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) ajp = a(jp+j) t7 = ajp - 0.25_dp * t5 ax = ajp + t5 t8 = t7 + t6 t9 = t7 - t6 a(jp+j) = ajd t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 a(jd+j) = ax bjt = b(jt+j) u1 = bji + bjt bjs = b(js+j) u2 = bjn + bjs u3 = bji - bjt u4 = bjn - bjs bjx = b(jx+j) bjt = bjx u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bjp = b(jp+j) u7 = bjp - 0.25_dp * u5 bx = bjp + u5 u8 = u7 + u6 u9 = u7 - u6 b(jp+j) = bjd u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 b(jd+j) = bx a(ji+j) = t8 - u11 b(ji+j) = u8 + t11 a(jx+j) = t8 + u11 b(jx+j) = u8 - t11 a(jn+j) = t9 - u10 b(jn+j) = u9 + t10 a(js+j) = t9 + u10 b(js+j) = u9 - t10 !---------------------- ajv = a(jv+j) ajy = a(jy+j) t1 = ajv + ajy t2 = ajo + ajt t3 = ajv - ajy t4 = ajo - ajt a(jv+j) = ajj t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aju = a(ju+j) t7 = aju - 0.25_dp * t5 ax = aju + t5 t8 = t7 + t6 t9 = t7 - t6 a(ju+j) = aje t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 a(je+j) = ax bjv = b(jv+j) bjy = b(jy+j) u1 = bjv + bjy u2 = bjo + bjt u3 = bjv - bjy u4 = bjo - bjt b(jv+j) = bjj u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bju = b(ju+j) u7 = bju - 0.25_dp * u5 bx = bju + u5 u8 = u7 + u6 u9 = u7 - u6 b(ju+j) = bje u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 b(je+j) = bx a(jj+j) = t8 - u11 b(jj+j) = u8 + t11 a(jy+j) = t8 + u11 b(jy+j) = u8 - t11 a(jo+j) = t9 - u10 b(jo+j) = u9 + t10 a(jt+j) = t9 + u10 b(jt+j) = u9 - t10 j = j + jump 410 continue ! else ! #ifdef OLD_CRAY !dir$ ivdep, shortloop #endif do 440 l = 1 , nvex ajb = a(jb+j) aje = a(je+j) t1 = ajb + aje ajc = a(jc+j) ajd = a(jd+j) t2 = ajc + ajd t3 = ajb - aje t4 = ajc - ajd ajf = a(jf+j) ajb = ajf t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aja = a(ja+j) t7 = aja - 0.25_dp * t5 a(ja+j) = aja + t5 t8 = t7 + t6 t9 = t7 - t6 ajk = a(jk+j) ajc = ajk t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjb = b(jb+j) bje = b(je+j) u1 = bjb + bje bjc = b(jc+j) bjd = b(jd+j) u2 = bjc + bjd u3 = bjb - bje u4 = bjc - bjd bjf = b(jf+j) bjb = bjf u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bja = b(ja+j) u7 = bja - 0.25_dp * u5 b(ja+j) = bja + u5 u8 = u7 + u6 u9 = u7 - u6 bjk = b(jk+j) bjc = bjk u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jf+j) = co1*(t8-u11) - si1*(u8+t11) b(jf+j) = si1*(t8-u11) + co1*(u8+t11) aje = co4*(t8+u11) - si4*(u8-t11) bje = si4*(t8+u11) + co4*(u8-t11) a(jk+j) = co2*(t9-u10) - si2*(u9+t10) b(jk+j) = si2*(t9-u10) + co2*(u9+t10) ajd = co3*(t9+u10) - si3*(u9-t10) bjd = si3*(t9+u10) + co3*(u9-t10) !---------------------- ajg = a(jg+j) ajj = a(jj+j) t1 = ajg + ajj ajh = a(jh+j) aji = a(ji+j) t2 = ajh + aji t3 = ajg - ajj t4 = ajh - aji ajl = a(jl+j) ajh = ajl t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) t7 = ajb - 0.25_dp * t5 a(jb+j) = ajb + t5 t8 = t7 + t6 t9 = t7 - t6 ajq = a(jq+j) aji = ajq t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjg = b(jg+j) bjj = b(jj+j) u1 = bjg + bjj bjh = b(jh+j) bji = b(ji+j) u2 = bjh + bji u3 = bjg - bjj u4 = bjh - bji bjl = b(jl+j) bjh = bjl u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) u7 = bjb - 0.25_dp * u5 b(jb+j) = bjb + u5 u8 = u7 + u6 u9 = u7 - u6 bjq = b(jq+j) bji = bjq u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jg+j) = co1*(t8-u11) - si1*(u8+t11) b(jg+j) = si1*(t8-u11) + co1*(u8+t11) ajj = co4*(t8+u11) - si4*(u8-t11) bjj = si4*(t8+u11) + co4*(u8-t11) a(jl+j) = co2*(t9-u10) - si2*(u9+t10) b(jl+j) = si2*(t9-u10) + co2*(u9+t10) a(jq+j) = co3*(t9+u10) - si3*(u9-t10) b(jq+j) = si3*(t9+u10) + co3*(u9-t10) !---------------------- ajo = a(jo+j) t1 = ajh + ajo ajm = a(jm+j) ajn = a(jn+j) t2 = ajm + ajn t3 = ajh - ajo t4 = ajm - ajn ajr = a(jr+j) ajn = ajr t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) t7 = ajc - 0.25_dp * t5 a(jc+j) = ajc + t5 t8 = t7 + t6 t9 = t7 - t6 ajw = a(jw+j) ajo = ajw t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 bjo = b(jo+j) u1 = bjh + bjo bjm = b(jm+j) bjn = b(jn+j) u2 = bjm + bjn u3 = bjh - bjo u4 = bjm - bjn bjr = b(jr+j) bjn = bjr u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) u7 = bjc - 0.25_dp * u5 b(jc+j) = bjc + u5 u8 = u7 + u6 u9 = u7 - u6 bjw = b(jw+j) bjo = bjw u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 a(jh+j) = co1*(t8-u11) - si1*(u8+t11) b(jh+j) = si1*(t8-u11) + co1*(u8+t11) a(jw+j) = co4*(t8+u11) - si4*(u8-t11) b(jw+j) = si4*(t8+u11) + co4*(u8-t11) a(jm+j) = co2*(t9-u10) - si2*(u9+t10) b(jm+j) = si2*(t9-u10) + co2*(u9+t10) a(jr+j) = co3*(t9+u10) - si3*(u9-t10) b(jr+j) = si3*(t9+u10) + co3*(u9-t10) !---------------------- ajt = a(jt+j) t1 = aji + ajt ajs = a(js+j) t2 = ajn + ajs t3 = aji - ajt t4 = ajn - ajs ajx = a(jx+j) ajt = ajx t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) ajp = a(jp+j) t7 = ajp - 0.25_dp * t5 ax = ajp + t5 t8 = t7 + t6 t9 = t7 - t6 a(jp+j) = ajd t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 a(jd+j) = ax bjt = b(jt+j) u1 = bji + bjt bjs = b(js+j) u2 = bjn + bjs u3 = bji - bjt u4 = bjn - bjs bjx = b(jx+j) bjt = bjx u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bjp = b(jp+j) u7 = bjp - 0.25_dp * u5 bx = bjp + u5 u8 = u7 + u6 u9 = u7 - u6 b(jp+j) = bjd u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 b(jd+j) = bx a(ji+j) = co1*(t8-u11) - si1*(u8+t11) b(ji+j) = si1*(t8-u11) + co1*(u8+t11) a(jx+j) = co4*(t8+u11) - si4*(u8-t11) b(jx+j) = si4*(t8+u11) + co4*(u8-t11) a(jn+j) = co2*(t9-u10) - si2*(u9+t10) b(jn+j) = si2*(t9-u10) + co2*(u9+t10) a(js+j) = co3*(t9+u10) - si3*(u9-t10) b(js+j) = si3*(t9+u10) + co3*(u9-t10) !---------------------- ajv = a(jv+j) ajy = a(jy+j) t1 = ajv + ajy t2 = ajo + ajt t3 = ajv - ajy t4 = ajo - ajt a(jv+j) = ajj t5 = t1 + t2 t6 = c1 * ( t1 - t2 ) aju = a(ju+j) t7 = aju - 0.25_dp * t5 ax = aju + t5 t8 = t7 + t6 t9 = t7 - t6 a(ju+j) = aje t10 = c3 * t3 - c2 * t4 t11 = c2 * t3 + c3 * t4 a(je+j) = ax bjv = b(jv+j) bjy = b(jy+j) u1 = bjv + bjy u2 = bjo + bjt u3 = bjv - bjy u4 = bjo - bjt b(jv+j) = bjj u5 = u1 + u2 u6 = c1 * ( u1 - u2 ) bju = b(ju+j) u7 = bju - 0.25_dp * u5 bx = bju + u5 u8 = u7 + u6 u9 = u7 - u6 b(ju+j) = bje u10 = c3 * u3 - c2 * u4 u11 = c2 * u3 + c3 * u4 b(je+j) = bx a(jj+j) = co1*(t8-u11) - si1*(u8+t11) b(jj+j) = si1*(t8-u11) + co1*(u8+t11) a(jy+j) = co4*(t8+u11) - si4*(u8-t11) b(jy+j) = si4*(t8+u11) + co4*(u8-t11) a(jo+j) = co2*(t9-u10) - si2*(u9+t10) b(jo+j) = si2*(t9-u10) + co2*(u9+t10) a(jt+j) = co3*(t9+u10) - si3*(u9-t10) b(jt+j) = si3*(t9+u10) + co3*(u9-t10) j = j + jump 440 continue ! endif ! !-----(end of loop across transforms) ! ja = ja + jstepx if (ja.lt.istart) ja = ja + ninc 445 continue 450 continue 460 continue !-----( end of double loop for this k ) kk = kk + 2*la 470 continue !-----( end of loop over values of k ) la = 5*la 480 continue !-----( end of loop on type II radix-5 passes ) !-----( nvex transforms completed) 490 continue istart = istart + nvex * jump 500 continue !-----( end of loop on blocks of transforms ) ! return end subroutine gpfa5f end module gridxc_gpfa_core_dp module gridxc_fft_gpfa ! ! 1d fft routine based on Clive Temperton's GPFA package ! use gridxc_gpfa_core_sp, only: gpfa use gridxc_gpfa_core_dp, only: gpfa implicit none integer, parameter :: dp = selected_real_kind(10,100) integer, parameter :: sp = selected_real_kind(6,10) ! This module is not thread safe due to the use of ! these global variables. ! To make it thread-safe, and to have more control over ! the re-use of the auxiliary twiddle factors, you might ! define a derived type to hold them and call the wrapper routines ! with it as one of the arguments. real(dp), allocatable, save :: trigs(:) integer, save :: nold = -1 !---------------------------------------------------------- ! auxiliary variable character(len=20) :: msg public :: nfft ! Generates N compatible with the fft interface fft_gpfa_ez module procedure fft_gpfa_ez_dp module procedure fft_gpfa_ez_sp end interface fft_gpfa_ez interface fft_gpfa module procedure fft_gpfa_dp module procedure fft_gpfa_sp end interface fft_gpfa public :: fft_gpfa_ez ! Simple version with complex array public :: fft_gpfa ! General version with strides, ! multiple vectors, etc ! Wrapper for setgpfa which returns proper length of work array public :: setgpfa_check ! Access to the standard legacy routines, if needed public :: gpfa, setgpfa private CONTAINS SUBROUTINE nfft( N ) integer, intent(inout) :: N ! CHANGES N INTO THE NEXT INTEGER ALLOWED BY THE FFT ROUTINE ! WRITTEN BY J.M.SOLER. MAY/95. ! Modified by A. Garcia, May 2015 integer :: np, nmax, nmin, nrem, ip ! no limits with huge, but subtract 2 to avoid 'overflow' warning parameter (NP = 3, NMAX = huge(N)-2) integer :: IPRIME(NP) data IPRIME / 2, 3, 5 / NMIN = N DO N = NMIN, NMAX NREM = N DO IP = 1,NP 10 CONTINUE IF ( MODULO( NREM, IPRIME(IP) ) .EQ. 0 ) THEN NREM = NREM / IPRIME(IP) GOTO 10 ENDIF ENDDO IF (NREM .EQ. 1) RETURN ENDDO write(msg,"(i0)") NMIN call die('NFFT: NO SUITABLE INTEGER FOUND FOR N =' // trim(msg)) END SUBROUTINE nfft !---------------------------------------------------- subroutine fft_gpfa_ez_sp(data,n,isign) ! packed complex array in a real array twice as long integer, intent(in) :: n real(sp), intent(inout) :: data(2*n) integer, intent(in) :: isign ! ISIGN = +1 FOR FORWARD TRANSFORM ! = -1 FOR INVERSE TRANSFORM ! If n has changed since the last use, ! set trigs array, used by 1D-FFT routine gpfa ! The routine setgpfa_alloc is a special version ! of setgpfa which (re)allocates the trigs argument ! if needed, and re-fills it with new twiddle factors if (n /= nold) then call setgpfa_alloc( trigs, n ) end if ! FFT. Arguments of gpfa are: ! gpfa( realData(*), imagData(*), trigs(*), ! dataSpan, vectorSpan, vectorSize, numVectors, fftDir ) ! ! Note dataSpan=2 since we are packing the real and imaginary ! parts in the same vector. call gpfa( data(1:), data(2:), trigs, & 2, 1, n, 1, isign ) nold = n end subroutine fft_gpfa_ez_sp !---------------------------------------------------- subroutine fft_gpfa_ez_dp(data,n,isign) ! packed complex array in a real array twice as long integer, intent(in) :: n real(dp), intent(inout) :: data(2*n) integer, intent(in) :: isign ! ISIGN = +1 FOR FORWARD TRANSFORM ! = -1 FOR INVERSE TRANSFORM ! If n has changed since the last use, ! set trigs array, used by 1D-FFT routine gpfa ! The routine setgpfa_alloc is a special version ! of setgpfa which (re)allocates the trigs argument ! if needed, and re-fills it with new twiddle factors if (n /= nold) then call setgpfa_alloc( trigs, n ) end if ! FFT. Arguments of gpfa are: ! gpfa( realData(*), imagData(*), trigs(*), ! dataSpan, vectorSpan, vectorSize, numVectors, fftDir ) ! ! Note dataSpan=2 since we are packing the real and imaginary ! parts in the same vector. call gpfa( data(1:), data(2:), trigs, & 2, 1, n, 1, isign ) nold = n end subroutine fft_gpfa_ez_dp !---------------------------------------------------------------------- subroutine fft_gpfa_sp(realData, imagData, & dataSpan, vectorSpan, N, numVectors, & isign ) ! A wrapper for the GPFA routine which handles the ! work array automatically real(sp), intent(inout) :: realData(:), imagData(:) integer, intent(in) :: dataSpan, vectorSpan, N, numVectors integer, intent(in) :: isign ! ISIGN = +1 FOR FORWARD TRANSFORM ! = -1 FOR INVERSE TRANSFORM ! Set trigs array, used by 1D-FFT routine gpfa ! The routine setgpfa_alloc is a special version ! of setgpfa which (re)allocates the trigs argument ! if needed, and re-fills it with new twiddle factors ! if n has changed since the last use. if (n /= nold) then call setgpfa_alloc( trigs, n ) end if call gpfa(realData, imagData, trigs, & dataSpan, vectorSpan, N, numVectors, isign) nold = n end subroutine fft_gpfa_sp !---------------------------------------------------------------------- subroutine fft_gpfa_dp(realData, imagData, & dataSpan, vectorSpan, N, numVectors, & isign ) ! A wrapper for the GPFA routine which handles the ! work array automatically real(dp), intent(inout) :: realData(:), imagData(:) integer, intent(in) :: dataSpan, vectorSpan, N, numVectors integer, intent(in) :: isign ! ISIGN = +1 FOR FORWARD TRANSFORM ! = -1 FOR INVERSE TRANSFORM ! Set trigs array, used by 1D-FFT routine gpfa ! The routine setgpfa_alloc is a special version ! of setgpfa which (re)allocates the trigs argument ! if needed, and re-fills it with new twiddle factors ! if n has changed since the last use. if (n /= nold) then call setgpfa_alloc( trigs, n ) end if call gpfa(realData, imagData, trigs, & dataSpan, vectorSpan, N, numVectors, isign) nold = n end subroutine fft_gpfa_dp !------------------------------------------------------------------ subroutine setgpfa_alloc( trigs, n ) real(dp), allocatable, intent(inout) :: trigs(:) integer, intent(in) :: n integer :: ntrigs if (.not. allocated(trigs)) then allocate(trigs(100)) endif call setgpfa_check(trigs,size(trigs),ntrigs,n) if (size(trigs) < ntrigs) then deallocate(trigs) allocate(trigs(ntrigs)) call setgpfa_check(trigs,size(trigs),ntrigs,n) if (size(trigs) < ntrigs) STOP "ntrigs error" endif end subroutine setgpfa_alloc ! SUBROUTINE SETGPFA_check(TRIGS,maxtrigs,ntrigs,N) ! integer, intent(in) :: maxtrigs ! The current size of trigs REAL(dp) TRIGS(maxtrigs) integer, intent(out) :: ntrigs ! The needed size for trigs integer, intent(in) :: N INTEGER NJ(3) integer :: nn, ifac, ll, kk, ip, iq, ir, & i, ni, irot, kink, k real(dp) :: angle, twopi, del ! ! DECOMPOSE N INTO FACTORS 2,3,5 ! ------------------------------ NN = N IFAC = 2 ! DO 30 LL = 1 , 3 KK = 0 10 CONTINUE IF (MOD(NN,IFAC).NE.0) GO TO 20 KK = KK + 1 NN = NN / IFAC GO TO 10 20 CONTINUE NJ(LL) = KK IFAC = IFAC + LL 30 CONTINUE ! IF (NN.NE.1) THEN write(msg,"(i0)") N call die('GPFA: '//trim(msg)//' IS NOT A LEGAL VALUE OF N') ENDIF ! IP = NJ(1) IQ = NJ(2) IR = NJ(3) ! ! COMPUTE LIST OF ROTATED TWIDDLE FACTORS ! --------------------------------------- NJ(1) = 2**IP NJ(2) = 3**IQ NJ(3) = 5**IR ! TWOPI = 4.0_dp * ASIN(1.0_dp) I = 1 ! DO 60 LL = 1 , 3 NI = NJ(LL) IF (NI.EQ.1) GO TO 60 ! DEL = TWOPI / real(NI,kind=dp) IROT = N / NI KINK = MOD(IROT,NI) KK = 0 ! DO 50 K = 1 , NI ANGLE = real(KK,kind=dp) * DEL if (i.lt.maxtrigs) then TRIGS(I) = COS(ANGLE) TRIGS(I+1) = SIN(ANGLE) endif I = I + 2 KK = KK + KINK IF (KK.GT.NI) KK = KK - NI 50 CONTINUE 60 CONTINUE ntrigs = i - 1 ! note boundary. i is even ! RETURN END SUBROUTINE SETGPFA_check ! ! SUBROUTINE 'SETGPFA' ! SETUP ROUTINE FOR SELF-SORTING IN-PLACE ! GENERALIZED PRIME FACTOR (COMPLEX) FFT [GPFA] ! ! CALL SETGPFA(TRIGS,N) ! ! INPUT : ! ----- ! N IS THE LENGTH OF THE TRANSFORMS. N MUST BE OF THE FORM: ! ----------------------------------- ! N = (2**IP) * (3**IQ) * (5**IR) ! ----------------------------------- ! ! OUTPUT: ! ------ ! TRIGS IS A TABLE OF TWIDDLE FACTORS, ! OF LENGTH 2*IPQR (REAL) WORDS, WHERE: ! -------------------------------------- ! IPQR = (2**IP) + (3**IQ) + (5**IR) ! -------------------------------------- ! ! WRITTEN BY CLIVE TEMPERTON 1990 ! !********************************************************************* ! SUBROUTINE SETGPFA(TRIGS,N) ! INTEGER NJ(3) REAL(dp) TRIGS(*) integer :: nn, n, ifac, ll, kk, ip, iq, ir, & i, ni, irot, kink, k real(dp) :: angle, twopi, del ! ! DECOMPOSE N INTO FACTORS 2,3,5 ! ------------------------------ NN = N IFAC = 2 ! DO 30 LL = 1 , 3 KK = 0 10 CONTINUE IF (MOD(NN,IFAC).NE.0) GO TO 20 KK = KK + 1 NN = NN / IFAC GO TO 10 20 CONTINUE NJ(LL) = KK IFAC = IFAC + LL 30 CONTINUE ! IF (NN.NE.1) THEN write(msg,"(i0)") N call die('GPFA: '//trim(msg)//' IS NOT A LEGAL VALUE OF N') ENDIF ! IP = NJ(1) IQ = NJ(2) IR = NJ(3) ! ! COMPUTE LIST OF ROTATED TWIDDLE FACTORS ! --------------------------------------- NJ(1) = 2**IP NJ(2) = 3**IQ NJ(3) = 5**IR ! TWOPI = 4.0_dp * ASIN(1.0_dp) I = 1 ! DO 60 LL = 1 , 3 NI = NJ(LL) IF (NI.EQ.1) GO TO 60 ! DEL = TWOPI / real(NI,kind=dp) IROT = N / NI KINK = MOD(IROT,NI) KK = 0 ! DO 50 K = 1 , NI ANGLE = real(KK,kind=dp) * DEL TRIGS(I) = COS(ANGLE) TRIGS(I+1) = SIN(ANGLE) I = I + 2 KK = KK + KINK IF (KK.GT.NI) KK = KK - NI 50 CONTINUE 60 CONTINUE ! RETURN END SUBROUTINE SETGPFA end module gridxc_fft_gpfa