gridxc_fft_gpfa.F90 Source File


Contents

Source Code


Source Code

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

!     1d fft routine based on Clive Temperton's GPFA package

! Explanation of the module structure:
!
! First come two modules implementing the sp and dp versions of the
! 'gpfa' main routine in Clive Temperton's package. Each exports a
! 'gpfa' interface.

! Then, a wrapper module 'gridxc_fft_gpfa' imports both interfaces and
! re-exports 'gpfa', and also contains the precision-independent parts
! of the GPFA package (setgpfa) and two wrapper routines:
!
!     fft_gpfa
!     fft_gpfa_ez, for packed arrays
!
!     These wrapper routines avoid the explicit use of the 'trigs'
!     twiddle factors argument, which is instead generated as needed.
!
! NOTES for building-system developers:
!
!   While there is only one source file, three* modules need to be accounted for:
!
!      gridxc_fft_gpfa.mod
!      gridxc_gpfa_core_sp.mod
!      gridxc_gpfa_core_dp.mod
!
!   If this is uwieldly, the auxiliary modules could be eliminated, and the code inserted
!   in one big module. In that case, however, all the routines would need to be disambiguated.
!   (For example, gpfa2f --> gpfa2f_sp / gpfa2f_dp)

module gridxc_gpfa_core_sp
!
!     1d fft routine based on Clive Temperton's GPFA package
!
! Vectors to be transformed are in SINGLE precision (wp=sp)

  implicit none

! In this module, trigs twiddle factors and intermediate
! variables are kept in double precision.
! (This was the standard approach in Siesta. In other codes the Temperton code
!  is simply converted wholesale to sp or dp)

integer, parameter :: dp = selected_real_kind(10,100)
integer, parameter :: sp = selected_real_kind(6,10)
integer, parameter :: wp = sp

interface gpfa
   module procedure gpfa_
end interface gpfa

public :: gpfa
private

CONTAINS

!--------------------------------------------------------------
!
!  What follows is the original code by Temperton (save setgpfa)
!  with explicit declarations of variables
!
! In this module, trigs twiddle factors and intermediate
! variables are kept in double precision.

!  For arbitrary-precision support the parameter constants might
!  be computed instead of inserted explicitly.
!
!  For example: sin60 = sin(2.0_wp*pi/3.0_wp), where wp is the working
!                       precision, and
!               pi = 4.0_wp * atan(1.0_wp), or
!               twopi = 4.0_wp * asin(1.0_wp)
!
!*********************************************************************

!        SUBROUTINE 'GPFA'
!        SELF-SORTING IN-PLACE GENERALIZED PRIME FACTOR (COMPLEX) FFT
!
!        *** THIS IS THE ALL-FORTRAN VERSION ***
!            -------------------------------
!
!        CALL GPFA(A,B,TRIGS,INC,JUMP,N,LOT,ISIGN)
!
!        A IS FIRST REAL INPUT/OUTPUT VECTOR
!        B IS FIRST IMAGINARY INPUT/OUTPUT VECTOR
!        TRIGS IS A TABLE OF TWIDDLE FACTORS, PRECALCULATED
!              BY CALLING SUBROUTINE 'SETGPFA'
!        INC IS THE INCREMENT WITHIN EACH DATA VECTOR
!        JUMP IS THE INCREMENT BETWEEN DATA VECTORS
!        N IS THE LENGTH OF THE TRANSFORMS:
!          -----------------------------------
!            N = (2**IP) * (3**IQ) * (5**IR)
!          -----------------------------------
!        LOT IS THE NUMBER OF TRANSFORMS
!        ISIGN = +1 FOR FORWARD TRANSFORM
!              = -1 FOR INVERSE TRANSFORM
!
!        WRITTEN BY CLIVE TEMPERTON
!        RECHERCHE EN PREVISION NUMERIQUE
!        ATMOSPHERIC ENVIRONMENT SERVICE, CANADA
!
!----------------------------------------------------------------------
!
!        DEFINITION OF TRANSFORM
!        -----------------------
!
!        X(J) = SUM(K=0,...,N-1)(C(K)*EXP(ISIGN*2*I*J*K*PI/N))
!
!---------------------------------------------------------------------
!
!        FOR A MATHEMATICAL DEVELOPMENT OF THE ALGORITHM USED,
!        SEE:
!
!        C TEMPERTON : "A GENERALIZED PRIME FACTOR FFT ALGORITHM
!          FOR ANY N = (2**P)(3**Q)(5**R)",
!          SIAM J. SCI. STAT. COMP., MAY 1992.
!

SUBROUTINE GPFA_(A,B,TRIGS,INC,JUMP,N,LOT,ISIGN)

IMPLICIT NONE
!
! NOTE: The original declaration of these arguments (here and below)
! was: A(*), B(*)
REAL(wp) ::     A(:), B(:) 
REAL(dp)  TRIGS(*)
INTEGER INC, JUMP, N, LOT, ISIGN
!
INTEGER &
  NJ(3), NN, IFAC, LL, KK, IP, IQ, IR, I

character(len=20) :: msg
!     
!     DECOMPOSE N INTO FACTORS 2,3,5
!     ------------------------------
NN = N
IFAC = 2
!
DO 30 LL = 1 , 3
KK = 0
10 CONTINUE
IF (MOD(NN,IFAC).NE.0) GO TO 20
KK = KK + 1
NN = NN / IFAC
GO TO 10
20 CONTINUE
NJ(LL) = KK
IFAC = IFAC + LL
30 CONTINUE
!
IF (NN.NE.1) THEN
   write(msg,"(i0)") N
   call die('GPFA: '//trim(msg)//' IS NOT A LEGAL VALUE OF N')
ENDIF
!
IP = NJ(1)
IQ = NJ(2)
IR = NJ(3)
!
!     COMPUTE THE TRANSFORM
!     ---------------------
I = 1
IF (IP.GT.0) THEN
   CALL GPFA2F(A,B,TRIGS,INC,JUMP,N,IP,LOT,ISIGN)
   I = I + 2 * ( 2**IP)
ENDIF
IF (IQ.GT.0) THEN
   CALL GPFA3F(A,B,TRIGS(I),INC,JUMP,N,IQ,LOT,ISIGN)
   I = I + 2 * (3**IQ)
ENDIF
IF (IR.GT.0) THEN
   CALL GPFA5F(A,B,TRIGS(I),INC,JUMP,N,IR,LOT,ISIGN)
ENDIF
!
RETURN
END SUBROUTINE GPFA_


!*********************************************************************

!     fortran version of *gpfa2* -
!     radix-2 section of self-sorting, in-place, generalized pfa
!     central radix-2 and radix-8 passes included
!      so that transform length can be any power of 2
!

subroutine gpfa2f(a,b,trigs,inc,jump,n,mm,lot,isign)

real(wp) ::     a(*), b(*)
real(dp)     ::     trigs(*)
integer inc, jump, n, mm, lot, isign

integer  n2, inq, jstepx, ninc, ink, m2, m8, &
         m, mh, nblox, left, istart, nb, nvex, &
         la, mu, ipass, jstep, jstepl, jjj, ja, &
         nu, jb, jc, jd, j, l, kk, k, je, jf, &
         jg, jh, laincl, ll, ji, jj, jk, jl, jm, &
         jn, jo, jp

real(dp) :: s, aja, ajc, t0, t2, ajb, ajd, &
                t1, t3, bja, bjc, u0, u2, bjb, &
                bjd, u1, u3, co1, si1, co2, si2, &
                co3, si3, ss, c1, c2, c3, aje, &
                ajf, ajg, ajh, bje, bjf, bjg, &
                bjh, co4, si4, co5, si5, co6, si6, &
                co7, si7, aji, bjm, ajj, bjj, ajk, &
                ajl, bji, bjk, ajo, bjl, bjo, ajm, &
                ajn, ajp, bjn, bjp

#ifdef OLD_CRAY
integer, parameter :: lvr = CRAY_LVR_PARAMETER
#else
integer, parameter :: lvr = 1024
#endif
!
!     ***************************************************************
!     *                                                             *
!     *  N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. *
!     *  RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE *
!     *  (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER.      *
!     *                                                             *
!     ***************************************************************
!
n2 = 2**mm
inq = n/n2
jstepx = (n2-n) * inc
ninc = n * inc
ink = inc * inq
!
m2 = 0
m8 = 0
if (mod(mm,2).eq.0) then
   m = mm/2
else if (mod(mm,4).eq.1) then
   m = (mm-1)/2
   m2 = 1
else if (mod(mm,4).eq.3) then
   m = (mm-3)/2
   m8 = 1
endif
mh = (m+1)/2
!
nblox = 1 + (lot-1)/lvr
left = lot
s = real(isign,kind=dp)
istart = 1
!
!  loop on blocks of lvr transforms
!  --------------------------------
do 500 nb = 1 , nblox
!
if (left.le.lvr) then
   nvex = left
else if (left.lt.(2*lvr)) then
   nvex = left/2
   nvex = nvex + mod(nvex,2)
else
   nvex = lvr
endif
left = left - nvex
!
la = 1
!
!  loop on type I radix-4 passes
!  -----------------------------
mu = mod(inq,4)
if (isign.eq.-1) mu = 4 - mu
ss = 1.0_dp
if (mu.eq.3) ss = -1.0_dp
!
if (mh.eq.0) go to 200
!
do 160 ipass = 1 , mh
jstep = (n*inc) / (4*la)
jstepl = jstep - ninc
!
!  k = 0 loop (no twiddle factors)
!  -------------------------------
do 120 jjj = 0 , (n-1)*inc , 4*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 115 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 110 l = 1 , nvex
aja = a(ja+j)
ajc = a(jc+j)
t0 = aja + ajc
t2 = aja - ajc
ajb = a(jb+j)
ajd = a(jd+j)
t1 = ajb + ajd
t3 = ss * ( ajb - ajd )
bja = b(ja+j)
bjc = b(jc+j)
u0 = bja + bjc
u2 = bja - bjc
bjb = b(jb+j)
bjd = b(jd+j)
u1 = bjb + bjd
u3 = ss * ( bjb - bjd )
a(ja+j) = t0 + t1
a(jc+j) = t0 - t1
b(ja+j) = u0 + u1
b(jc+j) = u0 - u1
a(jb+j) = t2 - u3
a(jd+j) = t2 + u3
b(jb+j) = u2 + t3
b(jd+j) = u2 - t3
j = j + jump
110 continue
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
115 continue
120 continue
!
!  finished if n2 = 4
!  ------------------
if (n2.eq.4) go to 490
kk = 2 * la
!
!  loop on nonzero k
!  -----------------
do 150 k = ink , jstep-ink , ink
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
co3 = trigs(3*kk+1)
si3 = s*trigs(3*kk+2)
!
!  loop along transform
!  --------------------
do 140 jjj = k , (n-1)*inc , 4*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 135 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 130 l = 1 , nvex
aja = a(ja+j)
ajc = a(jc+j)
t0 = aja + ajc
t2 = aja - ajc
ajb = a(jb+j)
ajd = a(jd+j)
t1 = ajb + ajd
t3 = ss * ( ajb - ajd )
bja = b(ja+j)
bjc = b(jc+j)
u0 = bja + bjc
u2 = bja - bjc
bjb = b(jb+j)
bjd = b(jd+j)
u1 = bjb + bjd
u3 = ss * ( bjb - bjd )
a(ja+j) = t0 + t1
b(ja+j) = u0 + u1
a(jb+j) = co1*(t2-u3) - si1*(u2+t3)
b(jb+j) = si1*(t2-u3) + co1*(u2+t3)
a(jc+j) = co2*(t0-t1) - si2*(u0-u1)
b(jc+j) = si2*(t0-t1) + co2*(u0-u1)
a(jd+j) = co3*(t2+u3) - si3*(u2-t3)
b(jd+j) = si3*(t2+u3) + co3*(u2-t3)
j = j + jump
130 continue
!-----( end of loop across transforms )
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
135 continue
140 continue
!-----( end of loop along transforms )
kk = kk + 2*la
150 continue
!-----( end of loop on nonzero k )
la = 4*la
160 continue
!-----( end of loop on type I radix-4 passes)
!
!  central radix-2 pass
!  --------------------
200 continue
if (m2.eq.0) go to 300
!
jstep = (n*inc) / (2*la)
jstepl = jstep - ninc
!
!  k=0 loop (no twiddle factors)
!  -----------------------------
do 220 jjj = 0 , (n-1)*inc , 2*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 215 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 210 l = 1 , nvex
aja = a(ja+j)
ajb = a(jb+j)
t0 = aja - ajb
a(ja+j) = aja + ajb
a(jb+j) = t0
bja = b(ja+j)
bjb = b(jb+j)
u0 = bja - bjb
b(ja+j) = bja + bjb
b(jb+j) = u0
j = j + jump
210 continue
!-----(end of loop across transforms)
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
215 continue
220 continue
!
!  finished if n2=2
!  ----------------
if (n2.eq.2) go to 490
!
kk = 2 * la
!
!  loop on nonzero k
!  -----------------
do 260 k = ink , jstep - ink , ink
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
!
!  loop along transforms
!  ---------------------
do 250 jjj = k , (n-1)*inc , 2*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 245 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
j = 0
!
!  loop across transforms
!  ----------------------
if (kk.eq.n2/2) then
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 230 l = 1 , nvex
aja = a(ja+j)
ajb = a(jb+j)
t0 = ss * ( aja - ajb )
a(ja+j) = aja + ajb
bjb = b(jb+j)
bja = b(ja+j)
a(jb+j) = ss * ( bjb - bja )
b(ja+j) = bja + bjb
b(jb+j) = t0
j = j + jump
230 continue
!
else
!
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 240 l = 1 , nvex
aja = a(ja+j)
ajb = a(jb+j)
t0 = aja - ajb
a(ja+j) = aja + ajb
bja = b(ja+j)
bjb = b(jb+j)
u0 = bja - bjb
b(ja+j) = bja + bjb
a(jb+j) = co1*t0 - si1*u0
b(jb+j) = si1*t0 + co1*u0
j = j + jump
240 continue
!
endif
!
!-----(end of loop across transforms)
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
245 continue
250 continue
!-----(end of loop along transforms)
kk = kk + 2 * la
260 continue
!-----(end of loop on nonzero k)
!-----(end of radix-2 pass)
!
la = 2 * la
go to 400
!
!  central radix-8 pass
!  --------------------
300 continue
if (m8.eq.0) go to 400
jstep = (n*inc) / (8*la)
jstepl = jstep - ninc
mu = mod(inq,8)
if (isign.eq.-1) mu = 8 - mu
c1 = 1.0_dp
if (mu.eq.3.or.mu.eq.7) c1 = -1.0_dp
c2 = sqrt(0.5_dp)
if (mu.eq.3.or.mu.eq.5) c2 = -c2
c3 = c1 * c2
!
!  stage 1
!  -------
do 320 k = 0 , jstep - ink , ink
do 315 jjj = k , (n-1)*inc , 8*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 312 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
j = 0
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 310 l = 1 , nvex
aja = a(ja+j)
aje = a(je+j)
t0 = aja - aje
a(ja+j) = aja + aje
ajc = a(jc+j)
ajg = a(jg+j)
t1 = c1 * ( ajc - ajg )
a(je+j) = ajc + ajg
ajb = a(jb+j)
ajf = a(jf+j)
t2 = ajb - ajf
a(jc+j) = ajb + ajf
ajd = a(jd+j)
ajh = a(jh+j)
t3 = ajd - ajh
a(jg+j) = ajd + ajh
a(jb+j) = t0
a(jf+j) = t1
a(jd+j) = c2 * ( t2 - t3 )
a(jh+j) = c3 * ( t2 + t3 )
bja = b(ja+j)
bje = b(je+j)
u0 = bja - bje
b(ja+j) = bja + bje
bjc = b(jc+j)
bjg = b(jg+j)
u1 = c1 * ( bjc - bjg )
b(je+j) = bjc + bjg
bjb = b(jb+j)
bjf = b(jf+j)
u2 = bjb - bjf
b(jc+j) = bjb + bjf
bjd = b(jd+j)
bjh = b(jh+j)
u3 = bjd - bjh
b(jg+j) = bjd + bjh
b(jb+j) = u0
b(jf+j) = u1
b(jd+j) = c2 * ( u2 - u3 )
b(jh+j) = c3 * ( u2 + u3 )
j = j + jump
310 continue
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
312 continue
315 continue
320 continue
!
!  stage 2
!  -------
!
!  k=0 (no twiddle factors)
!  ------------------------
do 330 jjj = 0 , (n-1)*inc , 8*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 328 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
j = 0
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 325 l = 1 , nvex
aja = a(ja+j)
aje = a(je+j)
t0 = aja + aje
t2 = aja - aje
ajc = a(jc+j)
ajg = a(jg+j)
t1 = ajc + ajg
t3 = c1 * ( ajc - ajg )
bja = b(ja+j)
bje = b(je+j)
u0 = bja + bje
u2 = bja - bje
bjc = b(jc+j)
bjg = b(jg+j)
u1 = bjc + bjg
u3 = c1 * ( bjc - bjg )
a(ja+j) = t0 + t1
a(je+j) = t0 - t1
b(ja+j) = u0 + u1
b(je+j) = u0 - u1
a(jc+j) = t2 - u3
a(jg+j) = t2 + u3
b(jc+j) = u2 + t3
b(jg+j) = u2 - t3
ajb = a(jb+j)
ajd = a(jd+j)
t0 = ajb + ajd
t2 = ajb - ajd
ajf = a(jf+j)
ajh = a(jh+j)
t1 = ajf - ajh
t3 = ajf + ajh
bjb = b(jb+j)
bjd = b(jd+j)
u0 = bjb + bjd
u2 = bjb - bjd
bjf = b(jf+j)
bjh = b(jh+j)
u1 = bjf - bjh
u3 = bjf + bjh
a(jb+j) = t0 - u3
a(jh+j) = t0 + u3
b(jb+j) = u0 + t3
b(jh+j) = u0 - t3
a(jd+j) = t2 + u1
a(jf+j) = t2 - u1
b(jd+j) = u2 - t1
b(jf+j) = u2 + t1
j = j + jump
325 continue
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
328 continue
330 continue
!
if (n2.eq.8) go to 490
!
!  loop on nonzero k
!  -----------------
kk = 2 * la
!
do 350 k = ink , jstep - ink , ink
!
co1 = trigs(kk+1)
si1 = s * trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s * trigs(2*kk+2)
co3 = trigs(3*kk+1)
si3 = s * trigs(3*kk+2)
co4 = trigs(4*kk+1)
si4 = s * trigs(4*kk+2)
co5 = trigs(5*kk+1)
si5 = s * trigs(5*kk+2)
co6 = trigs(6*kk+1)
si6 = s * trigs(6*kk+2)
co7 = trigs(7*kk+1)
si7 = s * trigs(7*kk+2)
!
do 345 jjj = k , (n-1)*inc , 8*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 342 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
j = 0
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 340 l = 1 , nvex
aja = a(ja+j)
aje = a(je+j)
t0 = aja + aje
t2 = aja - aje
ajc = a(jc+j)
ajg = a(jg+j)
t1 = ajc + ajg
t3 = c1 * ( ajc - ajg )
bja = b(ja+j)
bje = b(je+j)
u0 = bja + bje
u2 = bja - bje
bjc = b(jc+j)
bjg = b(jg+j)
u1 = bjc + bjg
u3 = c1 * ( bjc - bjg )
a(ja+j) = t0 + t1
b(ja+j) = u0 + u1
a(je+j) = co4*(t0-t1) - si4*(u0-u1)
b(je+j) = si4*(t0-t1) + co4*(u0-u1)
a(jc+j) = co2*(t2-u3) - si2*(u2+t3)
b(jc+j) = si2*(t2-u3) + co2*(u2+t3)
a(jg+j) = co6*(t2+u3) - si6*(u2-t3)
b(jg+j) = si6*(t2+u3) + co6*(u2-t3)
ajb = a(jb+j)
ajd = a(jd+j)
t0 = ajb + ajd
t2 = ajb - ajd
ajf = a(jf+j)
ajh = a(jh+j)
t1 = ajf - ajh
t3 = ajf + ajh
bjb = b(jb+j)
bjd = b(jd+j)
u0 = bjb + bjd
u2 = bjb - bjd
bjf = b(jf+j)
bjh = b(jh+j)
u1 = bjf - bjh
u3 = bjf + bjh
a(jb+j) = co1*(t0-u3) - si1*(u0+t3)
b(jb+j) = si1*(t0-u3) + co1*(u0+t3)
a(jh+j) = co7*(t0+u3) - si7*(u0-t3)
b(jh+j) = si7*(t0+u3) + co7*(u0-t3)
a(jd+j) = co3*(t2+u1) - si3*(u2-t1)
b(jd+j) = si3*(t2+u1) + co3*(u2-t1)
a(jf+j) = co5*(t2-u1) - si5*(u2+t1)
b(jf+j) = si5*(t2-u1) + co5*(u2+t1)
j = j + jump
340 continue
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
342 continue
345 continue
kk = kk + 2 * la
350 continue
!
la = 8 * la
!
!  loop on type II radix-4 passes
!  ------------------------------
400 continue
mu = mod(inq,4)
if (isign.eq.-1) mu = 4 - mu
ss = 1.0_dp
if (mu.eq.3) ss = -1.0_dp
!
do 480 ipass = mh+1 , m
jstep = (n*inc) / (4*la)
jstepl = jstep - ninc
laincl = la * ink - ninc
!
!  k=0 loop (no twiddle factors)
!  -----------------------------
do 430 ll = 0 , (la-1)*ink , 4*jstep
!
do 420 jjj = ll , (n-1)*inc , 4*la*ink
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 415 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = ja + laincl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
ji = je + laincl
if (ji.lt.istart) ji = ji + ninc
jj = ji + jstepl
if (jj.lt.istart) jj = jj + ninc
jk = jj + jstepl
if (jk.lt.istart) jk = jk + ninc
jl = jk + jstepl
if (jl.lt.istart) jl = jl + ninc
jm = ji + laincl
if (jm.lt.istart) jm = jm + ninc
jn = jm + jstepl
if (jn.lt.istart) jn = jn + ninc
jo = jn + jstepl
if (jo.lt.istart) jo = jo + ninc
jp = jo + jstepl
if (jp.lt.istart) jp = jp + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 410 l = 1 , nvex
aja = a(ja+j)
ajc = a(jc+j)
t0 = aja + ajc
t2 = aja - ajc
ajb = a(jb+j)
ajd = a(jd+j)
t1 = ajb + ajd
t3 = ss * ( ajb - ajd )
aji = a(ji+j)
ajc =  aji
bja = b(ja+j)
bjc = b(jc+j)
u0 = bja + bjc
u2 = bja - bjc
bjb = b(jb+j)
bjd = b(jd+j)
u1 = bjb + bjd
u3 = ss * ( bjb - bjd )
aje = a(je+j)
ajb =  aje
a(ja+j) = t0 + t1
a(ji+j) = t0 - t1
b(ja+j) = u0 + u1
bjc =  u0 - u1
bjm = b(jm+j)
bjd =  bjm
a(je+j) = t2 - u3
ajd =  t2 + u3
bjb =  u2 + t3
b(jm+j) = u2 - t3
!----------------------
ajg = a(jg+j)
t0 = ajb + ajg
t2 = ajb - ajg
ajf = a(jf+j)
ajh = a(jh+j)
t1 = ajf + ajh
t3 = ss * ( ajf - ajh )
ajj = a(jj+j)
ajg =  ajj
bje = b(je+j)
bjg = b(jg+j)
u0 = bje + bjg
u2 = bje - bjg
bjf = b(jf+j)
bjh = b(jh+j)
u1 = bjf + bjh
u3 = ss * ( bjf - bjh )
b(je+j) = bjb
a(jb+j) = t0 + t1
a(jj+j) = t0 - t1
bjj = b(jj+j)
bjg =  bjj
b(jb+j) = u0 + u1
b(jj+j) = u0 - u1
a(jf+j) = t2 - u3
ajh =  t2 + u3
b(jf+j) = u2 + t3
bjh =  u2 - t3
!----------------------
ajk = a(jk+j)
t0 = ajc + ajk
t2 = ajc - ajk
ajl = a(jl+j)
t1 = ajg + ajl
t3 = ss * ( ajg - ajl )
bji = b(ji+j)
bjk = b(jk+j)
u0 = bji + bjk
u2 = bji - bjk
ajo = a(jo+j)
ajl =  ajo
bjl = b(jl+j)
u1 = bjg + bjl
u3 = ss * ( bjg - bjl )
b(ji+j) = bjc
a(jc+j) = t0 + t1
a(jk+j) = t0 - t1
bjo = b(jo+j)
bjl =  bjo
b(jc+j) = u0 + u1
b(jk+j) = u0 - u1
a(jg+j) = t2 - u3
a(jo+j) = t2 + u3
b(jg+j) = u2 + t3
b(jo+j) = u2 - t3
!----------------------
ajm = a(jm+j)
t0 = ajm + ajl
t2 = ajm - ajl
ajn = a(jn+j)
ajp = a(jp+j)
t1 = ajn + ajp
t3 = ss * ( ajn - ajp )
a(jm+j) = ajd
u0 = bjd + bjl
u2 = bjd - bjl
bjn = b(jn+j)
bjp = b(jp+j)
u1 = bjn + bjp
u3 = ss * ( bjn - bjp )
a(jn+j) = ajh
a(jd+j) = t0 + t1
a(jl+j) = t0 - t1
b(jd+j) = u0 + u1
b(jl+j) = u0 - u1
b(jn+j) = bjh
a(jh+j) = t2 - u3
a(jp+j) = t2 + u3
b(jh+j) = u2 + t3
b(jp+j) = u2 - t3
j = j + jump
410 continue
!-----( end of loop across transforms )
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
415 continue
420 continue
430 continue
!-----( end of double loop for k=0 )
!
!  finished if last pass
!  ---------------------
if (ipass.eq.m) go to 490
!
kk = 2*la
!
!     loop on nonzero k
!     -----------------
do 470 k = ink , jstep-ink , ink
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
co3 = trigs(3*kk+1)
si3 = s*trigs(3*kk+2)
!
!  double loop along first transform in block
!  ------------------------------------------
do 460 ll = k , (la-1)*ink , 4*jstep
!
do 450 jjj = ll , (n-1)*inc , 4*la*ink
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 445 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = ja + laincl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
ji = je + laincl
if (ji.lt.istart) ji = ji + ninc
jj = ji + jstepl
if (jj.lt.istart) jj = jj + ninc
jk = jj + jstepl
if (jk.lt.istart) jk = jk + ninc
jl = jk + jstepl
if (jl.lt.istart) jl = jl + ninc
jm = ji + laincl
if (jm.lt.istart) jm = jm + ninc
jn = jm + jstepl
if (jn.lt.istart) jn = jn + ninc
jo = jn + jstepl
if (jo.lt.istart) jo = jo + ninc
jp = jo + jstepl
if (jp.lt.istart) jp = jp + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 440 l = 1 , nvex
aja = a(ja+j)
ajc = a(jc+j)
t0 = aja + ajc
t2 = aja - ajc
ajb = a(jb+j)
ajd = a(jd+j)
t1 = ajb + ajd
t3 = ss * ( ajb - ajd )
aji = a(ji+j)
ajc =  aji
bja = b(ja+j)
bjc = b(jc+j)
u0 = bja + bjc
u2 = bja - bjc
bjb = b(jb+j)
bjd = b(jd+j)
u1 = bjb + bjd
u3 = ss * ( bjb - bjd )
aje = a(je+j)
ajb =  aje
a(ja+j) = t0 + t1
b(ja+j) = u0 + u1
a(je+j) = co1*(t2-u3) - si1*(u2+t3)
bjb =  si1*(t2-u3) + co1*(u2+t3)
bjm = b(jm+j)
bjd =  bjm
a(ji+j) = co2*(t0-t1) - si2*(u0-u1)
bjc =  si2*(t0-t1) + co2*(u0-u1)
ajd =  co3*(t2+u3) - si3*(u2-t3)
b(jm+j) = si3*(t2+u3) + co3*(u2-t3)
!----------------------------------------
ajg = a(jg+j)
t0 = ajb + ajg
t2 = ajb - ajg
ajf = a(jf+j)
ajh = a(jh+j)
t1 = ajf + ajh
t3 = ss * ( ajf - ajh )
ajj = a(jj+j)
ajg =  ajj
bje = b(je+j)
bjg = b(jg+j)
u0 = bje + bjg
u2 = bje - bjg
bjf = b(jf+j)
bjh = b(jh+j)
u1 = bjf + bjh
u3 = ss * ( bjf - bjh )
b(je+j) = bjb
a(jb+j) = t0 + t1
b(jb+j) = u0 + u1
bjj = b(jj+j)
bjg =  bjj
a(jf+j) = co1*(t2-u3) - si1*(u2+t3)
b(jf+j) = si1*(t2-u3) + co1*(u2+t3)
a(jj+j) = co2*(t0-t1) - si2*(u0-u1)
b(jj+j) = si2*(t0-t1) + co2*(u0-u1)
ajh =  co3*(t2+u3) - si3*(u2-t3)
bjh =  si3*(t2+u3) + co3*(u2-t3)
!----------------------------------------
ajk = a(jk+j)
t0 = ajc + ajk
t2 = ajc - ajk
ajl = a(jl+j)
t1 = ajg + ajl
t3 = ss * ( ajg - ajl )
bji = b(ji+j)
bjk = b(jk+j)
u0 = bji + bjk
u2 = bji - bjk
ajo = a(jo+j)
ajl =  ajo
bjl = b(jl+j)
u1 = bjg + bjl
u3 = ss * ( bjg - bjl )
b(ji+j) = bjc
a(jc+j) = t0 + t1
b(jc+j) = u0 + u1
bjo = b(jo+j)
bjl =  bjo
a(jg+j) = co1*(t2-u3) - si1*(u2+t3)
b(jg+j) = si1*(t2-u3) + co1*(u2+t3)
a(jk+j) = co2*(t0-t1) - si2*(u0-u1)
b(jk+j) = si2*(t0-t1) + co2*(u0-u1)
a(jo+j) = co3*(t2+u3) - si3*(u2-t3)
b(jo+j) = si3*(t2+u3) + co3*(u2-t3)
!----------------------------------------
ajm = a(jm+j)
t0 = ajm + ajl
t2 = ajm - ajl
ajn = a(jn+j)
ajp = a(jp+j)
t1 = ajn + ajp
t3 = ss * ( ajn - ajp )
a(jm+j) = ajd
u0 = bjd + bjl
u2 = bjd - bjl
a(jn+j) = ajh
bjn = b(jn+j)
bjp = b(jp+j)
u1 = bjn + bjp
u3 = ss * ( bjn - bjp )
b(jn+j) = bjh
a(jd+j) = t0 + t1
b(jd+j) = u0 + u1
a(jh+j) = co1*(t2-u3) - si1*(u2+t3)
b(jh+j) = si1*(t2-u3) + co1*(u2+t3)
a(jl+j) = co2*(t0-t1) - si2*(u0-u1)
b(jl+j) = si2*(t0-t1) + co2*(u0-u1)
a(jp+j) = co3*(t2+u3) - si3*(u2-t3)
b(jp+j) = si3*(t2+u3) + co3*(u2-t3)
j = j + jump
440 continue
!-----(end of loop across transforms)
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
445 continue
450 continue
460 continue
!-----( end of double loop for this k )
kk = kk + 2*la
470 continue
!-----( end of loop over values of k )
la = 4*la
480 continue
!-----( end of loop on type II radix-4 passes )
!-----( nvex transforms completed)
490 continue
istart = istart + nvex * jump
500 continue
!-----( end of loop on blocks of transforms )
!
return
end subroutine gpfa2f

!******************************************************************

!     fortran version of *gpfa3* -
!     radix-3 section of self-sorting, in-place
!        generalized PFA
!
!-------------------------------------------------------------------
!
subroutine gpfa3f(a,b,trigs,inc,jump,n,mm,lot,isign)

real(wp) ::     a(*), b(*)
real(dp)     ::     trigs(*)
integer inc, jump, n, mm, lot, isign

real(dp), parameter :: sin60 = 0.866025403784437_dp

integer      :: je, jf, jg, jh, laincl, ll, ji
integer      :: n3, inq, jstepx, ninc, ink, mu, m, mh, &
                nblox, left, istart, nb, nvex, la, &
                ipass, jstep, jstepl, jjj, ja, nu, jb, &
                jc, j, l, kk, k, jd

real(dp) :: s, ajb, ajc, t1, aja, t2, t3, bjb, bjc, &
                u1, bja, u2, u3, co1, si1, co2, si2, ajd, &
                bjd
real(dp) :: c1, aje, ajg, ajf, ajh, bje, &
                bjf, bjg, bjh, aji, bji


integer, parameter :: lvr = 128
!
!     ***************************************************************
!     *                                                             *
!     *  N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. *
!     *  RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE *
!     *  (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER.      *
!     *                                                             *
!     ***************************************************************
!
n3 = 3**mm
inq = n/n3
jstepx = (n3-n) * inc
ninc = n * inc
ink = inc * inq
mu = mod(inq,3)
if (isign.eq.-1) mu = 3-mu
m = mm
mh = (m+1)/2
s = real(isign,kind=dp)
c1 = sin60
if (mu.eq.2) c1 = -c1
!
nblox = 1 + (lot-1)/lvr
left = lot
s = real(isign,kind=dp)
istart = 1
!
!  loop on blocks of lvr transforms
!  --------------------------------
do 500 nb = 1 , nblox
!
if (left.le.lvr) then
   nvex = left
else if (left.lt.(2*lvr)) then
   nvex = left/2
   nvex = nvex + mod(nvex,2)
else
   nvex = lvr
endif
left = left - nvex
!
la = 1
!
!  loop on type I radix-3 passes
!  -----------------------------
do 160 ipass = 1 , mh
jstep = (n*inc) / (3*la)
jstepl = jstep - ninc
!
!  k = 0 loop (no twiddle factors)
!  -------------------------------
do 120 jjj = 0 , (n-1)*inc , 3*jstep
ja = istart + jjj
!
!  "transverse" loop
!  -----------------
do 115 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 110 l = 1 , nvex
ajb = a(jb+j)
ajc = a(jc+j)
t1 = ajb + ajc
aja = a(ja+j)
t2 = aja - 0.5_dp * t1
t3 = c1 * ( ajb - ajc )
bjb = b(jb+j)
bjc = b(jc+j)
u1 = bjb + bjc
bja = b(ja+j)
u2 = bja - 0.5_dp * u1
u3 = c1 * ( bjb - bjc )
a(ja+j) = aja + t1
b(ja+j) = bja + u1
a(jb+j) = t2 - u3
b(jb+j) = u2 + t3
a(jc+j) = t2 + u3
b(jc+j) = u2 - t3
j = j + jump
110 continue
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
115 continue
120 continue
!
!  finished if n3 = 3
!  ------------------
if (n3.eq.3) go to 490
kk = 2 * la
!
!  loop on nonzero k
!  -----------------
do 150 k = ink , jstep-ink , ink
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
!
!  loop along transform
!  --------------------
do 140 jjj = k , (n-1)*inc , 3*jstep
ja = istart + jjj
!
!  "transverse" loop
!  -----------------
do 135 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 130 l = 1 , nvex
ajb = a(jb+j)
ajc = a(jc+j)
t1 = ajb + ajc
aja = a(ja+j)
t2 = aja - 0.5_dp * t1
t3 = c1 * ( ajb - ajc )
bjb = b(jb+j)
bjc = b(jc+j)
u1 = bjb + bjc
bja = b(ja+j)
u2 = bja - 0.5_dp * u1
u3 = c1 * ( bjb - bjc )
a(ja+j) = aja + t1
b(ja+j) = bja + u1
a(jb+j) = co1*(t2-u3) - si1*(u2+t3)
b(jb+j) = si1*(t2-u3) + co1*(u2+t3)
a(jc+j) = co2*(t2+u3) - si2*(u2-t3)
b(jc+j) = si2*(t2+u3) + co2*(u2-t3)
j = j + jump
130 continue
!-----( end of loop across transforms )
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
135 continue
140 continue
!-----( end of loop along transforms )
kk = kk + 2*la
150 continue
!-----( end of loop on nonzero k )
la = 3*la
160 continue
!-----( end of loop on type I radix-3 passes)
!
!  loop on type II radix-3 passes
!  ------------------------------
400 continue
!
do 480 ipass = mh+1 , m
jstep = (n*inc) / (3*la)
jstepl = jstep - ninc
laincl = la*ink - ninc
!
!  k=0 loop (no twiddle factors)
!  -----------------------------
do 430 ll = 0 , (la-1)*ink , 3*jstep
!
do 420 jjj = ll , (n-1)*inc , 3*la*ink
ja = istart + jjj
!
!  "transverse" loop
!  -----------------
do 415 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = ja + laincl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jd + laincl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
ji = jh + jstepl
if (ji.lt.istart) ji = ji + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 410 l = 1 , nvex
ajb = a(jb+j)
ajc = a(jc+j)
t1 = ajb + ajc
aja = a(ja+j)
t2 = aja - 0.5_dp * t1
t3 = c1 * ( ajb - ajc )
ajd = a(jd+j)
ajb =  ajd
bjb = b(jb+j)
bjc = b(jc+j)
u1 = bjb + bjc
bja = b(ja+j)
u2 = bja - 0.5_dp * u1
u3 = c1 * ( bjb - bjc )
bjd = b(jd+j)
bjb =  bjd
a(ja+j) = aja + t1
b(ja+j) = bja + u1
a(jd+j) = t2 - u3
b(jd+j) = u2 + t3
ajc =  t2 + u3
bjc =  u2 - t3
!----------------------
aje = a(je+j)
ajf = a(jf+j)
t1 = aje + ajf
t2 = ajb - 0.5_dp * t1
t3 = c1 * ( aje - ajf )
ajh = a(jh+j)
ajf =  ajh
bje = b(je+j)
bjf = b(jf+j)
u1 = bje + bjf
u2 = bjb - 0.5_dp * u1
u3 = c1 * ( bje - bjf )
bjh = b(jh+j)
bjf =  bjh
a(jb+j) = ajb + t1
b(jb+j) = bjb + u1
a(je+j) = t2 - u3
b(je+j) = u2 + t3
a(jh+j) = t2 + u3
b(jh+j) = u2 - t3
!----------------------
aji = a(ji+j)
t1 = ajf + aji
ajg = a(jg+j)
t2 = ajg - 0.5_dp * t1
t3 = c1 * ( ajf - aji )
t1 = ajg + t1
a(jg+j) = ajc
bji = b(ji+j)
u1 = bjf + bji
bjg = b(jg+j)
u2 = bjg - 0.5_dp * u1
u3 = c1 * ( bjf - bji )
u1 = bjg + u1
b(jg+j) = bjc
a(jc+j) = t1
b(jc+j) = u1
a(jf+j) = t2 - u3
b(jf+j) = u2 + t3
a(ji+j) = t2 + u3
b(ji+j) = u2 - t3
j = j + jump
410 continue
!-----( end of loop across transforms )
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
415 continue
420 continue
430 continue
!-----( end of double loop for k=0 )
!
!  finished if last pass
!  ---------------------
if (ipass.eq.m) go to 490
!
kk = 2*la
!
!     loop on nonzero k
!     -----------------
do 470 k = ink , jstep-ink , ink
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
!
!  double loop along first transform in block
!  ------------------------------------------
do 460 ll = k , (la-1)*ink , 3*jstep
!
do 450 jjj = ll , (n-1)*inc , 3*la*ink
ja = istart + jjj
!
!  "transverse" loop
!  -----------------
do 445 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = ja + laincl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jd + laincl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
ji = jh + jstepl
if (ji.lt.istart) ji = ji + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 440 l = 1 , nvex
ajb = a(jb+j)
ajc = a(jc+j)
t1 = ajb + ajc
aja = a(ja+j)
t2 = aja - 0.5_dp * t1
t3 = c1 * ( ajb - ajc )
ajd = a(jd+j)
ajb =  ajd
bjb = b(jb+j)
bjc = b(jc+j)
u1 = bjb + bjc
bja = b(ja+j)
u2 = bja - 0.5_dp * u1
u3 = c1 * ( bjb - bjc )
bjd = b(jd+j)
bjb =  bjd
a(ja+j) = aja + t1
b(ja+j) = bja + u1
a(jd+j) = co1*(t2-u3) - si1*(u2+t3)
b(jd+j) = si1*(t2-u3) + co1*(u2+t3)
ajc =  co2*(t2+u3) - si2*(u2-t3)
bjc =  si2*(t2+u3) + co2*(u2-t3)
!----------------------
aje = a(je+j)
ajf = a(jf+j)
t1 = aje + ajf
t2 = ajb - 0.5_dp * t1
t3 = c1 * ( aje - ajf )
ajh = a(jh+j)
ajf =  ajh
bje = b(je+j)
bjf = b(jf+j)
u1 = bje + bjf
u2 = bjb - 0.5_dp * u1
u3 = c1 * ( bje - bjf )
bjh = b(jh+j)
bjf =  bjh
a(jb+j) = ajb + t1
b(jb+j) = bjb + u1
a(je+j) = co1*(t2-u3) - si1*(u2+t3)
b(je+j) = si1*(t2-u3) + co1*(u2+t3)
a(jh+j) = co2*(t2+u3) - si2*(u2-t3)
b(jh+j) = si2*(t2+u3) + co2*(u2-t3)
!----------------------
aji = a(ji+j)
t1 = ajf + aji
ajg = a(jg+j)
t2 = ajg - 0.5_dp * t1
t3 = c1 * ( ajf - aji )
t1 = ajg + t1
a(jg+j) = ajc
bji = b(ji+j)
u1 = bjf + bji
bjg = b(jg+j)
u2 = bjg - 0.5_dp * u1
u3 = c1 * ( bjf - bji )
u1 = bjg + u1
b(jg+j) = bjc
a(jc+j) = t1
b(jc+j) = u1
a(jf+j) = co1*(t2-u3) - si1*(u2+t3)
b(jf+j) = si1*(t2-u3) + co1*(u2+t3)
a(ji+j) = co2*(t2+u3) - si2*(u2-t3)
b(ji+j) = si2*(t2+u3) + co2*(u2-t3)
j = j + jump
440 continue
!-----(end of loop across transforms)
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
445 continue
450 continue
460 continue
!-----( end of double loop for this k )
kk = kk + 2*la
470 continue
!-----( end of loop over values of k )
la = 3*la
480 continue
!-----( end of loop on type II radix-3 passes )
!-----( nvex transforms completed)
490 continue
istart = istart + nvex * jump
500 continue
!-----( end of loop on blocks of transforms )
!
return
end subroutine gpfa3f

!*******************************************************************

!     fortran version of *gpfa5* -
!     radix-5 section of self-sorting, in-place,
!        generalized pfa
!
!-------------------------------------------------------------------
!
subroutine gpfa5f(a,b,trigs,inc,jump,n,mm,lot,isign)

real(wp)     ::     a(*), b(*)
real(dp)     ::     trigs(*)
integer inc, jump, n, mm, lot, isign

real(dp), parameter :: sin36 = 0.587785252292473_dp, &
                       sin72 = 0.951056516295154_dp, &
                       qrt5  = 0.559016994374947_dp
integer, parameter  :: lvr = 128

integer      ::  inq, jstepx, ninc, ink, mu, m, mh, &
                 nblox, n5, left, istart, nb, nvex, la, &
                 ipass, jstep, jstepl, kk, k, jjj, ja, &
                 nu, jb, jc, jd, je, j, laincl, ll, l, &
                 jf, jg, jh, ji, jj, jk, jl, jm, jn, jo, &
                 jp, jq, jr, js, jt, ju, jv, jw, jx, jy

real(dp) ::  s, c1, c2, c3, co1, si1, co2, si2, co3, &
                 si3, co4, si4, ajb, aje, t1, ajc, ajd, &
                 t2, t3, t4, t5, t6, aja, t7, t8, t9, t10, &
                 t11, bjb, bje, u1, bjc, bjd, u2, u3, u4, &
                 u5, u6, bja, u7, u8, u9, u10, u11, ajf, &
                 ajk, bjf, bjk, ajg, ajj, ajh, aji, ajl, &
                 ajq, bjg, bjj, bjh, bji, bjl, bjq, ajo, &
                 ajm, ajn, ajr, ajw, bjo, bjm, bjn, bjr, &
                 bjw, ajt, ajs, ajx, ajp, ax, bjt, bjs, &
                 bjx, bjp, bx, ajv, ajy, aju, bjv, bjy, &
                 bju
!
!     ***************************************************************
!     *                                                             *
!     *  N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. *
!     *  RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE *
!     *  (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER.      *
!     *                                                             *
!     ***************************************************************
!
n5 = 5 ** mm
inq = n / n5
jstepx = (n5-n) * inc
ninc = n * inc
ink = inc * inq
mu = mod(inq,5)
if (isign.eq.-1) mu = 5 - mu
!
m = mm
mh = (m+1)/2
s = real(isign,kind=dp)
c1 = qrt5
c2 = sin72
c3 = sin36
if (mu.eq.2.or.mu.eq.3) then
   c1 = -c1
   c2 = sin36
   c3 = sin72
endif
if (mu.eq.3.or.mu.eq.4) c2 = -c2
if (mu.eq.2.or.mu.eq.4) c3 = -c3
!
nblox = 1 + (lot-1)/lvr
left = lot
s = real(isign,kind=dp)
istart = 1
!
!  loop on blocks of lvr transforms
!  --------------------------------
do 500 nb = 1 , nblox
!
if (left.le.lvr) then
   nvex = left
else if (left.lt.(2*lvr)) then
   nvex = left/2
   nvex = nvex + mod(nvex,2)
else
   nvex = lvr
endif
left = left - nvex
!
la = 1
!
!  loop on type I radix-5 passes
!  -----------------------------
do 160 ipass = 1 , mh
jstep = (n*inc) / (5*la)
jstepl = jstep - ninc
kk = 0
!
!  loop on k
!  ---------
do 150 k = 0 , jstep-ink , ink
!
if (k.gt.0) then
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
co3 = trigs(3*kk+1)
si3 = s*trigs(3*kk+2)
co4 = trigs(4*kk+1)
si4 = s*trigs(4*kk+2)
endif
!
!  loop along transform
!  --------------------
do 140 jjj = k , (n-1)*inc , 5*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 135 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
j = 0
!
!  loop across transforms
!  ----------------------
if (k.eq.0) then
!
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 110 l = 1 , nvex
ajb = a(jb+j)
aje = a(je+j)
t1 = ajb + aje
ajc = a(jc+j)
ajd = a(jd+j)
t2 = ajc + ajd
t3 = ajb - aje
t4 = ajc - ajd
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aja = a(ja+j)
t7 = aja - 0.25_dp * t5
a(ja+j) = aja + t5
t8 = t7 + t6
t9 = t7 - t6
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjb = b(jb+j)
bje = b(je+j)
u1 = bjb + bje
bjc = b(jc+j)
bjd = b(jd+j)
u2 = bjc + bjd
u3 = bjb - bje
u4 = bjc - bjd
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bja = b(ja+j)
u7 = bja - 0.25_dp * u5
b(ja+j) = bja + u5
u8 = u7 + u6
u9 = u7 - u6
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jb+j) = t8 - u11
b(jb+j) = u8 + t11
a(je+j) = t8 + u11
b(je+j) = u8 - t11
a(jc+j) = t9 - u10
b(jc+j) = u9 + t10
a(jd+j) = t9 + u10
b(jd+j) = u9 - t10
j = j + jump
110 continue
!
else
!
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 130 l = 1 , nvex
ajb = a(jb+j)
aje = a(je+j)
t1 = ajb + aje
ajc = a(jc+j)
ajd = a(jd+j)
t2 = ajc + ajd
t3 = ajb - aje
t4 = ajc - ajd
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aja = a(ja+j)
t7 = aja - 0.25_dp * t5
a(ja+j) = aja + t5
t8 = t7 + t6
t9 = t7 - t6
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjb = b(jb+j)
bje = b(je+j)
u1 = bjb + bje
bjc = b(jc+j)
bjd = b(jd+j)
u2 = bjc + bjd
u3 = bjb - bje
u4 = bjc - bjd
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bja = b(ja+j)
u7 = bja - 0.25_dp * u5
b(ja+j) = bja + u5
u8 = u7 + u6
u9 = u7 - u6
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jb+j) = co1*(t8-u11) - si1*(u8+t11)
b(jb+j) = si1*(t8-u11) + co1*(u8+t11)
a(je+j) = co4*(t8+u11) - si4*(u8-t11)
b(je+j) = si4*(t8+u11) + co4*(u8-t11)
a(jc+j) = co2*(t9-u10) - si2*(u9+t10)
b(jc+j) = si2*(t9-u10) + co2*(u9+t10)
a(jd+j) = co3*(t9+u10) - si3*(u9-t10)
b(jd+j) = si3*(t9+u10) + co3*(u9-t10)
j = j + jump
130 continue
!
endif
!
!-----( end of loop across transforms )
!
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
135 continue
140 continue
!-----( end of loop along transforms )
kk = kk + 2*la
150 continue
!-----( end of loop on nonzero k )
la = 5*la
160 continue
!-----( end of loop on type I radix-5 passes)
!
if (n.eq.5) go to 490
!
!  loop on type II radix-5 passes
!  ------------------------------
400 continue
!
do 480 ipass = mh+1 , m
jstep = (n*inc) / (5*la)
jstepl = jstep - ninc
laincl = la * ink - ninc
kk = 0
!
!     loop on k
!     ---------
do 470 k = 0 , jstep-ink , ink
!
if (k.gt.0) then
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
co3 = trigs(3*kk+1)
si3 = s*trigs(3*kk+2)
co4 = trigs(4*kk+1)
si4 = s*trigs(4*kk+2)
endif
!
!  double loop along first transform in block
!  ------------------------------------------
do 460 ll = k , (la-1)*ink , 5*jstep
!
do 450 jjj = ll , (n-1)*inc , 5*la*ink
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 445 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = ja + laincl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
ji = jh + jstepl
if (ji.lt.istart) ji = ji + ninc
jj = ji + jstepl
if (jj.lt.istart) jj = jj + ninc
jk = jf + laincl
if (jk.lt.istart) jk = jk + ninc
jl = jk + jstepl
if (jl.lt.istart) jl = jl + ninc
jm = jl + jstepl
if (jm.lt.istart) jm = jm + ninc
jn = jm + jstepl
if (jn.lt.istart) jn = jn + ninc
jo = jn + jstepl
if (jo.lt.istart) jo = jo + ninc
jp = jk + laincl
if (jp.lt.istart) jp = jp + ninc
jq = jp + jstepl
if (jq.lt.istart) jq = jq + ninc
jr = jq + jstepl
if (jr.lt.istart) jr = jr + ninc
js = jr + jstepl
if (js.lt.istart) js = js + ninc
jt = js + jstepl
if (jt.lt.istart) jt = jt + ninc
ju = jp + laincl
if (ju.lt.istart) ju = ju + ninc
jv = ju + jstepl
if (jv.lt.istart) jv = jv + ninc
jw = jv + jstepl
if (jw.lt.istart) jw = jw + ninc
jx = jw + jstepl
if (jx.lt.istart) jx = jx + ninc
jy = jx + jstepl
if (jy.lt.istart) jy = jy + ninc
j = 0
!
!  loop across transforms
!  ----------------------
if (k.eq.0) then
!
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 410 l = 1 , nvex
ajb = a(jb+j)
aje = a(je+j)
t1 = ajb + aje
ajc = a(jc+j)
ajd = a(jd+j)
t2 = ajc + ajd
t3 = ajb - aje
t4 = ajc - ajd
ajf = a(jf+j)
ajb =  ajf
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aja = a(ja+j)
t7 = aja - 0.25_dp * t5
a(ja+j) = aja + t5
t8 = t7 + t6
t9 = t7 - t6
ajk = a(jk+j)
ajc =  ajk
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjb = b(jb+j)
bje = b(je+j)
u1 = bjb + bje
bjc = b(jc+j)
bjd = b(jd+j)
u2 = bjc + bjd
u3 = bjb - bje
u4 = bjc - bjd
bjf = b(jf+j)
bjb =  bjf
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bja = b(ja+j)
u7 = bja - 0.25_dp * u5
b(ja+j) = bja + u5
u8 = u7 + u6
u9 = u7 - u6
bjk = b(jk+j)
bjc =  bjk
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jf+j) = t8 - u11
b(jf+j) = u8 + t11
aje =  t8 + u11
bje =  u8 - t11
a(jk+j) = t9 - u10
b(jk+j) = u9 + t10
ajd =  t9 + u10
bjd =  u9 - t10
!----------------------
ajg = a(jg+j)
ajj = a(jj+j)
t1 = ajg + ajj
ajh = a(jh+j)
aji = a(ji+j)
t2 = ajh + aji
t3 = ajg - ajj
t4 = ajh - aji
ajl = a(jl+j)
ajh =  ajl
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
t7 = ajb - 0.25_dp * t5
a(jb+j) = ajb + t5
t8 = t7 + t6
t9 = t7 - t6
ajq = a(jq+j)
aji =  ajq
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjg = b(jg+j)
bjj = b(jj+j)
u1 = bjg + bjj
bjh = b(jh+j)
bji = b(ji+j)
u2 = bjh + bji
u3 = bjg - bjj
u4 = bjh - bji
bjl = b(jl+j)
bjh =  bjl
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
u7 = bjb - 0.25_dp * u5
b(jb+j) = bjb + u5
u8 = u7 + u6
u9 = u7 - u6
bjq = b(jq+j)
bji =  bjq
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jg+j) = t8 - u11
b(jg+j) = u8 + t11
ajj =  t8 + u11
bjj =  u8 - t11
a(jl+j) = t9 - u10
b(jl+j) = u9 + t10
a(jq+j) = t9 + u10
b(jq+j) = u9 - t10
!----------------------
ajo = a(jo+j)
t1 = ajh + ajo
ajm = a(jm+j)
ajn = a(jn+j)
t2 = ajm + ajn
t3 = ajh - ajo
t4 = ajm - ajn
ajr = a(jr+j)
ajn =  ajr
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
t7 = ajc - 0.25_dp * t5
a(jc+j) = ajc + t5
t8 = t7 + t6
t9 = t7 - t6
ajw = a(jw+j)
ajo =  ajw
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjo = b(jo+j)
u1 = bjh + bjo
bjm = b(jm+j)
bjn = b(jn+j)
u2 = bjm + bjn
u3 = bjh - bjo
u4 = bjm - bjn
bjr = b(jr+j)
bjn =  bjr
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
u7 = bjc - 0.25_dp * u5
b(jc+j) = bjc + u5
u8 = u7 + u6
u9 = u7 - u6
bjw = b(jw+j)
bjo =  bjw
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jh+j) = t8 - u11
b(jh+j) = u8 + t11
a(jw+j) = t8 + u11
b(jw+j) = u8 - t11
a(jm+j) = t9 - u10
b(jm+j) = u9 + t10
a(jr+j) = t9 + u10
b(jr+j) = u9 - t10
!----------------------
ajt = a(jt+j)
t1 = aji + ajt
ajs = a(js+j)
t2 = ajn + ajs
t3 = aji - ajt
t4 = ajn - ajs
ajx = a(jx+j)
ajt =  ajx
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
ajp = a(jp+j)
t7 = ajp - 0.25_dp * t5
ax = ajp + t5
t8 = t7 + t6
t9 = t7 - t6
a(jp+j) = ajd
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
a(jd+j) = ax
bjt = b(jt+j)
u1 = bji + bjt
bjs = b(js+j)
u2 = bjn + bjs
u3 = bji - bjt
u4 = bjn - bjs
bjx = b(jx+j)
bjt =  bjx
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bjp = b(jp+j)
u7 = bjp - 0.25_dp * u5
bx = bjp + u5
u8 = u7 + u6
u9 = u7 - u6
b(jp+j) = bjd
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
b(jd+j) = bx
a(ji+j) = t8 - u11
b(ji+j) = u8 + t11
a(jx+j) = t8 + u11
b(jx+j) = u8 - t11
a(jn+j) = t9 - u10
b(jn+j) = u9 + t10
a(js+j) = t9 + u10
b(js+j) = u9 - t10
!----------------------
ajv = a(jv+j)
ajy = a(jy+j)
t1 = ajv + ajy
t2 = ajo + ajt
t3 = ajv - ajy
t4 = ajo - ajt
a(jv+j) = ajj
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aju = a(ju+j)
t7 = aju - 0.25_dp * t5
ax = aju + t5
t8 = t7 + t6
t9 = t7 - t6
a(ju+j) = aje
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
a(je+j) = ax
bjv = b(jv+j)
bjy = b(jy+j)
u1 = bjv + bjy
u2 = bjo + bjt
u3 = bjv - bjy
u4 = bjo - bjt
b(jv+j) = bjj
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bju = b(ju+j)
u7 = bju - 0.25_dp * u5
bx = bju + u5
u8 = u7 + u6
u9 = u7 - u6
b(ju+j) = bje
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
b(je+j) = bx
a(jj+j) = t8 - u11
b(jj+j) = u8 + t11
a(jy+j) = t8 + u11
b(jy+j) = u8 - t11
a(jo+j) = t9 - u10
b(jo+j) = u9 + t10
a(jt+j) = t9 + u10
b(jt+j) = u9 - t10
j = j + jump
410 continue
!
else
!
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 440 l = 1 , nvex
ajb = a(jb+j)
aje = a(je+j)
t1 = ajb + aje
ajc = a(jc+j)
ajd = a(jd+j)
t2 = ajc + ajd
t3 = ajb - aje
t4 = ajc - ajd
ajf = a(jf+j)
ajb =  ajf
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aja = a(ja+j)
t7 = aja - 0.25_dp * t5
a(ja+j) = aja + t5
t8 = t7 + t6
t9 = t7 - t6
ajk = a(jk+j)
ajc =  ajk
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjb = b(jb+j)
bje = b(je+j)
u1 = bjb + bje
bjc = b(jc+j)
bjd = b(jd+j)
u2 = bjc + bjd
u3 = bjb - bje
u4 = bjc - bjd
bjf = b(jf+j)
bjb =  bjf
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bja = b(ja+j)
u7 = bja - 0.25_dp * u5
b(ja+j) = bja + u5
u8 = u7 + u6
u9 = u7 - u6
bjk = b(jk+j)
bjc =  bjk
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jf+j) = co1*(t8-u11) - si1*(u8+t11)
b(jf+j) = si1*(t8-u11) + co1*(u8+t11)
aje =  co4*(t8+u11) - si4*(u8-t11)
bje =  si4*(t8+u11) + co4*(u8-t11)
a(jk+j) = co2*(t9-u10) - si2*(u9+t10)
b(jk+j) = si2*(t9-u10) + co2*(u9+t10)
ajd =  co3*(t9+u10) - si3*(u9-t10)
bjd =  si3*(t9+u10) + co3*(u9-t10)
!----------------------
ajg = a(jg+j)
ajj = a(jj+j)
t1 = ajg + ajj
ajh = a(jh+j)
aji = a(ji+j)
t2 = ajh + aji
t3 = ajg - ajj
t4 = ajh - aji
ajl = a(jl+j)
ajh =  ajl
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
t7 = ajb - 0.25_dp * t5
a(jb+j) = ajb + t5
t8 = t7 + t6
t9 = t7 - t6
ajq = a(jq+j)
aji =  ajq
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjg = b(jg+j)
bjj = b(jj+j)
u1 = bjg + bjj
bjh = b(jh+j)
bji = b(ji+j)
u2 = bjh + bji
u3 = bjg - bjj
u4 = bjh - bji
bjl = b(jl+j)
bjh =  bjl
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
u7 = bjb - 0.25_dp * u5
b(jb+j) = bjb + u5
u8 = u7 + u6
u9 = u7 - u6
bjq = b(jq+j)
bji =  bjq
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jg+j) = co1*(t8-u11) - si1*(u8+t11)
b(jg+j) = si1*(t8-u11) + co1*(u8+t11)
ajj =  co4*(t8+u11) - si4*(u8-t11)
bjj =  si4*(t8+u11) + co4*(u8-t11)
a(jl+j) = co2*(t9-u10) - si2*(u9+t10)
b(jl+j) = si2*(t9-u10) + co2*(u9+t10)
a(jq+j) = co3*(t9+u10) - si3*(u9-t10)
b(jq+j) = si3*(t9+u10) + co3*(u9-t10)
!----------------------
ajo = a(jo+j)
t1 = ajh + ajo
ajm = a(jm+j)
ajn = a(jn+j)
t2 = ajm + ajn
t3 = ajh - ajo
t4 = ajm - ajn
ajr = a(jr+j)
ajn =  ajr
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
t7 = ajc - 0.25_dp * t5
a(jc+j) = ajc + t5
t8 = t7 + t6
t9 = t7 - t6
ajw = a(jw+j)
ajo =  ajw
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjo = b(jo+j)
u1 = bjh + bjo
bjm = b(jm+j)
bjn = b(jn+j)
u2 = bjm + bjn
u3 = bjh - bjo
u4 = bjm - bjn
bjr = b(jr+j)
bjn =  bjr
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
u7 = bjc - 0.25_dp * u5
b(jc+j) = bjc + u5
u8 = u7 + u6
u9 = u7 - u6
bjw = b(jw+j)
bjo =  bjw
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jh+j) = co1*(t8-u11) - si1*(u8+t11)
b(jh+j) = si1*(t8-u11) + co1*(u8+t11)
a(jw+j) = co4*(t8+u11) - si4*(u8-t11)
b(jw+j) = si4*(t8+u11) + co4*(u8-t11)
a(jm+j) = co2*(t9-u10) - si2*(u9+t10)
b(jm+j) = si2*(t9-u10) + co2*(u9+t10)
a(jr+j) = co3*(t9+u10) - si3*(u9-t10)
b(jr+j) = si3*(t9+u10) + co3*(u9-t10)
!----------------------
ajt = a(jt+j)
t1 = aji + ajt
ajs = a(js+j)
t2 = ajn + ajs
t3 = aji - ajt
t4 = ajn - ajs
ajx = a(jx+j)
ajt =  ajx
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
ajp = a(jp+j)
t7 = ajp - 0.25_dp * t5
ax = ajp + t5
t8 = t7 + t6
t9 = t7 - t6
a(jp+j) = ajd
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
a(jd+j) = ax
bjt = b(jt+j)
u1 = bji + bjt
bjs = b(js+j)
u2 = bjn + bjs
u3 = bji - bjt
u4 = bjn - bjs
bjx = b(jx+j)
bjt =  bjx
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bjp = b(jp+j)
u7 = bjp - 0.25_dp * u5
bx = bjp + u5
u8 = u7 + u6
u9 = u7 - u6
b(jp+j) = bjd
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
b(jd+j) = bx
a(ji+j) = co1*(t8-u11) - si1*(u8+t11)
b(ji+j) = si1*(t8-u11) + co1*(u8+t11)
a(jx+j) = co4*(t8+u11) - si4*(u8-t11)
b(jx+j) = si4*(t8+u11) + co4*(u8-t11)
a(jn+j) = co2*(t9-u10) - si2*(u9+t10)
b(jn+j) = si2*(t9-u10) + co2*(u9+t10)
a(js+j) = co3*(t9+u10) - si3*(u9-t10)
b(js+j) = si3*(t9+u10) + co3*(u9-t10)
!----------------------
ajv = a(jv+j)
ajy = a(jy+j)
t1 = ajv + ajy
t2 = ajo + ajt
t3 = ajv - ajy
t4 = ajo - ajt
a(jv+j) = ajj
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aju = a(ju+j)
t7 = aju - 0.25_dp * t5
ax = aju + t5
t8 = t7 + t6
t9 = t7 - t6
a(ju+j) = aje
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
a(je+j) = ax
bjv = b(jv+j)
bjy = b(jy+j)
u1 = bjv + bjy
u2 = bjo + bjt
u3 = bjv - bjy
u4 = bjo - bjt
b(jv+j) = bjj
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bju = b(ju+j)
u7 = bju - 0.25_dp * u5
bx = bju + u5
u8 = u7 + u6
u9 = u7 - u6
b(ju+j) = bje
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
b(je+j) = bx
a(jj+j) = co1*(t8-u11) - si1*(u8+t11)
b(jj+j) = si1*(t8-u11) + co1*(u8+t11)
a(jy+j) = co4*(t8+u11) - si4*(u8-t11)
b(jy+j) = si4*(t8+u11) + co4*(u8-t11)
a(jo+j) = co2*(t9-u10) - si2*(u9+t10)
b(jo+j) = si2*(t9-u10) + co2*(u9+t10)
a(jt+j) = co3*(t9+u10) - si3*(u9-t10)
b(jt+j) = si3*(t9+u10) + co3*(u9-t10)
j = j + jump
440 continue
!
endif
!
!-----(end of loop across transforms)
!
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
445 continue
450 continue
460 continue
!-----( end of double loop for this k )
kk = kk + 2*la
470 continue
!-----( end of loop over values of k )
la = 5*la
480 continue
!-----( end of loop on type II radix-5 passes )
!-----( nvex transforms completed)
490 continue
istart = istart + nvex * jump
500 continue
!-----( end of loop on blocks of transforms )
!
return
end subroutine gpfa5f

end module gridxc_gpfa_core_sp


module gridxc_gpfa_core_dp
!
!     1d fft routine based on Clive Temperton's GPFA package
!
! Vectors to be transformed are in DOUBLE precision (wp=dp)

implicit none

! In this module, trigs twiddle factors and intermediate
! variables are kept in double precision.

integer, parameter :: dp = selected_real_kind(10,100)
integer, parameter :: sp = selected_real_kind(6,10)
integer, parameter :: wp = dp

interface gpfa
   module procedure gpfa_
end interface gpfa

public :: gpfa
private

CONTAINS

!--------------------------------------------------------------
!
!  What follows is the original code by Temperton (save setgpfa)
!  with explicit declarations of variables
!
! In this module, trigs twiddle factors and intermediate
! variables are kept in double precision.

!  For arbitrary-precision support the parameter constants might
!  be computed instead of inserted explicitly.
!
!  For example: sin60 = sin(2.0_wp*pi/3.0_wp), where wp is the working
!                       precision, and
!               pi = 4.0_wp * atan(1.0_wp), or
!               twopi = 4.0_wp * asin(1.0_wp)
!
!*********************************************************************

!        SUBROUTINE 'GPFA'
!        SELF-SORTING IN-PLACE GENERALIZED PRIME FACTOR (COMPLEX) FFT
!
!        *** THIS IS THE ALL-FORTRAN VERSION ***
!            -------------------------------
!
!        CALL GPFA(A,B,TRIGS,INC,JUMP,N,LOT,ISIGN)
!
!        A IS FIRST REAL INPUT/OUTPUT VECTOR
!        B IS FIRST IMAGINARY INPUT/OUTPUT VECTOR
!        TRIGS IS A TABLE OF TWIDDLE FACTORS, PRECALCULATED
!              BY CALLING SUBROUTINE 'SETGPFA'
!        INC IS THE INCREMENT WITHIN EACH DATA VECTOR
!        JUMP IS THE INCREMENT BETWEEN DATA VECTORS
!        N IS THE LENGTH OF THE TRANSFORMS:
!          -----------------------------------
!            N = (2**IP) * (3**IQ) * (5**IR)
!          -----------------------------------
!        LOT IS THE NUMBER OF TRANSFORMS
!        ISIGN = +1 FOR FORWARD TRANSFORM
!              = -1 FOR INVERSE TRANSFORM
!
!        WRITTEN BY CLIVE TEMPERTON
!        RECHERCHE EN PREVISION NUMERIQUE
!        ATMOSPHERIC ENVIRONMENT SERVICE, CANADA
!
!----------------------------------------------------------------------
!
!        DEFINITION OF TRANSFORM
!        -----------------------
!
!        X(J) = SUM(K=0,...,N-1)(C(K)*EXP(ISIGN*2*I*J*K*PI/N))
!
!---------------------------------------------------------------------
!
!        FOR A MATHEMATICAL DEVELOPMENT OF THE ALGORITHM USED,
!        SEE:
!
!        C TEMPERTON : "A GENERALIZED PRIME FACTOR FFT ALGORITHM
!          FOR ANY N = (2**P)(3**Q)(5**R)",
!          SIAM J. SCI. STAT. COMP., MAY 1992.
!

SUBROUTINE GPFA_(A,B,TRIGS,INC,JUMP,N,LOT,ISIGN)

IMPLICIT NONE
!
REAL(wp) ::     A(:), B(:) 
REAL(dp)  TRIGS(*)
INTEGER INC, JUMP, N, LOT, ISIGN
!
INTEGER &
  NJ(3), NN, IFAC, LL, KK, IP, IQ, IR, I

character(len=20) :: msg
!
!     DECOMPOSE N INTO FACTORS 2,3,5
!     ------------------------------
NN = N
IFAC = 2
!
DO 30 LL = 1 , 3
KK = 0
10 CONTINUE
IF (MOD(NN,IFAC).NE.0) GO TO 20
KK = KK + 1
NN = NN / IFAC
GO TO 10
20 CONTINUE
NJ(LL) = KK
IFAC = IFAC + LL
30 CONTINUE
!
IF (NN.NE.1) THEN
   write(msg,"(i0)") N
   call die('GPFA: '//trim(msg)//' IS NOT A LEGAL VALUE OF N')
ENDIF
!
IP = NJ(1)
IQ = NJ(2)
IR = NJ(3)
!
!     COMPUTE THE TRANSFORM
!     ---------------------
I = 1
IF (IP.GT.0) THEN
   CALL GPFA2F(A,B,TRIGS,INC,JUMP,N,IP,LOT,ISIGN)
   I = I + 2 * ( 2**IP)
ENDIF
IF (IQ.GT.0) THEN
   CALL GPFA3F(A,B,TRIGS(I),INC,JUMP,N,IQ,LOT,ISIGN)
   I = I + 2 * (3**IQ)
ENDIF
IF (IR.GT.0) THEN
   CALL GPFA5F(A,B,TRIGS(I),INC,JUMP,N,IR,LOT,ISIGN)
ENDIF
!
RETURN
END SUBROUTINE GPFA_


!*********************************************************************

!     fortran version of *gpfa2* -
!     radix-2 section of self-sorting, in-place, generalized pfa
!     central radix-2 and radix-8 passes included
!      so that transform length can be any power of 2
!

subroutine gpfa2f(a,b,trigs,inc,jump,n,mm,lot,isign)

real(wp) ::     a(*), b(*)
real(dp)     ::     trigs(*)
integer inc, jump, n, mm, lot, isign

integer  n2, inq, jstepx, ninc, ink, m2, m8, &
         m, mh, nblox, left, istart, nb, nvex, &
         la, mu, ipass, jstep, jstepl, jjj, ja, &
         nu, jb, jc, jd, j, l, kk, k, je, jf, &
         jg, jh, laincl, ll, ji, jj, jk, jl, jm, &
         jn, jo, jp

real(dp) :: s, aja, ajc, t0, t2, ajb, ajd, &
                t1, t3, bja, bjc, u0, u2, bjb, &
                bjd, u1, u3, co1, si1, co2, si2, &
                co3, si3, ss, c1, c2, c3, aje, &
                ajf, ajg, ajh, bje, bjf, bjg, &
                bjh, co4, si4, co5, si5, co6, si6, &
                co7, si7, aji, bjm, ajj, bjj, ajk, &
                ajl, bji, bjk, ajo, bjl, bjo, ajm, &
                ajn, ajp, bjn, bjp

#ifdef OLD_CRAY
integer, parameter :: lvr = CRAY_LVR_PARAMETER
#else
integer, parameter :: lvr = 1024
#endif
!
!     ***************************************************************
!     *                                                             *
!     *  N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. *
!     *  RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE *
!     *  (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER.      *
!     *                                                             *
!     ***************************************************************
!
n2 = 2**mm
inq = n/n2
jstepx = (n2-n) * inc
ninc = n * inc
ink = inc * inq
!
m2 = 0
m8 = 0
if (mod(mm,2).eq.0) then
   m = mm/2
else if (mod(mm,4).eq.1) then
   m = (mm-1)/2
   m2 = 1
else if (mod(mm,4).eq.3) then
   m = (mm-3)/2
   m8 = 1
endif
mh = (m+1)/2
!
nblox = 1 + (lot-1)/lvr
left = lot
s = real(isign,kind=dp)
istart = 1
!
!  loop on blocks of lvr transforms
!  --------------------------------
do 500 nb = 1 , nblox
!
if (left.le.lvr) then
   nvex = left
else if (left.lt.(2*lvr)) then
   nvex = left/2
   nvex = nvex + mod(nvex,2)
else
   nvex = lvr
endif
left = left - nvex
!
la = 1
!
!  loop on type I radix-4 passes
!  -----------------------------
mu = mod(inq,4)
if (isign.eq.-1) mu = 4 - mu
ss = 1.0_dp
if (mu.eq.3) ss = -1.0_dp
!
if (mh.eq.0) go to 200
!
do 160 ipass = 1 , mh
jstep = (n*inc) / (4*la)
jstepl = jstep - ninc
!
!  k = 0 loop (no twiddle factors)
!  -------------------------------
do 120 jjj = 0 , (n-1)*inc , 4*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 115 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 110 l = 1 , nvex
aja = a(ja+j)
ajc = a(jc+j)
t0 = aja + ajc
t2 = aja - ajc
ajb = a(jb+j)
ajd = a(jd+j)
t1 = ajb + ajd
t3 = ss * ( ajb - ajd )
bja = b(ja+j)
bjc = b(jc+j)
u0 = bja + bjc
u2 = bja - bjc
bjb = b(jb+j)
bjd = b(jd+j)
u1 = bjb + bjd
u3 = ss * ( bjb - bjd )
a(ja+j) = t0 + t1
a(jc+j) = t0 - t1
b(ja+j) = u0 + u1
b(jc+j) = u0 - u1
a(jb+j) = t2 - u3
a(jd+j) = t2 + u3
b(jb+j) = u2 + t3
b(jd+j) = u2 - t3
j = j + jump
110 continue
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
115 continue
120 continue
!
!  finished if n2 = 4
!  ------------------
if (n2.eq.4) go to 490
kk = 2 * la
!
!  loop on nonzero k
!  -----------------
do 150 k = ink , jstep-ink , ink
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
co3 = trigs(3*kk+1)
si3 = s*trigs(3*kk+2)
!
!  loop along transform
!  --------------------
do 140 jjj = k , (n-1)*inc , 4*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 135 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 130 l = 1 , nvex
aja = a(ja+j)
ajc = a(jc+j)
t0 = aja + ajc
t2 = aja - ajc
ajb = a(jb+j)
ajd = a(jd+j)
t1 = ajb + ajd
t3 = ss * ( ajb - ajd )
bja = b(ja+j)
bjc = b(jc+j)
u0 = bja + bjc
u2 = bja - bjc
bjb = b(jb+j)
bjd = b(jd+j)
u1 = bjb + bjd
u3 = ss * ( bjb - bjd )
a(ja+j) = t0 + t1
b(ja+j) = u0 + u1
a(jb+j) = co1*(t2-u3) - si1*(u2+t3)
b(jb+j) = si1*(t2-u3) + co1*(u2+t3)
a(jc+j) = co2*(t0-t1) - si2*(u0-u1)
b(jc+j) = si2*(t0-t1) + co2*(u0-u1)
a(jd+j) = co3*(t2+u3) - si3*(u2-t3)
b(jd+j) = si3*(t2+u3) + co3*(u2-t3)
j = j + jump
130 continue
!-----( end of loop across transforms )
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
135 continue
140 continue
!-----( end of loop along transforms )
kk = kk + 2*la
150 continue
!-----( end of loop on nonzero k )
la = 4*la
160 continue
!-----( end of loop on type I radix-4 passes)
!
!  central radix-2 pass
!  --------------------
200 continue
if (m2.eq.0) go to 300
!
jstep = (n*inc) / (2*la)
jstepl = jstep - ninc
!
!  k=0 loop (no twiddle factors)
!  -----------------------------
do 220 jjj = 0 , (n-1)*inc , 2*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 215 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 210 l = 1 , nvex
aja = a(ja+j)
ajb = a(jb+j)
t0 = aja - ajb
a(ja+j) = aja + ajb
a(jb+j) = t0
bja = b(ja+j)
bjb = b(jb+j)
u0 = bja - bjb
b(ja+j) = bja + bjb
b(jb+j) = u0
j = j + jump
210 continue
!-----(end of loop across transforms)
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
215 continue
220 continue
!
!  finished if n2=2
!  ----------------
if (n2.eq.2) go to 490
!
kk = 2 * la
!
!  loop on nonzero k
!  -----------------
do 260 k = ink , jstep - ink , ink
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
!
!  loop along transforms
!  ---------------------
do 250 jjj = k , (n-1)*inc , 2*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 245 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
j = 0
!
!  loop across transforms
!  ----------------------
if (kk.eq.n2/2) then
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 230 l = 1 , nvex
aja = a(ja+j)
ajb = a(jb+j)
t0 = ss * ( aja - ajb )
a(ja+j) = aja + ajb
bjb = b(jb+j)
bja = b(ja+j)
a(jb+j) = ss * ( bjb - bja )
b(ja+j) = bja + bjb
b(jb+j) = t0
j = j + jump
230 continue
!
else
!
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 240 l = 1 , nvex
aja = a(ja+j)
ajb = a(jb+j)
t0 = aja - ajb
a(ja+j) = aja + ajb
bja = b(ja+j)
bjb = b(jb+j)
u0 = bja - bjb
b(ja+j) = bja + bjb
a(jb+j) = co1*t0 - si1*u0
b(jb+j) = si1*t0 + co1*u0
j = j + jump
240 continue
!
endif
!
!-----(end of loop across transforms)
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
245 continue
250 continue
!-----(end of loop along transforms)
kk = kk + 2 * la
260 continue
!-----(end of loop on nonzero k)
!-----(end of radix-2 pass)
!
la = 2 * la
go to 400
!
!  central radix-8 pass
!  --------------------
300 continue
if (m8.eq.0) go to 400
jstep = (n*inc) / (8*la)
jstepl = jstep - ninc
mu = mod(inq,8)
if (isign.eq.-1) mu = 8 - mu
c1 = 1.0_dp
if (mu.eq.3.or.mu.eq.7) c1 = -1.0_dp
c2 = sqrt(0.5_dp)
if (mu.eq.3.or.mu.eq.5) c2 = -c2
c3 = c1 * c2
!
!  stage 1
!  -------
do 320 k = 0 , jstep - ink , ink
do 315 jjj = k , (n-1)*inc , 8*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 312 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
j = 0
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 310 l = 1 , nvex
aja = a(ja+j)
aje = a(je+j)
t0 = aja - aje
a(ja+j) = aja + aje
ajc = a(jc+j)
ajg = a(jg+j)
t1 = c1 * ( ajc - ajg )
a(je+j) = ajc + ajg
ajb = a(jb+j)
ajf = a(jf+j)
t2 = ajb - ajf
a(jc+j) = ajb + ajf
ajd = a(jd+j)
ajh = a(jh+j)
t3 = ajd - ajh
a(jg+j) = ajd + ajh
a(jb+j) = t0
a(jf+j) = t1
a(jd+j) = c2 * ( t2 - t3 )
a(jh+j) = c3 * ( t2 + t3 )
bja = b(ja+j)
bje = b(je+j)
u0 = bja - bje
b(ja+j) = bja + bje
bjc = b(jc+j)
bjg = b(jg+j)
u1 = c1 * ( bjc - bjg )
b(je+j) = bjc + bjg
bjb = b(jb+j)
bjf = b(jf+j)
u2 = bjb - bjf
b(jc+j) = bjb + bjf
bjd = b(jd+j)
bjh = b(jh+j)
u3 = bjd - bjh
b(jg+j) = bjd + bjh
b(jb+j) = u0
b(jf+j) = u1
b(jd+j) = c2 * ( u2 - u3 )
b(jh+j) = c3 * ( u2 + u3 )
j = j + jump
310 continue
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
312 continue
315 continue
320 continue
!
!  stage 2
!  -------
!
!  k=0 (no twiddle factors)
!  ------------------------
do 330 jjj = 0 , (n-1)*inc , 8*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 328 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
j = 0
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 325 l = 1 , nvex
aja = a(ja+j)
aje = a(je+j)
t0 = aja + aje
t2 = aja - aje
ajc = a(jc+j)
ajg = a(jg+j)
t1 = ajc + ajg
t3 = c1 * ( ajc - ajg )
bja = b(ja+j)
bje = b(je+j)
u0 = bja + bje
u2 = bja - bje
bjc = b(jc+j)
bjg = b(jg+j)
u1 = bjc + bjg
u3 = c1 * ( bjc - bjg )
a(ja+j) = t0 + t1
a(je+j) = t0 - t1
b(ja+j) = u0 + u1
b(je+j) = u0 - u1
a(jc+j) = t2 - u3
a(jg+j) = t2 + u3
b(jc+j) = u2 + t3
b(jg+j) = u2 - t3
ajb = a(jb+j)
ajd = a(jd+j)
t0 = ajb + ajd
t2 = ajb - ajd
ajf = a(jf+j)
ajh = a(jh+j)
t1 = ajf - ajh
t3 = ajf + ajh
bjb = b(jb+j)
bjd = b(jd+j)
u0 = bjb + bjd
u2 = bjb - bjd
bjf = b(jf+j)
bjh = b(jh+j)
u1 = bjf - bjh
u3 = bjf + bjh
a(jb+j) = t0 - u3
a(jh+j) = t0 + u3
b(jb+j) = u0 + t3
b(jh+j) = u0 - t3
a(jd+j) = t2 + u1
a(jf+j) = t2 - u1
b(jd+j) = u2 - t1
b(jf+j) = u2 + t1
j = j + jump
325 continue
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
328 continue
330 continue
!
if (n2.eq.8) go to 490
!
!  loop on nonzero k
!  -----------------
kk = 2 * la
!
do 350 k = ink , jstep - ink , ink
!
co1 = trigs(kk+1)
si1 = s * trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s * trigs(2*kk+2)
co3 = trigs(3*kk+1)
si3 = s * trigs(3*kk+2)
co4 = trigs(4*kk+1)
si4 = s * trigs(4*kk+2)
co5 = trigs(5*kk+1)
si5 = s * trigs(5*kk+2)
co6 = trigs(6*kk+1)
si6 = s * trigs(6*kk+2)
co7 = trigs(7*kk+1)
si7 = s * trigs(7*kk+2)
!
do 345 jjj = k , (n-1)*inc , 8*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 342 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
j = 0
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 340 l = 1 , nvex
aja = a(ja+j)
aje = a(je+j)
t0 = aja + aje
t2 = aja - aje
ajc = a(jc+j)
ajg = a(jg+j)
t1 = ajc + ajg
t3 = c1 * ( ajc - ajg )
bja = b(ja+j)
bje = b(je+j)
u0 = bja + bje
u2 = bja - bje
bjc = b(jc+j)
bjg = b(jg+j)
u1 = bjc + bjg
u3 = c1 * ( bjc - bjg )
a(ja+j) = t0 + t1
b(ja+j) = u0 + u1
a(je+j) = co4*(t0-t1) - si4*(u0-u1)
b(je+j) = si4*(t0-t1) + co4*(u0-u1)
a(jc+j) = co2*(t2-u3) - si2*(u2+t3)
b(jc+j) = si2*(t2-u3) + co2*(u2+t3)
a(jg+j) = co6*(t2+u3) - si6*(u2-t3)
b(jg+j) = si6*(t2+u3) + co6*(u2-t3)
ajb = a(jb+j)
ajd = a(jd+j)
t0 = ajb + ajd
t2 = ajb - ajd
ajf = a(jf+j)
ajh = a(jh+j)
t1 = ajf - ajh
t3 = ajf + ajh
bjb = b(jb+j)
bjd = b(jd+j)
u0 = bjb + bjd
u2 = bjb - bjd
bjf = b(jf+j)
bjh = b(jh+j)
u1 = bjf - bjh
u3 = bjf + bjh
a(jb+j) = co1*(t0-u3) - si1*(u0+t3)
b(jb+j) = si1*(t0-u3) + co1*(u0+t3)
a(jh+j) = co7*(t0+u3) - si7*(u0-t3)
b(jh+j) = si7*(t0+u3) + co7*(u0-t3)
a(jd+j) = co3*(t2+u1) - si3*(u2-t1)
b(jd+j) = si3*(t2+u1) + co3*(u2-t1)
a(jf+j) = co5*(t2-u1) - si5*(u2+t1)
b(jf+j) = si5*(t2-u1) + co5*(u2+t1)
j = j + jump
340 continue
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
342 continue
345 continue
kk = kk + 2 * la
350 continue
!
la = 8 * la
!
!  loop on type II radix-4 passes
!  ------------------------------
400 continue
mu = mod(inq,4)
if (isign.eq.-1) mu = 4 - mu
ss = 1.0_dp
if (mu.eq.3) ss = -1.0_dp
!
do 480 ipass = mh+1 , m
jstep = (n*inc) / (4*la)
jstepl = jstep - ninc
laincl = la * ink - ninc
!
!  k=0 loop (no twiddle factors)
!  -----------------------------
do 430 ll = 0 , (la-1)*ink , 4*jstep
!
do 420 jjj = ll , (n-1)*inc , 4*la*ink
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 415 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = ja + laincl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
ji = je + laincl
if (ji.lt.istart) ji = ji + ninc
jj = ji + jstepl
if (jj.lt.istart) jj = jj + ninc
jk = jj + jstepl
if (jk.lt.istart) jk = jk + ninc
jl = jk + jstepl
if (jl.lt.istart) jl = jl + ninc
jm = ji + laincl
if (jm.lt.istart) jm = jm + ninc
jn = jm + jstepl
if (jn.lt.istart) jn = jn + ninc
jo = jn + jstepl
if (jo.lt.istart) jo = jo + ninc
jp = jo + jstepl
if (jp.lt.istart) jp = jp + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 410 l = 1 , nvex
aja = a(ja+j)
ajc = a(jc+j)
t0 = aja + ajc
t2 = aja - ajc
ajb = a(jb+j)
ajd = a(jd+j)
t1 = ajb + ajd
t3 = ss * ( ajb - ajd )
aji = a(ji+j)
ajc =  aji
bja = b(ja+j)
bjc = b(jc+j)
u0 = bja + bjc
u2 = bja - bjc
bjb = b(jb+j)
bjd = b(jd+j)
u1 = bjb + bjd
u3 = ss * ( bjb - bjd )
aje = a(je+j)
ajb =  aje
a(ja+j) = t0 + t1
a(ji+j) = t0 - t1
b(ja+j) = u0 + u1
bjc =  u0 - u1
bjm = b(jm+j)
bjd =  bjm
a(je+j) = t2 - u3
ajd =  t2 + u3
bjb =  u2 + t3
b(jm+j) = u2 - t3
!----------------------
ajg = a(jg+j)
t0 = ajb + ajg
t2 = ajb - ajg
ajf = a(jf+j)
ajh = a(jh+j)
t1 = ajf + ajh
t3 = ss * ( ajf - ajh )
ajj = a(jj+j)
ajg =  ajj
bje = b(je+j)
bjg = b(jg+j)
u0 = bje + bjg
u2 = bje - bjg
bjf = b(jf+j)
bjh = b(jh+j)
u1 = bjf + bjh
u3 = ss * ( bjf - bjh )
b(je+j) = bjb
a(jb+j) = t0 + t1
a(jj+j) = t0 - t1
bjj = b(jj+j)
bjg =  bjj
b(jb+j) = u0 + u1
b(jj+j) = u0 - u1
a(jf+j) = t2 - u3
ajh =  t2 + u3
b(jf+j) = u2 + t3
bjh =  u2 - t3
!----------------------
ajk = a(jk+j)
t0 = ajc + ajk
t2 = ajc - ajk
ajl = a(jl+j)
t1 = ajg + ajl
t3 = ss * ( ajg - ajl )
bji = b(ji+j)
bjk = b(jk+j)
u0 = bji + bjk
u2 = bji - bjk
ajo = a(jo+j)
ajl =  ajo
bjl = b(jl+j)
u1 = bjg + bjl
u3 = ss * ( bjg - bjl )
b(ji+j) = bjc
a(jc+j) = t0 + t1
a(jk+j) = t0 - t1
bjo = b(jo+j)
bjl =  bjo
b(jc+j) = u0 + u1
b(jk+j) = u0 - u1
a(jg+j) = t2 - u3
a(jo+j) = t2 + u3
b(jg+j) = u2 + t3
b(jo+j) = u2 - t3
!----------------------
ajm = a(jm+j)
t0 = ajm + ajl
t2 = ajm - ajl
ajn = a(jn+j)
ajp = a(jp+j)
t1 = ajn + ajp
t3 = ss * ( ajn - ajp )
a(jm+j) = ajd
u0 = bjd + bjl
u2 = bjd - bjl
bjn = b(jn+j)
bjp = b(jp+j)
u1 = bjn + bjp
u3 = ss * ( bjn - bjp )
a(jn+j) = ajh
a(jd+j) = t0 + t1
a(jl+j) = t0 - t1
b(jd+j) = u0 + u1
b(jl+j) = u0 - u1
b(jn+j) = bjh
a(jh+j) = t2 - u3
a(jp+j) = t2 + u3
b(jh+j) = u2 + t3
b(jp+j) = u2 - t3
j = j + jump
410 continue
!-----( end of loop across transforms )
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
415 continue
420 continue
430 continue
!-----( end of double loop for k=0 )
!
!  finished if last pass
!  ---------------------
if (ipass.eq.m) go to 490
!
kk = 2*la
!
!     loop on nonzero k
!     -----------------
do 470 k = ink , jstep-ink , ink
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
co3 = trigs(3*kk+1)
si3 = s*trigs(3*kk+2)
!
!  double loop along first transform in block
!  ------------------------------------------
do 460 ll = k , (la-1)*ink , 4*jstep
!
do 450 jjj = ll , (n-1)*inc , 4*la*ink
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 445 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = ja + laincl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
ji = je + laincl
if (ji.lt.istart) ji = ji + ninc
jj = ji + jstepl
if (jj.lt.istart) jj = jj + ninc
jk = jj + jstepl
if (jk.lt.istart) jk = jk + ninc
jl = jk + jstepl
if (jl.lt.istart) jl = jl + ninc
jm = ji + laincl
if (jm.lt.istart) jm = jm + ninc
jn = jm + jstepl
if (jn.lt.istart) jn = jn + ninc
jo = jn + jstepl
if (jo.lt.istart) jo = jo + ninc
jp = jo + jstepl
if (jp.lt.istart) jp = jp + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 440 l = 1 , nvex
aja = a(ja+j)
ajc = a(jc+j)
t0 = aja + ajc
t2 = aja - ajc
ajb = a(jb+j)
ajd = a(jd+j)
t1 = ajb + ajd
t3 = ss * ( ajb - ajd )
aji = a(ji+j)
ajc =  aji
bja = b(ja+j)
bjc = b(jc+j)
u0 = bja + bjc
u2 = bja - bjc
bjb = b(jb+j)
bjd = b(jd+j)
u1 = bjb + bjd
u3 = ss * ( bjb - bjd )
aje = a(je+j)
ajb =  aje
a(ja+j) = t0 + t1
b(ja+j) = u0 + u1
a(je+j) = co1*(t2-u3) - si1*(u2+t3)
bjb =  si1*(t2-u3) + co1*(u2+t3)
bjm = b(jm+j)
bjd =  bjm
a(ji+j) = co2*(t0-t1) - si2*(u0-u1)
bjc =  si2*(t0-t1) + co2*(u0-u1)
ajd =  co3*(t2+u3) - si3*(u2-t3)
b(jm+j) = si3*(t2+u3) + co3*(u2-t3)
!----------------------------------------
ajg = a(jg+j)
t0 = ajb + ajg
t2 = ajb - ajg
ajf = a(jf+j)
ajh = a(jh+j)
t1 = ajf + ajh
t3 = ss * ( ajf - ajh )
ajj = a(jj+j)
ajg =  ajj
bje = b(je+j)
bjg = b(jg+j)
u0 = bje + bjg
u2 = bje - bjg
bjf = b(jf+j)
bjh = b(jh+j)
u1 = bjf + bjh
u3 = ss * ( bjf - bjh )
b(je+j) = bjb
a(jb+j) = t0 + t1
b(jb+j) = u0 + u1
bjj = b(jj+j)
bjg =  bjj
a(jf+j) = co1*(t2-u3) - si1*(u2+t3)
b(jf+j) = si1*(t2-u3) + co1*(u2+t3)
a(jj+j) = co2*(t0-t1) - si2*(u0-u1)
b(jj+j) = si2*(t0-t1) + co2*(u0-u1)
ajh =  co3*(t2+u3) - si3*(u2-t3)
bjh =  si3*(t2+u3) + co3*(u2-t3)
!----------------------------------------
ajk = a(jk+j)
t0 = ajc + ajk
t2 = ajc - ajk
ajl = a(jl+j)
t1 = ajg + ajl
t3 = ss * ( ajg - ajl )
bji = b(ji+j)
bjk = b(jk+j)
u0 = bji + bjk
u2 = bji - bjk
ajo = a(jo+j)
ajl =  ajo
bjl = b(jl+j)
u1 = bjg + bjl
u3 = ss * ( bjg - bjl )
b(ji+j) = bjc
a(jc+j) = t0 + t1
b(jc+j) = u0 + u1
bjo = b(jo+j)
bjl =  bjo
a(jg+j) = co1*(t2-u3) - si1*(u2+t3)
b(jg+j) = si1*(t2-u3) + co1*(u2+t3)
a(jk+j) = co2*(t0-t1) - si2*(u0-u1)
b(jk+j) = si2*(t0-t1) + co2*(u0-u1)
a(jo+j) = co3*(t2+u3) - si3*(u2-t3)
b(jo+j) = si3*(t2+u3) + co3*(u2-t3)
!----------------------------------------
ajm = a(jm+j)
t0 = ajm + ajl
t2 = ajm - ajl
ajn = a(jn+j)
ajp = a(jp+j)
t1 = ajn + ajp
t3 = ss * ( ajn - ajp )
a(jm+j) = ajd
u0 = bjd + bjl
u2 = bjd - bjl
a(jn+j) = ajh
bjn = b(jn+j)
bjp = b(jp+j)
u1 = bjn + bjp
u3 = ss * ( bjn - bjp )
b(jn+j) = bjh
a(jd+j) = t0 + t1
b(jd+j) = u0 + u1
a(jh+j) = co1*(t2-u3) - si1*(u2+t3)
b(jh+j) = si1*(t2-u3) + co1*(u2+t3)
a(jl+j) = co2*(t0-t1) - si2*(u0-u1)
b(jl+j) = si2*(t0-t1) + co2*(u0-u1)
a(jp+j) = co3*(t2+u3) - si3*(u2-t3)
b(jp+j) = si3*(t2+u3) + co3*(u2-t3)
j = j + jump
440 continue
!-----(end of loop across transforms)
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
445 continue
450 continue
460 continue
!-----( end of double loop for this k )
kk = kk + 2*la
470 continue
!-----( end of loop over values of k )
la = 4*la
480 continue
!-----( end of loop on type II radix-4 passes )
!-----( nvex transforms completed)
490 continue
istart = istart + nvex * jump
500 continue
!-----( end of loop on blocks of transforms )
!
return
end subroutine gpfa2f

!******************************************************************

!     fortran version of *gpfa3* -
!     radix-3 section of self-sorting, in-place
!        generalized PFA
!
!-------------------------------------------------------------------
!
subroutine gpfa3f(a,b,trigs,inc,jump,n,mm,lot,isign)

real(wp) ::     a(*), b(*)
real(dp)     ::     trigs(*)
integer inc, jump, n, mm, lot, isign

real(dp), parameter :: sin60 = 0.866025403784437_dp

integer      :: je, jf, jg, jh, laincl, ll, ji
integer      :: n3, inq, jstepx, ninc, ink, mu, m, mh, &
                nblox, left, istart, nb, nvex, la, &
                ipass, jstep, jstepl, jjj, ja, nu, jb, &
                jc, j, l, kk, k, jd

real(dp) :: s, ajb, ajc, t1, aja, t2, t3, bjb, bjc, &
                u1, bja, u2, u3, co1, si1, co2, si2, ajd, &
                bjd
real(dp) :: c1, aje, ajg, ajf, ajh, bje, &
                bjf, bjg, bjh, aji, bji


integer, parameter :: lvr = 128
!
!     ***************************************************************
!     *                                                             *
!     *  N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. *
!     *  RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE *
!     *  (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER.      *
!     *                                                             *
!     ***************************************************************
!
n3 = 3**mm
inq = n/n3
jstepx = (n3-n) * inc
ninc = n * inc
ink = inc * inq
mu = mod(inq,3)
if (isign.eq.-1) mu = 3-mu
m = mm
mh = (m+1)/2
s = real(isign,kind=dp)
c1 = sin60
if (mu.eq.2) c1 = -c1
!
nblox = 1 + (lot-1)/lvr
left = lot
s = real(isign,kind=dp)
istart = 1
!
!  loop on blocks of lvr transforms
!  --------------------------------
do 500 nb = 1 , nblox
!
if (left.le.lvr) then
   nvex = left
else if (left.lt.(2*lvr)) then
   nvex = left/2
   nvex = nvex + mod(nvex,2)
else
   nvex = lvr
endif
left = left - nvex
!
la = 1
!
!  loop on type I radix-3 passes
!  -----------------------------
do 160 ipass = 1 , mh
jstep = (n*inc) / (3*la)
jstepl = jstep - ninc
!
!  k = 0 loop (no twiddle factors)
!  -------------------------------
do 120 jjj = 0 , (n-1)*inc , 3*jstep
ja = istart + jjj
!
!  "transverse" loop
!  -----------------
do 115 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 110 l = 1 , nvex
ajb = a(jb+j)
ajc = a(jc+j)
t1 = ajb + ajc
aja = a(ja+j)
t2 = aja - 0.5_dp * t1
t3 = c1 * ( ajb - ajc )
bjb = b(jb+j)
bjc = b(jc+j)
u1 = bjb + bjc
bja = b(ja+j)
u2 = bja - 0.5_dp * u1
u3 = c1 * ( bjb - bjc )
a(ja+j) = aja + t1
b(ja+j) = bja + u1
a(jb+j) = t2 - u3
b(jb+j) = u2 + t3
a(jc+j) = t2 + u3
b(jc+j) = u2 - t3
j = j + jump
110 continue
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
115 continue
120 continue
!
!  finished if n3 = 3
!  ------------------
if (n3.eq.3) go to 490
kk = 2 * la
!
!  loop on nonzero k
!  -----------------
do 150 k = ink , jstep-ink , ink
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
!
!  loop along transform
!  --------------------
do 140 jjj = k , (n-1)*inc , 3*jstep
ja = istart + jjj
!
!  "transverse" loop
!  -----------------
do 135 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 130 l = 1 , nvex
ajb = a(jb+j)
ajc = a(jc+j)
t1 = ajb + ajc
aja = a(ja+j)
t2 = aja - 0.5_dp * t1
t3 = c1 * ( ajb - ajc )
bjb = b(jb+j)
bjc = b(jc+j)
u1 = bjb + bjc
bja = b(ja+j)
u2 = bja - 0.5_dp * u1
u3 = c1 * ( bjb - bjc )
a(ja+j) = aja + t1
b(ja+j) = bja + u1
a(jb+j) = co1*(t2-u3) - si1*(u2+t3)
b(jb+j) = si1*(t2-u3) + co1*(u2+t3)
a(jc+j) = co2*(t2+u3) - si2*(u2-t3)
b(jc+j) = si2*(t2+u3) + co2*(u2-t3)
j = j + jump
130 continue
!-----( end of loop across transforms )
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
135 continue
140 continue
!-----( end of loop along transforms )
kk = kk + 2*la
150 continue
!-----( end of loop on nonzero k )
la = 3*la
160 continue
!-----( end of loop on type I radix-3 passes)
!
!  loop on type II radix-3 passes
!  ------------------------------
400 continue
!
do 480 ipass = mh+1 , m
jstep = (n*inc) / (3*la)
jstepl = jstep - ninc
laincl = la*ink - ninc
!
!  k=0 loop (no twiddle factors)
!  -----------------------------
do 430 ll = 0 , (la-1)*ink , 3*jstep
!
do 420 jjj = ll , (n-1)*inc , 3*la*ink
ja = istart + jjj
!
!  "transverse" loop
!  -----------------
do 415 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = ja + laincl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jd + laincl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
ji = jh + jstepl
if (ji.lt.istart) ji = ji + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 410 l = 1 , nvex
ajb = a(jb+j)
ajc = a(jc+j)
t1 = ajb + ajc
aja = a(ja+j)
t2 = aja - 0.5_dp * t1
t3 = c1 * ( ajb - ajc )
ajd = a(jd+j)
ajb =  ajd
bjb = b(jb+j)
bjc = b(jc+j)
u1 = bjb + bjc
bja = b(ja+j)
u2 = bja - 0.5_dp * u1
u3 = c1 * ( bjb - bjc )
bjd = b(jd+j)
bjb =  bjd
a(ja+j) = aja + t1
b(ja+j) = bja + u1
a(jd+j) = t2 - u3
b(jd+j) = u2 + t3
ajc =  t2 + u3
bjc =  u2 - t3
!----------------------
aje = a(je+j)
ajf = a(jf+j)
t1 = aje + ajf
t2 = ajb - 0.5_dp * t1
t3 = c1 * ( aje - ajf )
ajh = a(jh+j)
ajf =  ajh
bje = b(je+j)
bjf = b(jf+j)
u1 = bje + bjf
u2 = bjb - 0.5_dp * u1
u3 = c1 * ( bje - bjf )
bjh = b(jh+j)
bjf =  bjh
a(jb+j) = ajb + t1
b(jb+j) = bjb + u1
a(je+j) = t2 - u3
b(je+j) = u2 + t3
a(jh+j) = t2 + u3
b(jh+j) = u2 - t3
!----------------------
aji = a(ji+j)
t1 = ajf + aji
ajg = a(jg+j)
t2 = ajg - 0.5_dp * t1
t3 = c1 * ( ajf - aji )
t1 = ajg + t1
a(jg+j) = ajc
bji = b(ji+j)
u1 = bjf + bji
bjg = b(jg+j)
u2 = bjg - 0.5_dp * u1
u3 = c1 * ( bjf - bji )
u1 = bjg + u1
b(jg+j) = bjc
a(jc+j) = t1
b(jc+j) = u1
a(jf+j) = t2 - u3
b(jf+j) = u2 + t3
a(ji+j) = t2 + u3
b(ji+j) = u2 - t3
j = j + jump
410 continue
!-----( end of loop across transforms )
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
415 continue
420 continue
430 continue
!-----( end of double loop for k=0 )
!
!  finished if last pass
!  ---------------------
if (ipass.eq.m) go to 490
!
kk = 2*la
!
!     loop on nonzero k
!     -----------------
do 470 k = ink , jstep-ink , ink
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
!
!  double loop along first transform in block
!  ------------------------------------------
do 460 ll = k , (la-1)*ink , 3*jstep
!
do 450 jjj = ll , (n-1)*inc , 3*la*ink
ja = istart + jjj
!
!  "transverse" loop
!  -----------------
do 445 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = ja + laincl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = je + jstepl
if (jf.lt.istart) jf = jf + ninc
jg = jd + laincl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
ji = jh + jstepl
if (ji.lt.istart) ji = ji + ninc
j = 0
!
!  loop across transforms
!  ----------------------
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 440 l = 1 , nvex
ajb = a(jb+j)
ajc = a(jc+j)
t1 = ajb + ajc
aja = a(ja+j)
t2 = aja - 0.5_dp * t1
t3 = c1 * ( ajb - ajc )
ajd = a(jd+j)
ajb =  ajd
bjb = b(jb+j)
bjc = b(jc+j)
u1 = bjb + bjc
bja = b(ja+j)
u2 = bja - 0.5_dp * u1
u3 = c1 * ( bjb - bjc )
bjd = b(jd+j)
bjb =  bjd
a(ja+j) = aja + t1
b(ja+j) = bja + u1
a(jd+j) = co1*(t2-u3) - si1*(u2+t3)
b(jd+j) = si1*(t2-u3) + co1*(u2+t3)
ajc =  co2*(t2+u3) - si2*(u2-t3)
bjc =  si2*(t2+u3) + co2*(u2-t3)
!----------------------
aje = a(je+j)
ajf = a(jf+j)
t1 = aje + ajf
t2 = ajb - 0.5_dp * t1
t3 = c1 * ( aje - ajf )
ajh = a(jh+j)
ajf =  ajh
bje = b(je+j)
bjf = b(jf+j)
u1 = bje + bjf
u2 = bjb - 0.5_dp * u1
u3 = c1 * ( bje - bjf )
bjh = b(jh+j)
bjf =  bjh
a(jb+j) = ajb + t1
b(jb+j) = bjb + u1
a(je+j) = co1*(t2-u3) - si1*(u2+t3)
b(je+j) = si1*(t2-u3) + co1*(u2+t3)
a(jh+j) = co2*(t2+u3) - si2*(u2-t3)
b(jh+j) = si2*(t2+u3) + co2*(u2-t3)
!----------------------
aji = a(ji+j)
t1 = ajf + aji
ajg = a(jg+j)
t2 = ajg - 0.5_dp * t1
t3 = c1 * ( ajf - aji )
t1 = ajg + t1
a(jg+j) = ajc
bji = b(ji+j)
u1 = bjf + bji
bjg = b(jg+j)
u2 = bjg - 0.5_dp * u1
u3 = c1 * ( bjf - bji )
u1 = bjg + u1
b(jg+j) = bjc
a(jc+j) = t1
b(jc+j) = u1
a(jf+j) = co1*(t2-u3) - si1*(u2+t3)
b(jf+j) = si1*(t2-u3) + co1*(u2+t3)
a(ji+j) = co2*(t2+u3) - si2*(u2-t3)
b(ji+j) = si2*(t2+u3) + co2*(u2-t3)
j = j + jump
440 continue
!-----(end of loop across transforms)
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
445 continue
450 continue
460 continue
!-----( end of double loop for this k )
kk = kk + 2*la
470 continue
!-----( end of loop over values of k )
la = 3*la
480 continue
!-----( end of loop on type II radix-3 passes )
!-----( nvex transforms completed)
490 continue
istart = istart + nvex * jump
500 continue
!-----( end of loop on blocks of transforms )
!
return
end subroutine gpfa3f

!*******************************************************************

!     fortran version of *gpfa5* -
!     radix-5 section of self-sorting, in-place,
!        generalized pfa
!
!-------------------------------------------------------------------
!
subroutine gpfa5f(a,b,trigs,inc,jump,n,mm,lot,isign)

real(wp)     ::     a(*), b(*)
real(dp)     ::     trigs(*)
integer inc, jump, n, mm, lot, isign

real(dp), parameter :: sin36 = 0.587785252292473_dp, &
                       sin72 = 0.951056516295154_dp, &
                       qrt5  = 0.559016994374947_dp
integer, parameter  :: lvr = 128

integer      ::  inq, jstepx, ninc, ink, mu, m, mh, &
                 nblox, n5, left, istart, nb, nvex, la, &
                 ipass, jstep, jstepl, kk, k, jjj, ja, &
                 nu, jb, jc, jd, je, j, laincl, ll, l, &
                 jf, jg, jh, ji, jj, jk, jl, jm, jn, jo, &
                 jp, jq, jr, js, jt, ju, jv, jw, jx, jy

real(dp) ::  s, c1, c2, c3, co1, si1, co2, si2, co3, &
                 si3, co4, si4, ajb, aje, t1, ajc, ajd, &
                 t2, t3, t4, t5, t6, aja, t7, t8, t9, t10, &
                 t11, bjb, bje, u1, bjc, bjd, u2, u3, u4, &
                 u5, u6, bja, u7, u8, u9, u10, u11, ajf, &
                 ajk, bjf, bjk, ajg, ajj, ajh, aji, ajl, &
                 ajq, bjg, bjj, bjh, bji, bjl, bjq, ajo, &
                 ajm, ajn, ajr, ajw, bjo, bjm, bjn, bjr, &
                 bjw, ajt, ajs, ajx, ajp, ax, bjt, bjs, &
                 bjx, bjp, bx, ajv, ajy, aju, bjv, bjy, &
                 bju
!
!     ***************************************************************
!     *                                                             *
!     *  N.B. LVR = LENGTH OF VECTOR REGISTERS, SET TO 128 FOR C90. *
!     *  RESET TO 64 FOR OTHER CRAY MACHINES, OR TO ANY LARGE VALUE *
!     *  (GREATER THAN OR EQUAL TO LOT) FOR A SCALAR COMPUTER.      *
!     *                                                             *
!     ***************************************************************
!
n5 = 5 ** mm
inq = n / n5
jstepx = (n5-n) * inc
ninc = n * inc
ink = inc * inq
mu = mod(inq,5)
if (isign.eq.-1) mu = 5 - mu
!
m = mm
mh = (m+1)/2
s = real(isign,kind=dp)
c1 = qrt5
c2 = sin72
c3 = sin36
if (mu.eq.2.or.mu.eq.3) then
   c1 = -c1
   c2 = sin36
   c3 = sin72
endif
if (mu.eq.3.or.mu.eq.4) c2 = -c2
if (mu.eq.2.or.mu.eq.4) c3 = -c3
!
nblox = 1 + (lot-1)/lvr
left = lot
s = real(isign,kind=dp)
istart = 1
!
!  loop on blocks of lvr transforms
!  --------------------------------
do 500 nb = 1 , nblox
!
if (left.le.lvr) then
   nvex = left
else if (left.lt.(2*lvr)) then
   nvex = left/2
   nvex = nvex + mod(nvex,2)
else
   nvex = lvr
endif
left = left - nvex
!
la = 1
!
!  loop on type I radix-5 passes
!  -----------------------------
do 160 ipass = 1 , mh
jstep = (n*inc) / (5*la)
jstepl = jstep - ninc
kk = 0
!
!  loop on k
!  ---------
do 150 k = 0 , jstep-ink , ink
!
if (k.gt.0) then
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
co3 = trigs(3*kk+1)
si3 = s*trigs(3*kk+2)
co4 = trigs(4*kk+1)
si4 = s*trigs(4*kk+2)
endif
!
!  loop along transform
!  --------------------
do 140 jjj = k , (n-1)*inc , 5*jstep
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 135 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
j = 0
!
!  loop across transforms
!  ----------------------
if (k.eq.0) then
!
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 110 l = 1 , nvex
ajb = a(jb+j)
aje = a(je+j)
t1 = ajb + aje
ajc = a(jc+j)
ajd = a(jd+j)
t2 = ajc + ajd
t3 = ajb - aje
t4 = ajc - ajd
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aja = a(ja+j)
t7 = aja - 0.25_dp * t5
a(ja+j) = aja + t5
t8 = t7 + t6
t9 = t7 - t6
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjb = b(jb+j)
bje = b(je+j)
u1 = bjb + bje
bjc = b(jc+j)
bjd = b(jd+j)
u2 = bjc + bjd
u3 = bjb - bje
u4 = bjc - bjd
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bja = b(ja+j)
u7 = bja - 0.25_dp * u5
b(ja+j) = bja + u5
u8 = u7 + u6
u9 = u7 - u6
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jb+j) = t8 - u11
b(jb+j) = u8 + t11
a(je+j) = t8 + u11
b(je+j) = u8 - t11
a(jc+j) = t9 - u10
b(jc+j) = u9 + t10
a(jd+j) = t9 + u10
b(jd+j) = u9 - t10
j = j + jump
110 continue
!
else
!
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 130 l = 1 , nvex
ajb = a(jb+j)
aje = a(je+j)
t1 = ajb + aje
ajc = a(jc+j)
ajd = a(jd+j)
t2 = ajc + ajd
t3 = ajb - aje
t4 = ajc - ajd
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aja = a(ja+j)
t7 = aja - 0.25_dp * t5
a(ja+j) = aja + t5
t8 = t7 + t6
t9 = t7 - t6
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjb = b(jb+j)
bje = b(je+j)
u1 = bjb + bje
bjc = b(jc+j)
bjd = b(jd+j)
u2 = bjc + bjd
u3 = bjb - bje
u4 = bjc - bjd
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bja = b(ja+j)
u7 = bja - 0.25_dp * u5
b(ja+j) = bja + u5
u8 = u7 + u6
u9 = u7 - u6
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jb+j) = co1*(t8-u11) - si1*(u8+t11)
b(jb+j) = si1*(t8-u11) + co1*(u8+t11)
a(je+j) = co4*(t8+u11) - si4*(u8-t11)
b(je+j) = si4*(t8+u11) + co4*(u8-t11)
a(jc+j) = co2*(t9-u10) - si2*(u9+t10)
b(jc+j) = si2*(t9-u10) + co2*(u9+t10)
a(jd+j) = co3*(t9+u10) - si3*(u9-t10)
b(jd+j) = si3*(t9+u10) + co3*(u9-t10)
j = j + jump
130 continue
!
endif
!
!-----( end of loop across transforms )
!
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
135 continue
140 continue
!-----( end of loop along transforms )
kk = kk + 2*la
150 continue
!-----( end of loop on nonzero k )
la = 5*la
160 continue
!-----( end of loop on type I radix-5 passes)
!
if (n.eq.5) go to 490
!
!  loop on type II radix-5 passes
!  ------------------------------
400 continue
!
do 480 ipass = mh+1 , m
jstep = (n*inc) / (5*la)
jstepl = jstep - ninc
laincl = la * ink - ninc
kk = 0
!
!     loop on k
!     ---------
do 470 k = 0 , jstep-ink , ink
!
if (k.gt.0) then
co1 = trigs(kk+1)
si1 = s*trigs(kk+2)
co2 = trigs(2*kk+1)
si2 = s*trigs(2*kk+2)
co3 = trigs(3*kk+1)
si3 = s*trigs(3*kk+2)
co4 = trigs(4*kk+1)
si4 = s*trigs(4*kk+2)
endif
!
!  double loop along first transform in block
!  ------------------------------------------
do 460 ll = k , (la-1)*ink , 5*jstep
!
do 450 jjj = ll , (n-1)*inc , 5*la*ink
ja = istart + jjj
!
!     "transverse" loop
!     -----------------
do 445 nu = 1 , inq
jb = ja + jstepl
if (jb.lt.istart) jb = jb + ninc
jc = jb + jstepl
if (jc.lt.istart) jc = jc + ninc
jd = jc + jstepl
if (jd.lt.istart) jd = jd + ninc
je = jd + jstepl
if (je.lt.istart) je = je + ninc
jf = ja + laincl
if (jf.lt.istart) jf = jf + ninc
jg = jf + jstepl
if (jg.lt.istart) jg = jg + ninc
jh = jg + jstepl
if (jh.lt.istart) jh = jh + ninc
ji = jh + jstepl
if (ji.lt.istart) ji = ji + ninc
jj = ji + jstepl
if (jj.lt.istart) jj = jj + ninc
jk = jf + laincl
if (jk.lt.istart) jk = jk + ninc
jl = jk + jstepl
if (jl.lt.istart) jl = jl + ninc
jm = jl + jstepl
if (jm.lt.istart) jm = jm + ninc
jn = jm + jstepl
if (jn.lt.istart) jn = jn + ninc
jo = jn + jstepl
if (jo.lt.istart) jo = jo + ninc
jp = jk + laincl
if (jp.lt.istart) jp = jp + ninc
jq = jp + jstepl
if (jq.lt.istart) jq = jq + ninc
jr = jq + jstepl
if (jr.lt.istart) jr = jr + ninc
js = jr + jstepl
if (js.lt.istart) js = js + ninc
jt = js + jstepl
if (jt.lt.istart) jt = jt + ninc
ju = jp + laincl
if (ju.lt.istart) ju = ju + ninc
jv = ju + jstepl
if (jv.lt.istart) jv = jv + ninc
jw = jv + jstepl
if (jw.lt.istart) jw = jw + ninc
jx = jw + jstepl
if (jx.lt.istart) jx = jx + ninc
jy = jx + jstepl
if (jy.lt.istart) jy = jy + ninc
j = 0
!
!  loop across transforms
!  ----------------------
if (k.eq.0) then
!
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 410 l = 1 , nvex
ajb = a(jb+j)
aje = a(je+j)
t1 = ajb + aje
ajc = a(jc+j)
ajd = a(jd+j)
t2 = ajc + ajd
t3 = ajb - aje
t4 = ajc - ajd
ajf = a(jf+j)
ajb =  ajf
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aja = a(ja+j)
t7 = aja - 0.25_dp * t5
a(ja+j) = aja + t5
t8 = t7 + t6
t9 = t7 - t6
ajk = a(jk+j)
ajc =  ajk
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjb = b(jb+j)
bje = b(je+j)
u1 = bjb + bje
bjc = b(jc+j)
bjd = b(jd+j)
u2 = bjc + bjd
u3 = bjb - bje
u4 = bjc - bjd
bjf = b(jf+j)
bjb =  bjf
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bja = b(ja+j)
u7 = bja - 0.25_dp * u5
b(ja+j) = bja + u5
u8 = u7 + u6
u9 = u7 - u6
bjk = b(jk+j)
bjc =  bjk
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jf+j) = t8 - u11
b(jf+j) = u8 + t11
aje =  t8 + u11
bje =  u8 - t11
a(jk+j) = t9 - u10
b(jk+j) = u9 + t10
ajd =  t9 + u10
bjd =  u9 - t10
!----------------------
ajg = a(jg+j)
ajj = a(jj+j)
t1 = ajg + ajj
ajh = a(jh+j)
aji = a(ji+j)
t2 = ajh + aji
t3 = ajg - ajj
t4 = ajh - aji
ajl = a(jl+j)
ajh =  ajl
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
t7 = ajb - 0.25_dp * t5
a(jb+j) = ajb + t5
t8 = t7 + t6
t9 = t7 - t6
ajq = a(jq+j)
aji =  ajq
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjg = b(jg+j)
bjj = b(jj+j)
u1 = bjg + bjj
bjh = b(jh+j)
bji = b(ji+j)
u2 = bjh + bji
u3 = bjg - bjj
u4 = bjh - bji
bjl = b(jl+j)
bjh =  bjl
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
u7 = bjb - 0.25_dp * u5
b(jb+j) = bjb + u5
u8 = u7 + u6
u9 = u7 - u6
bjq = b(jq+j)
bji =  bjq
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jg+j) = t8 - u11
b(jg+j) = u8 + t11
ajj =  t8 + u11
bjj =  u8 - t11
a(jl+j) = t9 - u10
b(jl+j) = u9 + t10
a(jq+j) = t9 + u10
b(jq+j) = u9 - t10
!----------------------
ajo = a(jo+j)
t1 = ajh + ajo
ajm = a(jm+j)
ajn = a(jn+j)
t2 = ajm + ajn
t3 = ajh - ajo
t4 = ajm - ajn
ajr = a(jr+j)
ajn =  ajr
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
t7 = ajc - 0.25_dp * t5
a(jc+j) = ajc + t5
t8 = t7 + t6
t9 = t7 - t6
ajw = a(jw+j)
ajo =  ajw
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjo = b(jo+j)
u1 = bjh + bjo
bjm = b(jm+j)
bjn = b(jn+j)
u2 = bjm + bjn
u3 = bjh - bjo
u4 = bjm - bjn
bjr = b(jr+j)
bjn =  bjr
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
u7 = bjc - 0.25_dp * u5
b(jc+j) = bjc + u5
u8 = u7 + u6
u9 = u7 - u6
bjw = b(jw+j)
bjo =  bjw
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jh+j) = t8 - u11
b(jh+j) = u8 + t11
a(jw+j) = t8 + u11
b(jw+j) = u8 - t11
a(jm+j) = t9 - u10
b(jm+j) = u9 + t10
a(jr+j) = t9 + u10
b(jr+j) = u9 - t10
!----------------------
ajt = a(jt+j)
t1 = aji + ajt
ajs = a(js+j)
t2 = ajn + ajs
t3 = aji - ajt
t4 = ajn - ajs
ajx = a(jx+j)
ajt =  ajx
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
ajp = a(jp+j)
t7 = ajp - 0.25_dp * t5
ax = ajp + t5
t8 = t7 + t6
t9 = t7 - t6
a(jp+j) = ajd
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
a(jd+j) = ax
bjt = b(jt+j)
u1 = bji + bjt
bjs = b(js+j)
u2 = bjn + bjs
u3 = bji - bjt
u4 = bjn - bjs
bjx = b(jx+j)
bjt =  bjx
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bjp = b(jp+j)
u7 = bjp - 0.25_dp * u5
bx = bjp + u5
u8 = u7 + u6
u9 = u7 - u6
b(jp+j) = bjd
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
b(jd+j) = bx
a(ji+j) = t8 - u11
b(ji+j) = u8 + t11
a(jx+j) = t8 + u11
b(jx+j) = u8 - t11
a(jn+j) = t9 - u10
b(jn+j) = u9 + t10
a(js+j) = t9 + u10
b(js+j) = u9 - t10
!----------------------
ajv = a(jv+j)
ajy = a(jy+j)
t1 = ajv + ajy
t2 = ajo + ajt
t3 = ajv - ajy
t4 = ajo - ajt
a(jv+j) = ajj
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aju = a(ju+j)
t7 = aju - 0.25_dp * t5
ax = aju + t5
t8 = t7 + t6
t9 = t7 - t6
a(ju+j) = aje
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
a(je+j) = ax
bjv = b(jv+j)
bjy = b(jy+j)
u1 = bjv + bjy
u2 = bjo + bjt
u3 = bjv - bjy
u4 = bjo - bjt
b(jv+j) = bjj
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bju = b(ju+j)
u7 = bju - 0.25_dp * u5
bx = bju + u5
u8 = u7 + u6
u9 = u7 - u6
b(ju+j) = bje
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
b(je+j) = bx
a(jj+j) = t8 - u11
b(jj+j) = u8 + t11
a(jy+j) = t8 + u11
b(jy+j) = u8 - t11
a(jo+j) = t9 - u10
b(jo+j) = u9 + t10
a(jt+j) = t9 + u10
b(jt+j) = u9 - t10
j = j + jump
410 continue
!
else
!
#ifdef OLD_CRAY
!dir$ ivdep, shortloop
#endif
do 440 l = 1 , nvex
ajb = a(jb+j)
aje = a(je+j)
t1 = ajb + aje
ajc = a(jc+j)
ajd = a(jd+j)
t2 = ajc + ajd
t3 = ajb - aje
t4 = ajc - ajd
ajf = a(jf+j)
ajb =  ajf
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aja = a(ja+j)
t7 = aja - 0.25_dp * t5
a(ja+j) = aja + t5
t8 = t7 + t6
t9 = t7 - t6
ajk = a(jk+j)
ajc =  ajk
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjb = b(jb+j)
bje = b(je+j)
u1 = bjb + bje
bjc = b(jc+j)
bjd = b(jd+j)
u2 = bjc + bjd
u3 = bjb - bje
u4 = bjc - bjd
bjf = b(jf+j)
bjb =  bjf
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bja = b(ja+j)
u7 = bja - 0.25_dp * u5
b(ja+j) = bja + u5
u8 = u7 + u6
u9 = u7 - u6
bjk = b(jk+j)
bjc =  bjk
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jf+j) = co1*(t8-u11) - si1*(u8+t11)
b(jf+j) = si1*(t8-u11) + co1*(u8+t11)
aje =  co4*(t8+u11) - si4*(u8-t11)
bje =  si4*(t8+u11) + co4*(u8-t11)
a(jk+j) = co2*(t9-u10) - si2*(u9+t10)
b(jk+j) = si2*(t9-u10) + co2*(u9+t10)
ajd =  co3*(t9+u10) - si3*(u9-t10)
bjd =  si3*(t9+u10) + co3*(u9-t10)
!----------------------
ajg = a(jg+j)
ajj = a(jj+j)
t1 = ajg + ajj
ajh = a(jh+j)
aji = a(ji+j)
t2 = ajh + aji
t3 = ajg - ajj
t4 = ajh - aji
ajl = a(jl+j)
ajh =  ajl
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
t7 = ajb - 0.25_dp * t5
a(jb+j) = ajb + t5
t8 = t7 + t6
t9 = t7 - t6
ajq = a(jq+j)
aji =  ajq
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjg = b(jg+j)
bjj = b(jj+j)
u1 = bjg + bjj
bjh = b(jh+j)
bji = b(ji+j)
u2 = bjh + bji
u3 = bjg - bjj
u4 = bjh - bji
bjl = b(jl+j)
bjh =  bjl
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
u7 = bjb - 0.25_dp * u5
b(jb+j) = bjb + u5
u8 = u7 + u6
u9 = u7 - u6
bjq = b(jq+j)
bji =  bjq
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jg+j) = co1*(t8-u11) - si1*(u8+t11)
b(jg+j) = si1*(t8-u11) + co1*(u8+t11)
ajj =  co4*(t8+u11) - si4*(u8-t11)
bjj =  si4*(t8+u11) + co4*(u8-t11)
a(jl+j) = co2*(t9-u10) - si2*(u9+t10)
b(jl+j) = si2*(t9-u10) + co2*(u9+t10)
a(jq+j) = co3*(t9+u10) - si3*(u9-t10)
b(jq+j) = si3*(t9+u10) + co3*(u9-t10)
!----------------------
ajo = a(jo+j)
t1 = ajh + ajo
ajm = a(jm+j)
ajn = a(jn+j)
t2 = ajm + ajn
t3 = ajh - ajo
t4 = ajm - ajn
ajr = a(jr+j)
ajn =  ajr
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
t7 = ajc - 0.25_dp * t5
a(jc+j) = ajc + t5
t8 = t7 + t6
t9 = t7 - t6
ajw = a(jw+j)
ajo =  ajw
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
bjo = b(jo+j)
u1 = bjh + bjo
bjm = b(jm+j)
bjn = b(jn+j)
u2 = bjm + bjn
u3 = bjh - bjo
u4 = bjm - bjn
bjr = b(jr+j)
bjn =  bjr
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
u7 = bjc - 0.25_dp * u5
b(jc+j) = bjc + u5
u8 = u7 + u6
u9 = u7 - u6
bjw = b(jw+j)
bjo =  bjw
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
a(jh+j) = co1*(t8-u11) - si1*(u8+t11)
b(jh+j) = si1*(t8-u11) + co1*(u8+t11)
a(jw+j) = co4*(t8+u11) - si4*(u8-t11)
b(jw+j) = si4*(t8+u11) + co4*(u8-t11)
a(jm+j) = co2*(t9-u10) - si2*(u9+t10)
b(jm+j) = si2*(t9-u10) + co2*(u9+t10)
a(jr+j) = co3*(t9+u10) - si3*(u9-t10)
b(jr+j) = si3*(t9+u10) + co3*(u9-t10)
!----------------------
ajt = a(jt+j)
t1 = aji + ajt
ajs = a(js+j)
t2 = ajn + ajs
t3 = aji - ajt
t4 = ajn - ajs
ajx = a(jx+j)
ajt =  ajx
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
ajp = a(jp+j)
t7 = ajp - 0.25_dp * t5
ax = ajp + t5
t8 = t7 + t6
t9 = t7 - t6
a(jp+j) = ajd
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
a(jd+j) = ax
bjt = b(jt+j)
u1 = bji + bjt
bjs = b(js+j)
u2 = bjn + bjs
u3 = bji - bjt
u4 = bjn - bjs
bjx = b(jx+j)
bjt =  bjx
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bjp = b(jp+j)
u7 = bjp - 0.25_dp * u5
bx = bjp + u5
u8 = u7 + u6
u9 = u7 - u6
b(jp+j) = bjd
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
b(jd+j) = bx
a(ji+j) = co1*(t8-u11) - si1*(u8+t11)
b(ji+j) = si1*(t8-u11) + co1*(u8+t11)
a(jx+j) = co4*(t8+u11) - si4*(u8-t11)
b(jx+j) = si4*(t8+u11) + co4*(u8-t11)
a(jn+j) = co2*(t9-u10) - si2*(u9+t10)
b(jn+j) = si2*(t9-u10) + co2*(u9+t10)
a(js+j) = co3*(t9+u10) - si3*(u9-t10)
b(js+j) = si3*(t9+u10) + co3*(u9-t10)
!----------------------
ajv = a(jv+j)
ajy = a(jy+j)
t1 = ajv + ajy
t2 = ajo + ajt
t3 = ajv - ajy
t4 = ajo - ajt
a(jv+j) = ajj
t5 = t1 + t2
t6 = c1 * ( t1 - t2 )
aju = a(ju+j)
t7 = aju - 0.25_dp * t5
ax = aju + t5
t8 = t7 + t6
t9 = t7 - t6
a(ju+j) = aje
t10 = c3 * t3 - c2 * t4
t11 = c2 * t3 + c3 * t4
a(je+j) = ax
bjv = b(jv+j)
bjy = b(jy+j)
u1 = bjv + bjy
u2 = bjo + bjt
u3 = bjv - bjy
u4 = bjo - bjt
b(jv+j) = bjj
u5 = u1 + u2
u6 = c1 * ( u1 - u2 )
bju = b(ju+j)
u7 = bju - 0.25_dp * u5
bx = bju + u5
u8 = u7 + u6
u9 = u7 - u6
b(ju+j) = bje
u10 = c3 * u3 - c2 * u4
u11 = c2 * u3 + c3 * u4
b(je+j) = bx
a(jj+j) = co1*(t8-u11) - si1*(u8+t11)
b(jj+j) = si1*(t8-u11) + co1*(u8+t11)
a(jy+j) = co4*(t8+u11) - si4*(u8-t11)
b(jy+j) = si4*(t8+u11) + co4*(u8-t11)
a(jo+j) = co2*(t9-u10) - si2*(u9+t10)
b(jo+j) = si2*(t9-u10) + co2*(u9+t10)
a(jt+j) = co3*(t9+u10) - si3*(u9-t10)
b(jt+j) = si3*(t9+u10) + co3*(u9-t10)
j = j + jump
440 continue
!
endif
!
!-----(end of loop across transforms)
!
ja = ja + jstepx
if (ja.lt.istart) ja = ja + ninc
445 continue
450 continue
460 continue
!-----( end of double loop for this k )
kk = kk + 2*la
470 continue
!-----( end of loop over values of k )
la = 5*la
480 continue
!-----( end of loop on type II radix-5 passes )
!-----( nvex transforms completed)
490 continue
istart = istart + nvex * jump
500 continue
!-----( end of loop on blocks of transforms )
!
return
end subroutine gpfa5f

end module gridxc_gpfa_core_dp




module gridxc_fft_gpfa
!
!     1d fft routine based on Clive Temperton's GPFA package
!
use gridxc_gpfa_core_sp, only: gpfa
use gridxc_gpfa_core_dp, only: gpfa

implicit none

integer, parameter :: dp = selected_real_kind(10,100)
integer, parameter :: sp = selected_real_kind(6,10)

  ! This module is not thread safe due to the use of
  ! these global variables.
  ! To make it thread-safe, and to have more control over
  ! the re-use of the auxiliary twiddle factors, you might
  ! define a derived type to hold them and call the wrapper routines
  ! with it as one of the arguments.

real(dp), allocatable, save :: trigs(:)
integer, save :: nold = -1
  !----------------------------------------------------------
! auxiliary variable
character(len=20) :: msg

public :: nfft            ! Generates N compatible with the fft

interface fft_gpfa_ez
   module procedure fft_gpfa_ez_dp
   module procedure fft_gpfa_ez_sp
end interface fft_gpfa_ez

interface fft_gpfa
   module procedure fft_gpfa_dp
   module procedure fft_gpfa_sp
end interface fft_gpfa

public :: fft_gpfa_ez     ! Simple version with complex array
public :: fft_gpfa        ! General version with strides,
                          ! multiple vectors, etc

! Wrapper for setgpfa which returns proper length of work array
public :: setgpfa_check

! Access to the standard legacy routines, if needed
public :: gpfa, setgpfa

private 

CONTAINS

SUBROUTINE nfft( N )

integer, intent(inout) :: N

! CHANGES N INTO THE NEXT INTEGER ALLOWED BY THE FFT ROUTINE
! WRITTEN BY J.M.SOLER. MAY/95.
! Modified by A. Garcia, May 2015

integer :: np, nmax, nmin, nrem, ip
! no limits with huge, but subtract 2 to avoid 'overflow' warning
parameter (NP = 3, NMAX = huge(N)-2)  
integer :: IPRIME(NP)
data IPRIME / 2, 3, 5 /

NMIN = N
DO N = NMIN, NMAX
  NREM = N
  DO IP = 1,NP
10     CONTINUE 
    IF ( MODULO( NREM, IPRIME(IP) ) .EQ. 0 ) THEN
      NREM = NREM / IPRIME(IP)
      GOTO 10
    ENDIF
  ENDDO
  IF (NREM .EQ. 1) RETURN
ENDDO
write(msg,"(i0)") NMIN
call die('NFFT: NO SUITABLE INTEGER FOUND FOR N =' // trim(msg))
END SUBROUTINE nfft

!----------------------------------------------------
subroutine fft_gpfa_ez_sp(data,n,isign)

! packed complex array in a real array twice as long

integer, intent(in)     :: n
real(sp), intent(inout) :: data(2*n)
integer, intent(in)     :: isign

!        ISIGN = +1 FOR FORWARD TRANSFORM
!              = -1 FOR INVERSE TRANSFORM

  ! If n has changed since the last use,
  ! set trigs array, used by 1D-FFT routine gpfa

  ! The routine setgpfa_alloc is a special version
  ! of setgpfa which (re)allocates the trigs argument
  ! if needed, and re-fills it with new twiddle factors

if (n /= nold) then
   call setgpfa_alloc( trigs, n )
end if 

  ! FFT. Arguments of gpfa are:                                
  ! gpfa( realData(*), imagData(*), trigs(*),                    
  !       dataSpan, vectorSpan, vectorSize, numVectors, fftDir )              
  !
  ! Note dataSpan=2 since we are packing the real and imaginary
  ! parts in the same vector.

call gpfa( data(1:), data(2:), trigs, &
           2, 1, n, 1, isign )

nold = n

end subroutine fft_gpfa_ez_sp

!----------------------------------------------------
subroutine fft_gpfa_ez_dp(data,n,isign)

! packed complex array in a real array twice as long

integer, intent(in)     :: n
real(dp), intent(inout) :: data(2*n)
integer, intent(in)     :: isign

!        ISIGN = +1 FOR FORWARD TRANSFORM
!              = -1 FOR INVERSE TRANSFORM

  ! If n has changed since the last use,
  ! set trigs array, used by 1D-FFT routine gpfa

  ! The routine setgpfa_alloc is a special version
  ! of setgpfa which (re)allocates the trigs argument
  ! if needed, and re-fills it with new twiddle factors

if (n /= nold) then
   call setgpfa_alloc( trigs, n )
end if 

  ! FFT. Arguments of gpfa are:                                
  ! gpfa( realData(*), imagData(*), trigs(*),                    
  !       dataSpan, vectorSpan, vectorSize, numVectors, fftDir )              
  !
  ! Note dataSpan=2 since we are packing the real and imaginary
  ! parts in the same vector.

call gpfa( data(1:), data(2:), trigs, &
           2, 1, n, 1, isign )

nold = n

end subroutine fft_gpfa_ez_dp

!----------------------------------------------------------------------
subroutine fft_gpfa_sp(realData, imagData, &
                    dataSpan, vectorSpan, N, numVectors, &
                    isign ) 

! A wrapper for the GPFA routine which handles the
! work array automatically

real(sp), intent(inout) :: realData(:), imagData(:)
integer, intent(in)     :: dataSpan, vectorSpan, N, numVectors
integer, intent(in)     :: isign

!        ISIGN = +1 FOR FORWARD TRANSFORM
!              = -1 FOR INVERSE TRANSFORM

  ! Set trigs array, used by 1D-FFT routine gpfa
  ! The routine setgpfa_alloc is a special version
  ! of setgpfa which (re)allocates the trigs argument
  ! if needed, and re-fills it with new twiddle factors
  ! if n has changed since the last use.

if (n /= nold) then
   call setgpfa_alloc( trigs, n )
end if 

call gpfa(realData, imagData, trigs, &
          dataSpan, vectorSpan, N, numVectors, isign)

nold = n

end subroutine fft_gpfa_sp
!----------------------------------------------------------------------
subroutine fft_gpfa_dp(realData, imagData, &
                    dataSpan, vectorSpan, N, numVectors, &
                    isign ) 

! A wrapper for the GPFA routine which handles the
! work array automatically

real(dp), intent(inout) :: realData(:), imagData(:)
integer, intent(in)     :: dataSpan, vectorSpan, N, numVectors
integer, intent(in)     :: isign

!        ISIGN = +1 FOR FORWARD TRANSFORM
!              = -1 FOR INVERSE TRANSFORM

  ! Set trigs array, used by 1D-FFT routine gpfa
  ! The routine setgpfa_alloc is a special version
  ! of setgpfa which (re)allocates the trigs argument
  ! if needed, and re-fills it with new twiddle factors
  ! if n has changed since the last use.

if (n /= nold) then
   call setgpfa_alloc( trigs, n )
end if 

call gpfa(realData, imagData, trigs, &
          dataSpan, vectorSpan, N, numVectors, isign)

nold = n

end subroutine fft_gpfa_dp
!------------------------------------------------------------------
subroutine  setgpfa_alloc( trigs, n )
real(dp), allocatable, intent(inout) :: trigs(:)
integer, intent(in)                  :: n

integer :: ntrigs

if (.not. allocated(trigs)) then
   allocate(trigs(100))
endif
call setgpfa_check(trigs,size(trigs),ntrigs,n)

if (size(trigs) < ntrigs) then
   deallocate(trigs)
   allocate(trigs(ntrigs))
   call setgpfa_check(trigs,size(trigs),ntrigs,n)
   if (size(trigs) < ntrigs) STOP "ntrigs error"
endif

end subroutine setgpfa_alloc
!
SUBROUTINE SETGPFA_check(TRIGS,maxtrigs,ntrigs,N)
!
integer, intent(in)  :: maxtrigs  ! The current size of trigs
REAL(dp)  TRIGS(maxtrigs)
integer, intent(out) :: ntrigs    ! The needed size for trigs
integer, intent(in)  :: N

INTEGER NJ(3)
integer       :: nn, ifac, ll, kk, ip, iq, ir, &
                 i, ni, irot, kink, k
real(dp)      :: angle, twopi, del
!
!     DECOMPOSE N INTO FACTORS 2,3,5
!     ------------------------------
NN = N
IFAC = 2
!
DO 30 LL = 1 , 3
KK = 0
10 CONTINUE
IF (MOD(NN,IFAC).NE.0) GO TO 20
KK = KK + 1
NN = NN / IFAC
GO TO 10
20 CONTINUE
NJ(LL) = KK
IFAC = IFAC + LL
30 CONTINUE
!
IF (NN.NE.1) THEN
   write(msg,"(i0)") N
   call die('GPFA: '//trim(msg)//' IS NOT A LEGAL VALUE OF N')
ENDIF
!
IP = NJ(1)
IQ = NJ(2)
IR = NJ(3)
!
!     COMPUTE LIST OF ROTATED TWIDDLE FACTORS
!     ---------------------------------------
NJ(1) = 2**IP
NJ(2) = 3**IQ
NJ(3) = 5**IR
!
TWOPI = 4.0_dp * ASIN(1.0_dp)
I = 1
!
DO 60 LL = 1 , 3
NI = NJ(LL)
IF (NI.EQ.1) GO TO 60
!
DEL = TWOPI / real(NI,kind=dp)
IROT = N / NI
KINK = MOD(IROT,NI)
KK = 0
!
DO 50 K = 1 , NI
ANGLE = real(KK,kind=dp) * DEL
if (i.lt.maxtrigs) then
  TRIGS(I) = COS(ANGLE)
  TRIGS(I+1) = SIN(ANGLE)
endif
I = I + 2
KK = KK + KINK
IF (KK.GT.NI) KK = KK - NI
50 CONTINUE
60 CONTINUE
ntrigs = i - 1  ! note boundary. i is even
!
RETURN
END SUBROUTINE SETGPFA_check

!
!        SUBROUTINE 'SETGPFA'
!        SETUP ROUTINE FOR SELF-SORTING IN-PLACE
!            GENERALIZED PRIME FACTOR (COMPLEX) FFT [GPFA]
!
!        CALL SETGPFA(TRIGS,N)
!
!        INPUT :
!        -----
!        N IS THE LENGTH OF THE TRANSFORMS. N MUST BE OF THE FORM:
!          -----------------------------------
!            N = (2**IP) * (3**IQ) * (5**IR)
!          -----------------------------------
!
!        OUTPUT:
!        ------
!        TRIGS IS A TABLE OF TWIDDLE FACTORS,
!          OF LENGTH 2*IPQR (REAL) WORDS, WHERE:
!          --------------------------------------
!            IPQR = (2**IP) + (3**IQ) + (5**IR)
!          --------------------------------------
!
!        WRITTEN BY CLIVE TEMPERTON 1990
!
!*********************************************************************
!
SUBROUTINE SETGPFA(TRIGS,N)
!

INTEGER NJ(3)
REAL(dp) TRIGS(*)

integer       :: nn, n, ifac, ll, kk, ip, iq, ir, &
                 i, ni, irot, kink, k
real(dp)      :: angle, twopi, del
!
!     DECOMPOSE N INTO FACTORS 2,3,5
!     ------------------------------
NN = N
IFAC = 2
!
DO 30 LL = 1 , 3
KK = 0
10 CONTINUE
IF (MOD(NN,IFAC).NE.0) GO TO 20
KK = KK + 1
NN = NN / IFAC
GO TO 10
20 CONTINUE
NJ(LL) = KK
IFAC = IFAC + LL
30 CONTINUE
!
IF (NN.NE.1) THEN
   write(msg,"(i0)") N
   call die('GPFA: '//trim(msg)//' IS NOT A LEGAL VALUE OF N')
ENDIF
!
IP = NJ(1)
IQ = NJ(2)
IR = NJ(3)
!
!     COMPUTE LIST OF ROTATED TWIDDLE FACTORS
!     ---------------------------------------
NJ(1) = 2**IP
NJ(2) = 3**IQ
NJ(3) = 5**IR
!
TWOPI = 4.0_dp * ASIN(1.0_dp)
I = 1
!
DO 60 LL = 1 , 3
NI = NJ(LL)
IF (NI.EQ.1) GO TO 60
!
DEL = TWOPI / real(NI,kind=dp)
IROT = N / NI
KINK = MOD(IROT,NI)
KK = 0
!
DO 50 K = 1 , NI
ANGLE = real(KK,kind=dp) * DEL
TRIGS(I) = COS(ANGLE)
TRIGS(I+1) = SIN(ANGLE)
I = I + 2
KK = KK + KINK
IF (KK.GT.NI) KK = KK - NI
50 CONTINUE
60 CONTINUE
!
RETURN
END SUBROUTINE SETGPFA

end module gridxc_fft_gpfa