From octave-maintainers-request at bevo dot che dot wisc dot edu Fri Jan 30 10:03:01 2004 Subject: Re: [Fwd: Re: rfftw slower than fftw for "bad" size arrays] From: David Bateman To: octave-maintainers mailing list Date: Fri, 30 Jan 2004 16:58:58 +0100 --WhfpMioaduB5tiZL Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Ok, I've port Octave to also use FFTW 3.0.1, but I'm getting some odd results. I attach a patch, and some rewritten test programs and some comparisons I've done. Basically, in many cases I'm faster than both the old octave and matlab, but there are some cases where it is slower. I'm really not sure what is the issue, so any clues would be of assistance. Cheers David -- David Bateman David dot Bateman at motorola dot com Motorola CRM +33 1 69 35 48 04 (Ph) Parc Les Algorithmes, Commune de St Aubin +33 1 69 35 77 01 (Fax) 91193 Gif-Sur-Yvette FRANCE The information contained in this communication has been classified as: [x] General Business Information [ ] Motorola Internal Use Only [ ] Motorola Confidential Proprietary --WhfpMioaduB5tiZL Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="testfft.m" % NxM is the size of the FFT to test. Note 65536 was chosen as a test % for thrashing of the cache when copying from FFTW's packed format to % a normal matrix. N = [512, 513, 514, 512, 65536]; M = [512, 513, 512, 514, 1]; rep = 10; savedata = 0; testcmplx = 0; infft = cell(1,length(N)); resfft = cell(1, length(N)); resfft2 = cell(1, length(N)); timfft = cell(1,length(N)); timfft2 = cell(1,length(N)); if (exist('OCTAVE_VERSION')) platfrm = ['Octave ' version]; else platfrm = ['Matlab ' version]; end if (testcmplx) fprintf('Testing %s for complex ffts\n\n', platfrm); else fprintf('Testing %s for real ffts\n\n', platfrm); end for i=1:length(N) if (testcmplx) a = randn(N(i),M(i)) + 1i * randn(N(i),M(i)); else a = randn(N(i),M(i)); end infft{i} = a; fprintf('Testing fft(%5d,%5d) ', N(i), M(i)); if (exist('OCTAVE_VERSION')) eval('fflush(stdout);'); end % Call fft twice to ensure all variables initialized b = fft(a); b = fft(a); timfft{i} = 0; for j=1:rep t = cputime; b = fft(a); timfft{i} = timfft{i} + cputime - t; end resfft{i} = b; timfft{i} = timfft{i} / rep; fprintf('%5.2e sec\n', timfft{i}); fprintf('Testing fft2(%5d,%5d) ', N(i), M(i)); if (exist('OCTAVE_VERSION')) eval('fflush(stdout);'); end % Call fft2 twice to ensure all variables initialized b = fft2(a); b = fft2(a); timfft2{i} = 0; for j=1:rep t = cputime; b = fft2(a); timfft2{i} = timfft2{i} + cputime - t; end resfft2{i} = b; timfft2{i} = timfft2{i} / rep; fprintf('%5.2e sec\n', timfft2{i}); end fprintf('Saving Data '); if (exist('OCTAVE_VERSION')) eval('fflush(stdout);'); end if (savedata) save testfft.mat infft resfft timfft resfft2 timfft2 N M testcmplx platfrm else save testfft.mat timfft timfft2 N M testcmplx platfrm end fprintf('done\n'); --WhfpMioaduB5tiZL Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="testfft2.m" if ~exist('resfft') fprintf('Loading Data '); if (exist('OCTAVE_VERSION')) eval('fflush(stdout);'); end if (exist('OCTAVE_VERSION')) eval('load -force testfft.mat;'); else load testfft.mat; end fprintf('done\n'); end rep = 10; xresfft = cell(1, length(N)); xresfft2 = cell(1, length(N)); xtimfft = cell(1,length(N)); xtimfft2 = cell(1,length(N)); if (exist('OCTAVE_VERSION')) xplatfrm = ['Octave ' version]; else xplatfrm = ['Matlab ' version]; end if (testcmplx) fprintf('Testing %s vs %s for complex ffts\n\n', xplatfrm, platfrm); else fprintf('Testing %s vs %s for real ffts\n\n', xplatfrm, platfrm); end for i=1:length(N) if ( ~ exist ('infft')) if (testcmplx) a = randn(N(i),M(i)) + 1i * randn(N(i),M(i)); else a = randn(N(i), M(i)); end else a = infft{i}; end fprintf('Testing fft(%5d,%5d) ', N(i), M(i)); if (exist('OCTAVE_VERSION')) eval('fflush(stdout);'); end % Call fft twice to ensure all variables initialized b = fft(a); b = fft(a); xtimfft{i} = 0; for j=1:rep t = cputime; b = fft(a); xtimfft{i} = xtimfft{i} + cputime - t; end xresfft{i} = b; xtimfft{i} = xtimfft{i} / rep; if (exist('infft')) fprintf('%5.2e sec (%5.2e) rerr %5.2e\n', xtimfft{i}, xtimfft{i}/timfft{i}, 0.5*max(max(abs(resfft{i}-xresfft{i})./(abs(resfft{i})+abs(xresfft{i}))))); else fprintf('%5.2e sec (%5.2e)\n', xtimfft{i}, xtimfft{i}/timfft{i}); end fprintf('Testing fft2(%5d,%5d) ', N(i), M(i)); if (exist('OCTAVE_VERSION')) eval('fflush(stdout);'); end % Call fft2 twice to ensure all variables initialized b = fft2(a); b = fft2(a); xtimfft2{i} = 0; for j=1:rep t = cputime; b = fft2(a); xtimfft2{i} = xtimfft2{i} + cputime - t; end xresfft2{i} = b; xtimfft2{i} = xtimfft2{i} / rep; if (exist('infft')) fprintf('%5.2e sec (%5.2e) rerr %5.2e\n', xtimfft2{i}, xtimfft2{i}/timfft2{i}, 0.5*max(max(abs(resfft2{i}-xresfft2{i})./(abs(resfft2{i})+abs(xresfft2{i}))))); else fprintf('%5.2e sec (%5.2e)\n', xtimfft2{i}, xtimfft2{i}/timfft2{i}); end end --WhfpMioaduB5tiZL Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename=log Testing Octave 2.1.53 vs Matlab 6.1.0.450 (R12.1) for real ffts Testing fft( 512, 512) 5.60e-02 sec (1.19e+00) XXXXX Testing fft2( 512, 512) 8.50e-02 sec (4.15e-01) Testing fft( 513, 513) 1.34e-01 sec (1.28e+00) XXXXX Testing fft2( 513, 513) 2.03e-01 sec (8.53e-01) Testing fft( 514, 512) 9.00e-02 sec (6.34e-01) Testing fft2( 514, 512) 1.21e-01 sec (6.27e-01) Testing fft( 512, 514) 5.40e-02 sec (1.15e+00) XXXXX Testing fft2( 512, 514) 1.22e-01 sec (4.18e-01) Testing fft(65536, 1) 4.50e-02 sec (1.32e+00) XXXXX Testing fft2(65536, 1) 5.00e-02 sec (1.43e+00) XXXXX Testing Octave 2.1.53 vs Matlab 6.1.0.450 (R12.1) for complex ffts Testing fft( 512, 512) 4.80e-02 sec (7.87e-01) Testing fft2( 512, 512) 3.07e-01 sec (1.40e+00) XXXXX Testing fft( 513, 513) 1.31e-01 sec (8.97e-01) Testing fft2( 513, 513) 2.67e-01 sec (9.57e-01) Testing fft( 514, 512) 1.30e-01 sec (9.09e-01) Testing fft2( 514, 512) 1.94e-01 sec (9.95e-01) Testing fft( 512, 514) 4.90e-02 sec (8.17e-01) Testing fft2( 512, 514) 2.60e-01 sec (8.50e-01) Testing fft(65536, 1) 6.20e-02 sec (1.11e+00) XXXXX Testing fft2(65536, 1) 6.40e-02 sec (1.12e+00) XXXXX Testing Octave 2.1.53 vs Octave 2.1.50 for real ffts Testing fft( 512, 512) 5.50e-02 sec (6.40e-01) Testing fft2( 512, 512) 8.60e-02 sec (4.73e-01) Testing fft( 513, 513) 1.36e-01 sec (8.24e-01) Testing fft2( 513, 513) 2.00e-01 sec (7.52e-01) Testing fft( 514, 512) 8.70e-02 sec (5.15e-01) Testing fft2( 514, 512) 1.21e-01 sec (6.17e-01) Testing fft( 512, 514) 5.50e-02 sec (6.32e-01) Testing fft2( 512, 514) 1.20e-01 sec (4.35e-01) Testing fft(65536, 1) 4.60e-02 sec (1.15e+00) XXXXX Testing fft2(65536, 1) 4.70e-02 sec (1.02e+00) XXXXX Testing Octave 2.1.53 vs Octave 2.1.50 for complex ffts Testing fft( 512, 512) 4.90e-02 sec (9.61e-01) Testing fft2( 512, 512) 3.22e-01 sec (1.75e+00) XXXXX Testing fft( 513, 513) 1.29e-01 sec (1.00e+00) Testing fft2( 513, 513) 2.66e-01 sec (9.96e-01) Testing fft( 514, 512) 1.27e-01 sec (9.69e-01) Testing fft2( 514, 512) 1.95e-01 sec (9.95e-01) Testing fft( 512, 514) 4.80e-02 sec (9.60e-01) Testing fft2( 512, 514) 2.60e-01 sec (9.45e-01) Testing fft(65536, 1) 6.20e-02 sec (2.00e+00) XXXXX Testing fft2(65536, 1) 6.40e-02 sec (1.36e+00) XXXXX --WhfpMioaduB5tiZL Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename=patch *** ./liboctave/ChangeLog.orig 2004-01-30 15:24:31.000000000 +0100 --- ./liboctave/ChangeLog 2004-01-30 15:25:30.000000000 +0100 *************** *** 1,3 **** --- 1,14 ---- + 2004-01-30 David Bateman + + * oct-fftw.cc: Add support for FFTW 3.x. Include the ability to + use the real to complex transform for fft's of real matrices + * oct-fftw.h: Decls updated for the above + + * dMatrix.cc: fft's use real to complex transforms. 1D fft of a + matrix done as single call rather than loop. Update for FFTW 3.x + * CMatrix.cc: 1D fft of a matrix done as single call rather than + loop. Update for FFTW 3.x + 2004-01-22 John W. Eaton * Array.cc (Array::assign2, Array::assignN): *** ./liboctave/dMatrix.cc.orig 2004-01-28 15:17:03.000000000 +0100 --- ./liboctave/dMatrix.cc 2004-01-30 11:04:11.000000000 +0100 *************** *** 52,58 **** #include "oct-cmplx.h" #include "quit.h" ! #ifdef HAVE_FFTW #include "oct-fftw.h" #endif --- 52,58 ---- #include "oct-cmplx.h" #include "quit.h" ! #if defined (HAVE_FFTW) || defined (HAVE_FFTW3) #include "oct-fftw.h" #endif *************** *** 766,772 **** } } ! #ifdef HAVE_FFTW ComplexMatrix Matrix::fourier (void) const --- 766,772 ---- } } ! #if defined (HAVE_FFTW) || defined (HAVE_FFTW3) ComplexMatrix Matrix::fourier (void) const *************** *** 789,804 **** nsamples = nc; } ! ComplexMatrix tmp (*this); ! Complex *in (tmp.fortran_vec ()); Complex *out (retval.fortran_vec ()); for (size_t i = 0; i < nsamples; i++) { OCTAVE_QUIT; octave_fftw::fft (&in[npts * i], &out[npts * i], npts); } return retval; } --- 789,809 ---- nsamples = nc; } ! const double *in (fortran_vec ()); Complex *out (retval.fortran_vec ()); + // XXX FIXME XXX octave_fftw::fft (in, out, npts, nsamples); + // should be faster for FFTW 2.1.x as well but isn't !!! + #ifdef HAVE_FFTW3 + octave_fftw::fft (in, out, npts, nsamples); + #else for (size_t i = 0; i < nsamples; i++) { OCTAVE_QUIT; octave_fftw::fft (&in[npts * i], &out[npts * i], npts); } + #endif return retval; } *************** *** 828,839 **** --- 833,850 ---- Complex *in (tmp.fortran_vec ()); Complex *out (retval.fortran_vec ()); + // XXX FIXME XXX octave_fftw::ifft (in, out, npts, nsamples); + // should be faster for FFTW 2.1.x as well but isn't !!! + #ifdef HAVE_FFTW3 + octave_fftw::ifft (in, out, npts, nsamples); + #else for (size_t i = 0; i < nsamples; i++) { OCTAVE_QUIT; octave_fftw::ifft (&in[npts * i], &out[npts * i], npts); } + #endif return retval; } *************** *** 844,853 **** int nr = rows (); int nc = cols (); ! ComplexMatrix retval (*this); // Note the order of passing the rows and columns to account for // column-major storage. ! octave_fftw::fft2d (retval.fortran_vec (), nc, nr); return retval; } --- 855,865 ---- int nr = rows (); int nc = cols (); ! const double *in = fortran_vec (); ! ComplexMatrix retval (rows (), cols ()); // Note the order of passing the rows and columns to account for // column-major storage. ! octave_fftw::fft2d (in, retval.fortran_vec (), nc, nr); return retval; } *** ./liboctave/CMatrix.cc.orig 2004-01-28 15:17:09.000000000 +0100 --- ./liboctave/CMatrix.cc 2004-01-30 14:22:53.000000000 +0100 *************** *** 56,62 **** #include "mx-inlines.cc" #include "oct-cmplx.h" ! #ifdef HAVE_FFTW #include "oct-fftw.h" #endif --- 56,62 ---- #include "mx-inlines.cc" #include "oct-cmplx.h" ! #if defined (HAVE_FFTW) || defined (HAVE_FFTW3) #include "oct-fftw.h" #endif *************** *** 1102,1108 **** return retval; } ! #ifdef HAVE_FFTW ComplexMatrix ComplexMatrix::fourier (void) const --- 1102,1108 ---- return retval; } ! #if defined (HAVE_FFTW) || defined (HAVE_FFTW3) ComplexMatrix ComplexMatrix::fourier (void) const *************** *** 1128,1139 **** --- 1128,1145 ---- const Complex *in (data ()); Complex *out (retval.fortran_vec ()); + // XXX FIXME XXX octave_fftw::fft (in, out, npts, nsamples); + // should be faster for FFTW 2.1.x as well but isn't !!! + #ifdef HAVE_FFTW3 + octave_fftw::fft (in, out, npts, nsamples); + #else for (size_t i = 0; i < nsamples; i++) { OCTAVE_QUIT; octave_fftw::fft (&in[npts * i], &out[npts * i], npts); } + #endif return retval; } *************** *** 1162,1173 **** --- 1168,1185 ---- const Complex *in (data ()); Complex *out (retval.fortran_vec ()); + // XXX FIXME XXX octave_fftw::ifft (in, out, npts, nsamples); + // should be faster for FFTW 2.1.x as well but isn't !!! + #ifdef HAVE_FFTW3 + octave_fftw::ifft (in, out, npts, nsamples); + #else for (size_t i = 0; i < nsamples; i++) { OCTAVE_QUIT; octave_fftw::ifft (&in[npts * i], &out[npts * i], npts); } + #endif return retval; } *** ./liboctave/oct-fftw.h.orig 2004-01-28 15:17:17.000000000 +0100 --- ./liboctave/oct-fftw.h 2004-01-30 10:29:25.000000000 +0100 *************** *** 23,32 **** #include ! #if defined (HAVE_DFFTW_H) ! #include #else ! #include #endif #include "oct-cmplx.h" --- 23,38 ---- #include ! #if defined (HAVE_FFTW3) ! # include #else ! # if defined (HAVE_DFFTW_H) ! # include ! # include ! # else ! # include ! # include ! # endif #endif #include "oct-cmplx.h" *************** *** 35,42 **** octave_fftw { public: ! static int fft (const Complex*, Complex *, size_t); ! static int ifft (const Complex*, Complex *, size_t); static int fft2d (Complex*, size_t, size_t); static int ifft2d (Complex*, size_t, size_t); --- 41,54 ---- octave_fftw { public: ! static int fft (const double *in, Complex *out, size_t npts, ! size_t nsamples = 1); ! static int fft (const Complex *in, Complex *out, size_t npts, ! size_t nsamples = 1); ! static int ifft (const Complex *in, Complex *out, size_t npts, ! size_t nsamples = 1); ! ! static int fft2d (const double*, Complex*, size_t, size_t); static int fft2d (Complex*, size_t, size_t); static int ifft2d (Complex*, size_t, size_t); *** ./liboctave/oct-fftw.cc.orig 2004-01-28 15:17:22.000000000 +0100 --- ./liboctave/oct-fftw.cc 2004-01-30 15:30:19.000000000 +0100 *************** *** 22,33 **** #include #endif ! #ifdef HAVE_FFTW #include "oct-fftw.h" #include "lo-error.h" - // Helper class to create and cache fftw plans for both 1d and 2d. This // implementation uses FFTW_ESTIMATE to create the plans, which in theory // is suboptimal, but provides quite reasonable performance. Future --- 22,32 ---- #include #endif ! #if defined (HAVE_FFTW) || defined (HAVE_FFTW3) #include "oct-fftw.h" #include "lo-error.h" // Helper class to create and cache fftw plans for both 1d and 2d. This // implementation uses FFTW_ESTIMATE to create the plans, which in theory // is suboptimal, but provides quite reasonable performance. Future *************** *** 35,40 **** --- 34,195 ---- // to manipulate fftw wisdom so that users may choose the appropriate // planner. + // Also note that if FFTW_ESTIMATE is not used the planner in FFTW3 + // destroys the input and output arrays. So with the form of the + // current code we definitely want FFTW_ESTIMATE!! + + // XXX FIXME XXX If we can ensure 16 byte alignment in Array ( *data) + // the FFTW3 can use SIMD instructions for further acceleration. + + #ifdef HAVE_FFTW3 + + #define plan_flags (FFTW_ESTIMATE) + + static inline void convert_packcomplex_1d (Complex *out, size_t nr, + size_t nc) + { + // Fill in the missing data + for (size_t i = 0; i < nr; i++) + for (size_t j = nc/2+1; j < nc; j++) + out[j + i*nc] = conj(out[nc - j + i*nc]); + } + + static inline void convert_packcomplex_2d (Complex *out, size_t nr, + size_t nc) + { + Complex *ptr1, *ptr2; + + // Create space for the missing elements + for (size_t i = 0; i < nr; i++) + { + ptr1 = out + i * (nc/2 + 1) + nr*((nc-1)/2); + ptr2 = out + i * nc; + for (size_t j = 0; j < nc/2+1; j++) + *ptr2++ = *ptr1++; + } + + // Fill in the missing data + for (size_t i = 1; i < nr; i++) + for (size_t j = nc/2+1; j < nc; j++) + out[j + i*nc] = conj(out[nc - j + (nr-i)*nc]); + for (size_t j = nc/2+1; j < nc; j++) + out[j] = conj(out[nc - j]); + } + + int + octave_fftw::fft (const double *in, Complex *out, size_t npts, size_t nsamples) + { + fftw_plan plan + = fftw_plan_many_dft_r2c (1, reinterpret_cast (&npts), nsamples, + (const_cast (in)), NULL, 1, npts, + reinterpret_cast (out), NULL, 1, npts, plan_flags); + + fftw_execute (plan); + + fftw_destroy_plan (plan); + + // Need to create other half of the transform + convert_packcomplex_1d (out, nsamples, npts); + + return 0; + } + + int + octave_fftw::fft (const Complex *in, Complex *out, size_t npts, + size_t nsamples) + { + fftw_plan plan = fftw_plan_many_dft (1, reinterpret_cast (&npts), + nsamples, + reinterpret_cast (const_cast (in)), + NULL, 1, npts, + reinterpret_cast (out), NULL, 1, npts, + FFTW_FORWARD, plan_flags); + + fftw_execute (plan); + + fftw_destroy_plan (plan); + + return 0; + } + + int + octave_fftw::ifft (const Complex *in, Complex *out, size_t npts, + size_t nsamples) + { + fftw_plan plan = fftw_plan_many_dft (1, reinterpret_cast (&npts), + nsamples, + reinterpret_cast (const_cast (in)), + NULL, 1, npts, + reinterpret_cast (out), NULL, 1, npts, + FFTW_BACKWARD, plan_flags); + + fftw_execute (plan); + + fftw_destroy_plan (plan); + + const Complex scale = npts; + for (size_t i = 0; i < npts*nsamples; i++) + out[i] /= scale; + + return 0; + } + + int + octave_fftw::fft2d (const double *in, Complex *out, size_t nr, size_t nc) + { + // Fool with the position of the start of the matrix, so that creating + // other half of the matrix won't cause cache problems + fftw_plan plan = fftw_plan_dft_r2c_2d (nr, nc, (const_cast(in)), + reinterpret_cast (out + nr*((nc-1)/2)), plan_flags); + + fftw_execute (plan); + + fftw_destroy_plan (plan); + + // Need to create other half of the transform + convert_packcomplex_2d (out, nr, nc); + + return 0; + } + + int + octave_fftw::fft2d (Complex *inout, size_t nr, size_t nc) + { + fftw_plan plan + = fftw_plan_dft_2d (nr, nc, + reinterpret_cast (const_cast (inout)), + reinterpret_cast (inout), FFTW_FORWARD, plan_flags); + + fftw_execute (plan); + + fftw_destroy_plan (plan); + + return 0; + } + + int + octave_fftw::ifft2d (Complex *inout, size_t nr, size_t nc) + { + fftw_plan plan + = fftw_plan_dft_2d (nr, nc, + reinterpret_cast (const_cast (inout)), + reinterpret_cast (inout), FFTW_BACKWARD, + plan_flags); + + fftw_execute (plan); + + fftw_destroy_plan (plan); + + const size_t npts = nr * nc; + const Complex scale = npts; + for (size_t i = 0; i < npts; i++) + inout[i] /= scale; + + return 0; + } + + #else // HAVE_FFTW3 + class octave_fftw_planner { *************** *** 42,48 **** octave_fftw_planner (); fftw_plan create_plan (fftw_direction, size_t); ! fftwnd_plan create_plan2d (fftw_direction, size_t, size_t); private: int plan_flags; --- 197,206 ---- octave_fftw_planner (); fftw_plan create_plan (fftw_direction, size_t); ! fftwnd_plan create_plan2d (fftw_direction, size_t, size_t, Complex); ! ! rfftw_plan create_rplan (fftw_direction, size_t); ! rfftwnd_plan create_rplan2d (fftw_direction, size_t, size_t); private: int plan_flags; *************** *** 50,57 **** --- 208,221 ---- fftw_plan plan[2]; fftwnd_plan plan2d[2]; + rfftw_plan rplan[2]; + rfftwnd_plan rplan2d[2]; + size_t n[2]; size_t n2d[2][2]; + + size_t rn[2]; + size_t rn2d[2][2]; }; octave_fftw_planner::octave_fftw_planner () *************** *** 63,68 **** --- 227,239 ---- n[0] = n[1] = 0; n2d[0][0] = n2d[0][1] = n2d[1][0] = n2d[1][1] = 0; + + rplan[0] = rplan[1] = 0; + rplan2d[0] = rplan2d[1] = 0; + + rn[0] = rn[1] = 0; + rn2d[0][0] = rn2d[0][1] = rn2d[1][0] = rn2d[1][1] = 0; + } fftw_plan *************** *** 92,106 **** return *cur_plan_p; } fftwnd_plan ! octave_fftw_planner::create_plan2d (fftw_direction dir, ! size_t nrows, size_t ncols) { size_t which = (dir == FFTW_FORWARD) ? 0 : 1; fftwnd_plan *cur_plan_p = &plan2d[which]; bool create_new_plan = false; ! if (plan2d[which] == 0 || n2d[which][0] != nrows || n2d[which][1] != ncols) { create_new_plan = true; --- 263,305 ---- return *cur_plan_p; } + rfftw_plan + octave_fftw_planner::create_rplan (fftw_direction dir, size_t npts) + { + size_t which = (dir == FFTW_FORWARD) ? 0 : 1; + rfftw_plan *cur_plan_p = &rplan[which]; + bool create_new_plan = false; + + if (rplan[which] == 0 || rn[which] != npts) + { + create_new_plan = true; + rn[which] = npts; + } + + if (create_new_plan) + { + if (*cur_plan_p) + rfftw_destroy_plan (*cur_plan_p); + + *cur_plan_p = rfftw_create_plan (npts, dir, plan_flags); + + if (*cur_plan_p == 0) + (*current_liboctave_error_handler) ("Error creating fftw plan"); + } + + return *cur_plan_p; + } + fftwnd_plan ! octave_fftw_planner::create_plan2d (fftw_direction dir, size_t nrows, ! size_t ncols) { size_t which = (dir == FFTW_FORWARD) ? 0 : 1; fftwnd_plan *cur_plan_p = &plan2d[which]; bool create_new_plan = false; ! if (plan2d[which] == 0 || n2d[which][0] != nrows || ! n2d[which][1] != ncols) { create_new_plan = true; *************** *** 115,121 **** --- 314,351 ---- *cur_plan_p = fftw2d_create_plan (nrows, ncols, dir, plan_flags | FFTW_IN_PLACE); + + if (*cur_plan_p == 0) + (*current_liboctave_error_handler) ("Error creating 2d fftw plan"); + } + return *cur_plan_p; + } + + rfftwnd_plan + octave_fftw_planner::create_rplan2d (fftw_direction dir, size_t nrows, + size_t ncols) + { + size_t which = (dir == FFTW_FORWARD) ? 0 : 1; + fftwnd_plan *cur_plan_p = &rplan2d[which]; + bool create_new_plan = false; + + if (rplan2d[which] == 0 || rn2d[which][0] != nrows || + rn2d[which][1] != ncols) + { + create_new_plan = true; + + rn2d[which][0] = nrows; + rn2d[which][1] = ncols; + } + + if (create_new_plan) + { + if (*cur_plan_p) + rfftwnd_destroy_plan (*cur_plan_p); + + *cur_plan_p = rfftw2d_create_plan (nrows, ncols, dir, plan_flags); + if (*cur_plan_p == 0) (*current_liboctave_error_handler) ("Error creating 2d fftw plan"); } *************** *** 125,158 **** static octave_fftw_planner fftw_planner; int ! octave_fftw::fft (const Complex *in, Complex *out, size_t npts) { ! fftw_one (fftw_planner.create_plan (FFTW_FORWARD, npts), ! reinterpret_cast (const_cast (in)), ! reinterpret_cast (out)); return 0; } int ! octave_fftw::ifft (const Complex *in, Complex *out, size_t npts) { ! fftw_one (fftw_planner.create_plan (FFTW_BACKWARD, npts), ! reinterpret_cast (const_cast (in)), ! reinterpret_cast (out)); const Complex scale = npts; ! for (size_t i = 0; i < npts; i++) out[i] /= scale; return 0; } int octave_fftw::fft2d (Complex *inout, size_t nr, size_t nc) { ! fftwnd_one (fftw_planner.create_plan2d (FFTW_FORWARD, nr, nc), reinterpret_cast (inout), 0); --- 355,481 ---- static octave_fftw_planner fftw_planner; + static inline void convert_halfcomplex (Complex *in, size_t npts, + size_t nsamples) + { + // This is likely to thrash about in the cache !!! + double *ptr1, *ptr2; + + for (size_t k = 0; k < nsamples; k++) + { + // First move the imaginary parts + ptr1 = (double *)in + 3; + ptr2 = (double *)in + 2 * (npts-1); + for (size_t i = 0; i < (npts-1)/2; i++) + { + *ptr1 = *ptr2; + *(ptr2+1) = - *ptr2; + ptr1 += 2; + ptr2 -= 2; + } + + // Now move the real parts + ptr1 = (double *)in + 2; + ptr2 = (double *)in + 2 * (npts-1); + for (size_t i = 0; i < npts/2; i++) + { + *ptr2 = *ptr1; + ptr1 += 2; + ptr2 -= 2; + } + + // Fix-up complex parts at zero and n/2+1 if n is even + ptr1 = (double *)in; + *(ptr1 + 1) = 0.; + if (!(npts & 1)) + *(ptr1 + npts + 1) = 0.; + + // Advance to the next data + in += npts; + } + } + + static inline void convert_packcomplex (Complex *out, size_t nr, size_t nc) + { + Complex *ptr1, *ptr2; + + // Create space for the missing elements + for (size_t i = 0; i < nr; i++) + { + ptr1 = out + i * (nc/2 + 1) + nr*((nc-1)/2); + ptr2 = out + i * nc; + for (size_t j = 0; j < nc/2+1; j++) + *ptr2++ = *ptr1++; + } + + // Fill in the missing data + for (size_t i = 1; i < nr; i++) + for (size_t j = nc/2+1; j < nc; j++) + out[j + i*nc] = conj(out[nc - j + (nr-i)*nc]); + for (size_t j = nc/2+1; j < nc; j++) + out[j] = conj(out[nc - j]); + } + + int + octave_fftw::fft (const double *in, Complex *out, size_t npts, size_t nsamples) + { + // Fool with the stride to make the conversion to complex easier + rfftw (fftw_planner.create_rplan (FFTW_FORWARD, npts, nsamples, in, out), + nsamples, reinterpret_cast (const_cast (in)), + 1, npts, reinterpret_cast (out), 2, 2*npts); + + // Need to create other half of the transform + convert_halfcomplex (out, npts, nsamples); + + return 0; + } + int ! octave_fftw::fft (const Complex *in, Complex *out, size_t npts, ! size_t nsamples) { ! fftw (fftw_planner.create_plan (FFTW_FORWARD, npts, nsamples, in, out), ! nsamples, reinterpret_cast (const_cast (in)), ! 1, npts, reinterpret_cast (out), 1, npts); return 0; } int ! octave_fftw::ifft (const Complex *in, Complex *out, size_t npts, ! size_t nsamples) { ! fftw (fftw_planner.create_plan (FFTW_BACKWARD, npts, nsamples, in, out), ! nsamples, reinterpret_cast (const_cast (in)), ! 1, npts, reinterpret_cast (out), 1, npts); const Complex scale = npts; ! for (size_t i = 0; i < npts*nsamples; i++) out[i] /= scale; return 0; } int + octave_fftw::fft2d (const double *in, Complex *out, size_t nr, size_t nc) + { + // Fool with the position of the start of the matrix, so that creating + // other half of the matrix won't cause cache problems + rfftwnd_one_real_to_complex (fftw_planner.create_rplan2d (FFTW_FORWARD, + nr, nc, in, out), + reinterpret_cast (const_cast(in)), + reinterpret_cast (out + nr*((nc-1)/2))); + + // Need to create other half of the transform + convert_packcomplex (out, nr, nc); + + return 0; + } + + int octave_fftw::fft2d (Complex *inout, size_t nr, size_t nc) { ! fftwnd_one (fftw_planner.create_plan2d (FFTW_FORWARD, nr, nc, in, out), reinterpret_cast (inout), 0); *************** *** 162,168 **** int octave_fftw::ifft2d (Complex *inout, size_t nr, size_t nc) { ! fftwnd_one (fftw_planner.create_plan2d (FFTW_BACKWARD, nr, nc), reinterpret_cast (inout), 0); --- 485,491 ---- int octave_fftw::ifft2d (Complex *inout, size_t nr, size_t nc) { ! fftwnd_one (fftw_planner.create_plan2d (FFTW_BACKWARD, nr, nc, in, out), reinterpret_cast (inout), 0); *************** *** 174,180 **** return 0; } ! #endif /* ;;; Local Variables: *** --- 497,504 ---- return 0; } ! #endif // HAVE_FFTW3 ! #endif // HAVE_FFTW or HAVE_FFTW3 /* ;;; Local Variables: *** *** ./configure.in.orig 2004-01-28 15:16:49.000000000 +0100 --- ./configure.in 2004-01-29 21:30:25.000000000 +0100 *************** *** 407,425 **** with_fftw=$withval, with_fftw=yes) if test "$with_fftw" = "yes"; then ! have_fftw_header=no ! AC_CHECK_HEADERS(dfftw.h fftw.h, [have_fftw_header=yes; break]) ! if test "$have_fftw_header" = yes; then ! AC_CHECK_LIB(dfftw, fftw_create_plan, FFTW_LIBS="-ldfftw", ! [AC_CHECK_LIB(fftw, fftw_create_plan, FFTW_LIBS="-lfftw", with_fftw=no)]) ! else ! with_fftw=no fi fi ! if test "$with_fftw" = yes; then FFT_DIR='' ! AC_DEFINE(HAVE_FFTW, 1, [Define if the FFTW library is available.]) fi WITH_MPI=true --- 407,442 ---- with_fftw=$withval, with_fftw=yes) if test "$with_fftw" = "yes"; then ! have_fftw3_header=no ! with_fftw3=no ! AC_CHECK_HEADER(fftw3.h, [have_fftw3_header=yes; break]) ! if test "$have_fftw3_header" = yes; then ! AC_CHECK_LIB(fftw3, fftw_plan_dft_1d, [FFTW_LIBS="-lfftw3"; with_fftw3=yes]) ! fi ! if test "$with_fftw3" = no; then ! have_fftw_header=no ! have_rfftw_header=no ! AC_CHECK_HEADERS(dfftw.h fftw.h, [have_fftw_header=yes; break]) ! AC_CHECK_HEADERS(drfftw.h rfftw.h, [have_rfftw_header=yes; break]) ! if test "$have_fftw_header" = yes && test "$have_rfftw_header" = yes; then ! AC_CHECK_LIB(dfftw, fftw_create_plan, FFTW_LIBS="-ldfftw", ! [AC_CHECK_LIB(fftw, fftw_create_plan, FFTW_LIBS="-lfftw", with_fftw=no)]) ! AC_CHECK_LIB(drfftw, rfftw_create_plan, FFTW_LIBS="$FFTW_LIBS -ldrfftw", ! [AC_CHECK_LIB(rfftw, rfftw_create_plan, FFTW_LIBS="$FFTW_LIBS -lrfftw", with_fftw=no, $FFTW_LIBS)], $FFTW_LIBS) ! else ! with_fftw=no ! fi fi fi ! if test "$with_fftw" = yes || test "$with_fftw3" = yes; then FFT_DIR='' ! if test "$with_fftw3" = yes; then ! AC_DEFINE(HAVE_FFTW3, 1, [Define if the FFTW3 library is used.]) ! else ! AC_DEFINE(HAVE_FFTW, 1, [Define if the FFTW library is used.]) ! warn_fftw="Old version of FFTW found. This will affect performance!!" ! fi fi WITH_MPI=true *************** *** 1608,1613 **** --- 1625,1635 ---- warn_msg_printed=true fi + if test -n "$warn_fftw"; then + AC_MSG_WARN($warn_fftw) + warn_msg_printed=true + fi + if $warn_msg_printed; then AC_MSG_RESULT([]) fi *** ./ChangeLog.orig 2004-01-30 15:25:15.000000000 +0100 --- ./ChangeLog 2004-01-30 15:26:32.000000000 +0100 *************** *** 1,3 **** --- 1,8 ---- + 2004-01-30 David Bateman + + * configure.in: Test for the presence of FFTW 3.x and use it in + preference to FFTW 2.x. Define HAVE_FFTW3 + 2004-01-22 John W. Eaton * octMakefile.in (maintainer-clean, distclean): --WhfpMioaduB5tiZL--