From octave-maintainers-request at bevo dot che dot wisc dot edu Wed Jan 28 08:49:41 2004 Subject: A faster FFT of real matrices ?? From: David Bateman To: octave-maintainers at bevo dot che dot wisc dot edu Date: Wed, 28 Jan 2004 15:45:23 +0100 --2JFBq9zoW8cOFH7v Content-Type: text/plain; charset=us-ascii Content-Disposition: inline I've implemented FFT's over real matrices in octave with FFTW 2.1.5 and I can achieve a speed up of about a factor of 2. However, in some cases I'm also getting slower code fft(randn(514)) is a case in point. Basically, I can't see why this is happening, and so this e-mail is a call for help. What I think is happening is that my libfftw.so was compiled by someone else and so isn't optimal for my machine, (does FFTW build optimize for cache size?). So what I was hoping was to see if someone might like to test the attached patch against 2.1.53, with the test programs. There are two test programs. The procedure to use the test code, is to run "testfft.m" on an unpatched version of 2.1.53, and then run "testfft2.m" on a patched version. You'll see something like Loading Data done Testing fft( 512, 512) 4.00e-02 sec (5.06e-01) rerr 2.69e-13 Testing fft2( 512, 512) 8.06e-02 sec (4.90e-01) rerr 2.06e-13 Testing fft( 513, 513) 8.94e-02 sec (5.90e-01) rerr 3.35e-13 Testing fft2( 513, 513) 1.65e-01 sec (6.77e-01) rerr 1.64e-13 Testing fft( 514, 512) 4.01e-01 sec (2.60e+00) rerr 2.88e-12 Testing fft2( 514, 512) 4.41e-01 sec (2.45e+00) rerr 1.37e-13 Testing fft( 512, 514) 4.01e-02 sec (4.97e-01) rerr 1.63e-12 Testing fft2( 512, 514) 1.18e-01 sec (4.69e-01) rerr 5.56e-13 Testing fft(65536, 1) 2.82e-02 sec (7.83e-01) rerr 4.17e-14 Testing fft2(65536, 1) 2.81e-02 sec (6.57e-01) rerr 4.17e-14 Where the first value is the run time on the current version, the second is the relative runtime to the previous version and "rerr" is the relative error. As I save in ascii (I'm using 2.1.50 as my old test veresion), you can't expect better relative errors than about 1e-13 or so. As you can see the speed of the new code is better in the cases tested except n=514, where it is 2.5 times slower!!!! It would be nice to understand what is happening here, as matlab doesn't seem to have this problem and runs at the union of the lowest speeds on the two versions of the octave fft code. Cheers David -- David Bateman David dot Bateman at motorola dot com Motorola CRM +33 1 69 35 48 04 (Ph) Parc Les Algorithmes, Commune de St Aubin +33 1 69 35 77 01 (Fax) 91193 Gif-Sur-Yvette FRANCE The information contained in this communication has been classified as: [x] General Business Information [ ] Motorola Internal Use Only [ ] Motorola Confidential Proprietary --2JFBq9zoW8cOFH7v Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="testfft.m" N = [512, 513, 514, 512, 65536]; M = [512, 513, 512, 514, 1]; rep = 40; infft = cell(1,length(N)); resfft = cell(1, length(N)); resfft2 = cell(1, length(N)); timfft = cell(1,length(N)); timfft2 = cell(1,length(N)); for i=1:length(N) a = randn(N(i),M(i)); infft{i} = a; fprintf("Testing fft(%5d,%5d) ", N(i), M(i)); fflush(stdout); # Call fft twice to ensure all variables initialized b = fft(a); b = fft(a); timfft{i} = 0; for j=1:rep t = cputime; b = fft(a); timfft{i} = timfft{i} + cputime - t; endfor resfft{i} = b; timfft{i} = timfft{i} / rep; fprintf("%5.2e sec\n", timfft{i}); fprintf("Testing fft2(%5d,%5d) ", N(i), M(i)); fflush(stdout); # Call fft2 twice to ensure all variables initialized b = fft2(a); b = fft2(a); timfft2{i} = 0; for j=1:rep t = cputime; b = fft2(a); timfft2{i} = timfft2{i} + cputime - t; endfor resfft2{i} = b; timfft2{i} = timfft2{i} / rep; fprintf("%5.2e sec\n", timfft2{i}); endfor fprintf("Saving Data "); fflush(stdout); save testfft.mat infft resfft timfft resfft2 timfft2 fprintf("done\n"); --2JFBq9zoW8cOFH7v Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="testfft2.m" fprintf("Loading Data "); fflush(stdout); load -force testfft.mat fprintf("done\n"); rep = 100; len = length(timfft); N = zeros(1,len); M = zeros(1,len); for i=1:len N(i) = size(resfft{i},1); M(i) = size(resfft{i},2); endfor xresfft = cell(1, length(N)); xresfft2 = cell(1, length(N)); xtimfft = cell(1,length(N)); xtimfft2 = cell(1,length(N)); for i=1:length(N) a = infft{i}; fprintf("Testing fft(%5d,%5d) ", N(i), M(i)); fflush(stdout); # Call fft twice to ensure all variables initialized b = fft(a); b = fft(a); xtimfft{i} = 0; for j=1:rep t = cputime; b = fft(a); xtimfft{i} = xtimfft{i} + cputime - t; endfor xresfft{i} = b; xtimfft{i} = xtimfft{i} / rep; fprintf("%5.2e sec (%5.2e) rerr %5.2e\n", xtimfft{i}, xtimfft{i}/timfft{i}, 0.5*max(max(abs(resfft{i}-xresfft{i})./(abs(resfft{i})+abs(xresfft{i}))))); fprintf("Testing fft2(%5d,%5d) ", N(i), M(i)); fflush(stdout); # Call fft2 twice to ensure all variables initialized b = fft2(a); b = fft2(a); xtimfft2{i} = 0; for j=1:rep t = cputime; b = fft2(a); xtimfft2{i} = xtimfft2{i} + cputime - t; endfor xresfft2{i} = b; xtimfft2{i} = xtimfft2{i} / rep; fprintf("%5.2e sec (%5.2e) rerr %5.2e\n", xtimfft2{i}, xtimfft2{i}/timfft2{i}, 0.5*max(max(abs(resfft2{i}-xresfft2{i})./(abs(resfft2{i})+abs(xresfft2{i}))))); endfor --2JFBq9zoW8cOFH7v Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename=patch *** ./liboctave/dMatrix.cc.orig 2004-01-28 15:17:03.000000000 +0100 --- ./liboctave/dMatrix.cc 2004-01-28 14:41:24.000000000 +0100 *************** *** 789,798 **** nsamples = nc; } ! ComplexMatrix tmp (*this); ! Complex *in (tmp.fortran_vec ()); Complex *out (retval.fortran_vec ()); for (size_t i = 0; i < nsamples; i++) { OCTAVE_QUIT; --- 789,800 ---- nsamples = nc; } ! const double *in (fortran_vec ()); Complex *out (retval.fortran_vec ()); + // XXX FIXME XXX + // octave_fftw::fft (in, out, npts, nsamples); + // should be faster but isn't !!! for (size_t i = 0; i < nsamples; i++) { OCTAVE_QUIT; *************** *** 828,833 **** --- 830,838 ---- Complex *in (tmp.fortran_vec ()); Complex *out (retval.fortran_vec ()); + // XXX FIXME XXX + // octave_fftw::ifft (in, out, npts, nsamples); + // should be faster but isn't !!! for (size_t i = 0; i < nsamples; i++) { OCTAVE_QUIT; *************** *** 844,853 **** int nr = rows (); int nc = cols (); ! ComplexMatrix retval (*this); // Note the order of passing the rows and columns to account for // column-major storage. ! octave_fftw::fft2d (retval.fortran_vec (), nc, nr); return retval; } --- 849,859 ---- int nr = rows (); int nc = cols (); ! const double *in = fortran_vec (); ! ComplexMatrix retval (rows (), cols ()); // Note the order of passing the rows and columns to account for // column-major storage. ! octave_fftw::fft2d (in, retval.fortran_vec (), nc, nr); return retval; } *** ./liboctave/CMatrix.cc.orig 2004-01-28 15:17:09.000000000 +0100 --- ./liboctave/CMatrix.cc 2004-01-28 10:40:00.000000000 +0100 *************** *** 1128,1133 **** --- 1128,1136 ---- const Complex *in (data ()); Complex *out (retval.fortran_vec ()); + // XXX FIXME XXX + // octave_fftw::fft (in, out, npts, nsamples); + // should be faster but isn't !!! for (size_t i = 0; i < nsamples; i++) { OCTAVE_QUIT; *************** *** 1162,1167 **** --- 1165,1173 ---- const Complex *in (data ()); Complex *out (retval.fortran_vec ()); + // XXX FIXME XXX + // octave_fftw::ifft (in, out, npts, nsamples); + // should be faster but isn't !!! for (size_t i = 0; i < nsamples; i++) { OCTAVE_QUIT; *** ./liboctave/oct-fftw.h.orig 2004-01-28 15:17:17.000000000 +0100 --- ./liboctave/oct-fftw.h 2004-01-28 09:31:53.000000000 +0100 *************** *** 25,32 **** --- 25,34 ---- #if defined (HAVE_DFFTW_H) #include + #include #else #include + #include #endif #include "oct-cmplx.h" *************** *** 35,42 **** octave_fftw { public: ! static int fft (const Complex*, Complex *, size_t); ! static int ifft (const Complex*, Complex *, size_t); static int fft2d (Complex*, size_t, size_t); static int ifft2d (Complex*, size_t, size_t); --- 37,50 ---- octave_fftw { public: ! static int fft (const double *in, Complex *out, size_t npts, ! size_t nsamples = 1); ! static int fft (const Complex *in, Complex *out, size_t npts, ! size_t nsamples = 1); ! static int ifft (const Complex *in, Complex *out, size_t npts, ! size_t nsamples = 1); ! ! static int fft2d (const double*, Complex*, size_t, size_t); static int fft2d (Complex*, size_t, size_t); static int ifft2d (Complex*, size_t, size_t); *** ./liboctave/oct-fftw.cc.orig 2004-01-28 15:17:22.000000000 +0100 --- ./liboctave/oct-fftw.cc 2004-01-28 15:30:42.000000000 +0100 *************** *** 26,31 **** --- 26,32 ---- #include "oct-fftw.h" #include "lo-error.h" + #include // Helper class to create and cache fftw plans for both 1d and 2d. This *************** *** 44,49 **** --- 45,53 ---- fftw_plan create_plan (fftw_direction, size_t); fftwnd_plan create_plan2d (fftw_direction, size_t, size_t); + rfftw_plan create_rplan (fftw_direction, size_t); + rfftwnd_plan create_rplan2d (fftw_direction, size_t, size_t); + private: int plan_flags; *************** *** 52,57 **** --- 56,67 ---- size_t n[2]; size_t n2d[2][2]; + + rfftw_plan rplan[2]; + rfftwnd_plan rplan2d[2]; + + size_t rn[2]; + size_t rn2d[2][2]; }; octave_fftw_planner::octave_fftw_planner () *************** *** 63,68 **** --- 73,84 ---- n[0] = n[1] = 0; n2d[0][0] = n2d[0][1] = n2d[1][0] = n2d[1][1] = 0; + + rplan[0] = rplan[1] = 0; + rplan2d[0] = rplan2d[1] = 0; + + rn[0] = rn[1] = 0; + rn2d[0][0] = rn2d[0][1] = rn2d[1][0] = rn2d[1][1] = 0; } fftw_plan *************** *** 92,100 **** return *cur_plan_p; } fftwnd_plan octave_fftw_planner::create_plan2d (fftw_direction dir, ! size_t nrows, size_t ncols) { size_t which = (dir == FFTW_FORWARD) ? 0 : 1; fftwnd_plan *cur_plan_p = &plan2d[which]; --- 108,143 ---- return *cur_plan_p; } + rfftw_plan + octave_fftw_planner::create_rplan (fftw_direction dir, size_t npts) + { + size_t which = (dir == FFTW_FORWARD) ? 0 : 1; + rfftw_plan *cur_plan_p = &rplan[which]; + bool create_new_plan = false; + + if (rplan[which] == 0 || rn[which] != npts) + { + create_new_plan = true; + rn[which] = npts; + } + + if (create_new_plan) + { + if (*cur_plan_p) + rfftw_destroy_plan (*cur_plan_p); + + *cur_plan_p = rfftw_create_plan (npts, dir, plan_flags); + + if (*cur_plan_p == 0) + (*current_liboctave_error_handler) ("Error creating fftw plan"); + } + + return *cur_plan_p; + } + fftwnd_plan octave_fftw_planner::create_plan2d (fftw_direction dir, ! size_t nrows, size_t ncols) { size_t which = (dir == FFTW_FORWARD) ? 0 : 1; fftwnd_plan *cur_plan_p = &plan2d[which]; *************** *** 112,120 **** { if (*cur_plan_p) fftwnd_destroy_plan (*cur_plan_p); ! *cur_plan_p = fftw2d_create_plan (nrows, ncols, dir, plan_flags | FFTW_IN_PLACE); if (*cur_plan_p == 0) (*current_liboctave_error_handler) ("Error creating 2d fftw plan"); --- 155,194 ---- { if (*cur_plan_p) fftwnd_destroy_plan (*cur_plan_p); ! *cur_plan_p = fftw2d_create_plan (nrows, ncols, dir, plan_flags | FFTW_IN_PLACE); + if (*cur_plan_p == 0) + (*current_liboctave_error_handler) ("Error creating 2d fftw plan"); + } + + return *cur_plan_p; + } + + rfftwnd_plan + octave_fftw_planner::create_rplan2d (fftw_direction dir, + size_t nrows, size_t ncols) + { + size_t which = (dir == FFTW_FORWARD) ? 0 : 1; + fftwnd_plan *cur_plan_p = &rplan2d[which]; + bool create_new_plan = false; + + if (rplan2d[which] == 0 || rn2d[which][0] != nrows || + rn2d[which][1] != ncols) + { + create_new_plan = true; + + rn2d[which][0] = nrows; + rn2d[which][1] = ncols; + } + + if (create_new_plan) + { + if (*cur_plan_p) + rfftwnd_destroy_plan (*cur_plan_p); + + *cur_plan_p = rfftw2d_create_plan (nrows, ncols, dir, + plan_flags); if (*cur_plan_p == 0) (*current_liboctave_error_handler) ("Error creating 2d fftw plan"); *************** *** 125,154 **** static octave_fftw_planner fftw_planner; int ! octave_fftw::fft (const Complex *in, Complex *out, size_t npts) { ! fftw_one (fftw_planner.create_plan (FFTW_FORWARD, npts), ! reinterpret_cast (const_cast (in)), ! reinterpret_cast (out)); return 0; } int ! octave_fftw::ifft (const Complex *in, Complex *out, size_t npts) { ! fftw_one (fftw_planner.create_plan (FFTW_BACKWARD, npts), ! reinterpret_cast (const_cast (in)), ! reinterpret_cast (out)); const Complex scale = npts; ! for (size_t i = 0; i < npts; i++) out[i] /= scale; return 0; } int octave_fftw::fft2d (Complex *inout, size_t nr, size_t nc) { --- 199,321 ---- static octave_fftw_planner fftw_planner; + static inline void convert_halfcomplex (Complex *in, size_t npts, + size_t nsamples) + { + // This is likely to thrash about in the cache !!! + double *ptr1, *ptr2; + + for (size_t k = 0; k < nsamples; k++) + { + // First move the imaginary parts + ptr1 = (double *)in + 3; + ptr2 = (double *)in + 2 * (npts-1); + for (size_t i = 0; i < (npts-1)/2; i++) + { + *ptr1 = *ptr2; + *(ptr2+1) = - *ptr2; + ptr1 += 2; + ptr2 -= 2; + } + + // Now move the real parts + ptr1 = (double *)in + 2; + ptr2 = (double *)in + 2 * (npts-1); + for (size_t i = 0; i < npts/2; i++) + { + *ptr2 = *ptr1; + ptr1 += 2; + ptr2 -= 2; + } + + // Fix-up complex parts at zero and n/2+1 if n is even + ptr1 = (double *)in; + *(ptr1 + 1) = 0.; + if (!(npts & 1)) + *(ptr1 + npts + 1) = 0.; + + // Advance to the next data + in += npts; + } + } + + int + octave_fftw::fft (const double *in, Complex *out, size_t npts, size_t nsamples) + { + // Fool with the stride to make the conversion to complex easier + rfftw (fftw_planner.create_rplan (FFTW_FORWARD, npts), nsamples, + reinterpret_cast (const_cast (in)), 1, npts, + reinterpret_cast (out), 2, 2*npts); + + // Need to create other half of the transform + convert_halfcomplex (out, npts, nsamples); + + return 0; + } + int ! octave_fftw::fft (const Complex *in, Complex *out, size_t npts, ! size_t nsamples) { ! fftw (fftw_planner.create_plan (FFTW_FORWARD, npts), nsamples, ! reinterpret_cast (const_cast (in)), ! 1, npts, reinterpret_cast (out), 1, npts); return 0; } int ! octave_fftw::ifft (const Complex *in, Complex *out, size_t npts, ! size_t nsamples) { ! fftw (fftw_planner.create_plan (FFTW_BACKWARD, npts), nsamples, ! reinterpret_cast (const_cast (in)), ! 1, npts, reinterpret_cast (out), 1, npts); const Complex scale = npts; ! for (size_t i = 0; i < npts*nsamples; i++) out[i] /= scale; return 0; } + static inline void convert_packcomplex (Complex *out, size_t nr, size_t nc) + { + Complex *ptr1, *ptr2; + + // Create space for the missing elements + for (size_t i = 0; i < nr; i++) + { + ptr1 = out + i * (nc/2 + 1) + nr*((nc-1)/2); + ptr2 = out + i * nc; + for (size_t j = 0; j < nc/2+1; j++) + *ptr2++ = *ptr1++; + } + + // Fill in the missing data + for (size_t i = 1; i < nr; i++) + for (size_t j = nc/2+1; j < nc; j++) + out[j + i*nc] = conj(out[nc - j + (nr-i)*nc]); + for (size_t j = nc/2+1; j < nc; j++) + out[j] = conj(out[nc - j]); + } + + int + octave_fftw::fft2d (const double *in, Complex *out, size_t nr, size_t nc) + { + // Fool with the position of the start of the matrix, so that creating + // other half of the matrix won't cause cache problems + rfftwnd_one_real_to_complex (fftw_planner.create_rplan2d (FFTW_FORWARD, + nr, nc), + reinterpret_cast (const_cast(in)), + reinterpret_cast (out + nr*((nc-1)/2))); + + // Need to create other half of the transform + convert_packcomplex (out, nr, nc); + + return 0; + } + int octave_fftw::fft2d (Complex *inout, size_t nr, size_t nc) { *** ./configure.in.orig 2004-01-28 15:16:49.000000000 +0100 --- ./configure.in 2004-01-28 11:18:57.000000000 +0100 *************** *** 408,417 **** if test "$with_fftw" = "yes"; then have_fftw_header=no AC_CHECK_HEADERS(dfftw.h fftw.h, [have_fftw_header=yes; break]) ! if test "$have_fftw_header" = yes; then AC_CHECK_LIB(dfftw, fftw_create_plan, FFTW_LIBS="-ldfftw", [AC_CHECK_LIB(fftw, fftw_create_plan, FFTW_LIBS="-lfftw", with_fftw=no)]) else with_fftw=no fi --- 408,421 ---- if test "$with_fftw" = "yes"; then have_fftw_header=no + have_rfftw_header=no AC_CHECK_HEADERS(dfftw.h fftw.h, [have_fftw_header=yes; break]) ! AC_CHECK_HEADERS(drfftw.h rfftw.h, [have_rfftw_header=yes; break]) ! if test "$have_fftw_header" = yes && test "$have_rfftw_header" = yes; then AC_CHECK_LIB(dfftw, fftw_create_plan, FFTW_LIBS="-ldfftw", [AC_CHECK_LIB(fftw, fftw_create_plan, FFTW_LIBS="-lfftw", with_fftw=no)]) + AC_CHECK_LIB(drfftw, rfftw_create_plan, FFTW_LIBS="$FFTW_LIBS -ldrfftw", + [AC_CHECK_LIB(rfftw, rfftw_create_plan, FFTW_LIBS="$FFTW_LIBS -lrfftw", with_fftw=no, $FFTW_LIBS)], $FFTW_LIBS) else with_fftw=no fi --2JFBq9zoW8cOFH7v--