/*	This file contains demonstration code that supplements the paper
	_Construction of a High-Performance FFT_.

	See the paper and ReadMe.txt for more information.

	The code is formatted for tabs at four-column intervals.
*/


#include "common.h"


// CommonWeight
typedef struct {
	float w1r, w1i, w2r, w2i, w3r, w3i;
} CommonWeight;


// GenerateCommonWeights
static int GenerateCommonWeights(
	CommonWeight **weights,	// Pointer to array address.
	int *length,			// Pointer to supported length.
	int NewLength			// New length to support (1<<N).
)
{
	int k0;

	// Try to allocate space and check result.
	CommonWeight *p = (CommonWeight *)
		realloc(*weights, NewLength/16 * sizeof **weights);
	if (p == NULL)
		return 1;

	for (k0 = *length/16; k0 < NewLength/16; ++k0)
	{
		const double x = TwoPi * r(4*k0);
		p[k0].w1r = cos(x);
		p[k0].w1i = tan(x);
		p[k0].w2r = cos(x+x);
		p[k0].w2i = tan(x+x);
		p[k0].w3r = 2. * p[k0].w2r - 1.;
		p[k0].w3i = tan(3.*x);
	}

	// Pass address and supported length back to caller.
	*weights = p;
	*length = NewLength;

	return 0;
}


static int GenerateFinalIndices(
	FinalIndices **indices,	// Pointer to index array address.
	int NewLength			// New length to support (1<<N).
)
{
	// Prepare to bit-reverse a number of N-4 bits (see below).
	const int shift = 32 - (ilog2(NewLength) - 4);
	int kL;

	// Try to allocate space and check result.
	FinalIndices *p = (FinalIndices *)
		realloc(*indices, NewLength/16 * sizeof **indices);
	if (p == NULL)
		return 1;

	// Pass address back to caller.
	*indices = p;

	// Iterate through all values of kL.
	for (kL = 0; kL < NewLength/16; ++kL)
	{
		// rw(kL) reverses kL as a 32-bit number.  To get it as
		// the reversal of an N-4 bit number, shift right to
		// remove 32-(N-4) bits.
		const int kLprime = rw(kL) >> shift;

		// If kLprime < kL, then kL in a previous iteration had the
		// value kLprime has now, and we do not want to repeat it.
		if (kL <= kLprime)
		{
			// If kL == kLprime, add one table entry.
			// If kL != kLprime, add table entries in both orders.
			*(p++)     = Construct( kL, kLprime );
			if (kL < kLprime)
				*(p++) = Construct( kLprime, kL );
		}
	}
	return 0;
}


// FinalWeights
typedef struct {
	float w1r[4], w1i[4], w2r[4], w2i[4], w3r[4], w3i[4];
} FinalWeights;


// GenerateFinalWeights
static int GenerateFinalWeights(
	FinalWeights **weights,	// Pointer to weight array address.
	int NewLength,			// New length to support (1<<N).
	FinalIndices *indices	// Index array address.
)
{
	const double rn = 1./NewLength;
	int kHprime, q;

	// Try to allocate space and check result.
	FinalWeights *p = (FinalWeights *)
		realloc(*weights, NewLength/16 * sizeof **weights);
	if (p == NULL)
		return 1;

	for (q = 0; q < NewLength/16; ++q)
	{
		const int kL = indices[q].read;
		const double r4kL = r(4*kL);
		for (kHprime = 0; kHprime < 4; ++kHprime)
		{
			const double x = TwoPi * (r4kL + kHprime*rn);
			p[q].w1r[kHprime] = cos(x);
			p[q].w1i[kHprime] = tan(x);
			p[q].w2r[kHprime] = cos(x+x);
			p[q].w2i[kHprime] = tan(x+x);
			p[q].w3r[kHprime] = 2. * p[q].w2r[kHprime] - 1.;
			p[q].w3i[kHprime] = tan(3.*x);
		}
	}

	// Pass address back to caller.
	*weights = p;

	return 0;
}


// FFT4_0Weights
static void FFT4_0Weights(
	ComplexArray vOut,	// Address of output vector.
	ComplexArray vIn,	// Address of input vector.
	int c0				// Coefficient for k0.
)
{
	// Coefficient for k1 is coefficient for k0 divided by 1<<m.
	const int c1 = c0 >> 2;
	int k2;
	float	a0r, a0i, a1r, a1i, a2r, a2i, a3r, a3i,
			c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
			d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i;

	for (k2 = 0; k2 < c1; ++k2)
	{
		a0r = vIn.re[c1*0 + k2];
		a0i = vIn.im[c1*0 + k2];
		a1r = vIn.re[c1*1 + k2];
		a1i = vIn.im[c1*1 + k2];
		a2r = vIn.re[c1*2 + k2];
		a2i = vIn.im[c1*2 + k2];
		a3r = vIn.re[c1*3 + k2];
		a3i = vIn.im[c1*3 + k2];
		c0r = + a2r + a0r;
		c0i = + a2i + a0i;
		c2r = - a2r + a0r;
		c2i = - a2i + a0i;
		c1r = + a3r + a1r;
		c1i = + a3i + a1i;
		c3r = - a3r + a1r;
		c3i = - a3i + a1i;
		d0r = + c1r + c0r;
		d0i = + c1i + c0i;
		d1r = - c1r + c0r;
		d1i = - c1i + c0i;
		d2r = - c3i + c2r;
		d2i = + c3r + c2i;
		d3r = + c3i + c2r;
		d3i = - c3r + c2i;
		vOut.re[c1*0 + k2] = d0r;
		vOut.im[c1*0 + k2] = d0i;
		vOut.re[c1*1 + k2] = d1r;
		vOut.im[c1*1 + k2] = d1i;
		vOut.re[c1*2 + k2] = d2r;
		vOut.im[c1*2 + k2] = d2i;
		vOut.re[c1*3 + k2] = d3r;
		vOut.im[c1*3 + k2] = d3i;
	}
}


// FFT8_0Weights
static void FFT8_0Weights(
	ComplexArray vOut,	// Address of output vector.
	ComplexArray vIn,	// Address of input vector.
	int c0				// Coefficient for k0.
)
{
	// Prepare a constant, sqrt(2)/2.
	const float sqrt2d2 = .7071067811865475244;
	// Coefficient for k1 is coefficient for k0 divided by 1<<m.
	const int c1 = c0 >> 3;
	int k2;
	float	a0r, a0i, a1r, a1i, a2r, a2i, a3r, a3i,
			a4r, a4i, a5r, a5i, a6r, a6i, a7r, a7i,
			b0r, b0i, b1r, b1i, b2r, b2i, b3r, b3i,
			b4r, b4i, b5r, b5i, b6r, b6i, b7r, b7i,
			c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
			c4r, c4i, c5r, c5i, c6r, c6i, c7r, c7i,
			d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i,
			d4r, d4i, d5r, d5i, d6r, d6i, d7r, d7i,
			t5r, t5i, t7r, t7i;

	for (k2 = 0; k2 < c1; ++k2)
	{
		a0r = vIn.re[c1*0 + k2];
		a0i = vIn.im[c1*0 + k2];
		a1r = vIn.re[c1*1 + k2];
		a1i = vIn.im[c1*1 + k2];
		a2r = vIn.re[c1*2 + k2];
		a2i = vIn.im[c1*2 + k2];
		a3r = vIn.re[c1*3 + k2];
		a3i = vIn.im[c1*3 + k2];
		a4r = vIn.re[c1*4 + k2];
		a4i = vIn.im[c1*4 + k2];
		a5r = vIn.re[c1*5 + k2];
		a5i = vIn.im[c1*5 + k2];
		a6r = vIn.re[c1*6 + k2];
		a6i = vIn.im[c1*6 + k2];
		a7r = vIn.re[c1*7 + k2];
		a7i = vIn.im[c1*7 + k2];
		b0r = a0r + a4r;			// w = 1.
		b0i = a0i + a4i;
		b1r = a1r + a5r;
		b1i = a1i + a5i;
		b2r = a2r + a6r;
		b2i = a2i + a6i;
		b3r = a3r + a7r;
		b3i = a3i + a7i;
		b4r = a0r - a4r;
		b4i = a0i - a4i;
		b5r = a1r - a5r;
		b5i = a1i - a5i;
		b6r = a2r - a6r;
		b6i = a2i - a6i;
		b7r = a3r - a7r;
		b7i = a3i - a7i;
		c0r = b0r + b2r;			// w = 1.
		c0i = b0i + b2i;
		c1r = b1r + b3r;
		c1i = b1i + b3i;
		c2r = b0r - b2r;
		c2i = b0i - b2i;
		c3r = b1r - b3r;
		c3i = b1i - b3i;
		c4r = b4r - b6i;			// w = i.
		c4i = b4i + b6r;
		c5r = b5r - b7i;
		c5i = b5i + b7r;
		c6r = b4r + b6i;
		c6i = b4i - b6r;
		c7r = b5r + b7i;
		c7i = b5i - b7r;
		t5r = c5r - c5i;
		t5i = c5r + c5i;
		t7r = c7r + c7i;
		t7i = c7r - c7i;
		d0r = c0r + c1r;			// w = 1.
		d0i = c0i + c1i;
		d1r = c0r - c1r;
		d1i = c0i - c1i;
		d2r = c2r - c3i;			// w = i.
		d2i = c2i + c3r;
		d3r = c2r + c3i;
		d3i = c2i - c3r;
		d4r = + t5r * sqrt2d2 + c4r;	// w = sqrt(2)/2 * (+1+i).
		d4i = + t5i * sqrt2d2 + c4i;
		d5r = - t5r * sqrt2d2 + c4r;
		d5i = - t5i * sqrt2d2 + c4i;
		d6r = - t7r * sqrt2d2 + c6r;	// w = sqrt(2)/2 * (-1+i).
		d6i = + t7i * sqrt2d2 + c6i;
		d7r = + t7r * sqrt2d2 + c6r;
		d7i = - t7i * sqrt2d2 + c6i;
		vOut.re[c1*0 + k2] = d0r;
		vOut.im[c1*0 + k2] = d0i;
		vOut.re[c1*1 + k2] = d1r;
		vOut.im[c1*1 + k2] = d1i;
		vOut.re[c1*2 + k2] = d2r;
		vOut.im[c1*2 + k2] = d2i;
		vOut.re[c1*3 + k2] = d3r;
		vOut.im[c1*3 + k2] = d3i;
		vOut.re[c1*4 + k2] = d4r;
		vOut.im[c1*4 + k2] = d4i;
		vOut.re[c1*5 + k2] = d5r;
		vOut.im[c1*5 + k2] = d5i;
		vOut.re[c1*6 + k2] = d6r;
		vOut.im[c1*6 + k2] = d6i;
		vOut.re[c1*7 + k2] = d7r;
		vOut.im[c1*7 + k2] = d7i;
	}
}


// FFT4_1WeightPerCall
static void FFT4_1WeightPerCall(
	ComplexArray vOut,	// Address of output vector.
	int k0,				// k0 from equation.
	int c0,				// Coefficient for k0.
	CommonWeight weight	// Values for weight calculations.
)
{
	// Coefficient for k1 is coefficient for k0 divided by 1<<m.
	const int c1 = c0 >> 2;
	int k2;
	float	a0r, a0i, a1r, a1i, a2r, a2i, a3r, a3i,
					  b1r, b1i, b2r, b2i, b3r, b3i,
			c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
			d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i;

	for (k2 = 0; k2 < c1; ++k2)
	{
		a0r = vOut.re[c0*k0 + c1*0 + k2];
		a0i = vOut.im[c0*k0 + c1*0 + k2];
		a1r = vOut.re[c0*k0 + c1*1 + k2];
		a1i = vOut.im[c0*k0 + c1*1 + k2];
		a2r = vOut.re[c0*k0 + c1*2 + k2];
		a2i = vOut.im[c0*k0 + c1*2 + k2];
		a3r = vOut.re[c0*k0 + c1*3 + k2];
		a3i = vOut.im[c0*k0 + c1*3 + k2];
		b1r = - a1i * weight.w1i + a1r;
		b1i = + a1r * weight.w1i + a1i;
		b2r = - a2i * weight.w2i + a2r;
		b2i = + a2r * weight.w2i + a2i;
		b3r = - a3i * weight.w3i + a3r;
		b3i = + a3r * weight.w3i + a3i;
		c0r = + b2r * weight.w2r + a0r;
		c0i = + b2i * weight.w2r + a0i;
		c2r = - b2r * weight.w2r + a0r;
		c2i = - b2i * weight.w2r + a0i;
		c1r = + b3r * weight.w3r + b1r;
		c1i = + b3i * weight.w3r + b1i;
		c3r = - b3r * weight.w3r + b1r;
		c3i = - b3i * weight.w3r + b1i;
		d0r = + c1r * weight.w1r + c0r;
		d0i = + c1i * weight.w1r + c0i;
		d1r = - c1r * weight.w1r + c0r;
		d1i = - c1i * weight.w1r + c0i;
		d2r = - c3i * weight.w1r + c2r;
		d2i = + c3r * weight.w1r + c2i;
		d3r = + c3i * weight.w1r + c2r;
		d3i = - c3r * weight.w1r + c2i;
		vOut.re[c0*k0 + c1*0 + k2] = d0r;
		vOut.im[c0*k0 + c1*0 + k2] = d0i;
		vOut.re[c0*k0 + c1*1 + k2] = d1r;
		vOut.im[c0*k0 + c1*1 + k2] = d1i;
		vOut.re[c0*k0 + c1*2 + k2] = d2r;
		vOut.im[c0*k0 + c1*2 + k2] = d2i;
		vOut.re[c0*k0 + c1*3 + k2] = d3r;
		vOut.im[c0*k0 + c1*3 + k2] = d3i;
	}
}


// FFT4_1WeightPerIteration
static void FFT4_1WeightPerIteration(
	ComplexArray vOut,				// Address of output vector.
	int u0,							// Upper bound on k0.
	const CommonWeight weights[]	// Array of weight values.
)
{
	int k0, k2;
	float	a0r, a0i, a1r, a1i, a2r, a2i, a3r, a3i,
					  b1r, b1i, b2r, b2i, b3r, b3i,
			c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
			d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i;

	for (k0 = 0; k0 < u0; ++k0)
	{
		// Load values for current weight.
		CommonWeight weight = weights[k0];

		for (k2 = 0; k2 < 4 ; ++k2)
		{
			a0r = vOut.re[16*k0 + 4*0 + k2];
			a0i = vOut.im[16*k0 + 4*0 + k2];
			a1r = vOut.re[16*k0 + 4*1 + k2];
			a1i = vOut.im[16*k0 + 4*1 + k2];
			a2r = vOut.re[16*k0 + 4*2 + k2];
			a2i = vOut.im[16*k0 + 4*2 + k2];
			a3r = vOut.re[16*k0 + 4*3 + k2];
			a3i = vOut.im[16*k0 + 4*3 + k2];
			b1r = - a1i * weight.w1i + a1r;
			b1i = + a1r * weight.w1i + a1i;
			b2r = - a2i * weight.w2i + a2r;
			b2i = + a2r * weight.w2i + a2i;
			b3r = - a3i * weight.w3i + a3r;
			b3i = + a3r * weight.w3i + a3i;
			c0r = + b2r * weight.w2r + a0r;
			c0i = + b2i * weight.w2r + a0i;
			c2r = - b2r * weight.w2r + a0r;
			c2i = - b2i * weight.w2r + a0i;
			c1r = + b3r * weight.w3r + b1r;
			c1i = + b3i * weight.w3r + b1i;
			c3r = - b3r * weight.w3r + b1r;
			c3i = - b3i * weight.w3r + b1i;
			d0r = + c1r * weight.w1r + c0r;
			d0i = + c1i * weight.w1r + c0i;
			d1r = - c1r * weight.w1r + c0r;
			d1i = - c1i * weight.w1r + c0i;
			d2r = - c3i * weight.w1r + c2r;
			d2i = + c3r * weight.w1r + c2i;
			d3r = + c3i * weight.w1r + c2r;
			d3i = - c3r * weight.w1r + c2i;
			vOut.re[16*k0 + 4*0 + k2] = d0r;
			vOut.im[16*k0 + 4*0 + k2] = d0i;
			vOut.re[16*k0 + 4*1 + k2] = d1r;
			vOut.im[16*k0 + 4*1 + k2] = d1i;
			vOut.re[16*k0 + 4*2 + k2] = d2r;
			vOut.im[16*k0 + 4*2 + k2] = d2i;
			vOut.re[16*k0 + 4*3 + k2] = d3r;
			vOut.im[16*k0 + 4*3 + k2] = d3i;
		}
	}
}


// ReadElements
#define ReadElements(kL)						\
{												\
	a0r[0] = vOut.re[u0*0 + 4*kL + 0];			\
	a1r[0] = vOut.re[u0*0 + 4*kL + 1];			\
	a2r[0] = vOut.re[u0*0 + 4*kL + 2];			\
	a3r[0] = vOut.re[u0*0 + 4*kL + 3];			\
	a0r[1] = vOut.re[u0*2 + 4*kL + 0];			\
	a1r[1] = vOut.re[u0*2 + 4*kL + 1];			\
	a2r[1] = vOut.re[u0*2 + 4*kL + 2];			\
	a3r[1] = vOut.re[u0*2 + 4*kL + 3];			\
	a0r[2] = vOut.re[u0*1 + 4*kL + 0];			\
	a1r[2] = vOut.re[u0*1 + 4*kL + 1];			\
	a2r[2] = vOut.re[u0*1 + 4*kL + 2];			\
	a3r[2] = vOut.re[u0*1 + 4*kL + 3];			\
	a0r[3] = vOut.re[u0*3 + 4*kL + 0];			\
	a1r[3] = vOut.re[u0*3 + 4*kL + 1];			\
	a2r[3] = vOut.re[u0*3 + 4*kL + 2];			\
	a3r[3] = vOut.re[u0*3 + 4*kL + 3];			\
	a0i[0] = vOut.im[u0*0 + 4*kL + 0];			\
	a1i[0] = vOut.im[u0*0 + 4*kL + 1];			\
	a2i[0] = vOut.im[u0*0 + 4*kL + 2];			\
	a3i[0] = vOut.im[u0*0 + 4*kL + 3];			\
	a0i[1] = vOut.im[u0*2 + 4*kL + 0];			\
	a1i[1] = vOut.im[u0*2 + 4*kL + 1];			\
	a2i[1] = vOut.im[u0*2 + 4*kL + 2];			\
	a3i[1] = vOut.im[u0*2 + 4*kL + 3];			\
	a0i[2] = vOut.im[u0*1 + 4*kL + 0];			\
	a1i[2] = vOut.im[u0*1 + 4*kL + 1];			\
	a2i[2] = vOut.im[u0*1 + 4*kL + 2];			\
	a3i[2] = vOut.im[u0*1 + 4*kL + 3];			\
	a0i[3] = vOut.im[u0*3 + 4*kL + 0];			\
	a1i[3] = vOut.im[u0*3 + 4*kL + 1];			\
	a2i[3] = vOut.im[u0*3 + 4*kL + 2];			\
	a3i[3] = vOut.im[u0*3 + 4*kL + 3];			\
}


#define WriteReversedElements(kLprime)						\
{															\
	int kHprime;											\
	for (kHprime = 0; kHprime < 4; ++kHprime)				\
	{														\
		vOut.re[u0*0 + 4*kLprime + kHprime] = d0r[kHprime];	\
		vOut.re[u0*2 + 4*kLprime + kHprime] = d1r[kHprime];	\
		vOut.re[u0*1 + 4*kLprime + kHprime] = d2r[kHprime];	\
		vOut.re[u0*3 + 4*kLprime + kHprime] = d3r[kHprime];	\
		vOut.im[u0*0 + 4*kLprime + kHprime] = d0i[kHprime];	\
		vOut.im[u0*2 + 4*kLprime + kHprime] = d1i[kHprime];	\
		vOut.im[u0*1 + 4*kLprime + kHprime] = d2i[kHprime];	\
		vOut.im[u0*3 + 4*kLprime + kHprime] = d3i[kHprime];	\
	}														\
}


// PerformButterflies
#define PerformButterflies(weight)					\
{													\
	int i;											\
	for (i = 0; i < 4; ++i)							\
	{												\
		b1r[i] = - a1i[i] * weight.w1i[i] + a1r[i];	\
		b1i[i] = + a1r[i] * weight.w1i[i] + a1i[i];	\
		b2r[i] = - a2i[i] * weight.w2i[i] + a2r[i];	\
		b2i[i] = + a2r[i] * weight.w2i[i] + a2i[i];	\
		b3r[i] = - a3i[i] * weight.w3i[i] + a3r[i];	\
		b3i[i] = + a3r[i] * weight.w3i[i] + a3i[i];	\
		c0r[i] = + b2r[i] * weight.w2r[i] + a0r[i];	\
		c0i[i] = + b2i[i] * weight.w2r[i] + a0i[i];	\
		c2r[i] = - b2r[i] * weight.w2r[i] + a0r[i];	\
		c2i[i] = - b2i[i] * weight.w2r[i] + a0i[i];	\
		c1r[i] = + b3r[i] * weight.w3r[i] + b1r[i];	\
		c1i[i] = + b3i[i] * weight.w3r[i] + b1i[i];	\
		c3r[i] = - b3r[i] * weight.w3r[i] + b1r[i];	\
		c3i[i] = - b3i[i] * weight.w3r[i] + b1i[i];	\
		d0r[i] = + c1r[i] * weight.w1r[i] + c0r[i];	\
		d0i[i] = + c1i[i] * weight.w1r[i] + c0i[i];	\
		d1r[i] = - c1r[i] * weight.w1r[i] + c0r[i];	\
		d1i[i] = - c1i[i] * weight.w1r[i] + c0i[i];	\
		d2r[i] = - c3i[i] * weight.w1r[i] + c2r[i];	\
		d2i[i] = + c3r[i] * weight.w1r[i] + c2i[i];	\
		d3r[i] = + c3i[i] * weight.w1r[i] + c2r[i];	\
		d3i[i] = - c3r[i] * weight.w1r[i] + c2i[i];	\
	}												\
}


// FFT4_Final With Bit-Reversal Permutation
static void FFT4_Final(
	ComplexArray vOut,					// Address of output vector.
	int u0,								// Upper bound on k0.
	const FinalIndices IndexTable[],	// Array of index pairs.
	const FinalWeights weights[]		// Array of weight values.
)
{
	typedef float FloatBlock[4];
	FloatBlock	a0r, a0i, a1r, a1i, a2r, a2i, a3r, a3i,
						  b1r, b1i, b2r, b2i, b3r, b3i,
				c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
				d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i;
	int q = 0;

	ReadElements(IndexTable[q].read);
	PerformButterflies(weights[q]);
	for (q = 1; q < u0 >> 2; ++q)
	{
		ReadElements(IndexTable[q].read);
		WriteReversedElements(IndexTable[q-1].write);
		PerformButterflies(weights[q]);
	}
	WriteReversedElements(IndexTable[q-1].write);
}


// ConstantsSet
typedef struct {
	const CommonWeight	*commonWeights;
	const FinalWeights	*finalWeights;
	const FinalIndices	*finalIndices;
} ConstantsSet;


// GetConstants
static int GetConstants(
	ConstantsSet *set,	// Structure in which to return pointers.
	int length,			// Length of vector to be transformed.
	int d				// Direction of transform.
)
{
	static CommonWeight	*CommonWeights = NULL;
	static int			CommonLength = 0;
	static FinalWeights	*FinalWeights[32] = { NULL };
	static FinalIndices	*FinalIndices[32] = { NULL };

	const int hash = ilog2(length);

	if (CommonLength < length)
		if (GenerateCommonWeights(&CommonWeights, &CommonLength,
				length) != 0)
			return 1;
	set->commonWeights = CommonWeights;

	if (FinalIndices[hash] == NULL)
		if (GenerateFinalIndices(&FinalIndices[hash], length) != 0)
			return 1;
	set->finalIndices = FinalIndices[hash];

	if (FinalWeights[hash] == NULL)
		if (GenerateFinalWeights(&FinalWeights[hash], length,
				FinalIndices[hash]) != 0)
			return 1;
	set->finalWeights = FinalWeights[hash];

	return 0;
}


// FFT Kernel with Reordered Loops and Separated Loop for k0=0
static void FFT_Kernel(
	ComplexArray vOut,					// Address of output vector.
	ComplexArray vIn,					// Address of input vector.
	int N,								// N from mathematics.
	const CommonWeight *weights,		// Address of common weight values.
	const FinalIndices *finalIndices,	// Address of index pairs.
	const FinalWeights *finalWeights	// Address of final weight values.
)
{
	int n, nLower, k0;

	if (N & 1)
		FFT8_0Weights(vOut, vIn, 1<<N);
	else
		FFT4_0Weights(vOut, vIn, 1<<N);

	nLower = N&1 ? 3 : 2;
	for (n = nLower; n      < N-4      ; n +=2      )
		FFT4_0Weights(vOut, vOut, 1<<N-n);

	for (k0 = 1    ; nLower < N-4      ; nLower += 2)
	for (          ; k0     < 1<<nLower; ++k0       )
	for (n = nLower; n      < N-4      ; n += 2     )
		FFT4_1WeightPerCall(vOut, k0, 1<<N-n, weights[k0]);

	if (n < N-2)
		FFT4_1WeightPerIteration(vOut, 1<<N-4, weights);

	FFT4_Final(vOut, 1<<N-2, finalIndices, finalWeights);
}


// FFT_FirstStage Prototype
static void FFT_FirstStage(
	int m,							// log2 of butterfly radix.
	ComplexArray vOut,				// Address of output vector.
	ComplexArray vIn,				// Address of input vector.
	int c0,							// Coefficient for c0.
	const CommonWeight weights[]	// Array of weight values.
)
{
	FFT_Butterflies(m, vOut, vIn, 0, c0);
}


// FFT_PenultimateStage Prototype
static void FFT_PenultimateStage(
	ComplexArray vOut,				// Address of output vector.
	int nStage,						// n at start of stage.
	int N,							// N from mathematics.
	const CommonWeight weights[]	// Array of weight values.
)
{
	int k0;

	for (k0 = 0; k0 < 1<<nStage ; ++k0)
		FFT_Butterflies(N-2-nStage, vOut, vOut, k0, 1<<N-nStage);
}


// FFT_FinalStage Prototype
static void FFT_FinalStage(
	ComplexArray vOut,					// Address of output vector.
	int u0,								// Upper bound on k0.
	const FinalIndices IndexTable[],	// Array of index pairs.
	const FinalWeights weights[]		// Array of weight values.
)
{
	int k0;

	for (k0 = 0; k0 < u0; ++k0)
		FFT_Butterflies(2, vOut, vOut, k0, 4);

	// The bit-reversal permutation is not included in this implementation.
}


// Multiple-Stage Kernel
static void FFT_MultipleStages(
	ComplexArray vOut,					// Address of output vector.
	ComplexArray vIn,					// Address of input vector.
	int N,								// N from mathematics.
	const CommonWeight *weights,		// Address of common weight values.
	const FinalIndices *finalIndices,	// Address of index pairs.
	const FinalWeights *finalWeights	// Address of final weight values.
)
{
	int m0 = N&1 ? 7 : 6;

	FFT_FirstStage(m0, vOut, vIn, 1<<N, weights);

	if (0 < N-2-m0)
		FFT_PenultimateStage(vOut, m0, N, weights);

	FFT_FinalStage(vOut, 1<<N-2, finalIndices, finalWeights);
}


/*	This routine, FFT, provides the public interface for the FFT.  It
	allocates necessary memory, chooses a structure for the FFT, calls
	the kernel, and provides the bit-reversal permutation for versions
	of the kernel that do not have it.
*/
extern "C" {
int FFT(
	float *re,	// Address of real components.
	float *im,	// Address of imaginary components.
	int N,		// Base-two logarithm of length of vector.
	int d		// Direction of transform.
)
{
	extern void BitReversalPermute(float *re, float *im, int TwoToTheN);

	ConstantsSet constants;
	ComplexArray v(re, im);

	// This FFT does not support the reverse transform.
	if (d != 1)
		return 1;

	// This FFT does not support N < 4.
	if (N < 4)
		return 1;

	/*	This FFT does not support long vectors that overflow the
		field size in the indices.
	*/
	if (CHAR_BIT * sizeof constants.finalIndices->read + 4 < N)
		return 1;

	// Get the constants.
	if (0 != GetConstants(&constants, 1<<N, d))
		return 1;

	// If n is small, do the single-stage FFT.
	if (1<<N < 32768 / (sizeof *re + sizeof *im))
		FFT_Kernel(v, v, N, constants.commonWeights,
			constants.finalIndices, constants.finalWeights);

	// If n is large, do the multiple-stage FFT.
	else
	{
		FFT_MultipleStages(v, v, N, constants.commonWeights,
			constants.finalIndices, constants.finalWeights);

		// Supply the bit-reversal that is missing in this implementation.
		BitReversalPermute(v.re, v.im, 1<<N);
	}

	return 0;
}
}
