/*	This file contains demonstration code that supplements the paper
	_Construction of a High-Performance FFT_.

	See the paper and ReadMe.txt for more information.

	The code is formatted for tabs at four-column intervals.
*/


#include "common.h"


// CommonWeight
typedef struct {
	float w1r, w1i, w2r, w2i, w3r, w3i;
} CommonWeight;


/*	Note:  This version of GenerateCommonWeights differs from the final
	version in that it generates NewLength/4 weights instead of only
	NewLength/16.  This is because this demonstration version uses
	common weights in FFT4_Final, which needs NewLength/4 weights.
	Later versions use separate weights in FFT4_Final and need only
	NewLength/16 weights for other routines.
*/

// GenerateCommonWeights
static int GenerateCommonWeights(
	CommonWeight **weights,	// Pointer to array address.
	int *length,			// Pointer to supported length.
	int NewLength			// New length to support (1<<N).
)
{
	int k0;

	// Try to allocate space and check result.
	CommonWeight *p = (CommonWeight *)
		realloc(*weights, NewLength/4 * sizeof **weights);
	if (p == NULL)
		return 1;

	for (k0 = *length/4; k0 < NewLength/4; ++k0)
	{
		const double x = TwoPi * r(4*k0);
		p[k0].w1r = cos(x);
		p[k0].w1i = tan(x);
		p[k0].w2r = cos(x+x);
		p[k0].w2i = tan(x+x);
		p[k0].w3r = 2. * p[k0].w2r - 1.;
		p[k0].w3i = tan(3.*x);
	}

	// Pass address and supported length back to caller.
	*weights = p;
	*length = NewLength;

	return 0;
}


// FFT4_0Weights
static void FFT4_0Weights(
	ComplexArray vOut,	// Address of output vector.
	ComplexArray vIn,	// Address of input vector.
	int c0				// Coefficient for k0.
)
{
	// Coefficient for k1 is coefficient for k0 divided by 1<<m.
	const int c1 = c0 >> 2;
	int k2;
	float	a0r, a0i, a1r, a1i, a2r, a2i, a3r, a3i,
			c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
			d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i;

	for (k2 = 0; k2 < c1; ++k2)
	{
		a0r = vIn.re[c1*0 + k2];
		a0i = vIn.im[c1*0 + k2];
		a1r = vIn.re[c1*1 + k2];
		a1i = vIn.im[c1*1 + k2];
		a2r = vIn.re[c1*2 + k2];
		a2i = vIn.im[c1*2 + k2];
		a3r = vIn.re[c1*3 + k2];
		a3i = vIn.im[c1*3 + k2];
		c0r = + a2r + a0r;
		c0i = + a2i + a0i;
		c2r = - a2r + a0r;
		c2i = - a2i + a0i;
		c1r = + a3r + a1r;
		c1i = + a3i + a1i;
		c3r = - a3r + a1r;
		c3i = - a3i + a1i;
		d0r = + c1r + c0r;
		d0i = + c1i + c0i;
		d1r = - c1r + c0r;
		d1i = - c1i + c0i;
		d2r = - c3i + c2r;
		d2i = + c3r + c2i;
		d3r = + c3i + c2r;
		d3i = - c3r + c2i;
		vOut.re[c1*0 + k2] = d0r;
		vOut.im[c1*0 + k2] = d0i;
		vOut.re[c1*1 + k2] = d1r;
		vOut.im[c1*1 + k2] = d1i;
		vOut.re[c1*2 + k2] = d2r;
		vOut.im[c1*2 + k2] = d2i;
		vOut.re[c1*3 + k2] = d3r;
		vOut.im[c1*3 + k2] = d3i;
	}
}


// FFT8_0Weights
static void FFT8_0Weights(
	ComplexArray vOut,	// Address of output vector.
	ComplexArray vIn,	// Address of input vector.
	int c0				// Coefficient for k0.
)
{
	// Prepare a constant, sqrt(2)/2.
	const float sqrt2d2 = .7071067811865475244;
	// Coefficient for k1 is coefficient for k0 divided by 1<<m.
	const int c1 = c0 >> 3;
	int k2;
	float	a0r, a0i, a1r, a1i, a2r, a2i, a3r, a3i,
			a4r, a4i, a5r, a5i, a6r, a6i, a7r, a7i,
			b0r, b0i, b1r, b1i, b2r, b2i, b3r, b3i,
			b4r, b4i, b5r, b5i, b6r, b6i, b7r, b7i,
			c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
			c4r, c4i, c5r, c5i, c6r, c6i, c7r, c7i,
			d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i,
			d4r, d4i, d5r, d5i, d6r, d6i, d7r, d7i,
			t5r, t5i, t7r, t7i;

	for (k2 = 0; k2 < c1; ++k2)
	{
		a0r = vIn.re[c1*0 + k2];
		a0i = vIn.im[c1*0 + k2];
		a1r = vIn.re[c1*1 + k2];
		a1i = vIn.im[c1*1 + k2];
		a2r = vIn.re[c1*2 + k2];
		a2i = vIn.im[c1*2 + k2];
		a3r = vIn.re[c1*3 + k2];
		a3i = vIn.im[c1*3 + k2];
		a4r = vIn.re[c1*4 + k2];
		a4i = vIn.im[c1*4 + k2];
		a5r = vIn.re[c1*5 + k2];
		a5i = vIn.im[c1*5 + k2];
		a6r = vIn.re[c1*6 + k2];
		a6i = vIn.im[c1*6 + k2];
		a7r = vIn.re[c1*7 + k2];
		a7i = vIn.im[c1*7 + k2];
		b0r = a0r + a4r;			// w = 1.
		b0i = a0i + a4i;
		b1r = a1r + a5r;
		b1i = a1i + a5i;
		b2r = a2r + a6r;
		b2i = a2i + a6i;
		b3r = a3r + a7r;
		b3i = a3i + a7i;
		b4r = a0r - a4r;
		b4i = a0i - a4i;
		b5r = a1r - a5r;
		b5i = a1i - a5i;
		b6r = a2r - a6r;
		b6i = a2i - a6i;
		b7r = a3r - a7r;
		b7i = a3i - a7i;
		c0r = b0r + b2r;			// w = 1.
		c0i = b0i + b2i;
		c1r = b1r + b3r;
		c1i = b1i + b3i;
		c2r = b0r - b2r;
		c2i = b0i - b2i;
		c3r = b1r - b3r;
		c3i = b1i - b3i;
		c4r = b4r - b6i;			// w = i.
		c4i = b4i + b6r;
		c5r = b5r - b7i;
		c5i = b5i + b7r;
		c6r = b4r + b6i;
		c6i = b4i - b6r;
		c7r = b5r + b7i;
		c7i = b5i - b7r;
		t5r = c5r - c5i;
		t5i = c5r + c5i;
		t7r = c7r + c7i;
		t7i = c7r - c7i;
		d0r = c0r + c1r;			// w = 1.
		d0i = c0i + c1i;
		d1r = c0r - c1r;
		d1i = c0i - c1i;
		d2r = c2r - c3i;			// w = i.
		d2i = c2i + c3r;
		d3r = c2r + c3i;
		d3i = c2i - c3r;
		d4r = + t5r * sqrt2d2 + c4r;	// w = sqrt(2)/2 * (+1+i).
		d4i = + t5i * sqrt2d2 + c4i;
		d5r = - t5r * sqrt2d2 + c4r;
		d5i = - t5i * sqrt2d2 + c4i;
		d6r = - t7r * sqrt2d2 + c6r;	// w = sqrt(2)/2 * (-1+i).
		d6i = + t7i * sqrt2d2 + c6i;
		d7r = + t7r * sqrt2d2 + c6r;
		d7i = - t7i * sqrt2d2 + c6i;
		vOut.re[c1*0 + k2] = d0r;
		vOut.im[c1*0 + k2] = d0i;
		vOut.re[c1*1 + k2] = d1r;
		vOut.im[c1*1 + k2] = d1i;
		vOut.re[c1*2 + k2] = d2r;
		vOut.im[c1*2 + k2] = d2i;
		vOut.re[c1*3 + k2] = d3r;
		vOut.im[c1*3 + k2] = d3i;
		vOut.re[c1*4 + k2] = d4r;
		vOut.im[c1*4 + k2] = d4i;
		vOut.re[c1*5 + k2] = d5r;
		vOut.im[c1*5 + k2] = d5i;
		vOut.re[c1*6 + k2] = d6r;
		vOut.im[c1*6 + k2] = d6i;
		vOut.re[c1*7 + k2] = d7r;
		vOut.im[c1*7 + k2] = d7i;
	}
}


// FFT4_1WeightPerCall
static void FFT4_1WeightPerCall(
	ComplexArray vOut,	// Address of output vector.
	int k0,				// k0 from equation.
	int c0,				// Coefficient for k0.
	CommonWeight weight	// Values for weight calculations.
)
{
	// Coefficient for k1 is coefficient for k0 divided by 1<<m.
	const int c1 = c0 >> 2;
	int k2;
	float	a0r, a0i, a1r, a1i, a2r, a2i, a3r, a3i,
					  b1r, b1i, b2r, b2i, b3r, b3i,
			c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
			d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i;

	for (k2 = 0; k2 < c1; ++k2)
	{
		a0r = vOut.re[c0*k0 + c1*0 + k2];
		a0i = vOut.im[c0*k0 + c1*0 + k2];
		a1r = vOut.re[c0*k0 + c1*1 + k2];
		a1i = vOut.im[c0*k0 + c1*1 + k2];
		a2r = vOut.re[c0*k0 + c1*2 + k2];
		a2i = vOut.im[c0*k0 + c1*2 + k2];
		a3r = vOut.re[c0*k0 + c1*3 + k2];
		a3i = vOut.im[c0*k0 + c1*3 + k2];
		b1r = - a1i * weight.w1i + a1r;
		b1i = + a1r * weight.w1i + a1i;
		b2r = - a2i * weight.w2i + a2r;
		b2i = + a2r * weight.w2i + a2i;
		b3r = - a3i * weight.w3i + a3r;
		b3i = + a3r * weight.w3i + a3i;
		c0r = + b2r * weight.w2r + a0r;
		c0i = + b2i * weight.w2r + a0i;
		c2r = - b2r * weight.w2r + a0r;
		c2i = - b2i * weight.w2r + a0i;
		c1r = + b3r * weight.w3r + b1r;
		c1i = + b3i * weight.w3r + b1i;
		c3r = - b3r * weight.w3r + b1r;
		c3i = - b3i * weight.w3r + b1i;
		d0r = + c1r * weight.w1r + c0r;
		d0i = + c1i * weight.w1r + c0i;
		d1r = - c1r * weight.w1r + c0r;
		d1i = - c1i * weight.w1r + c0i;
		d2r = - c3i * weight.w1r + c2r;
		d2i = + c3r * weight.w1r + c2i;
		d3r = + c3i * weight.w1r + c2r;
		d3i = - c3r * weight.w1r + c2i;
		vOut.re[c0*k0 + c1*0 + k2] = d0r;
		vOut.im[c0*k0 + c1*0 + k2] = d0i;
		vOut.re[c0*k0 + c1*1 + k2] = d1r;
		vOut.im[c0*k0 + c1*1 + k2] = d1i;
		vOut.re[c0*k0 + c1*2 + k2] = d2r;
		vOut.im[c0*k0 + c1*2 + k2] = d2i;
		vOut.re[c0*k0 + c1*3 + k2] = d3r;
		vOut.im[c0*k0 + c1*3 + k2] = d3i;
	}
}


// FFT4_1WeightPerIteration
static void FFT4_1WeightPerIteration(
	ComplexArray vOut,				// Address of output vector.
	int u0,							// Upper bound on k0.
	const CommonWeight weights[]	// Array of weight values.
)
{
	int k0, k2;
	float	a0r, a0i, a1r, a1i, a2r, a2i, a3r, a3i,
					  b1r, b1i, b2r, b2i, b3r, b3i,
			c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
			d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i;

	for (k0 = 0; k0 < u0; ++k0)
	{
		// Load values for current weight.
		CommonWeight weight = weights[k0];

		for (k2 = 0; k2 < 4 ; ++k2)
		{
			a0r = vOut.re[16*k0 + 4*0 + k2];
			a0i = vOut.im[16*k0 + 4*0 + k2];
			a1r = vOut.re[16*k0 + 4*1 + k2];
			a1i = vOut.im[16*k0 + 4*1 + k2];
			a2r = vOut.re[16*k0 + 4*2 + k2];
			a2i = vOut.im[16*k0 + 4*2 + k2];
			a3r = vOut.re[16*k0 + 4*3 + k2];
			a3i = vOut.im[16*k0 + 4*3 + k2];
			b1r = - a1i * weight.w1i + a1r;
			b1i = + a1r * weight.w1i + a1i;
			b2r = - a2i * weight.w2i + a2r;
			b2i = + a2r * weight.w2i + a2i;
			b3r = - a3i * weight.w3i + a3r;
			b3i = + a3r * weight.w3i + a3i;
			c0r = + b2r * weight.w2r + a0r;
			c0i = + b2i * weight.w2r + a0i;
			c2r = - b2r * weight.w2r + a0r;
			c2i = - b2i * weight.w2r + a0i;
			c1r = + b3r * weight.w3r + b1r;
			c1i = + b3i * weight.w3r + b1i;
			c3r = - b3r * weight.w3r + b1r;
			c3i = - b3i * weight.w3r + b1i;
			d0r = + c1r * weight.w1r + c0r;
			d0i = + c1i * weight.w1r + c0i;
			d1r = - c1r * weight.w1r + c0r;
			d1i = - c1i * weight.w1r + c0i;
			d2r = - c3i * weight.w1r + c2r;
			d2i = + c3r * weight.w1r + c2i;
			d3r = + c3i * weight.w1r + c2r;
			d3i = - c3r * weight.w1r + c2i;
			vOut.re[16*k0 + 4*0 + k2] = d0r;
			vOut.im[16*k0 + 4*0 + k2] = d0i;
			vOut.re[16*k0 + 4*1 + k2] = d1r;
			vOut.im[16*k0 + 4*1 + k2] = d1i;
			vOut.re[16*k0 + 4*2 + k2] = d2r;
			vOut.im[16*k0 + 4*2 + k2] = d2i;
			vOut.re[16*k0 + 4*3 + k2] = d3r;
			vOut.im[16*k0 + 4*3 + k2] = d3i;
		}
	}
}


// FFT4_Final
static void FFT4_Final(
	ComplexArray vOut,				// Address of output vector.
	int u0,							// Upper bound on k0.
	const CommonWeight weights[]	// Array of weight values.
)
{
	int k0;
	float	a0r, a0i, a1r, a1i, a2r, a2i, a3r, a3i,
					  b1r, b1i, b2r, b2i, b3r, b3i,
			c0r, c0i, c1r, c1i, c2r, c2i, c3r, c3i,
			d0r, d0i, d1r, d1i, d2r, d2i, d3r, d3i;

	for (k0 = 0; k0 < u0; ++k0)
	{
		// Load values for current weight.
		CommonWeight weight = weights[k0];

		a0r = vOut.re[4*k0 + 0];
		a0i = vOut.im[4*k0 + 0];
		a1r = vOut.re[4*k0 + 1];
		a1i = vOut.im[4*k0 + 1];
		a2r = vOut.re[4*k0 + 2];
		a2i = vOut.im[4*k0 + 2];
		a3r = vOut.re[4*k0 + 3];
		a3i = vOut.im[4*k0 + 3];
		b1r = - a1i * weight.w1i + a1r;
		b1i = + a1r * weight.w1i + a1i;
		b2r = - a2i * weight.w2i + a2r;
		b2i = + a2r * weight.w2i + a2i;
		b3r = - a3i * weight.w3i + a3r;
		b3i = + a3r * weight.w3i + a3i;
		c0r = + b2r * weight.w2r + a0r;
		c0i = + b2i * weight.w2r + a0i;
		c2r = - b2r * weight.w2r + a0r;
		c2i = - b2i * weight.w2r + a0i;
		c1r = + b3r * weight.w3r + b1r;
		c1i = + b3i * weight.w3r + b1i;
		c3r = - b3r * weight.w3r + b1r;
		c3i = - b3i * weight.w3r + b1i;
		d0r = + c1r * weight.w1r + c0r;
		d0i = + c1i * weight.w1r + c0i;
		d1r = - c1r * weight.w1r + c0r;
		d1i = - c1i * weight.w1r + c0i;
		d2r = - c3i * weight.w1r + c2r;
		d2i = + c3r * weight.w1r + c2i;
		d3r = + c3i * weight.w1r + c2r;
		d3i = - c3r * weight.w1r + c2i;
		vOut.re[4*k0 + 0] = d0r;
		vOut.im[4*k0 + 0] = d0i;
		vOut.re[4*k0 + 1] = d1r;
		vOut.im[4*k0 + 1] = d1i;
		vOut.re[4*k0 + 2] = d2r;
		vOut.im[4*k0 + 2] = d2i;
		vOut.re[4*k0 + 3] = d3r;
		vOut.im[4*k0 + 3] = d3i;
	}
}


// FFT Kernel Using Specialized Butterfly Routines
static void FFT_Kernel(
	ComplexArray vOut,					// Address of output vector.
	ComplexArray vIn,					// Address of input vector.
	int *n,								// n's from mathematics.
	int N,								// N from mathematics.
	int P,								// P from mathematics.
	const CommonWeight *weights			// Address of weight values.
)
{
	int p, k0;

	if (N & 1)
		FFT8_0Weights(vOut, vIn, 1<<N);
	else
		FFT4_0Weights(vOut, vIn, 1<<N);

	for (p  = 1; p  < P-2    ; ++p )
	for (k0 = 0; k0 < 1<<n[p]; ++k0)
		FFT4_1WeightPerCall(vOut, k0, 1<<N-n[p], weights[k0]);

	if (p < P-1)
		FFT4_1WeightPerIteration(vOut, 1<<N-4, weights);

	FFT4_Final(vOut, 1<<N-2, weights);
}


/*	This routine, FFT, provides the public interface for the FFT.  It
	allocates necessary memory, chooses a structure for the FFT, calls
	the kernel, and provides the bit-reversal permutation for versions
	of the kernel that do not have it.
*/
extern "C" {
int FFT(
	float *re,	// Address of real components.
	float *im,	// Address of imaginary components.
	int N,		// Base-two logarithm of length of vector.
	int d		// Direction of transform.
)
{
	extern void BitReversalPermute(float *re, float *im, int TwoToTheN);

	static CommonWeight *CommonWeights = NULL;
	static int length = 0;

	int P;

	ComplexArray v(re, im);
	int i, *n;

	// This FFT does not support the reverse transform.
	if (d != 1)
		return 1;

	// This FFT does not support N < 4.
	if (N < 4)
		return 1;

	// Start structure the FFT by choosing how many passes we will have.
	P = N/2;

	n = (int *) malloc((P+1) * sizeof *n);
	if (n == NULL) {
		fprintf(stderr, "Error allocating memory.\n");
		exit(1);
	}

	// Structure the FFT by choosing the values of n[p].
	n[0] = 0;
	// Start with a radix-8 or radix-4 pass.
	n[1] = N & 1 ? 3 : 2;
	// Use radix-4 passes for all passes after the first.
	for (i = 2; i <= P; ++i)
		n[i] = n[i-1] + 2;

	// Get the common weights.
	if (0 != GenerateCommonWeights(&CommonWeights, &length, 1<<N))
		return 1;

	// Do the FFT!
	FFT_Kernel(v, v, n, N, P, CommonWeights);

	// Do the bit-reversal because it is not in this FFT kernel.
	// Convert bit-reversed v[N] to desired mathematical output H.
	BitReversalPermute(re, im, 1<<N);

	free(n);

	return 0;
}
}
