|
void Benchmark_CS1(uint3 thread_id : SV_DispatchThreadID)
{
if(thread_id.x >= thread_count)
return;
uint i;
float2 D[8];
uint iaddr = thread_id << 3;
for (i = 0; i < 32; i++) {
D[0] = g_SrcData[iaddr + 0];
D[1] = g_SrcData[iaddr + 1];
D[2] = g_SrcData[iaddr + 2];
D[3] = g_SrcData[iaddr + 3];
D[4] = g_SrcData[iaddr + 4];
D[5] = g_SrcData[iaddr + 5];
D[6] = g_SrcData[iaddr + 6];
D[7] = g_SrcData[iaddr + 7];
FFT_forward_8(D);
TWIDDLE_8(D, COS_PI_4_16);
FFT_forward_8(D);
TWIDDLE_8(D, COS_PI_4_16);
FFT_forward_8(D);
TWIDDLE_8(D, COS_PI_4_16);
FFT_forward_8(D);
TWIDDLE_8(D, COS_PI_4_16);
uint omod = thread_id & (ostride - 1);
uint oaddr = ((thread_id - omod) << 3) + omod;
g_DstData[oaddr + 0 * ostride] = D[0];
g_DstData[oaddr + 1 * ostride] = D[4];
g_DstData[oaddr + 2 * ostride] = D[2];
g_DstData[oaddr + 3 * ostride] = D[6];
g_DstData[oaddr + 4 * ostride] = D[1];
g_DstData[oaddr + 5 * ostride] = D[5];
g_DstData[oaddr + 6 * ostride] = D[3];
g_DstData[oaddr + 7 * ostride] = D[7];
}
}
蛮奇怪的代码 |
|