高质量静态稳定的TAA改进

2022年4月27日实时渲染

在被TAA中的静态Flicking折磨了两个月后，终于决定花一些来时间来修复这些问题。

最后给出源码。

目标

高质量的采样，理想情况下比拟16xSSAA。
精确的Clamp，不能出现一点残影。
静态稳定性，大量减少静态场景的Flickering。
最好的性能，不引入额外的RT。
锐利的输出。

实现目标#1

对于目标1，16x的SSAA意味着TAA的抖动周期至少为16帧。但通常来说，越大的帧周期，画面稳定性越差，并且对帧率的要求更高。

因此，直接设Halton序列的周期为16是比较好的选择。

实现目标#2

我测试了很多Clamp方法，目前最优秀的还是AMD的Variance Clamp做法，他们在算方差前就提前做好了Tonemapper, 最后Clamp出来的效果非常的准确。

实现目标#3和#4

我大量的时间都花在这里了，所以会更加详细的说明思路。

静态Flickering是因为Halton序列随机抖动导致远处微小几何体在随机某一帧光栅化，断断续续的，因此很容易被History Clamp剔除掉贡献，然后就会出现静态的Flicking。

治标做法是给场景做好网格LOD。

治本做法，标记出这些Flickering的像素，将它们锁住，并增加它们在Clamp时Box的Size，或者根本就不Clamp。

这种做法其实来自AMD的FSR2.0:

思路很清晰但做起来付出的代价却很高。

首先排除LockPixel自己的RT消耗，我们如果要准确标记出Lock Pixel，我们得确保满足如下条件：

这部分像素在上一帧和这一帧没有移动（速度为0）。
这部分像素的深度和历史帧深度存在突变。
这部分像素的亮度和历史帧亮度存在突变。

条件#1确保静态的像素才会被锁住。由于TAA每帧在一个像素内Jitter，所以判断两帧内是否移动，得采样3x3范围内的速度缓冲做判断，并且还得保持历史帧速度缓冲。

条件#2和条件#3同样得在3x3范围内做，并且这个突变的范围需要设置得非常的微妙，需要耐心调整。

我反正是做着做着要吐了。

我最终的做法：

其实调整BlendFactor为一个较低的值可以很好的抑制闪烁。但它会带来拖影。

所以在History.w中存入一个Lerp Factor，根据速度来调整clamp的box size。

这样，不会引入任何额外的RT。

实现目标#5

AMD的CAS是最好的锐化算法了，用在这里非常合适。

我这边实现了LDS优化的版本。一起放在代码里。细节详见如下：

#Pass 0: TAAMain:

#version 460

#include "Common.glsl"

// Temporal Anti-Alias
#define LOCAL_SIZE_XY 16
layout (local_size_x = LOCAL_SIZE_XY, local_size_y = LOCAL_SIZE_XY, local_size_z = 1) in;

#define VIEW_DATA_SET 0
#include "ViewData.glsl"

#define FRAME_DATA_SET 2
#include "FrameData.glsl"

layout (set = 1, binding = 0, rgba16f) uniform image2D outHdrColor;
layout (set = 1, binding = 1) uniform sampler2D inDepth;  
layout (set = 1, binding = 2) uniform sampler2D inHistory; 
layout (set = 1, binding = 3) uniform sampler2D inVelocity;
layout (set = 1, binding = 4) uniform sampler2D inHdrColor;

struct TAAPushConstant
{
    uint firstRender;
    uint camMove;
};

layout(push_constant) uniform block
{
	TAAPushConstant pushConstant;
};

bool IsFirstFrame()
{
    return pushConstant.firstRender != 0;
}

bool IsCamMove()
{
    return pushConstant.camMove != 0;
}

// TAA random offset within one pixel range.
// so use 3x3 tap to get safe value.
const ivec2 kPattern3x3[9] = {
    ivec2(-1,-1),
    ivec2(-1, 0),
    ivec2(-1, 1),
    ivec2( 0, 1),
    ivec2( 0, 0),
    ivec2( 0,-1),
    ivec2( 1, 1),
    ivec2( 1, 0),
    ivec2( 1,-1)
};
const float kRpc9 = 1.0f / 9.0f;

// keep 1 pixel border for lds, to keep edge tap safe.
const int kBorderSize = 1;
const int kGroupSize  = LOCAL_SIZE_XY;
const int kLdsLength  = kGroupSize + kBorderSize * 2;
const int kLdsArea    = kLdsLength * kLdsLength;

const float kTinyFloat   = 1e-8f;
const float kMaxFloat16  = 32767.0f;
const float kMaxFloat16u = 65535.0f;

shared vec3  sharedColor[kLdsLength][kLdsLength];
shared float sharedDepth[kLdsLength][kLdsLength];

vec3 reinhard(vec3 hdr)
{
    return hdr / (hdr + 1.0f);
}

vec3 reinhardInverse(in vec3 sdr)
{
    return sdr / max(1.0f - sdr, 1e-5f);
}

vec3 ldsLoadColor(ivec2 GId)
{
    GId += ivec2(kBorderSize);
    return sharedColor[GId.x][GId.y];
}

// store color with reinhard tonemmaper.
void ldsStoreColor(ivec2 GId, vec3 color)
{
    sharedColor[GId.x][GId.y] = reinhard(color);
}

float ldsLoadDepth(ivec2 GId)
{
    GId += ivec2(kBorderSize);
    return sharedDepth[GId.x][GId.y];
}

void ldsStoreDepth(ivec2 GId, float depth)
{
    sharedDepth[GId.x][GId.y] = depth;
}

float luminance(vec3 color)
{
    return max(dot(color, vec3(0.299f, 0.587f, 0.114f)), 1e-5f);
}

void storeColorDepth(ivec2 GId, ivec2 TId, ivec2 size)
{
    TId = clamp(TId, ivec2(0,0), size - ivec2(1,1));

    // store color.
    ldsStoreColor(GId, texelFetch(inHdrColor, TId, 0).rgb);

    // store linear z.
    float linearZ = linearizeDepth(texelFetch(inDepth, TId, 0).r, viewData.camInfo.z, viewData.camInfo.w);
    ldsStoreDepth(GId, linearZ);
}

void prepareLds(ivec2 topLeft, ivec2 workSize, int groupIndex)
{
    // 4 sample per pixel.
    if(groupIndex < (kLdsArea >> 2)) // 1 / 4 area pixel work.
    {
        // sample [0, 0]
        ivec2 id0 = ivec2(
            groupIndex                     % kLdsLength,                      
            groupIndex                     / kLdsLength);
        storeColorDepth(id0, topLeft + id0, workSize);

        // sample [0.25,  0.25]
        ivec2 id1 = ivec2(
            (groupIndex + (kLdsArea >> 2))  % kLdsLength, 
            (groupIndex + (kLdsArea >> 2))  / kLdsLength);
        storeColorDepth(id1, topLeft + id1, workSize);

        // sample [0.5, 0.5]
        ivec2 id2 = ivec2(
            (groupIndex + (kLdsArea >> 1))  % kLdsLength, 
            (groupIndex + (kLdsArea >> 1))  / kLdsLength);
        storeColorDepth(id2, topLeft + id2, workSize);

        // sample [0.75, 0.75]
        ivec2 id3 = ivec2(
            (groupIndex + kLdsArea * 3 / 4) % kLdsLength, 
            (groupIndex + kLdsArea * 3 / 4) / kLdsLength);
        storeColorDepth(id3, topLeft + id3, workSize);
    }
}

// linear z cloest test.
void depthGetClosest(ivec2 pos, inout float cloestDepth, inout ivec2 cloestPos)
{
    float d = ldsLoadDepth(pos);

    if(d < cloestDepth)
    {
        cloestDepth = d;
        cloestPos = pos;
    }
}

// 3x3 tap get closet pos depth.
float velocitySample3x3Closest(ivec2 groupPos, ivec2 topLeft, out vec2 velocity)
{
    float minDepth = 1.0f;
    ivec2 minPos   = groupPos;

    depthGetClosest(groupPos + kPattern3x3[0], minDepth, minPos);
    depthGetClosest(groupPos + kPattern3x3[1], minDepth, minPos);
    depthGetClosest(groupPos + kPattern3x3[2], minDepth, minPos);
    depthGetClosest(groupPos + kPattern3x3[3], minDepth, minPos);
    depthGetClosest(groupPos + kPattern3x3[4], minDepth, minPos);
    depthGetClosest(groupPos + kPattern3x3[5], minDepth, minPos);
    depthGetClosest(groupPos + kPattern3x3[6], minDepth, minPos);
    depthGetClosest(groupPos + kPattern3x3[7], minDepth, minPos);
    depthGetClosest(groupPos + kPattern3x3[8], minDepth, minPos);

    velocity = texelFetch(inVelocity, topLeft + minPos, 0).xy;

    return minDepth;
}

// catmull Rom 9 tap sampler.
// sTex: linear clamp sampler2D.
// uv: sample uv.
// resolution: working rt resolution.
vec3 catmullRom9Sample(sampler2D sTex, vec2 uv, vec2 resolution)
{
    vec2 samplePos = uv * resolution;
    vec2 texPos1   = floor(samplePos - 0.5f) + 0.5f;

    vec2 f = samplePos - texPos1;

    vec2 w0 = f * (-0.5f + f * (1.0f - 0.5f * f));
    vec2 w1 = 1.0f + f * f * (-2.5f + 1.5f * f);
    vec2 w2 = f * (0.5f + f * (2.0f - 1.5f * f));
    vec2 w3 = f * f * (-0.5f + 0.5f * f);

    vec2 w12 = w1 + w2;
    vec2 offset12 = w2 / (w1 + w2);

    vec2 texPos0 = texPos1 - 1.0f;
    vec2 texPos3 = texPos1 + 2.0f;

    vec2 texPos12 = texPos1 + offset12;

    texPos0  /= resolution;
    texPos3  /= resolution;
    texPos12 /= resolution;

    vec3 result = vec3(0.0f);

    result += textureLod(sTex, vec2(texPos0.x,  texPos0.y),  0).xyz * w0.x  * w0.y;
    result += textureLod(sTex, vec2(texPos12.x, texPos0.y),  0).xyz * w12.x * w0.y;
    result += textureLod(sTex, vec2(texPos3.x,  texPos0.y),  0).xyz * w3.x  * w0.y;

    result += textureLod(sTex, vec2(texPos0.x,  texPos12.y), 0).xyz * w0.x  * w12.y;
    result += textureLod(sTex, vec2(texPos12.x, texPos12.y), 0).xyz * w12.x * w12.y;
    result += textureLod(sTex, vec2(texPos3.x,  texPos12.y), 0).xyz * w3.x  * w12.y;

    result += textureLod(sTex, vec2(texPos0.x,  texPos3.y),  0).xyz * w0.x  * w3.y;
    result += textureLod(sTex, vec2(texPos12.x, texPos3.y),  0).xyz * w12.x * w3.y;
    result += textureLod(sTex, vec2(texPos3.x,  texPos3.y),  0).xyz * w3.x  * w3.y;

    return max(result, vec3(0.0f));
}

void main() 
{
    ivec2 workSize = textureSize(inHdrColor, 0).xy;
    ivec2 topLeft  = ivec2(gl_WorkGroupID.xy) * kGroupSize - kBorderSize;
    ivec2 groupPos = ivec2(gl_LocalInvocationID.xy);
    int groupIndex = int(gl_LocalInvocationIndex);

    prepareLds(topLeft, workSize, groupIndex);

    groupMemoryBarrier();
    barrier();

    ivec2 pixelPos = ivec2(gl_GlobalInvocationID.xy);
    if (pixelPos.x >= workSize.x || pixelPos.y >= workSize.y)
    {
        return; 
    }

    vec2 texelSize = 1.0f / vec2(workSize);
    vec2 uv        = (vec2(pixelPos) + vec2(0.5)) * texelSize; 

    // get cloest velocity.
    vec2 velocity;
    const float cloestDepth = velocitySample3x3Closest(groupPos, topLeft, velocity);
    const bool  bSky = cloestDepth <= BG_DEPTH;

    // reproject uv.
    vec2 reprojectedUV = uv - velocity;

    float velocityLerp = 0.0f;
    if(!IsFirstFrame())
    {
        velocityLerp = texture(inHistory, reprojectedUV).w;
    }

    const float ideaStaticBoxSize = 2.5f;
    float staticBoxSize = IsCamMove() ? ideaStaticBoxSize : mix(0.5f, ideaStaticBoxSize, velocityLerp);
    float boxSize = mix(0.5f, staticBoxSize, bSky ? 0.0f : smoothstep(0.02f, 0.0f, length(velocity)));

    // in center color.
    vec3 colorIn = ldsLoadColor(groupPos);

    // sample history color.
    vec3 colorHistory = catmullRom9Sample(inHistory, reprojectedUV, vec2(workSize));
    colorHistory = reinhard(colorHistory);

    // variance clamp.
    vec3 clampHistory;
    {
        float wsum = 0.0f;
        vec3 vsum  = vec3(0.0f, 0.0f, 0.0f);
        vec3 vsum2 = vec3(0.0f, 0.0f, 0.0f);

        for (int y = -1; y <= 1; ++y)
        {
            for (int x = -1; x <= 1; ++x)
            {
                const vec3 neigh = ldsLoadColor(groupPos + ivec2(x, y));
                const float w = exp(-0.75f * (x * x + y * y));

                vsum2 += neigh * neigh * w;
                vsum  += neigh * w;
                wsum  += w;
            }
        }

        const vec3 ex  = vsum / wsum;
        const vec3 ex2 = vsum2 / wsum;
        const vec3 dev = sqrt(max(ex2 - ex * ex, 0.0f));

        vec3 nmin = ex - dev * boxSize;
        vec3 nmax = ex + dev * boxSize;
        clampHistory = clamp(colorHistory, nmin, nmax);
    }

    // when camera move, use this.
    // when camera don't move, use more bigger blend factor if motion factor check.
    const float ideaLerpFactor = 0.01f;
    float blendFactor = ideaLerpFactor;
    {   
        const float threshold   = 0.5f;
        const float base        = 0.5f;
        const float gather      = 0.1666f;

        // subpixel flicker reduce
        float depth = linearizeDepth(cloestDepth, viewData.camInfo.z, viewData.camInfo.w);
        float texelVelMag    = length(velocity * vec2(workSize)) * depth;
        float subpixelMotion = clamp(threshold / (texelVelMag + kTinyFloat), 0.0f, 1.0f);

        // something moveing
        float dynamicBlendFactor = texelVelMag * base + subpixelMotion * gather;

        // lumiance bias correct.
        float luminanceHistory = luminance(clampHistory);
        float luminanceCurrent = luminance(colorIn);
        float unbiasedDifference = abs(luminanceCurrent - luminanceHistory) / ((max(luminanceCurrent, luminanceHistory) + 0.3));
        dynamicBlendFactor *= 1.0 - unbiasedDifference;

        // clamp
        dynamicBlendFactor = clamp(dynamicBlendFactor, 0.0f, 0.4f);

        float lerpFactor = length(velocity * vec2(workSize)) * 5.0f;
        lerpFactor  = clamp(lerpFactor, 0, 1);

        blendFactor = bSky ? blendFactor : mix(blendFactor, dynamicBlendFactor, lerpFactor);

        // tiny move, so reset lerp factor.
        velocityLerp = lerpFactor > 0.01f ? 0 : velocityLerp;

        // mix lerp factor by frames to get a good clip value.
        velocityLerp = mix(velocityLerp, 1.0f, ideaLerpFactor);
    }

    vec3 colorResolve = mix(clampHistory, colorIn, blendFactor);

    // half16 safe clamp.
    colorResolve = min(vec3(65504.0f), colorResolve);

    imageStore(outHdrColor, ivec2(gl_GlobalInvocationID.xy), vec4(colorResolve, velocityLerp));
}

Pass#1 TAASharpen:

#version 460
#define LOCAL_SIZE_XY 16
layout (local_size_x = LOCAL_SIZE_XY, local_size_y = LOCAL_SIZE_XY, local_size_z = 1) in;

// out
layout (set = 0, binding = 0,rgba16f) uniform image2D hdrImage;

// in
layout (set = 0, binding = 1,rgba16f) uniform image2D historyImage;
layout (set = 0, binding = 2,rgba16f) uniform image2D inTAAImage;

struct TAASharpenPushConstant
{
    uint  sharpenMethod;
    float sharpness;
};

layout(push_constant) uniform block
{
	TAASharpenPushConstant pushConstant;
};

// same with cpp.
#define SHARPEN_OFF        0

// Bloom flicking.
#define SHARPEN_RESPONSIVE 1

// Bloom stable.
#define SHARPEN_CAS        2

const int kBorderSize = 1;
const int kGroupSize  = LOCAL_SIZE_XY;
const int kLdsLength  = kGroupSize + kBorderSize * 2;
const int kLdsArea    = kLdsLength * kLdsLength;

shared vec4 sharedColor[kLdsLength][kLdsLength];

vec4 ldsLoadColor(ivec2 GId)
{
    GId += ivec2(kBorderSize);
    return sharedColor[GId.x][GId.y];
}

void ldsStoreColor(ivec2 GId, vec4 color)
{
    sharedColor[GId.x][GId.y] = color;
}

void storeColor(ivec2 GId, ivec2 TId, ivec2 size)
{
    TId = clamp(TId, ivec2(0,0), size - ivec2(1,1));

    // store color.
    ldsStoreColor(GId, imageLoad(inTAAImage, TId));
}

void prepareLds(ivec2 topLeft, ivec2 workSize, int groupIndex)
{
    // 4 sample per pixel.
    if(groupIndex < (kLdsArea >> 2)) // 1 / 4 area pixel work.
    {
        // sample [0, 0]
        ivec2 id0 = ivec2(
            groupIndex                     % kLdsLength,                      
            groupIndex                     / kLdsLength);
        storeColor(id0, topLeft + id0, workSize);

        // sample [0.25,  0.25]
        ivec2 id1 = ivec2(
            (groupIndex + (kLdsArea >> 2))  % kLdsLength, 
            (groupIndex + (kLdsArea >> 2))  / kLdsLength);
        storeColor(id1, topLeft + id1, workSize);

        // sample [0.5, 0.5]
        ivec2 id2 = ivec2(
            (groupIndex + (kLdsArea >> 1))  % kLdsLength, 
            (groupIndex + (kLdsArea >> 1))  / kLdsLength);
        storeColor(id2, topLeft + id2, workSize);

        // sample [0.75, 0.75]
        ivec2 id3 = ivec2(
            (groupIndex + kLdsArea * 3 / 4) % kLdsLength, 
            (groupIndex + kLdsArea * 3 / 4) / kLdsLength);
        storeColor(id3, topLeft + id3, workSize);
    }
}

float min3x(float a, float b, float c)
{
    return min(min(a, b), c);
}

float max3x(float a, float b, float c)
{
    return max(max(a, b), c);
}

vec3 reinhardInverse(in vec3 sdr)
{
    return sdr / max(1.0f - sdr, 1e-5f);
}

vec3 RGBToYCoCg(in vec3 rgb)
{
    return vec3(
        0.25f * rgb.r + 0.5f * rgb.g + 0.25f * rgb.b,
        0.5f * rgb.r - 0.5f * rgb.b,
        -0.25f * rgb.r + 0.5f * rgb.g - 0.25f * rgb.b
    );
}

vec3 YCoCgToRGB(in vec3 yCoCg)
{
    return vec3(
        yCoCg.x + yCoCg.y - yCoCg.z,
        yCoCg.x + yCoCg.z,
        yCoCg.x - yCoCg.y - yCoCg.z
    );
}

vec3 ApplySharpening(ivec2 groupPos)
{
    const vec3 top    = ldsLoadColor(groupPos + ivec2( 0,  1)).xyz;
    const vec3 left   = ldsLoadColor(groupPos + ivec2( 1,  0)).xyz;
    const vec3 center = ldsLoadColor(groupPos + ivec2( 0,  0)).xyz;
    const vec3 right  = ldsLoadColor(groupPos + ivec2(-1,  0)).xyz;
    const vec3 bottom = ldsLoadColor(groupPos + ivec2( 0, -1)).xyz;

    vec3 result = RGBToYCoCg(center);

    float unsharpenMask = 4.0f * result.x;

    unsharpenMask -= RGBToYCoCg(top).x;
    unsharpenMask -= RGBToYCoCg(bottom).x;
    unsharpenMask -= RGBToYCoCg(left).x;
    unsharpenMask -= RGBToYCoCg(right).x;

    result.x = min(result.x + 0.25f * unsharpenMask, 1.1f * result.x);

    return YCoCgToRGB(result);
}

// AMD CAS Filter for sharpen.
vec3 AMDCASFilter(float sharpness, ivec2 groupPos)
{
    ivec2 pixelPos = ivec2(gl_GlobalInvocationID.xy);

    // a b c
    // d e f
    // g h i
    vec3 a = ldsLoadColor(groupPos + ivec2(-1, -1)).xyz;
    vec3 b = ldsLoadColor(groupPos + ivec2( 0, -1)).xyz;
    vec3 c = ldsLoadColor(groupPos + ivec2( 1, -1)).xyz;
    vec3 d = ldsLoadColor(groupPos + ivec2(-1,  0)).xyz;
    vec3 e = ldsLoadColor(groupPos + ivec2( 0,  0)).xyz;
    vec3 f = ldsLoadColor(groupPos + ivec2( 1,  0)).xyz;
    vec3 g = ldsLoadColor(groupPos + ivec2(-1,  1)).xyz;
    vec3 h = ldsLoadColor(groupPos + ivec2( 0,  1)).xyz;
    vec3 i = ldsLoadColor(groupPos + ivec2( 1,  1)).xyz;


    float mnR = min3x(min3x(d.r,e.r,f.r),b.r,h.r);
    float mnG = min3x(min3x(d.g,e.g,f.g),b.g,h.g);
    float mnB = min3x(min3x(d.b,e.b,f.b),b.b,h.b);

    float mnR2 = min3x(min3x(mnR,a.r,c.r),g.r,i.r);
    float mnG2 = min3x(min3x(mnG,a.g,c.g),g.g,i.g);
    float mnB2 = min3x(min3x(mnB,a.b,c.b),g.b,i.b);

    mnR = mnR + mnR2;
    mnG = mnG + mnG2;
    mnB = mnB + mnB2;

    float mxR = max3x(max3x(d.r,e.r,f.r),b.r,h.r);
    float mxG = max3x(max3x(d.g,e.g,f.g),b.g,h.g);
    float mxB = max3x(max3x(d.b,e.b,f.b),b.b,h.b);

    float mxR2 = max3x(max3x(mxR,a.r,c.r),g.r,i.r);
    float mxG2 = max3x(max3x(mxG,a.g,c.g),g.g,i.g);
    float mxB2 = max3x(max3x(mxB,a.b,c.b),g.b,i.b);

    mxR = mxR + mxR2;
    mxG = mxG + mxG2;
    mxB = mxB + mxB2;

    float rcpMR = 1.0f / mxR;
    float rcpMG = 1.0f / mxG;
    float rcpMB = 1.0f / mxB;

    float ampR = clamp(min(mnR, 2.0f - mxR) * rcpMR, 0.0f, 1.0f);
    float ampG = clamp(min(mnG, 2.0f - mxG) * rcpMG, 0.0f, 1.0f);
    float ampB = clamp(min(mnB, 2.0f - mxB) * rcpMB, 0.0f, 1.0f);

    // Shaping amount of sharpening.
    ampR = sqrt(ampR);
    ampG = sqrt(ampG);
    ampB = sqrt(ampB);

    // Filter shape.
    //  0 w 0
    //  w 1 w
    //  0 w 0
    float peak = - 1.0f / mix(8.0f, 5.0f, clamp(sharpness, 0.0f, 1.0f));
    float wR = ampR * peak;
    float wG = ampG * peak;
    float wB = ampB * peak;

    float rcpWeightR = 1.0f / (1.0f + 4.0f * wR);
    float rcpWeightG = 1.0f / (1.0f + 4.0f * wG);
    float rcpWeightB = 1.0f / (1.0f + 4.0f * wB);

    vec3 outColor;

    outColor.r = clamp((b.r * wR + d.r * wR + f.r * wR + h.r * wR + e.r) * rcpWeightR, 0.0f, 1.0f);
    outColor.g = clamp((b.g * wG + d.g * wG + f.g * wG + h.g * wG + e.g) * rcpWeightG, 0.0f, 1.0f);
    outColor.b = clamp((b.b * wB + d.b * wB + f.b * wB + h.b * wB + e.b) * rcpWeightB, 0.0f, 1.0f);

    return outColor;
}

void main()
{
    ivec2 workSize = imageSize(inTAAImage);
    ivec2 topLeft  = ivec2(gl_WorkGroupID.xy) * kGroupSize - kBorderSize;
    ivec2 groupPos = ivec2(gl_LocalInvocationID.xy);
    int groupIndex = int(gl_LocalInvocationIndex);

    prepareLds(topLeft, workSize, groupIndex);

    groupMemoryBarrier();
    barrier();

    if (gl_GlobalInvocationID.x >= workSize.x || gl_GlobalInvocationID.y >= workSize.y)
    {
        return; 
    }

    // load center color, mask valid on w channel.
    vec4 colorIn = ldsLoadColor(groupPos);
    
    // cache center color.
    const vec3 center = colorIn.xyz;

    // history color is after reinhard tonemapper. so here reverse.
    colorIn.xyz = reinhardInverse(colorIn.xyz);

    // update history image.
    imageStore(historyImage, ivec2(gl_GlobalInvocationID.xy), colorIn);

    // out color.
    vec3 color = center;
    
    float sharpness = pushConstant.sharpness;
    if(pushConstant.sharpenMethod == SHARPEN_RESPONSIVE)
    {
        color = ApplySharpening(groupPos);
        
    }
    else if(pushConstant.sharpenMethod == SHARPEN_CAS)
    {
        color = AMDCASFilter(sharpness, groupPos);
    } 

    // out hdr color, also tonemapper reverse.
    imageStore(hdrImage, ivec2(gl_GlobalInvocationID.xy), vec4(reinhardInverse(color), 1.0f));
}