// Bloom
//

#include "Color.hlsl"

cbuffer LumaBloom : register(b11)
{
    float2 src_size;
    float2 inv_src_size;
    float2 axis;
    float sigma;
    float tex_noise_index;
}

SamplerState smp : register(s0);
Texture2D tex : register(t0);

// User configurable
//

// Allows you to define a custom threshold funcion that takes one float3 argument (color).
#ifndef LUMA_BLOOM_THRESHOLD_FUNCTION
#define LUMA_BLOOM_THRESHOLD_FUNCTION(color) quadratic_threshold(color)
#endif

// Only used in the default threshold function. 
#ifndef LUMA_BLOOM_THRESHOLD
#define LUMA_BLOOM_THRESHOLD 1.0
#endif

// Only used in the default threshold function.
#ifndef LUMA_BLOOM_SOFT_KNEE
#define LUMA_BLOOM_SOFT_KNEE 1.0
#endif

#ifndef LUMA_BLOOM_TINT
#define LUMA_BLOOM_TINT float3(1.0, 1.0, 1.0)
#endif

#ifndef LUMA_BLOOM_SCALE
#define LUMA_BLOOM_SCALE 1.0
#endif

//

// Fullscreen triangle VS.
void bloom_main_vs(uint vid : SV_VertexID, out float4 pos : SV_Position, out float2 texcoord : TEXCOORD)
{
    texcoord = float2((vid << 1) & 2, vid & 2);
    pos = float4(texcoord * float2(2.0, -2.0) + float2(-1.0, 1.0), 0.0, 1.0);
}

float3 quadratic_threshold(float3 color)
{
    const float epsilon = 1e-6;

    // Pixel brightness.
    float br = max(max(color.r, color.g), color.b);
    br = max(epsilon, br);

    // Under the threshold part, a quadratic curve.
    // Above the threshold part will be a linear curve.
    const float k = max(epsilon, LUMA_BLOOM_SOFT_KNEE);
    const float3 curve = float3(LUMA_BLOOM_THRESHOLD - k, k * 2.0, 0.25 / k);
    float rq = clamp(br - curve.x, 0.0, curve.y);
    rq = curve.z * rq * rq;

    // Combine and apply the brightness response curve.
    return color * max(rq, br - LUMA_BLOOM_THRESHOLD) * rcp(br);
}

float get_gaussian_weight(float x)
{
    return exp(-x * x * rcp(2.0 * sigma * sigma));
}

float4 bloom_prefilter_ps(float4 pos : SV_Position, float2 texcoord : TEXCOORD) : SV_Target
{
    // Calculate fractional part and texel center.
    const float f = dot(frac(texcoord * src_size - 0.5), axis);
    const float2 tc = texcoord - f * inv_src_size * axis;

    float3 csum = 0.0;
    float wsum = 0.0;

    // Calculate kernel radius.
    const float radius = ceil(sigma * 3.0);

    for (float i = 1.0 - radius; i <= radius; ++i) {
        const float weight = get_gaussian_weight(i - f);
        csum += tex.SampleLevel(smp, tc + i * inv_src_size * axis, 0.0).rgb * weight;
        wsum += weight;
    }

    // Normalize.
    csum *= rcp(wsum);

    // Apply threshold.
    float3 color = LUMA_BLOOM_THRESHOLD_FUNCTION(csum);

    // Apply tint.
    const float luma = GetLuminance(color);
    color *= LUMA_BLOOM_TINT;
    color *= luma * rcp(max(1e-6, GetLuminance(color)));

    return float4(color * LUMA_BLOOM_SCALE, 1.0);
}

float4 bloom_downsample_ps(float4 pos : SV_Position, float2 texcoord : TEXCOORD) : SV_Target
{
    // Calculate fractional part and texel center.
    const float f = dot(frac(texcoord * src_size - 0.5), axis);
    const float2 tc = texcoord - f * inv_src_size * axis;

    float3 csum = 0.0;
    float wsum = 0.0;

    // Calculate kernel radius.
    const float radius = ceil(sigma * 3.0);

    for (float i = 1.0 - radius; i <= radius; ++i) {
        const float weight = get_gaussian_weight(i - f);
        csum += tex.SampleLevel(smp, tc + i * inv_src_size * axis, 0.0).rgb * weight;
        wsum += weight;
    }

    // Normalize.
    csum *= rcp(wsum);

    return float4(csum, 1.0);
}

// Bicubic upsampling in 4 texture fetches.
//
// f(x) = (4 + 3 * |x|^3 – 6 * |x|^2) / 6 for 0 <= |x| <= 1
// f(x) = (2 – |x|)^3 / 6 for 1 < |x| <= 2
// f(x) = 0 otherwise
//
// Source: https://www.researchgate.net/publication/220494113_Efficient_GPU-Based_Texture_Interpolation_using_Uniform_B-Splines
float4 bloom_upsample_ps(float4 pos : SV_Position, float2 texcoord : TEXCOORD) : SV_Target
{
    // transform the coordinate from [0,extent] to [-0.5, extent-0.5]
    float2 coord_grid = texcoord * src_size - 0.5;
    float2 index = floor(coord_grid);
    float2 fraction = coord_grid - index;
    float2 one_frac = 1.0 - fraction;
    float2 one_frac2 = one_frac * one_frac;
    float2 fraction2 = fraction * fraction;
    float2 w0 = 1.0 / 6.0 * one_frac2 * one_frac;
    float2 w1 = 2.0 / 3.0 - 0.5 * fraction2 * (2.0 - fraction);
    float2 w2 = 2.0 / 3.0 - 0.5 * one_frac2 * (2.0 - one_frac);
    float2 w3 = 1.0 / 6.0 * fraction2 * fraction;
    float2 g0 = w0 + w1;
    float2 g1 = w2 + w3;

    // h0 = w1/g0 - 1, move from [-0.5, extent-0.5] to [0, extent]
    float2 h0 = (w1 / g0) - 0.5 + index;
    float2 h1 = (w3 / g1) + 1.5 + index;

    // fetch the four linear interpolations
    float3 tex00 = tex.SampleLevel(smp, float2(h0.x, h0.y) * inv_src_size, 0.0).rgb;
    float3 tex10 = tex.SampleLevel(smp, float2(h1.x, h0.y) * inv_src_size, 0.0).rgb;
    float3 tex01 = tex.SampleLevel(smp, float2(h0.x, h1.y) * inv_src_size, 0.0).rgb;
    float3 tex11 = tex.SampleLevel(smp, float2(h1.x, h1.y) * inv_src_size, 0.0).rgb;

    // weigh along the y-direction
    tex00 = lerp(tex01, tex00, g0.y);
    tex10 = lerp(tex11, tex10, g0.y);

    // weigh along the x-direction
    return float4(lerp(tex10, tex00, g0.x), 1.0);
}