// Copyright (c) 2015-2018, bacondither
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer
//    in this position and unchanged.
// 2. Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// Adaptive sharpen - version 2018-04-14
// EXPECTS FULL RANGE GAMMA LIGHT

#include "ReShadeUI.fxh"

uniform float curve_height < __UNIFORM_SLIDER_FLOAT1
	ui_min = 0.01; ui_max = 2.0;
	ui_label = "Sharpening strength";
	ui_tooltip = "Main control of sharpening strength";
	ui_step = 0.01;
> = 1.0;

uniform float curveslope <
	ui_min = 0.01; ui_max = 2.0;
	ui_tooltip = "Sharpening curve slope, high edge values";
	ui_category = "Advanced";
> = 0.5;

uniform float L_overshoot <
	ui_min = 0.001; ui_max = 0.1;
	ui_tooltip = "Max light overshoot before compression";
	ui_category = "Advanced";
> = 0.003;

uniform float L_compr_low <
	ui_min = 0.0; ui_max = 1.0;
	ui_tooltip = "Light compression, default (0.167=~6x)";
	ui_category = "Advanced";
> = 0.167;

uniform float L_compr_high <
	ui_min = 0.0; ui_max = 1.0;
	ui_tooltip = "Light compression, surrounded by edges (0.334=~3x)";
	ui_category = "Advanced";
> = 0.334;

uniform float D_overshoot <
	ui_min = 0.001; ui_max = 0.1;
	ui_tooltip = "Max dark overshoot before compression";
	ui_category = "Advanced";
> = 0.009;

uniform float D_compr_low <
	ui_min = 0.0; ui_max = 1.0;
	ui_tooltip = "Dark compression, default (0.250=4x)";
	ui_category = "Advanced";
> = 0.250;

uniform float D_compr_high <
	ui_min = 0.0; ui_max = 1.0;
	ui_tooltip = "Dark compression, surrounded by edges (0.500=2x)";
	ui_category = "Advanced";
> = 0.500;

uniform float scale_lim <
	ui_min = 0.01; ui_max = 1.0;
	ui_tooltip = "Abs max change before compression";
	ui_category = "Advanced";
> = 0.1;

uniform float scale_cs <
	ui_min = 0.0; ui_max = 1.0;
	ui_tooltip = "Compression slope above scale_lim";
	ui_category = "Advanced";
> = 0.056;

uniform float pm_p <
	ui_min = 0.01; ui_max = 1.0;
	ui_tooltip = "Power mean p-value";
	ui_category = "Advanced";
> = 0.7;

//-------------------------------------------------------------------------------------------------
#ifndef fast_ops
	#define fast_ops 1 // Faster code path, small difference in quality
#endif
//-------------------------------------------------------------------------------------------------

#include "ReShade.fxh"

texture AS_Pass0Tex < pooled = true; > { Width = BUFFER_WIDTH; Height = BUFFER_HEIGHT; Format = RG16F; };
sampler AS_Pass0Sampler { Texture = AS_Pass0Tex; };

// Helper funcs
#define sqr(a)         ( (a)*(a) )
#define max4(a,b,c,d)  ( max(max(a, b), max(c, d)) )

// Get destination pixel values
#define texc(x,y)      ( BUFFER_PIXEL_SIZE*float2(x, y) + tex )
#define getB(x,y)      ( saturate(tex2D(ReShade::BackBuffer, texc(x, y)).rgb) )
#define getT(x,y)      ( tex2D(AS_Pass0Sampler, texc(x, y)).xy )

// Soft if, fast linear approx
#define soft_if(a,b,c) ( saturate((a + b + c + 0.056)*rcp(abs(maxedge) + 0.03) - 0.85) )

// Soft limit, modified tanh
#if (fast_ops == 1) // Tanh approx
	#define soft_lim(v,s)  ( saturate(abs(v/s)*(27 + sqr(v/s))/(27 + 9*sqr(v/s)))*s )
#else
	#define soft_lim(v,s)  ( (exp(2*min(abs(v), s*24)/s) - 1)/(exp(2*min(abs(v), s*24)/s) + 1)*s )
#endif

// Weighted power mean
#define wpmean(a,b,w)  ( pow(abs(w)*pow(abs(a), pm_p) + abs(1-w)*pow(abs(b), pm_p), (1.0/pm_p)) )

// Component-wise distance
#define b_diff(pix)    ( abs(blur - c[pix]) )

// Fast-skip threshold, keep max possible luma error under 0.5/2^bit-depth
#if (fast_ops == 1)
	// Approx of x = tanh(x/y)*y + 0.5/2^bit-depth, y = min(L_overshoot, D_overshoot)
	#define min_overshoot  ( min(abs(L_overshoot), abs(D_overshoot)) )
	#define fskip_th       ( 0.114*pow(min_overshoot, 0.676) + 3.20e-4 ) // 10-bits
	//#define fskip_th       ( 0.045*pow(min_overshoot, 0.667) + 1.75e-5 ) // 14-bits
#else
	// x = tanh(x/y)*y + 0.5/2^bit-depth, y = 0.0001
	#define fskip_th       ( 0.000110882 ) // 14-bits
#endif

// Smoothstep to linearstep approx
//#define SStLS(a,b,x,c) ( clamp(-(6*(c - 1)*(b - x))/(5*(a - b)) - 0.1*c + 1.1, c, 1) )

// Center pixel diff
#define mdiff(a,b,c,d,e,f,g) ( abs(luma[g] - luma[a]) + abs(luma[g] - luma[b])       \
                             + abs(luma[g] - luma[c]) + abs(luma[g] - luma[d])       \
                             + 0.5*(abs(luma[g] - luma[e]) + abs(luma[g] - luma[f])) )

float2 AdaptiveSharpenP0(float4 vpos : SV_Position, float2 tex : TEXCOORD) : SV_Target
{
	// Get points and clip out of range values (BTB & WTW)
	// [                c9                ]
	// [           c1,  c2,  c3           ]
	// [      c10, c4,  c0,  c5, c11      ]
	// [           c6,  c7,  c8           ]
	// [                c12               ]
	float3 c[13] = { getB( 0, 0), getB(-1,-1), getB( 0,-1), getB( 1,-1), getB(-1, 0),
	                 getB( 1, 0), getB(-1, 1), getB( 0, 1), getB( 1, 1), getB( 0,-2),
	                 getB(-2, 0), getB( 2, 0), getB( 0, 2) };

	// Colour to luma, fast approx gamma, avg of rec. 709 & 601 luma coeffs
	float luma = sqrt(dot(float3(0.2558, 0.6511, 0.0931), sqr(c[0])));

	// Blur, gauss 3x3
	float3 blur = (2*(c[2]+c[4]+c[5]+c[7]) + (c[1]+c[3]+c[6]+c[8]) + 4*c[0])/16;

	// Contrast compression, center = 0.5, scaled to 1/3
	float c_comp = saturate(4.0/15.0 + 0.9*exp2(dot(blur, -37.0/15.0)));

	// Edge detection
	// Relative matrix weights
	// [          1          ]
	// [      4,  5,  4      ]
	// [  1,  5,  6,  5,  1  ]
	// [      4,  5,  4      ]
	// [          1          ]
	float edge = length( 1.38*(b_diff(0))
	                   + 1.15*(b_diff(2) + b_diff(4)  + b_diff(5)  + b_diff(7))
	                   + 0.92*(b_diff(1) + b_diff(3)  + b_diff(6)  + b_diff(8))
	                   + 0.23*(b_diff(9) + b_diff(10) + b_diff(11) + b_diff(12)) );

	return float2(edge*c_comp, luma);
}

float3 AdaptiveSharpenP1(float4 vpos : SV_Position, float2 tex : TEXCOORD) : SV_Target
{
	float3 origsat = getB(0, 0);

	// Get texture points, .x = edge, .y = luma
	// [                d22               ]
	// [           d24, d9,  d23          ]
	// [      d21, d1,  d2,  d3, d18      ]
	// [ d19, d10, d4,  d0,  d5, d11, d16 ]
	// [      d20, d6,  d7,  d8, d17      ]
	// [           d15, d12, d14          ]
	// [                d13               ]
	float2 d[25] = { getT( 0, 0), getT(-1,-1), getT( 0,-1), getT( 1,-1), getT(-1, 0),
	                 getT( 1, 0), getT(-1, 1), getT( 0, 1), getT( 1, 1), getT( 0,-2),
	                 getT(-2, 0), getT( 2, 0), getT( 0, 2), getT( 0, 3), getT( 1, 2),
	                 getT(-1, 2), getT( 3, 0), getT( 2, 1), getT( 2,-1), getT(-3, 0),
	                 getT(-2, 1), getT(-2,-1), getT( 0,-3), getT( 1,-2), getT(-1,-2) };

	// Allow for higher overshoot if the current edge pixel is surrounded by similar edge pixels
	float maxedge = max4( max4(d[1].x,d[2].x,d[3].x,d[4].x), max4(d[5].x,d[6].x,d[7].x,d[8].x),
	                      max4(d[9].x,d[10].x,d[11].x,d[12].x), d[0].x );

	// [          x          ]
	// [       z, x, w       ]
	// [    z, z, x, w, w    ]
	// [ y, y, y, 0, y, y, y ]
	// [    w, w, x, z, z    ]
	// [       w, x, z       ]
	// [          x          ]
	float sbe = soft_if(d[2].x,d[9].x, d[22].x)*soft_if(d[7].x,d[12].x,d[13].x)  // x dir
	          + soft_if(d[4].x,d[10].x,d[19].x)*soft_if(d[5].x,d[11].x,d[16].x)  // y dir
	          + soft_if(d[1].x,d[24].x,d[21].x)*soft_if(d[8].x,d[14].x,d[17].x)  // z dir
	          + soft_if(d[3].x,d[23].x,d[18].x)*soft_if(d[6].x,d[20].x,d[15].x); // w dir

	#if (fast_ops == 1)
		float2 cs = lerp( float2(L_compr_low,  D_compr_low),
		                  float2(L_compr_high, D_compr_high), saturate(1.091*sbe - 2.282) );
	#else
		float2 cs = lerp( float2(L_compr_low,  D_compr_low),
		                  float2(L_compr_high, D_compr_high), smoothstep(2, 3.1, sbe) );
	#endif

	float luma[25] = { d[0].y,  d[1].y,  d[2].y,  d[3].y,  d[4].y,
	                   d[5].y,  d[6].y,  d[7].y,  d[8].y,  d[9].y,
	                   d[10].y, d[11].y, d[12].y, d[13].y, d[14].y,
	                   d[15].y, d[16].y, d[17].y, d[18].y, d[19].y,
	                   d[20].y, d[21].y, d[22].y, d[23].y, d[24].y };

	// Pre-calculated default squared kernel weights
	const float3 W1 = float3(0.5,           1.0, 1.41421356237); // 0.25, 1.0, 2.0
	const float3 W2 = float3(0.86602540378, 1.0, 0.54772255751); // 0.75, 1.0, 0.3

	// Transition to a concave kernel if the center edge val is above thr
	#if (fast_ops == 1)
		float3 dW = sqr(lerp( W1, W2, saturate(2.4*d[0].x - 0.82) ));
	#else
		float3 dW = sqr(lerp( W1, W2, smoothstep(0.3, 0.8, d[0].x) ));
	#endif

	float mdiff_c0 = 0.02 + 3*( abs(luma[0]-luma[2]) + abs(luma[0]-luma[4])
	                          + abs(luma[0]-luma[5]) + abs(luma[0]-luma[7])
	                          + 0.25*(abs(luma[0]-luma[1]) + abs(luma[0]-luma[3])
	                                 +abs(luma[0]-luma[6]) + abs(luma[0]-luma[8])) );

	// Use lower weights for pixels in a more active area relative to center pixel area
	// This results in narrower and less visible overshoots around sharp edges
	float weights[12] = { ( min(mdiff_c0/mdiff(24, 21, 2,  4,  9,  10, 1),  dW.y) ),   // c1
	                      ( dW.x ),                                                    // c2
	                      ( min(mdiff_c0/mdiff(23, 18, 5,  2,  9,  11, 3),  dW.y) ),   // c3
	                      ( dW.x ),                                                    // c4
	                      ( dW.x ),                                                    // c5
	                      ( min(mdiff_c0/mdiff(4,  20, 15, 7,  10, 12, 6),  dW.y) ),   // c6
	                      ( dW.x ),                                                    // c7
	                      ( min(mdiff_c0/mdiff(5,  7,  17, 14, 12, 11, 8),  dW.y) ),   // c8
	                      ( min(mdiff_c0/mdiff(2,  24, 23, 22, 1,  3,  9),  dW.z) ),   // c9
	                      ( min(mdiff_c0/mdiff(20, 19, 21, 4,  1,  6,  10), dW.z) ),   // c10
	                      ( min(mdiff_c0/mdiff(17, 5,  18, 16, 3,  8,  11), dW.z) ),   // c11
	                      ( min(mdiff_c0/mdiff(13, 15, 7,  14, 6,  8,  12), dW.z) ) }; // c12

	weights[0] = (max(max((weights[8]  + weights[9])/4,  weights[0]), 0.25) + weights[0])/2;
	weights[2] = (max(max((weights[8]  + weights[10])/4, weights[2]), 0.25) + weights[2])/2;
	weights[5] = (max(max((weights[9]  + weights[11])/4, weights[5]), 0.25) + weights[5])/2;
	weights[7] = (max(max((weights[10] + weights[11])/4, weights[7]), 0.25) + weights[7])/2;

	// Calculate the negative part of the laplace kernel and the low threshold weight
	float lowthrsum   = 0;
	float weightsum   = 0;
	float neg_laplace = 0;

	[unroll] for (int pix = 0; pix < 12; ++pix)
	{
		#if (fast_ops == 1)
			float lowthr = clamp((13.2*d[pix + 1].x - 0.221), 0.01, 1);

			neg_laplace += sqr(luma[pix + 1])*(weights[pix]*lowthr);
		#else
			float t = saturate((d[pix + 1].x - 0.01)/0.09);
			float lowthr = t*t*(2.97 - 1.98*t) + 0.01; // t*t*(3 - a*3 - (2 - a*2)*t) + a

			neg_laplace += pow(abs(luma[pix + 1]) + 0.06, 2.4)*(weights[pix]*lowthr);
		#endif
		weightsum += weights[pix]*lowthr;
		lowthrsum += lowthr/12;
	}

	#if (fast_ops == 1)
		neg_laplace = sqrt(neg_laplace/weightsum);
	#else
		neg_laplace = pow(abs(neg_laplace/weightsum), (1.0/2.4)) - 0.06;
	#endif

	// Compute sharpening magnitude function
	float sharpen_val = curve_height/(curve_height*curveslope*pow(abs(d[0].x), 3.5) + 0.625);

	// Calculate sharpening diff and scale
	float sharpdiff = (d[0].y - neg_laplace)*(lowthrsum*sharpen_val + 0.01);

	// Skip limiting on flat areas where sharpdiff is low
	[branch] if (abs(sharpdiff) > fskip_th)
	{
		// Calculate local near min & max, partial sort
		// Manually unrolled outer loop, solves OpenGL slowdown
		{
			float temp; int i; int ii;

			// 1st iteration
			[unroll] for (i = 0; i < 24; i += 2)
			{
				temp = luma[i];
				luma[i]   = min(luma[i], luma[i+1]);
				luma[i+1] = max(temp, luma[i+1]);
			}
			[unroll] for (ii = 24; ii > 0; ii -= 2)
			{
				temp = luma[0];
				luma[0]    = min(luma[0], luma[ii]);
				luma[ii]   = max(temp, luma[ii]);

				temp = luma[24];
				luma[24]   = max(luma[24], luma[ii-1]);
				luma[ii-1] = min(temp, luma[ii-1]);
			}

			// 2nd iteration
			[unroll] for (i = 1; i < 23; i += 2)
			{
				temp = luma[i];
				luma[i]   = min(luma[i], luma[i+1]);
				luma[i+1] = max(temp, luma[i+1]);
			}
			[unroll] for (ii = 23; ii > 1; ii -= 2)
			{
				temp = luma[1];
				luma[1]    = min(luma[1], luma[ii]);
				luma[ii]   = max(temp, luma[ii]);

				temp = luma[23];
				luma[23]   = max(luma[23], luma[ii-1]);
				luma[ii-1] = min(temp, luma[ii-1]);
			}

			#if (fast_ops != 1) // 3rd iteration
				[unroll] for (i = 2; i < 22; i += 2)
				{
					temp = luma[i];
					luma[i]   = min(luma[i], luma[i+1]);
					luma[i+1] = max(temp, luma[i+1]);
				}
				[unroll] for (ii = 22; ii > 2; ii -= 2)
				{
					temp = luma[2];
					luma[2]    = min(luma[2], luma[ii]);
					luma[ii]   = max(temp, luma[ii]);

					temp = luma[22];
					luma[22]   = max(luma[22], luma[ii-1]);
					luma[ii-1] = min(temp, luma[ii-1]);
				}
			#endif
		}

		// Calculate tanh scale factors
		#if (fast_ops == 1)
			float nmax = (max(luma[23], d[0].y)*2 + luma[24])/3;
			float nmin = (min(luma[1],  d[0].y)*2 + luma[0])/3;

			float min_dist  = min(abs(nmax - d[0].y), abs(d[0].y - nmin));
			float pos_scale = min_dist + L_overshoot;
			float neg_scale = min_dist + D_overshoot;
		#else
			float nmax = (max(luma[22] + luma[23]*2, d[0].y*3) + luma[24])/4;
			float nmin = (min(luma[2]  + luma[1]*2,  d[0].y*3) + luma[0])/4;

			float min_dist  = min(abs(nmax - d[0].y), abs(d[0].y - nmin));
			float pos_scale = min_dist + min(L_overshoot, 1.0001 - min_dist - d[0].y);
			float neg_scale = min_dist + min(D_overshoot, 0.0001 + d[0].y - min_dist);
		#endif

		pos_scale = min(pos_scale, scale_lim*(1 - scale_cs) + pos_scale*scale_cs);
		neg_scale = min(neg_scale, scale_lim*(1 - scale_cs) + neg_scale*scale_cs);

		// Soft limited anti-ringing with tanh, wpmean to control compression slope
		sharpdiff = wpmean( max(sharpdiff, 0), soft_lim( max(sharpdiff, 0), pos_scale ), cs.x )
		          - wpmean( min(sharpdiff, 0), soft_lim( min(sharpdiff, 0), neg_scale ), cs.y );
	}

	// Compensate for saturation loss/gain while making pixels brighter/darker
	float sharpdiff_lim = saturate(d[0].y + sharpdiff) - d[0].y;
	float satmul = (d[0].y + max(sharpdiff_lim*0.9, sharpdiff_lim)*1.03 + 0.03)/(d[0].y + 0.03);
	float3 res = d[0].y + (sharpdiff_lim*3 + sharpdiff)/4 + (origsat - d[0].y)*satmul;

	return saturate(res);
}

technique AdaptiveSharpen
{
	pass AdaptiveSharpenPass1
	{
		VertexShader = PostProcessVS;
		PixelShader  = AdaptiveSharpenP0;
		RenderTarget = AS_Pass0Tex;
	}

	pass AdaptiveSharpenPass2
	{
		VertexShader = PostProcessVS;
		PixelShader  = AdaptiveSharpenP1;
	}
}