/*============================================================================= d8b 888b d888 888b d888 8888888888 8888888b. .d8888b. 8888888888 Y8P 8888b d8888 8888b d8888 888 888 Y88b d88P Y88b 888 88888b.d88888 88888b.d88888 888 888 888 Y88b. 888 888 888Y88888P888 888Y88888P888 8888888 888 d88P "Y888b. 8888888 888 888 Y888P 888 888 Y888P 888 888 8888888P" "Y88b. 888 888 888 Y8P 888 888 Y8P 888 888 888 T88b "888 888 888 888 " 888 888 " 888 888 888 T88b Y88b d88P 888 888 888 888 888 888 8888888888 888 T88b "Y8888P" 8888888888 Copyright (c) Pascal Gilcher. All rights reserved. * Unauthorized copying of this file, via any medium is strictly prohibited * Proprietary and confidential THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. =============================================================================== MXAO v1.1 Author: Pascal Gilcher More info: https://martysmods.com https://patreon.com/mcflypg https://github.com/martymcmodding =============================================================================*/ //TODO: fix black lines in bottom and right for DX9 (require threads outside view if not 1:1 mapping) /*============================================================================= Preprocessor settings =============================================================================*/ #ifndef MXAO_AO_TYPE #define MXAO_AO_TYPE 0 #endif #ifndef MXAO_USE_LAUNCHPAD_NORMALS #define MXAO_USE_LAUNCHPAD_NORMALS 0 #endif /*============================================================================= UI Uniforms =============================================================================*/ uniform int MXAO_GLOBAL_SAMPLE_QUALITY_PRESET < ui_type = "combo"; ui_label = "Sample Quality"; ui_items = "Low\0Medium\0High\0Very High\0Ultra\0Extreme\0IDGAF\0"; ui_tooltip = "Global quality control, main performance knob. Higher radii might require higher quality."; ui_category = "Global"; > = 1; uniform int SHADING_RATE < ui_type = "combo"; ui_label = "Shading Rate"; ui_items = "Full Rate\0Half Rate\0Quarter Rate\0"; ui_tooltip = "0: render all pixels each frame\n1: render only 50% of pixels each frame\n2: render only 25% of pixels each frame"; ui_category = "Global"; > = 1; uniform float MXAO_SAMPLE_RADIUS < ui_type = "drag"; ui_min = 0.5; ui_max = 10.0; ui_label = "Sample Radius"; ui_tooltip = "Sample radius of MXAO, higher means more large-scale occlusion with less fine-scale details."; ui_category = "Global"; > = 2.5; uniform bool MXAO_WORLDSPACE_ENABLE < ui_label = "Increase Radius with Distance"; ui_category = "Global"; > = false; uniform float MXAO_SSAO_AMOUNT < ui_type = "drag"; ui_min = 0.0; ui_max = 1.0; ui_label = "Ambient Occlusion Amount"; ui_tooltip = "Intensity of AO effect. Can cause pitch black clipping if set too high."; ui_category = "Blending"; > = 0.8; uniform float MXAO_FADE_DEPTH < ui_type = "drag"; ui_label = "Fade Out Distance"; ui_min = 0.0; ui_max = 1.0; ui_tooltip = "Fadeout distance for MXAO. Higher values show MXAO in farther areas."; ui_category = "Blending"; > = 0.25; uniform int MXAO_FILTER_SIZE < ui_type = "slider"; ui_label = "Filter Quality"; ui_min = 0; ui_max = 2; ui_category = "Blending"; > = 1; uniform bool MXAO_DEBUG_VIEW_ENABLE < ui_label = "Show Raw AO"; ui_category = "Debug"; > = false; #define TOKENIZE(s) #s uniform int HELP1 < ui_type = "radio"; ui_label = " "; ui_category = "Preprocessor definition Documentation"; ui_category_closed = false; ui_text = "\n" TOKENIZE(MXAO_AO_TYPE) ":\n\n0: Ground Truth Ambient Occlusion (high contrast, fast)\n" "1: Solid Angle (smoother, fastest)\n" "2: Visibility Bitmask (DX11+ only, highest quality, slower)\n" "3: Visibility Bitmask w/ Solid Angle (like 2, only smoother)\n" "\n" TOKENIZE(MXAO_USE_LAUNCHPAD_NORMALS) ":\n\n0: Compute normal vectors on the fly (fast)\n" "1: Use normals from iMMERSE Launchpad (far slower)\n" " This allows to use Launchpad's smooth normals feature."; >; /* uniform float4 tempF1 < ui_type = "drag"; ui_min = -100.0; ui_max = 100.0; > = float4(1,1,1,1); uniform float4 tempF2 < ui_type = "drag"; ui_min = -100.0; ui_max = 100.0; > = float4(1,1,1,1); uniform float4 tempF3 < ui_type = "drag"; ui_min = -100.0; ui_max = 100.0; > = float4(1,1,1,1); */ /*============================================================================= Textures, Samplers, Globals, Structs =============================================================================*/ //do NOT change anything here. "hurr durr I changed this and now it works" //you ARE breaking things down the line, if the shader does not work without changes //here, it's by design. texture ColorInputTex : COLOR; texture DepthInputTex : DEPTH; sampler ColorInput { Texture = ColorInputTex; }; sampler DepthInput { Texture = DepthInputTex; }; texture AOTex1 { Width = BUFFER_WIDTH; Height = BUFFER_HEIGHT; Format = RG16F; }; texture AOTex2 { Width = BUFFER_WIDTH; Height = BUFFER_HEIGHT; Format = RG16F; }; #if !_COMPUTE_SUPPORTED texture AOTexRaw { Width = BUFFER_WIDTH; Height = BUFFER_HEIGHT; Format = RG16F; }; sampler sAOTexRaw { Texture = AOTexRaw; MinFilter=POINT; MipFilter=POINT; MagFilter=POINT; }; #endif sampler sAOTex1 { Texture = AOTex1; }; sampler sAOTex2 { Texture = AOTex2; }; #include ".\MartysMods\mmx_global.fxh" #include ".\MartysMods\mmx_depth.fxh" #include ".\MartysMods\mmx_math.fxh" #include ".\MartysMods\mmx_camera.fxh" #if MXAO_USE_LAUNCHPAD_NORMALS #include ".\MartysMods\mmx_deferred.fxh" #endif //#undef _COMPUTE_SUPPORTED #if ((BUFFER_WIDTH/4)*4) == BUFFER_WIDTH #define DEINTERLEAVE_HIGH 0 #define DEINTERLEAVE_TILE_COUNT 4u #else #define DEINTERLEAVE_HIGH 1 #define DEINTERLEAVE_TILE_COUNT 5u #endif uniform uint FRAMECOUNT < source = "framecount"; >; #if _COMPUTE_SUPPORTED storage stAOTex1 { Texture = AOTex1; }; storage stAOTex2 { Texture = AOTex2; }; texture3D ZSrc3D { Width = BUFFER_WIDTH/DEINTERLEAVE_TILE_COUNT; Height = BUFFER_HEIGHT/DEINTERLEAVE_TILE_COUNT; Depth = DEINTERLEAVE_TILE_COUNT * DEINTERLEAVE_TILE_COUNT; Format = R16F; }; sampler3D sZSrc3D { Texture = ZSrc3D; MinFilter=POINT; MipFilter=POINT; MagFilter=POINT;}; storage3D stZSrc3D { Texture = ZSrc3D; }; #else texture ZSrc { Width = BUFFER_WIDTH; Height = BUFFER_HEIGHT; Format = R16F; }; sampler sZSrc { Texture = ZSrc; MinFilter=POINT; MipFilter=POINT; MagFilter=POINT;}; #endif struct VSOUT { float4 vpos : SV_Position; float2 uv : TEXCOORD0; }; struct CSIN { uint3 groupthreadid : SV_GroupThreadID; //XYZ idx of thread inside group uint3 groupid : SV_GroupID; //XYZ idx of group inside dispatch uint3 dispatchthreadid : SV_DispatchThreadID; //XYZ idx of thread inside dispatch uint threadid : SV_GroupIndex; //flattened idx of thread inside group }; static const uint2 samples_per_preset[7] = { // slices/steps preset samples uint2(2, 2), //Low 8 uint2(2, 4), //Medium 16 uint2(2, 10), //High 40 uint2(3, 12), //Very High 72 uint2(4, 14), //Ultra 112 uint2(6, 16), //Extreme 192 uint2(8, 20) //IDGAF 320 }; /*============================================================================= Functions =============================================================================*/ float2 pixel_idx_to_uv(float2 pos, float2 texture_size) { float2 inv_texture_size = rcp(texture_size); return pos * inv_texture_size + 0.5 * inv_texture_size; } bool check_boundaries(uint2 pos, uint2 dest_size) { return pos.x < dest_size.x && pos.y < dest_size.y; //>= because dest size e.g. 1920, pos [0, 1919] } uint2 deinterleave_pos(uint2 pos, uint2 tiles, uint2 gridsize) { int2 tilesize = CEIL_DIV(gridsize, tiles); //gridsize / tiles; int2 tile_idx = pos % tiles; int2 pos_in_tile = pos / tiles; return tile_idx * tilesize + pos_in_tile; } uint2 reinterleave_pos(uint2 pos, uint2 tiles, uint2 gridsize) { int2 tilesize = CEIL_DIV(gridsize, tiles); //gridsize / tiles; int2 tile_idx = pos / tilesize; int2 pos_in_tile = pos % tilesize; return pos_in_tile * tiles + tile_idx; } float2 deinterleave_uv(float2 uv) { float2 splituv = uv * DEINTERLEAVE_TILE_COUNT; float2 splitoffset = floor(splituv) - DEINTERLEAVE_TILE_COUNT * 0.5 + 0.5; splituv = frac(splituv) + splitoffset * BUFFER_PIXEL_SIZE; return splituv; } float2 reinterleave_uv(float2 uv) { uint2 whichtile = floor(uv / BUFFER_PIXEL_SIZE) % DEINTERLEAVE_TILE_COUNT; float2 newuv = uv + whichtile; newuv /= DEINTERLEAVE_TILE_COUNT; return newuv; } float3 get_normals(in float2 uv, out float edge_weight) { float3 delta = float3(BUFFER_PIXEL_SIZE, 0); //similar system to Intel ASSAO/AMD CACAO/XeGTAO and friends with improved weighting and less ALU float3 center = Camera::uv_to_proj(uv); float3 deltaL = Camera::uv_to_proj(uv - delta.xz) - center; float3 deltaR = Camera::uv_to_proj(uv + delta.xz) - center; float3 deltaT = Camera::uv_to_proj(uv - delta.zy) - center; float3 deltaB = Camera::uv_to_proj(uv + delta.zy) - center; float4 zdeltaLRTB = abs(float4(deltaL.z, deltaR.z, deltaT.z, deltaB.z)); float4 w = zdeltaLRTB.xzyw + zdeltaLRTB.zywx; w = rcp(0.001 + w * w); //inverse weighting, larger delta -> lesser weight edge_weight = saturate(1.0 - dot(w, 1)); #if MXAO_USE_LAUNCHPAD_NORMALS //this is a bit hacky, we need the edge weight for filtering but Launchpad doesn't give them to us, so we compute the data till here and read launchpad normals float3 normal = Deferred::get_normals(uv); #else float3 n0 = cross(deltaT, deltaL); float3 n1 = cross(deltaR, deltaT); float3 n2 = cross(deltaB, deltaR); float3 n3 = cross(deltaL, deltaB); float4 finalweight = w * rsqrt(float4(dot(n0, n0), dot(n1, n1), dot(n2, n2), dot(n3, n3))); float3 normal = n0 * finalweight.x + n1 * finalweight.y + n2 * finalweight.z + n3 * finalweight.w; normal *= rsqrt(dot(normal, normal) + 1e-8); #endif return normal; } float get_jitter(uint2 p) { uint tiles = DEINTERLEAVE_TILE_COUNT; uint jitter_idx = dot(p % tiles, uint2(1, tiles)); jitter_idx *= DEINTERLEAVE_HIGH ? 17u : 11u; return ((jitter_idx % (tiles * tiles)) + 0.5) / (tiles * tiles); } float get_fade_factor(float depth) { float fade = saturate(1 - depth * depth); //fixed fade that smoothly goes to 0 at depth = 1 depth /= MXAO_FADE_DEPTH; return fade * saturate(exp2(-depth * depth)); //overlaying regular exponential fade } //============================================================================= #if _COMPUTE_SUPPORTED //============================================================================= static uint occlusion_bitfield; void bitfield_init() { occlusion_bitfield = 0xFFFFFFFF; } void process_horizons(float2 h) { uint a = uint(h.x * 32); uint b = ceil(saturate(h.y - h.x) * 32); //ceil? using half occlusion here, this attenuates effect when an occluder is so far away that can't cover half a sector uint occlusion = ((1 << b) - 1) << a; occlusion_bitfield &= ~occlusion; //somehow "and" is faster than "or" based occlusion } float integrate_sectors() { return saturate(countbits(occlusion_bitfield) / 32.0); } //read from deinterleave volume float read_z(float2 uv, float w) { return tex3Dlod(sZSrc3D, float4(uv, w, 0)).x; } bool shading_rate(uint2 tile_idx) { bool skip_pixel = false; switch(SHADING_RATE) { case 1: skip_pixel = ((tile_idx.x + tile_idx.y) & 1) ^ (FRAMECOUNT & 1); break; case 2: skip_pixel = (tile_idx.x & 1 + (tile_idx.y & 1) * 2) ^ (FRAMECOUNT & 3); break; } return skip_pixel; } //============================================================================= #else //Needs this because DX9 is a jackass and doesn't have bitwise ops... so emulate them with floats //============================================================================= bool bitfield_is_set(float bitfield, int bit) { float state = floor(bitfield * exp2(-bit)); //>> return frac(state * 0.5) > 0.25; //& 1 } void bitfield_set(inout float bitfield, int bit, bool value) { bitfield += exp2(bit) * (value - bitfield_is_set(bitfield, bit)); } float bitfield_set_bits(float bitfield, int start, int stride) { [loop] for(int bit = start; bit < start + stride; bit++) bitfield_set(bitfield, bit, 1); return bitfield; } static float occlusion_bitfield; void bitfield_init() { occlusion_bitfield = 0; } float integrate_sectors() { float sum = 0; [loop] for(int bit = 0; bit < 24; bit++) sum += bitfield_is_set(occlusion_bitfield, bit); return saturate(1.0 - sum / 25.0); } void process_horizons(float2 h) { uint a = floor(h.x * 24); uint b = floor(saturate(h.y - h.x) * 25.0); //haven't figured out why this needs to be one more (gives artifacts otherwise) but whatever, somethingsomething float inaccuracy occlusion_bitfield = bitfield_set_bits(occlusion_bitfield, a, b); } //read from tiled texture float read_z(float2 uv, float w) { return tex2Dlod(sZSrc, uv, 0).x; } bool shading_rate(uint2 tile_idx) { bool skip_pixel = false; switch(SHADING_RATE) { case 1: skip_pixel = ((tile_idx.x + tile_idx.y) % 2) != (FRAMECOUNT % 2); break; case 2: skip_pixel = (tile_idx.x % 2 + (tile_idx.y % 2) * 2) != (FRAMECOUNT % 4); break; } return skip_pixel; } //============================================================================= #endif //_COMPUTE_SUPPORTED //============================================================================= /*============================================================================= Shader Entry Points =============================================================================*/ VSOUT MainVS(in uint id : SV_VertexID) { VSOUT o; FullscreenTriangleVS(id, o.vpos, o.uv); return o; } #if _COMPUTE_SUPPORTED void Deinterleave3DCS(in CSIN i) { if(!check_boundaries(i.dispatchthreadid.xy * 2, BUFFER_SCREEN_SIZE)) return; float2 uv = pixel_idx_to_uv(i.dispatchthreadid.xy * 2, BUFFER_SCREEN_SIZE); float2 corrected_uv = Depth::correct_uv(uv); //fixed for lookup #if RESHADE_DEPTH_INPUT_IS_UPSIDE_DOWN corrected_uv.y -= BUFFER_PIXEL_SIZE.y * 0.5; //shift upwards since gather looks down and right float4 depth_texels = tex2DgatherR(DepthInput, corrected_uv).wzyx; #else float4 depth_texels = tex2DgatherR(DepthInput, corrected_uv); #endif depth_texels = Depth::linearize(depth_texels); depth_texels.x = Camera::depth_to_z(depth_texels.x); depth_texels.y = Camera::depth_to_z(depth_texels.y); depth_texels.z = Camera::depth_to_z(depth_texels.z); depth_texels.w = Camera::depth_to_z(depth_texels.w); //offsets for xyzw components const uint2 offsets[4] = {uint2(0, 1), uint2(1, 1), uint2(1, 0), uint2(0, 0)}; [unroll] for(uint j = 0; j < 4; j++) { uint2 screenpos = i.dispatchthreadid.xy * 2 + offsets[j]; const uint tilecount = DEINTERLEAVE_TILE_COUNT; uint3 write_pos; write_pos.xy = screenpos / tilecount; uint2 tile_idx = screenpos - write_pos.xy * tilecount; write_pos.z = tile_idx.x + tile_idx.y * tilecount; tex3Dstore(stZSrc3D, write_pos, depth_texels[j]); } } #else void DepthInterleavePS(in VSOUT i, out float o : SV_Target0) { float2 get_uv = deinterleave_uv(i.uv); o = Camera::depth_to_z(Depth::get_linear_depth(get_uv)); } #endif float2 MXAOFused(uint2 screenpos, float4 uv, float depth_layer) { float z = read_z(uv.xy, depth_layer); float d = Camera::z_to_depth(z); [branch] if(get_fade_factor(d) < 0.001) return float2(1, d); float3 p = Camera::uv_to_proj(uv.zw, z); float edge_weight; float3 n = get_normals(uv.zw, edge_weight); p = p * 0.996; float3 v = normalize(-p); #if _COMPUTE_SUPPORTED static const float4 texture_scale = BUFFER_ASPECT_RATIO.xyxy; #else static const float4 texture_scale = float2(1.0 / DEINTERLEAVE_TILE_COUNT, 1.0).xxyy * BUFFER_ASPECT_RATIO.xyxy; #endif uint slice_count = samples_per_preset[MXAO_GLOBAL_SAMPLE_QUALITY_PRESET].x; uint sample_count = samples_per_preset[MXAO_GLOBAL_SAMPLE_QUALITY_PRESET].y; float jitter = get_jitter(screenpos); float3 slice_dir = 0; sincos(jitter * PI * (6.0/slice_count), slice_dir.x, slice_dir.y); float2x2 rotslice; sincos(PI / slice_count, rotslice._21, rotslice._11); rotslice._12 = -rotslice._21; rotslice._22 = rotslice._11; float worldspace_radius = MXAO_SAMPLE_RADIUS * 0.5; float screenspace_radius = worldspace_radius / p.z * 0.5; [flatten] if(MXAO_WORLDSPACE_ENABLE) { screenspace_radius = MXAO_SAMPLE_RADIUS * 0.03; worldspace_radius = screenspace_radius * p.z * 2.0; } float visibility = 0; float slicesum = 0; float T = log(1 + worldspace_radius) * 0.3333;//arbitrary thickness that looks good relative to sample radius float falloff_factor = rcp(worldspace_radius); falloff_factor *= falloff_factor; //terms for the GTAO slice weighting logic, math has been extremely simplified but is //entirely unrecognizable now. 26 down to 19 instructions though :yeahboiii: float2 vcrossn_xy = float2(v.yz * n.zx - v.zx * n.yz);//cross(v, n).xy; float ndotv = dot(n, v); while(slice_count-- > 0) //1 less register and a bit faster { slice_dir.xy = mul(slice_dir.xy, rotslice); float4 scaled_dir = (slice_dir.xy * screenspace_radius).xyxy * texture_scale; float sdotv = dot(slice_dir.xy, v.xy); float sdotn = dot(slice_dir.xy, n.xy); float ndotns = dot(slice_dir.xy, vcrossn_xy) * rsqrt(saturate(1 - sdotv * sdotv)); float sliceweight = sqrt(saturate(1 - ndotns * ndotns));//length of projected normal on slice float cosn = saturate(ndotv * rcp(sliceweight)); float normal_angle = Math::fast_acos(cosn); normal_angle = sdotn < sdotv * ndotv ? -normal_angle : normal_angle; float2 maxhorizoncos = sin(normal_angle); maxhorizoncos.y = -maxhorizoncos.y; //cos(normal_angle -+ pi/2) bitfield_init(); [unroll] for(int side = 0; side < 2; side++) { maxhorizoncos = maxhorizoncos.yx; //can't trust Vulkan to unroll, so make indices natively addressable for that little more efficiency float lowesthorizoncos = maxhorizoncos.x; //much better falloff than original GTAO :) [loop] for(int _sample = 0; _sample < sample_count; _sample += 2) { float2 s = (_sample + float2(0, 1) + jitter) / sample_count; s *= s; float4 tap_uv[2] = {uv + s.x * scaled_dir, uv + s.y * scaled_dir}; if(!all(saturate(tap_uv[1].zw - tap_uv[1].zw * tap_uv[1].zw))) break; float2 zz; //https://developer.nvidia.com/blog/the-peak-performance-analysis-method-for-optimizing-any-gpu-workload/ zz.x = read_z(tap_uv[0].xy, depth_layer); zz.y = read_z(tap_uv[1].xy, depth_layer); [unroll] //less VGPR by splitting for(uint pair = 0; pair < 2; pair++) { float3 deltavec = Camera::uv_to_proj(tap_uv[pair].zw, zz[pair]) - p; #if MXAO_AO_TYPE < 2 float ddotd = dot(deltavec, deltavec); float samplehorizoncos = dot(deltavec, v) * rsqrt(ddotd); float falloff = rcp(1 + ddotd * falloff_factor); samplehorizoncos = lerp(lowesthorizoncos, samplehorizoncos, falloff); maxhorizoncos.x = max(maxhorizoncos.x, samplehorizoncos); #else float ddotv = dot(deltavec, v); float ddotd = dot(deltavec, deltavec); float2 h_frontback = float2(ddotv, ddotv - T) * rsqrt(float2(ddotd, ddotd - 2 * T * ddotv + T * T)); h_frontback = Math::fast_acos(h_frontback); h_frontback = side ? h_frontback : -h_frontback.yx;//flip sign and sort in the same cmov, efficiency baby! h_frontback = saturate((h_frontback + normal_angle) / PI + 0.5); #if MXAO_AO_TYPE == 2 //this almost perfectly approximates inverse transform sampling for cosine lobe h_frontback = h_frontback * h_frontback * (3.0 - 2.0 * h_frontback); #endif process_horizons(h_frontback); #endif //MXAO_AO_TYPE } } scaled_dir = -scaled_dir; //unroll kills that :) } #if MXAO_AO_TYPE == 0 float2 max_horizon_angle = Math::fast_acos(maxhorizoncos); float2 h = float2(-max_horizon_angle.x, max_horizon_angle.y); //already clamped at init visibility += dot(cosn + 2.0 * h * sin(normal_angle) - cos(2.0 * h - normal_angle), sliceweight); slicesum++; #elif MXAO_AO_TYPE == 1 float2 max_horizon_angle = Math::fast_acos(maxhorizoncos); visibility += dot(max_horizon_angle, sliceweight); slicesum += sliceweight; #else visibility += integrate_sectors() * sliceweight; slicesum += sliceweight; #endif } #if MXAO_AO_TYPE == 0 visibility /= slicesum * 4; #elif MXAO_AO_TYPE == 1 visibility /= slicesum * PI; #else visibility /= slicesum; #endif return float2(saturate(visibility), edge_weight > 0.5 ? -d : d);//store depth negated for pixels with low normal confidence to drive the filter } #if _COMPUTE_SUPPORTED void OcclusionWrap3DCS(in CSIN i) { const uint tilecount = DEINTERLEAVE_TILE_COUNT; const uint2 tilesize = BUFFER_SCREEN_SIZE / tilecount; uint2 tile_idx; tile_idx.y = i.dispatchthreadid.z / tilecount; tile_idx.x = i.dispatchthreadid.z - tile_idx.y * tilecount; if(!check_boundaries(i.dispatchthreadid.xy, tilesize) || shading_rate(tile_idx)) return; uint2 screen_pos = i.dispatchthreadid.xy * tilecount + tile_idx; float4 uv; uv.xy = pixel_idx_to_uv(i.dispatchthreadid.xy, tilesize); uv.zw = pixel_idx_to_uv(screen_pos, BUFFER_SCREEN_SIZE); float depth_layer = i.dispatchthreadid.z * rcp(tilecount * tilecount); float2 ao_and_guide = MXAOFused(screen_pos, uv, depth_layer); tex2Dstore(stAOTex1, screen_pos, ao_and_guide.xyyy); } #else void OcclusionWrap1PS(in VSOUT i, out float2 o : SV_Target0) //writes to AOTex2 { uint2 dispatchthreadid = floor(i.vpos.xy); uint2 write_pos = reinterleave_pos(dispatchthreadid, DEINTERLEAVE_TILE_COUNT, BUFFER_SCREEN_SIZE); uint2 tile_idx = dispatchthreadid / CEIL_DIV(BUFFER_SCREEN_SIZE, DEINTERLEAVE_TILE_COUNT); if(shading_rate(tile_idx)) discard; float4 uv; uv.xy = pixel_idx_to_uv(dispatchthreadid, BUFFER_SCREEN_SIZE); //uv.zw = pixel_idx_to_uv(write_pos, BUFFER_SCREEN_SIZE); uv.zw = deinterleave_uv(uv.xy); //no idea why _this_ works but the other doesn't but that's just DX9 being a jackass I guess o = MXAOFused(write_pos, uv, 0.0); } void OcclusionWrap2PS(in VSOUT i, out float2 o : SV_Target0) { uint2 dispatchthreadid = floor(i.vpos.xy); uint2 read_pos = deinterleave_pos(dispatchthreadid, DEINTERLEAVE_TILE_COUNT, BUFFER_SCREEN_SIZE); uint2 tile_idx = dispatchthreadid / CEIL_DIV(BUFFER_SCREEN_SIZE, DEINTERLEAVE_TILE_COUNT); //need to do it here again because the AO pass writes to AOTex2, which is also intermediate for filter //so we only take the new texels and transfer them to AOTex1, so AOTex1 contains unfiltered, reconstructed data if(shading_rate(tile_idx)) discard; o = tex2Dfetch(sAOTexRaw, read_pos).xy; } #endif /* float2 filter_crossbilateral(float2 uv, sampler sAO, int iter) { float2 center = tex2Dlod(sAO, uv, 0).xy; float2 axis = float2(iter, !iter) * BUFFER_PIXEL_SIZE; int k = 5; float sigma = (k + 1.0) * 0.5; float falloff = rcp(2 * sigma * sigma); float4 mv = float4(center.y, center.y * center.y, center.x, center.x * center.y); float wsum = 1; [unroll] for(int j = 1; j < k; j++) { float2 tap = tex2Dlod(sAO, uv + axis * j, 0).xy; float w = exp2(-j*j*falloff); mv += float4(tap.y, tap.y * tap.y, tap.x, tap.x * tap.y) * w; tap = tex2Dlod(sAO, uv - axis * j, 0).xy; mv += float4(tap.y, tap.y * tap.y, tap.x, tap.x * tap.y) * w; wsum += 2.0 * w; } mv /= wsum; float b = (mv.w - mv.x * mv.z) / max(mv.y - mv.x * mv.x, exp2(-28)); float a = mv.z - b * mv.x; return float2(saturate(b * center.y + a), center.y); } */ //todo add direct sample method for DX9 float2 filter(float2 uv, sampler sAO, int iter) { float g = tex2D(sAO, uv).y; bool blurry = g < 0; float flip = iter ? -1 : 1; float4 ao, depth, mv; ao = tex2DgatherR(sAO, uv + flip * BUFFER_PIXEL_SIZE * float2(-0.5, -0.5)); depth = abs(tex2DgatherG(sAO, uv + flip * BUFFER_PIXEL_SIZE * float2(-0.5, -0.5))); //abs because sign flip for edge pixels! mv = float4(dot(depth, 1), dot(depth, depth), dot(ao, 1), dot(ao, depth)); ao = tex2DgatherR(sAO, uv + flip * BUFFER_PIXEL_SIZE * float2(1.5, -0.5)); depth = abs(tex2DgatherG(sAO, uv + flip * BUFFER_PIXEL_SIZE * float2(1.5, -0.5))); mv += float4(dot(depth, 1), dot(depth, depth), dot(ao, 1), dot(ao, depth)); ao = tex2DgatherR(sAO, uv + flip * BUFFER_PIXEL_SIZE * float2(-0.5, 1.5)); depth = abs(tex2DgatherG(sAO, uv + flip * BUFFER_PIXEL_SIZE * float2(-0.5, 1.5))); mv += float4(dot(depth, 1), dot(depth, depth), dot(ao, 1), dot(ao, depth)); ao = tex2DgatherR(sAO, uv + flip * BUFFER_PIXEL_SIZE * float2(1.5, 1.5)); depth = abs(tex2DgatherG(sAO, uv + flip * BUFFER_PIXEL_SIZE * float2(1.5, 1.5))); mv += float4(dot(depth, 1), dot(depth, depth), dot(ao, 1), dot(ao, depth)); mv /= 16.0; float b = (mv.w - mv.x * mv.z) / max(mv.y - mv.x * mv.x, exp2(blurry ? -12 : -30)); float a = mv.z - b * mv.x; return float2(saturate(b * abs(g) + a), g); //abs because sign flip for edge pixels! } void Filter1PS(in VSOUT i, out float2 o : SV_Target0) { if(MXAO_FILTER_SIZE < 2) discard; o = filter(i.uv, sAOTex1, 0); } void Filter2PS(in VSOUT i, out float3 o : SV_Target0) { float2 t; [branch] if(MXAO_FILTER_SIZE == 2) t = filter(i.uv, sAOTex2, 1); else if(MXAO_FILTER_SIZE == 1) t = filter(i.uv, sAOTex1, 1); else t = tex2Dlod(sAOTex1, i.uv, 0).xy; float mxao = t.x, d = abs(t.y); //abs because sign flip for edge pixels! mxao = lerp(1, mxao, saturate(MXAO_SSAO_AMOUNT)); if(MXAO_SSAO_AMOUNT > 1) mxao = lerp(mxao, mxao * mxao, saturate(MXAO_SSAO_AMOUNT - 1)); //if someone _MUST_ use a higher intensity, switch to gamma mxao = lerp(1, mxao, get_fade_factor(d)); float3 color = tex2D(ColorInput, i.uv).rgb; color *= color; color = color * rcp(1.1 - color); color *= mxao; color = 1.1 * color * rcp(color + 1.0); color = sqrt(color); o = MXAO_DEBUG_VIEW_ENABLE ? mxao : color; } /*============================================================================= Techniques =============================================================================*/ technique MartysMods_MXAO < ui_label = "iMMERSE: MXAO"; ui_tooltip = " MartysMods - MXAO \n" " MartysMods Epic ReShade Effects (iMMERSE) \n" "______________________________________________________________________________\n" "\n" "MXAO is a high quality, high performance Screen-Space Ambient Occlusion (SSAO)\n" "effect which accurately simulates diffuse shadows in dark corners and crevices\n" "\n" "\n" "Visit https://martysmods.com for more information. \n" "\n" "______________________________________________________________________________"; > { #if _COMPUTE_SUPPORTED pass { ComputeShader = Deinterleave3DCS<32, 32>; DispatchSizeX = CEIL_DIV(BUFFER_WIDTH, 64); DispatchSizeY = CEIL_DIV(BUFFER_HEIGHT, 64); } pass { ComputeShader = OcclusionWrap3DCS<16, 16, 1>; DispatchSizeX = CEIL_DIV((BUFFER_WIDTH/DEINTERLEAVE_TILE_COUNT), 16); DispatchSizeY = CEIL_DIV((BUFFER_HEIGHT/DEINTERLEAVE_TILE_COUNT), 16); DispatchSizeZ = DEINTERLEAVE_TILE_COUNT * DEINTERLEAVE_TILE_COUNT; } #else pass { VertexShader = MainVS; PixelShader = DepthInterleavePS; RenderTarget = ZSrc; } pass { VertexShader = MainVS; PixelShader = OcclusionWrap1PS; RenderTarget = AOTexRaw; } pass { VertexShader = MainVS; PixelShader = OcclusionWrap2PS; RenderTarget = AOTex1; } #endif pass { VertexShader = MainVS; PixelShader = Filter1PS; RenderTarget = AOTex2; } pass { VertexShader = MainVS; PixelShader = Filter2PS; } }