And here's a more lightweight shader that rips out all the NTSC-related stuff for people who want a faster MAME experience. This took Tapper at 3x oversampling from 330% to 410% on my Radeon 5970.

Code
//-----------------------------------------------------------------------------
// Post-Processing Effect
//-----------------------------------------------------------------------------

texture Diffuse;

sampler DiffuseSampler = sampler_state
{
	Texture   = <Diffuse>;
	MipFilter = LINEAR;
	MinFilter = LINEAR;
	MagFilter = LINEAR;
	AddressU = CLAMP;
	AddressV = CLAMP;
	AddressW = CLAMP;
};

texture Shadow;

sampler ShadowSampler = sampler_state
{
	Texture   = <Shadow>;
	MipFilter = LINEAR;
	MinFilter = LINEAR;
	MagFilter = LINEAR;
	AddressU = WRAP;
	AddressV = WRAP;
	AddressW = WRAP;
};

//-----------------------------------------------------------------------------
// Vertex Definitions
//-----------------------------------------------------------------------------

struct VS_OUTPUT
{
	float4 Position : POSITION;
	float4 Color : COLOR0;
	float2 TexCoord : TEXCOORD0;
	float2 ExtraInfo : TEXCOORD1;
	float3 CoordX : TEXCOORD2;
	float3 CoordY : TEXCOORD3;
};

struct VS_INPUT
{
	float4 Position : POSITION;
	float4 Color : COLOR0;
	float2 TexCoord : TEXCOORD0;
	float2 ExtraInfo : TEXCOORD1;
};

struct PS_INPUT
{
	float4 Color : COLOR0;
	float2 TexCoord : TEXCOORD0;
	float2 ExtraInfo : TEXCOORD1;
	float3 CoordX : TEXCOORD2;
	float3 CoordY : TEXCOORD3;
};

//-----------------------------------------------------------------------------
// Post-Processing Vertex Shader
//-----------------------------------------------------------------------------

uniform float RedConvergeX;
uniform float RedConvergeY;
uniform float GrnConvergeX;
uniform float GrnConvergeY;
uniform float BluConvergeX;
uniform float BluConvergeY;

uniform float TargetWidth;
uniform float TargetHeight;

uniform float RawWidth;
uniform float RawHeight;

uniform float WidthRatio;
uniform float HeightRatio;

uniform float RedRadialConvergeX;
uniform float RedRadialConvergeY;
uniform float GrnRadialConvergeX;
uniform float GrnRadialConvergeY;
uniform float BluRadialConvergeX;
uniform float BluRadialConvergeY;

VS_OUTPUT vs_main(VS_INPUT Input)
{
	VS_OUTPUT Output = (VS_OUTPUT)0;
	
	float2 invDims = float2(1.0f / RawWidth, 1.0f / RawHeight);
	float2 Ratios = float2(1.0f / WidthRatio, 1.0f / HeightRatio);
	Output.Position = float4(Input.Position.xyz, 1.0f);
	Output.Position.x /= TargetWidth;
	Output.Position.y /= TargetHeight;
	Output.Position.y = 1.0f - Output.Position.y;
	Output.Position.x -= 0.5f;
	Output.Position.y -= 0.5f;
	Output.Position *= float4(2.0f, 2.0f, 1.0f, 1.0f);
	Output.Color = Input.Color;
	Output.TexCoord = Input.TexCoord;//(Input.TexCoord - float2(0.5f, 0.5f)) / 8.0f + float2(0.25f, 0.125f);
	Output.ExtraInfo = Input.ExtraInfo;
	
	Output.CoordX.x = ((((Output.TexCoord.x / Ratios.x) - 0.5f)) * (1.0f + RedRadialConvergeX / RawWidth) + 0.5f) * Ratios.x + RedConvergeX * invDims.x;
	Output.CoordX.y = ((((Output.TexCoord.x / Ratios.x) - 0.5f)) * (1.0f + GrnRadialConvergeX / RawWidth) + 0.5f) * Ratios.x + GrnConvergeX * invDims.x;
	Output.CoordX.z = ((((Output.TexCoord.x / Ratios.x) - 0.5f)) * (1.0f + BluRadialConvergeX / RawWidth) + 0.5f) * Ratios.x + GrnConvergeX * invDims.x;
	
	Output.CoordY.x = ((((Output.TexCoord.y / Ratios.y) - 0.5f)) * (1.0f + RedRadialConvergeY / RawHeight) + 0.5f) * Ratios.y + RedConvergeY * invDims.y;
	Output.CoordY.y = ((((Output.TexCoord.y / Ratios.y) - 0.5f)) * (1.0f + GrnRadialConvergeY / RawHeight) + 0.5f) * Ratios.y + BluConvergeY * invDims.y;
	Output.CoordY.z = ((((Output.TexCoord.y / Ratios.y) - 0.5f)) * (1.0f + BluRadialConvergeY / RawHeight) + 0.5f) * Ratios.y + BluConvergeY * invDims.y;
	return Output;
}

//-----------------------------------------------------------------------------
// Post-Processing Pixel Shader
//-----------------------------------------------------------------------------

uniform float PI = 3.14159265f;

uniform float PincushionAmountX = 0.1f;
uniform float PincushionAmountY = 0.1f;

uniform float ScanlineAmount = 1.0f;
uniform float ScanlineScale = 1.0f;
uniform float ScanlineBrightScale = 1.0f;
uniform float ScanlineBrightOffset = 1.0f;
uniform float ScanlineOffset = 1.0f;

uniform float RedFromRed = 1.0f;
uniform float RedFromGrn = 0.0f;
uniform float RedFromBlu = 0.0f;
uniform float GrnFromRed = 0.0f;
uniform float GrnFromGrn = 1.0f;
uniform float GrnFromBlu = 0.0f;
uniform float BluFromRed = 0.0f;
uniform float BluFromGrn = 0.0f;
uniform float BluFromBlu = 1.0f;

uniform float YfromY = 1.0f;
uniform float YfromI = 0.0f;
uniform float YfromQ = 0.0f;
uniform float IfromY = 0.0f;
uniform float IfromI = 1.0f;
uniform float IfromQ = 0.0f;
uniform float QfromY = 0.0f;
uniform float QfromI = 0.0f;
uniform float QfromQ = 1.0f;

uniform float RedOffset = 0.0f;
uniform float GrnOffset = 0.0f;
uniform float BluOffset = 0.0f;

uniform float RedScale = 1.0f;
uniform float GrnScale = 1.0f;
uniform float BluScale = 1.0f;

uniform float RedFloor = 0.0f;
uniform float GrnFloor = 0.0f;
uniform float BluFloor = 0.0f;

uniform float Saturation = 1.0f;

uniform float YScale = 1.0f;
uniform float IScale = 1.0f;
uniform float QScale = 1.0f;
uniform float YOffset = 0.0f;
uniform float IOffset = 0.0f;
uniform float QOffset = 0.0f;

uniform float RedPower = 2.2f;
uniform float GrnPower = 2.2f;
uniform float BluPower = 2.2f;

uniform float EdgeDetectScale = 1.0f;
uniform float EdgeToBaseRatio = 0.0f;

uniform float SubsampleLength = 3.0f;

uniform float CurrFrame = 0.0f;
uniform float CrawlWidth = 3.0f;
uniform float CrawlHeight = 3.0f;
uniform float CrawlRate = 3.0f;

uniform float UseShadow = 0.0f;
uniform float ShadowBrightness = 1.0f;
uniform float ShadowPixelSizeX = 3.0f;
uniform float ShadowPixelSizeY = 3.0f;
uniform float ShadowU = 0.375f;
uniform float ShadowV = 0.375f;
uniform float ShadowWidth = 8.0f;
uniform float ShadowHeight = 8.0f;

float4 ps_main(PS_INPUT Input) : COLOR
{
	float2 Ratios = float2(WidthRatio, HeightRatio);

	// -- Screen Pincushion Calculation --
	float2 UnitCoord = Input.TexCoord * Ratios * 2.0f - 1.0f;

	float PincushionR2 = pow(length(UnitCoord),2.0f) / pow(length(Ratios), 2.0f);
	float2 PincushionCurve = UnitCoord * PincushionAmountX * PincushionR2;
	float2 BaseCoord = Input.TexCoord + PincushionCurve;

	// RGB Pincushion Calculation
	float3 PincushionCurveX = UnitCoord.x * PincushionAmountX * PincushionR2;
	float3 PincushionCurveY = UnitCoord.y * PincushionAmountX * PincushionR2;

	float4 BaseTexel = tex2D(DiffuseSampler, BaseCoord);

	// -- Alpha Clipping (1px border in drawd3d does not work for some reason) --
	clip((BaseCoord.x < 2.0f / RawWidth) ? -1 : 1);
	clip((BaseCoord.y < 1.0f / RawHeight) ? -1 : 1);
	clip((BaseCoord.x > (2.0f / WidthRatio - 2.0f / RawWidth)) ? -1 : 1);
	clip((BaseCoord.y > 1.0f / HeightRatio) ? -1 : 1);

	// -- RGB Deconvergence --
	float3 CoordX = Input.CoordX + PincushionCurveX;
	float3 CoordY = Input.CoordY + PincushionCurveY;
	float RedTexel = tex2D(DiffuseSampler, float2(CoordX.x, CoordY.x)).r;
	float GrnTexel = tex2D(DiffuseSampler, float2(CoordX.y, CoordY.y)).g;
	float BluTexel = tex2D(DiffuseSampler, float2(CoordX.z, CoordY.z)).b;

	float3 Texel = float3(RedTexel, GrnTexel, BluTexel);

	// -- RGB Tint & Shift --
	float ShiftedRed = dot(Texel, float3(RedFromRed, RedFromGrn, RedFromBlu));
	float ShiftedGrn = dot(Texel, float3(GrnFromRed, GrnFromGrn, GrnFromBlu));
	float ShiftedBlu = dot(Texel, float3(BluFromRed, BluFromGrn, BluFromBlu));
	
	// -- RGB Offset & Scale --
	float3 OutTexel = float3(ShiftedRed, ShiftedGrn, ShiftedBlu) * float3(RedScale, GrnScale, BluScale) + float3(RedOffset, GrnOffset, BluOffset);
	
	// -- Saturation --
	float OutLuma = dot(OutTexel, float3(0.3f, 0.59f, 0.11f));
	float3 OutChroma = OutTexel - float3(OutLuma, OutLuma, OutLuma);
	float3 Saturated = OutLuma + OutChroma * Saturation;
	
	float3 OutRGB = Saturated;

	float3 Power = float3(RedPower, GrnPower, BluPower);
	OutRGB = pow(OutRGB, Power);

	// -- Color Compression (increasing the floor of the signal without affecting the ceiling) --
	float3 Floor = float3(RedFloor, GrnFloor, BluFloor);
	OutRGB = Floor + (1.0f - Floor) * OutRGB;

	// -- Scanline Simulation --
	float3 ScanBrightness = lerp(1.0f, abs(sin(((CoordY * RawHeight * ScanlineScale) * PI + ScanlineOffset * RawHeight))) * ScanlineBrightScale + ScanlineBrightOffset, ScanlineAmount);
	float3 Scanned = OutRGB * ScanBrightness;

	float2 ShadowCoord = BaseCoord * float2(RawWidth, RawHeight);
	float ShadowCoordX = frac(ShadowCoord.x / ShadowPixelSizeX) * ShadowU + 2.0f / ShadowWidth;
	float ShadowCoordY = frac(ShadowCoord.y / ShadowPixelSizeY) * ShadowV + 2.0f / ShadowHeight;
	float3 ShadowTexel = lerp(1.0f, tex2D(ShadowSampler, float2(ShadowCoordX, ShadowCoordY)), UseShadow);
	
	// -- Final Pixel --
	float4 Output = lerp(Input.Color, float4(Scanned * lerp(1.0f, ShadowTexel * 1.0f, ShadowBrightness), BaseTexel.a) * Input.Color, Input.ExtraInfo.x);
	
	return Output;
}

//-----------------------------------------------------------------------------
// Post-Processing Effect
//-----------------------------------------------------------------------------

technique TestTechnique
{
	pass Pass0
	{
		Lighting = FALSE;

		//Sampler[0] = <DiffuseSampler>;

		VertexShader = compile vs_3_0 vs_main();
		PixelShader  = compile ps_3_0 ps_main();
	}
}