找个动画片测了下A卡和N卡播放视频时画面的区别

japhsoncross · 发表于 2008-8-2 22:53

关于这个问题我来说一下，对于A卡来说，驱动中有bug，Overlay下，default size播放视频的时候，两个chroma channel采用point resize，导致楼主说的A卡下字幕的问题，因为红色占据大量的V信号，但是overlay在拖动到任意非default size的情况下，对chroma channel的resize又开始起作用，看到的画面又会比较好，但是overlay本身不论是luma，还是chroma channel的resize都非常锐，导致在小画面拉伸到比较大的size的时候会出现网格（grid）。关于这个问题我多次向ATI反映过，他们第一次踢皮球，第二次索性不理。
目前用VMR9比较多，VMR9下，ATI的驱动对chroma channel默认统统做point resize，所以楼主的问题不会随拖动size大小而解决。我在nVidia的8400GS和8500GT上发现nVidia的VMR9的Chroma channel拉伸做得非常完美，这也是楼主看到的效果比ATI效果要的原因。但是nVidia的问题在于使用169WHQL驱动的情况下，对SD和HD都使用limited range，即16-235，不能拉伸到Full Range。幸亏VMR9还有shader，网络上有16-235->0-255的shader，不过我认为这种做法太简单，我自己写了一个，仅作参考：
ampler s0 : register(s0);
float4 p0 : register(c0);
float4 p1 : register(c1);

#define width (p0[0])
#define height (p0[1])
#define counter (p0[2])
#define clock (p0[3])
#define one_over_width (p1[0])
#define one_over_height (p1[1])

#define PI acos(-1)

static float4x4 RGB2YCbCr601 =
{
65.473689369329, 128.426245019123, 25.1000656115476, 0.0,
-37.8187503388942, -74.1812496611058, 112.0, 0.0,
112.0, -93.6890841905523, -18.3109158094477, 0.0,
0.0, 0.0, 0.0, 0.0
};

static float4x4 YCbCr6012RGB =
{
0.0045662100456621, 0.0, 0.0062592266238858, 0.0,
0.0045662100456621, -0.00154502886195404, -0.0031910507046555, 0.0,
0.0045662100456621, 0.00790524846658726, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0
};

static float4x4 RGB2YCbCr709 =
{
46.5679422858608, 156.621940650139, 15.8101170640007, 0.0,
-25.6686477724839, -86.3313522275161, 112.0, 0.0,
112.0, -101.730835816484, -10.2691641835164, 0.0,
0.0, 0.0, 0.0, 0.0
};

static float4x4 YCbCr7092RGB =
{
0.0045662100456621, 0.0 , 0.00703000887614723, 0.0,
0.0045662100456621, -0.000836223613994593, -0.00209021192212653, 0.0,
0.0045662100456621, 0.00828399718427916, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0
};

float4 main(float2 tex : TEXCOORD0) : COLOR
{
float4 c0 = tex2D(s0, tex);
float4 yuvc0 = {0, 0, 0, 0};

if (height < 720)
{
yuvc0 = mul(RGB2YCbCr601, c0) + float4(16, 128, 128, 0);
yuvc0.r = (yuvc0.r-16) / 219 * 255;
yuvc0.g = (yuvc0.g-16) / 224 * 255;
yuvc0.b = (yuvc0.b-16) / 224 * 255;

c0 = mul(YCbCr6012RGB , yuvc0-float4(16, 128, 128, 0));
}
else
{
yuvc0 = mul(RGB2YCbCr709, c0) + float4(16, 128, 128, 0);
yuvc0.r = (yuvc0.r-16) / 219 * 255;
yuvc0.g = (yuvc0.g-16) / 224 * 255;
yuvc0.b = (yuvc0.b-16) / 224 * 255;

c0 = mul(YCbCr7092RGB , yuvc0-float4(16, 128, 128, 0));
}

return c0;
}
如果有问题欢迎指正。这样n卡使用这个shader也可以达到full range，不过这里界定SD和HD的我是依靠Hight，也有看Width的，这个没有固定的说法。

待续。。。

japhsoncross · 发表于 2008-8-2 22:54

关于ATI的问题，我自己就用ATI的卡，2600Pro，被这个问题搞的头疼，不过自己写了一个work around的办法，在shader中加入两个，一个是UV Blur，然后在UV Sharpen，使用最简单的高斯Kernel：
UV Blur：
sampler s0 : register(s0);
float4 p0 : register(c0);
float4 p1 : register(c1);

#define width (p0[0])
#define height (p0[1])
#define counter (p0[2])
#define clock (p0[3])
#define one_over_width (p1[0])
#define one_over_height (p1[1])

#define PI acos(-1)

static float4x4 RGB2YUV709Matrix =
{
0.21263900587151, 0.715168678767756, 0.0721923153607337, 0.0,
-0.21263900587151, -0.715168678767756, 0.927807684639266, 0.0,
0.78736099412849, -0.715168678767756, -0.0721923153607337, 0.0,
0.0, 0.0, 0.0, 0.0
};

static float4x4 YUV7092RGBMatrix =
{
1.0, -0.00000000000000005288, 1.0, 0.0,
1.0, -0.100944458984308, -0.297327067284168, 0.0,
1.0, 1.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0
};

static float4x4 RGB2YUV601Matrix =
{
0.29896661812479, 0.586421210132983, 0.114612171742227, 0.0,
-0.29896661812479, -0.586421210132983, 0.885387828257773, 0.0,
0.70103338187521, -0.586421210132983, -0.114612171742227, 0.0,
0.0, 0.0, 0.0, 0.0
};

static float4x4 YUV6012RGBMatrix =
{
1.0, 0.0000000000000000198, 1.0, 0.0,
1.0, -0.195443428310234, -0.509815492616635, 0.0,
1.0, 1.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0
};

float4 main(float2 tex : TEXCOORD0) : COLOR
{
float4 c0 = tex2D(s0, tex);
float4 yuvc0 = {0, 0, 0, 0};

if (height>=720)
{
yuvc0 = mul(RGB2YUV709Matrix, c0);
yuvc0.g = (    ( mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(0, 1/height)))).g*2 + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(0, -1/height)))).g*2
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, 0)))).g*2 + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, 0)))).g*2
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, 1/height)))).g + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, -1/height)))).g
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, 1/height)))).g + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, -1/height)))).g
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(0, 0)))).g*4)/16.0;

yuvc0.b = (    ( mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(0, 1/height)))).b*2 + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(0, -1/height)))).b*2
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, 0)))).b*2 + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, 0)))).b*2
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, 1/height)))).b + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, -1/height)))).b
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, 1/height)))).b + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, -1/height)))).b
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(0, 0)))).b*4)/16.0;

c0 = mul(YUV7092RGBMatrix, yuvc0);
}
else
{
yuvc0 = mul(RGB2YUV601Matrix, c0);
yuvc0.g = (    ( mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(0, 1/height)))).g*2 + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(0, -1/height)))).g*2
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, 0)))).g*2 + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, 0)))).g*2
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, 1/height)))).g + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, -1/height)))).g
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, 1/height)))).g + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, -1/height)))).g
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(0, 0)))).g*4)/16.0;

yuvc0.b = (    ( mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(0, 1/height)))).b*2 + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(0, -1/height)))).b*2
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, 0)))).b*2 + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, 0)))).b*2
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, 1/height)))).b + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, -1/height)))).b
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, 1/height)))).b + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, -1/height)))).b
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(0, 0)))).b*4)/16.0;

c0 = mul(YUV6012RGBMatrix, yuvc0);
}

return c0;
}

UV Sharpen：
sampler s0 : register(s0);
float4 p0 : register(c0);
float4 p1 : register(c1);

#define width (p0[0])
#define height (p0[1])
#define counter (p0[2])
#define clock (p0[3])
#define one_over_width (p1[0])
#define one_over_height (p1[1])

#define PI acos(-1)

static float4x4 RGB2YUV709Matrix =
{
0.21263900587151, 0.715168678767756, 0.0721923153607337, 0.0,
-0.21263900587151, -0.715168678767756, 0.927807684639266, 0.0,
0.78736099412849, -0.715168678767756, -0.0721923153607337, 0.0,
0.0, 0.0, 0.0, 0.0
};

static float4x4 YUV7092RGBMatrix =
{
1.0, -0.00000000000000005288, 1.0, 0.0,
1.0, -0.100944458984308, -0.297327067284168, 0.0,
1.0, 1.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0
};

static float4x4 RGB2YUV601Matrix =
{
0.29896661812479, 0.586421210132983, 0.114612171742227, 0.0,
-0.29896661812479, -0.586421210132983, 0.885387828257773, 0.0,
0.70103338187521, -0.586421210132983, -0.114612171742227, 0.0,
0.0, 0.0, 0.0, 0.0
};

static float4x4 YUV6012RGBMatrix =
{
1.0, 0.0000000000000000198, 1.0, 0.0,
1.0, -0.195443428310234, -0.509815492616635, 0.0,
1.0, 1.0, 0.0, 0.0,
0.0, 0.0, 0.0, 0.0
};

float4 main(float2 tex : TEXCOORD0) : COLOR
{
float4 c0 = tex2D(s0, tex);
float4 yuvc0 = {0, 0, 0, 0};

if (height >= 720)
{
yuvc0 = mul(RGB2YUV709Matrix, c0);
yuvc0.g = -(    (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(0, 1/height)))).g + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(0, -1/height)))).g
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, 0)))).g + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, 0)))).g
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, 1/height)))).g + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, -1/height)))).g
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, 1/height)))).g + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, -1/height)))).g)/8.0 + 2* (mul(RGB2YUV709Matrix, tex2D(s0, tex))).g;

yuvc0.b = -(    (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(0, 1/height)))).b + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(0, -1/height)))).b
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, 0)))).b + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, 0)))).b
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, 1/height)))).b + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, -1/height)))).b
+(mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(-1/width, 1/height)))).b + (mul(RGB2YUV709Matrix, tex2D(s0, tex+float2(1/width, -1/height)))).b)/8.0 + 2* (mul(RGB2YUV709Matrix, tex2D(s0, tex))).b;
c0 = mul(YUV7092RGBMatrix, yuvc0);

return c0;
}
else
{
yuvc0 = mul(RGB2YUV601Matrix, c0);
yuvc0.g = -(    (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(0, 1/height)))).g + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(0, -1/height)))).g
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, 0)))).g + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, 0)))).g
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, 1/height)))).g + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, -1/height)))).g
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, 1/height)))).g + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, -1/height)))).g)/8.0 + 2* (mul(RGB2YUV601Matrix, tex2D(s0, tex))).g;

yuvc0.b = -(    (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(0, 1/height)))).b + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(0, -1/height)))).b
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, 0)))).b + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, 0)))).b
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, 1/height)))).b + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, -1/height)))).b
+(mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(-1/width, 1/height)))).b + (mul(RGB2YUV601Matrix, tex2D(s0, tex+float2(1/width, -1/height)))).b)/8.0 + 2* (mul(RGB2YUV601Matrix, tex2D(s0, tex))).b;
c0 = mul(YUV6012RGBMatrix, yuvc0);

return c0;
}
}

用MPC的combine shader来达成，注意顺序就可以了，当然ATI最好在注册表中打开UseBT601CSC＝1的功能，这样在SD和HD下默认拉伸到0－255，在这种情况下nVidia和ATI的画面差别微乎其微，在用SMPTE监测的时候有细微的差别，应该是Convert Matrix系数的细微差别了。

以上仅供大家参考。

japhsoncross · 发表于 2008-8-3 10:49

我想我们在讨论的过程中已经偏离了楼主的初衷。
楼主的图，我看过之后只看到两个问题，首先UV Channel在做拉伸的时候的filter问题（ATI在VMR9默认只用box filter），然后是是limited range和full range的问题。
我倾向于相信，楼主使用的VMR9，毕竟overlay下是抓不到图的，VMR7的话，使用意义不大，当然如果我说的不对，楼主可以在后面澄清。
VMR9走的色彩方面处理的流程，据我所知和desktop是一样的。那么对我们用户来说，VMR9显示出来的数据，即抓下来的截图如果和我们期待的是一样的，那就说明我们的努力已经到头了，因为再往后面，就是Curve Control（这个功能我在ATI芯片的spec中看见过，相信nVidia也会有类似的功能）和CLUT（当然我这里抛弃了用户调整驱动中相关色彩的功能，但这是可以排除一切外因的条件）。除非我们可以改写驱动，或者bios，那么显卡的Curve Control和CLUT是没有办法控制的。当然这里，我排除了一切传输过程的损失。
对于视频来说，CPU/GPU解码出来的默认都是YV12（除非用户在软件解码器中特别定制，比如ffdshow的color space converter），YV12中UV的size只有Y'的1/4，那么即使在默认大小都需要对UV进行拉伸，拉伸的算法，就会影响后面用户的感觉。楼主说的字幕问题就是典型的UV拉伸的问题，这不是锐度问题，因为这里ATI的做法就是过于简单，甚至错误。
至于Y'UV转换为R'G'B'应该用601还是709，应该用0-255还是16-235(UV16-240)，这是很显然的问题，应该不用再花时间讨论，因为这只有对错，没有好坏。

[ 本帖最后由 japhsoncross 于 2008-8-3 10:50 编辑 ]

japhsoncross · 发表于 2008-8-3 10:56

原帖由水银于 2008-8-2 23:06 发表
楼上是高手啊，能写一个用于intel整合显卡的shader么？
很多笔记本都是intel显卡的，好像也是画面灰白

呵呵，惭愧了，我还没有用过intel整合显卡啊，手头的朋友的笔记本也都是独立显卡的笔记本，连我自己老旧的笔记本也是nVidia的Geforce2Go，呵呵，古董了。

japhsoncross · 发表于 2008-8-3 11:03

楼主不妨试试我在104楼和105楼贴的shader，可以大大改善这个问题。

另：硬解也可以用shader，不过我的2600Pro很奇怪，用硬解Vmr9工作不正常，很郁闷，不能拉伸。

[ 本帖最后由 japhsoncross 于 2008-8-3 11:05 编辑 ]

japhsoncross · 发表于 2008-8-3 11:32

呵呵，忘了提了，指令太多了，看见我的卡1080p也能流畅跑，也就没改。默认需要PS3.0编译。

帐号		自动登录	找回密码
密码			注册