Implement VP9 loop filtering (#550)
Unmerged PR from OG Ryujinx (#4367). From @gdkchan: > The main goal of this change is porting the loop filtering from libvpx, which should fix the block artifacts on some VP9 videos on games using NVDEC to decode them. In addition to that, there are two other changes: > > - The remaining decoder code required to decode a VP9 video (with headers included) has been added. That was done because it's much better to test the decoder standalone with a video file. I decided to keep that code on the emulator, even if some of it is unused, since it makes standalone testing easier in the future too, and we can include unit tests with video files. > - Large refactoring of both new and existing code to conform with our conding [sic] styles, done by @TSRBerry (thanks!) Some of it has been automated. > > Since we had no loop filtering before, this change will make video decoding slower. That may cause frame drop etc if the decoder is not fast enough in some games. I plan to optimize the decoder more in the future to make up for that, but if possible I'd prefer to not do it as part of this PR, but if the perf loss is too severe I might consider. > > This will need to be tested on games that had the block artifacts, it would be nice to confirm if they match hardware now, and get some before/after screenshots etc. Comment from @Bjorn29512: > Significantly improves the block artifacts in FE: Engage. > > Before: >  > > After: >  --------- Co-authored-by: gdkchan <gab.dark.100@gmail.com> Co-authored-by: TSR Berry <20988865+TSRBerry@users.noreply.github.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
using Ryujinx.Graphics.Nvdec.Vp9.Common;
|
||||
using Ryujinx.Graphics.Nvdec.Vp9.Common;
|
||||
using Ryujinx.Graphics.Nvdec.Vp9.Types;
|
||||
using System;
|
||||
using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.InvTxfm;
|
||||
@@ -8,11 +8,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||
internal static class Idct
|
||||
{
|
||||
private delegate void Transform1D(ReadOnlySpan<int> input, Span<int> output);
|
||||
|
||||
private delegate void HighbdTransform1D(ReadOnlySpan<int> input, Span<int> output, int bd);
|
||||
|
||||
private struct Transform2D
|
||||
{
|
||||
public Transform1D Cols, Rows; // Vertical and horizontal
|
||||
public readonly Transform1D Cols; // Vertical and horizontal
|
||||
public readonly Transform1D Rows; // Vertical and horizontal
|
||||
|
||||
public Transform2D(Transform1D cols, Transform1D rows)
|
||||
{
|
||||
@@ -23,7 +25,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||
|
||||
private struct HighbdTransform2D
|
||||
{
|
||||
public HighbdTransform1D Cols, Rows; // Vertical and horizontal
|
||||
public readonly HighbdTransform1D Cols; // Vertical and horizontal
|
||||
public readonly HighbdTransform1D Rows; // Vertical and horizontal
|
||||
|
||||
public HighbdTransform2D(HighbdTransform1D cols, HighbdTransform1D rows)
|
||||
{
|
||||
@@ -32,121 +35,124 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly Transform2D[] _iht4 = {
|
||||
private static readonly Transform2D[] Iht4 =
|
||||
{
|
||||
new(Idct4, Idct4), // DCT_DCT = 0
|
||||
new(Iadst4, Idct4), // ADST_DCT = 1
|
||||
new(Idct4, Iadst4), // DCT_ADST = 2
|
||||
new(Iadst4, Iadst4), // ADST_ADST = 3
|
||||
new(Iadst4, Iadst4) // ADST_ADST = 3
|
||||
};
|
||||
|
||||
public static void Iht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType)
|
||||
{
|
||||
int i, j;
|
||||
Span<int> output = stackalloc int[4 * 4];
|
||||
Span<int> outptr = output;
|
||||
Span<int> tempIn = stackalloc int[4];
|
||||
Span<int> tempOut = stackalloc int[4];
|
||||
|
||||
// Inverse transform row vectors
|
||||
for (i = 0; i < 4; ++i)
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
_iht4[txType].Rows(input, outptr);
|
||||
input = input[4..];
|
||||
outptr = outptr[4..];
|
||||
Iht4[txType].Rows(input, outptr);
|
||||
input = input.Slice(4);
|
||||
outptr = outptr.Slice(4);
|
||||
}
|
||||
|
||||
// Inverse transform column vectors
|
||||
for (i = 0; i < 4; ++i)
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
for (j = 0; j < 4; ++j)
|
||||
for (int j = 0; j < 4; ++j)
|
||||
{
|
||||
tempIn[j] = output[j * 4 + i];
|
||||
tempIn[j] = output[(j * 4) + i];
|
||||
}
|
||||
|
||||
_iht4[txType].Cols(tempIn, tempOut);
|
||||
for (j = 0; j < 4; ++j)
|
||||
Iht4[txType].Cols(tempIn, tempOut);
|
||||
for (int j = 0; j < 4; ++j)
|
||||
{
|
||||
dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
|
||||
dest[(j * stride) + i] =
|
||||
ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly Transform2D[] _iht8 = {
|
||||
private static readonly Transform2D[] Iht8 =
|
||||
{
|
||||
new(Idct8, Idct8), // DCT_DCT = 0
|
||||
new(Iadst8, Idct8), // ADST_DCT = 1
|
||||
new(Idct8, Iadst8), // DCT_ADST = 2
|
||||
new(Iadst8, Iadst8), // ADST_ADST = 3
|
||||
new(Iadst8, Iadst8) // ADST_ADST = 3
|
||||
};
|
||||
|
||||
public static void Iht8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType)
|
||||
{
|
||||
int i, j;
|
||||
Span<int> output = stackalloc int[8 * 8];
|
||||
Span<int> outptr = output;
|
||||
Span<int> tempIn = stackalloc int[8];
|
||||
Span<int> tempOut = stackalloc int[8];
|
||||
Transform2D ht = _iht8[txType];
|
||||
Transform2D ht = Iht8[txType];
|
||||
|
||||
// Inverse transform row vectors
|
||||
for (i = 0; i < 8; ++i)
|
||||
for (int i = 0; i < 8; ++i)
|
||||
{
|
||||
ht.Rows(input, outptr);
|
||||
input = input[8..];
|
||||
outptr = outptr[8..];
|
||||
input = input.Slice(8);
|
||||
outptr = outptr.Slice(8);
|
||||
}
|
||||
|
||||
// Inverse transform column vectors
|
||||
for (i = 0; i < 8; ++i)
|
||||
for (int i = 0; i < 8; ++i)
|
||||
{
|
||||
for (j = 0; j < 8; ++j)
|
||||
for (int j = 0; j < 8; ++j)
|
||||
{
|
||||
tempIn[j] = output[j * 8 + i];
|
||||
tempIn[j] = output[(j * 8) + i];
|
||||
}
|
||||
|
||||
ht.Cols(tempIn, tempOut);
|
||||
for (j = 0; j < 8; ++j)
|
||||
for (int j = 0; j < 8; ++j)
|
||||
{
|
||||
dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5));
|
||||
dest[(j * stride) + i] =
|
||||
ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly Transform2D[] _iht16 = {
|
||||
private static readonly Transform2D[] Iht16 =
|
||||
{
|
||||
new(Idct16, Idct16), // DCT_DCT = 0
|
||||
new(Iadst16, Idct16), // ADST_DCT = 1
|
||||
new(Idct16, Iadst16), // DCT_ADST = 2
|
||||
new(Iadst16, Iadst16), // ADST_ADST = 3
|
||||
new(Iadst16, Iadst16) // ADST_ADST = 3
|
||||
};
|
||||
|
||||
public static void Iht16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType)
|
||||
{
|
||||
int i, j;
|
||||
Span<int> output = stackalloc int[16 * 16];
|
||||
Span<int> outptr = output;
|
||||
Span<int> tempIn = stackalloc int[16];
|
||||
Span<int> tempOut = stackalloc int[16];
|
||||
Transform2D ht = _iht16[txType];
|
||||
Transform2D ht = Iht16[txType];
|
||||
|
||||
// Rows
|
||||
for (i = 0; i < 16; ++i)
|
||||
for (int i = 0; i < 16; ++i)
|
||||
{
|
||||
ht.Rows(input, outptr);
|
||||
input = input[16..];
|
||||
outptr = outptr[16..];
|
||||
input = input.Slice(16);
|
||||
outptr = outptr.Slice(16);
|
||||
}
|
||||
|
||||
// Columns
|
||||
for (i = 0; i < 16; ++i)
|
||||
for (int i = 0; i < 16; ++i)
|
||||
{
|
||||
for (j = 0; j < 16; ++j)
|
||||
for (int j = 0; j < 16; ++j)
|
||||
{
|
||||
tempIn[j] = output[j * 16 + i];
|
||||
tempIn[j] = output[(j * 16) + i];
|
||||
}
|
||||
|
||||
ht.Cols(tempIn, tempOut);
|
||||
for (j = 0; j < 16; ++j)
|
||||
for (int j = 0; j < 16; ++j)
|
||||
{
|
||||
dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
|
||||
dest[(j * stride) + i] =
|
||||
ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -268,7 +274,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||
}
|
||||
|
||||
public static void Iht16x16Add(TxType txType, ReadOnlySpan<int> input, Span<byte> dest,
|
||||
int stride, int eob)
|
||||
int stride, int eob)
|
||||
{
|
||||
if (txType == TxType.DctDct)
|
||||
{
|
||||
@@ -280,121 +286,125 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly HighbdTransform2D[] _highbdIht4 = {
|
||||
private static readonly HighbdTransform2D[] HighbdIht4 =
|
||||
{
|
||||
new(HighbdIdct4, HighbdIdct4), // DCT_DCT = 0
|
||||
new(HighbdIadst4, HighbdIdct4), // ADST_DCT = 1
|
||||
new(HighbdIdct4, HighbdIadst4), // DCT_ADST = 2
|
||||
new(HighbdIadst4, HighbdIadst4), // ADST_ADST = 3
|
||||
new(HighbdIadst4, HighbdIadst4) // ADST_ADST = 3
|
||||
};
|
||||
|
||||
public static void HighbdIht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd)
|
||||
{
|
||||
int i, j;
|
||||
Span<int> output = stackalloc int[4 * 4];
|
||||
Span<int> outptr = output;
|
||||
Span<int> tempIn = stackalloc int[4];
|
||||
Span<int> tempOut = stackalloc int[4];
|
||||
|
||||
// Inverse transform row vectors.
|
||||
for (i = 0; i < 4; ++i)
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
_highbdIht4[txType].Rows(input, outptr, bd);
|
||||
input = input[4..];
|
||||
outptr = outptr[4..];
|
||||
HighbdIht4[txType].Rows(input, outptr, bd);
|
||||
input = input.Slice(4);
|
||||
outptr = outptr.Slice(4);
|
||||
}
|
||||
|
||||
// Inverse transform column vectors.
|
||||
for (i = 0; i < 4; ++i)
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
for (j = 0; j < 4; ++j)
|
||||
for (int j = 0; j < 4; ++j)
|
||||
{
|
||||
tempIn[j] = output[j * 4 + i];
|
||||
tempIn[j] = output[(j * 4) + i];
|
||||
}
|
||||
|
||||
_highbdIht4[txType].Cols(tempIn, tempOut, bd);
|
||||
for (j = 0; j < 4; ++j)
|
||||
HighbdIht4[txType].Cols(tempIn, tempOut, bd);
|
||||
for (int j = 0; j < 4; ++j)
|
||||
{
|
||||
dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
|
||||
dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i],
|
||||
BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly HighbdTransform2D[] _highIht8 = {
|
||||
private static readonly HighbdTransform2D[] HighIht8 =
|
||||
{
|
||||
new(HighbdIdct8, HighbdIdct8), // DCT_DCT = 0
|
||||
new(HighbdIadst8, HighbdIdct8), // ADST_DCT = 1
|
||||
new(HighbdIdct8, HighbdIadst8), // DCT_ADST = 2
|
||||
new(HighbdIadst8, HighbdIadst8), // ADST_ADST = 3
|
||||
new(HighbdIadst8, HighbdIadst8) // ADST_ADST = 3
|
||||
};
|
||||
|
||||
public static void HighbdIht8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd)
|
||||
{
|
||||
int i, j;
|
||||
Span<int> output = stackalloc int[8 * 8];
|
||||
Span<int> outptr = output;
|
||||
Span<int> tempIn = stackalloc int[8];
|
||||
Span<int> tempOut = stackalloc int[8];
|
||||
HighbdTransform2D ht = _highIht8[txType];
|
||||
HighbdTransform2D ht = HighIht8[txType];
|
||||
|
||||
// Inverse transform row vectors.
|
||||
for (i = 0; i < 8; ++i)
|
||||
for (int i = 0; i < 8; ++i)
|
||||
{
|
||||
ht.Rows(input, outptr, bd);
|
||||
input = input[8..];
|
||||
outptr = output[8..];
|
||||
input = input.Slice(8);
|
||||
outptr = output.Slice(8);
|
||||
}
|
||||
|
||||
// Inverse transform column vectors.
|
||||
for (i = 0; i < 8; ++i)
|
||||
for (int i = 0; i < 8; ++i)
|
||||
{
|
||||
for (j = 0; j < 8; ++j)
|
||||
for (int j = 0; j < 8; ++j)
|
||||
{
|
||||
tempIn[j] = output[j * 8 + i];
|
||||
tempIn[j] = output[(j * 8) + i];
|
||||
}
|
||||
|
||||
ht.Cols(tempIn, tempOut, bd);
|
||||
for (j = 0; j < 8; ++j)
|
||||
for (int j = 0; j < 8; ++j)
|
||||
{
|
||||
dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
|
||||
dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i],
|
||||
BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly HighbdTransform2D[] _highIht16 = {
|
||||
private static readonly HighbdTransform2D[] HighIht16 =
|
||||
{
|
||||
new(HighbdIdct16, HighbdIdct16), // DCT_DCT = 0
|
||||
new(HighbdIadst16, HighbdIdct16), // ADST_DCT = 1
|
||||
new(HighbdIdct16, HighbdIadst16), // DCT_ADST = 2
|
||||
new(HighbdIadst16, HighbdIadst16), // ADST_ADST = 3
|
||||
new(HighbdIadst16, HighbdIadst16) // ADST_ADST = 3
|
||||
};
|
||||
|
||||
public static void HighbdIht16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd)
|
||||
public static void HighbdIht16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType,
|
||||
int bd)
|
||||
{
|
||||
int i, j;
|
||||
Span<int> output = stackalloc int[16 * 16];
|
||||
Span<int> outptr = output;
|
||||
Span<int> tempIn = stackalloc int[16];
|
||||
Span<int> tempOut = stackalloc int[16];
|
||||
HighbdTransform2D ht = _highIht16[txType];
|
||||
HighbdTransform2D ht = HighIht16[txType];
|
||||
|
||||
// Rows
|
||||
for (i = 0; i < 16; ++i)
|
||||
for (int i = 0; i < 16; ++i)
|
||||
{
|
||||
ht.Rows(input, outptr, bd);
|
||||
input = input[16..];
|
||||
outptr = output[16..];
|
||||
input = input.Slice(16);
|
||||
outptr = output.Slice(16);
|
||||
}
|
||||
|
||||
// Columns
|
||||
for (i = 0; i < 16; ++i)
|
||||
for (int i = 0; i < 16; ++i)
|
||||
{
|
||||
for (j = 0; j < 16; ++j)
|
||||
for (int j = 0; j < 16; ++j)
|
||||
{
|
||||
tempIn[j] = output[j * 16 + i];
|
||||
tempIn[j] = output[(j * 16) + i];
|
||||
}
|
||||
|
||||
ht.Cols(tempIn, tempOut, bd);
|
||||
for (j = 0; j < 16; ++j)
|
||||
for (int j = 0; j < 16; ++j)
|
||||
{
|
||||
dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
|
||||
dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i],
|
||||
BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -434,7 +444,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||
// DC only DCT coefficient
|
||||
if (eob == 1)
|
||||
{
|
||||
Vpx_Highbdidct8x8_1_add_c(input, dest, stride, bd);
|
||||
VpxHighbdidct8x81AddC(input, dest, stride, bd);
|
||||
}
|
||||
else if (eob <= 12)
|
||||
{
|
||||
@@ -491,7 +501,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||
}
|
||||
|
||||
// Iht
|
||||
public static void HighbdIht4x4Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
|
||||
public static void HighbdIht4x4Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride,
|
||||
int eob, int bd)
|
||||
{
|
||||
if (txType == TxType.DctDct)
|
||||
{
|
||||
@@ -503,7 +514,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||
}
|
||||
}
|
||||
|
||||
public static void HighbdIht8x8Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
|
||||
public static void HighbdIht8x8Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride,
|
||||
int eob, int bd)
|
||||
{
|
||||
if (txType == TxType.DctDct)
|
||||
{
|
||||
@@ -515,7 +527,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||
}
|
||||
}
|
||||
|
||||
public static void HighbdIht16x16Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
|
||||
public static void HighbdIht16x16Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride,
|
||||
int eob, int bd)
|
||||
{
|
||||
if (txType == TxType.DctDct)
|
||||
{
|
||||
@@ -527,4 +540,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user