Previous Thread
Next Thread
Print Thread
#93517 03/26/14 02:13 PM
Joined: Apr 2010
Posts: 57
H
Happy Offline OP
Member
OP Offline
Member
H
Joined: Apr 2010
Posts: 57
I am posting this here to allow any interested parties a chance to comment/respond.

I will admit to not knowing every last detail of SSE, but from what I do understand this is what I can identify as potential issues with this implementation.

VMACF interprets its two operands as s1.15 fractions. It does a multiplication, a precision shift (to line up the fractional part with the accumulator), and then adds the result to the 'l' and 'll' parts of the accumulator. Since the 'll' part isn't really stored it is rounded up.

The // comments are mine.

Code:
INLINE void cfunc_rsp_vmacf_simd(void *param)
{
	rsp_state *rsp = (rsp_state*)param;
	int op = rsp->impstate->arg0;

	__m128i loProduct, hiProduct, unpackLo, unpackHi;
	__m128i vaccHigh;
	__m128i vdReg, vdRegLo, vdRegHi;

	__m128i vsReg = rsp->xv[VS1REG];
	__m128i vtReg = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);

	__m128i vaccLow = rsp->accum_l;

	/* Unpack to obtain for 32-bit precision. */
	RSPZeroExtend16to32(vaccLow, &vaccLow, &vaccHigh);

	/* Begin accumulating the products. */
	unpackLo = _mm_mullo_epi16(vsReg, vtReg);
	unpackHi = _mm_mulhi_epi16(vsReg, vtReg);
	loProduct = _mm_unpacklo_epi16(unpackLo, unpackHi);
	hiProduct = _mm_unpackhi_epi16(unpackLo, unpackHi);

// Possibly throwing away the sign bit
	loProduct = _mm_slli_epi32(loProduct, 1);
	hiProduct = _mm_slli_epi32(hiProduct, 1);

	vdRegLo = _mm_srli_epi32(loProduct, 16);
	vdRegHi = _mm_srli_epi32(hiProduct, 16);
	vdRegLo = _mm_slli_epi32(vdRegLo, 16);
	vdRegHi = _mm_slli_epi32(vdRegHi, 16);
//
	vdRegLo = _mm_xor_si128(vdRegLo, loProduct);
	vdRegHi = _mm_xor_si128(vdRegHi, hiProduct);

// Does a carry/overflow propagate correctly when adding to the 48 accumulator bits using 32 bit adds?
	vaccLow = _mm_add_epi32(vaccLow, vdRegLo);
	vaccHigh = _mm_add_epi32(vaccHigh, vdRegHi);

	rsp->accum_l = vdReg = RSPPackLo32to16(vaccLow, vaccHigh);

	/* Multiply the MSB of sources, accumulate the product. */
	vdRegLo = _mm_unpacklo_epi16(rsp->accum_m, rsp->accum_h);
	vdRegHi = _mm_unpackhi_epi16(rsp->accum_m, rsp->accum_h);

	loProduct = _mm_srai_epi32(loProduct, 16);
	hiProduct = _mm_srai_epi32(hiProduct, 16);
	vaccLow = _mm_srai_epi32(vaccLow, 16);
	vaccHigh = _mm_srai_epi32(vaccHigh, 16);

	vaccLow = _mm_add_epi32(loProduct, vaccLow);
	vaccHigh = _mm_add_epi32(hiProduct, vaccHigh);
	vaccLow = _mm_add_epi32(vdRegLo, vaccLow);
	vaccHigh = _mm_add_epi32(vdRegHi, vaccHigh);
//

	/* Clamp the accumulator and write it all out. */
	rsp->xv[VDREG] = _mm_packs_epi32(vaccLow, vaccHigh);
	rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh);
	rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh);
}

Last edited by Happy; 03/26/14 02:31 PM. Reason: Sudden enlightenment
Joined: May 2009
Posts: 2,036
Likes: 77
J
Very Senior Member
Offline
Very Senior Member
J
Joined: May 2009
Posts: 2,036
Likes: 77
Originally Posted By Happy
I am posting this here to allow any interested parties a chance to comment/respond.

I will admit to not knowing every last detail of SSE, but from what I do understand this is what I can identify as potential issues with this implementation.

VMACF interprets its two operands as s1.15 fractions. It does a multiplication, a precision shift (to line up the fractional part with the accumulator), and then adds the result to the 'l' and 'll' parts of the accumulator. Since the 'll' part isn't really stored it is rounded up.

The // comments are mine.

Code:
INLINE void cfunc_rsp_vmacf_simd(void *param)
{
	rsp_state *rsp = (rsp_state*)param;
	int op = rsp->impstate->arg0;

	__m128i loProduct, hiProduct, unpackLo, unpackHi;
	__m128i vaccHigh;
	__m128i vdReg, vdRegLo, vdRegHi;

	__m128i vsReg = rsp->xv[VS1REG];
	__m128i vtReg = _mm_shuffle_epi8(rsp->xv[VS2REG], vec_shuf_inverse[EL]);

	__m128i vaccLow = rsp->accum_l;

	/* Unpack to obtain for 32-bit precision. */
	RSPZeroExtend16to32(vaccLow, &vaccLow, &vaccHigh);

	/* Begin accumulating the products. */
	unpackLo = _mm_mullo_epi16(vsReg, vtReg);
	unpackHi = _mm_mulhi_epi16(vsReg, vtReg);
	loProduct = _mm_unpacklo_epi16(unpackLo, unpackHi);
	hiProduct = _mm_unpackhi_epi16(unpackLo, unpackHi);

// Possibly throwing away the sign bit
	loProduct = _mm_slli_epi32(loProduct, 1);
	hiProduct = _mm_slli_epi32(hiProduct, 1);

	vdRegLo = _mm_srli_epi32(loProduct, 16);
	vdRegHi = _mm_srli_epi32(hiProduct, 16);
	vdRegLo = _mm_slli_epi32(vdRegLo, 16);
	vdRegHi = _mm_slli_epi32(vdRegHi, 16);
//
	vdRegLo = _mm_xor_si128(vdRegLo, loProduct);
	vdRegHi = _mm_xor_si128(vdRegHi, hiProduct);

// Does a carry/overflow propagate correctly when adding to the 48 accumulator bits using 32 bit adds?
	vaccLow = _mm_add_epi32(vaccLow, vdRegLo);
	vaccHigh = _mm_add_epi32(vaccHigh, vdRegHi);

	rsp->accum_l = vdReg = RSPPackLo32to16(vaccLow, vaccHigh);

	/* Multiply the MSB of sources, accumulate the product. */
	vdRegLo = _mm_unpacklo_epi16(rsp->accum_m, rsp->accum_h);
	vdRegHi = _mm_unpackhi_epi16(rsp->accum_m, rsp->accum_h);

	loProduct = _mm_srai_epi32(loProduct, 16);
	hiProduct = _mm_srai_epi32(hiProduct, 16);
	vaccLow = _mm_srai_epi32(vaccLow, 16);
	vaccHigh = _mm_srai_epi32(vaccHigh, 16);

	vaccLow = _mm_add_epi32(loProduct, vaccLow);
	vaccHigh = _mm_add_epi32(hiProduct, vaccHigh);
	vaccLow = _mm_add_epi32(vdRegLo, vaccLow);
	vaccHigh = _mm_add_epi32(vdRegHi, vaccHigh);
//

	/* Clamp the accumulator and write it all out. */
	rsp->xv[VDREG] = _mm_packs_epi32(vaccLow, vaccHigh);
	rsp->accum_m = RSPPackLo32to16(vaccLow, vaccHigh);
	rsp->accum_h = RSPPackHi32to16(vaccLow, vaccHigh);
}


I have no idea if you're right or not. Take a shot at fixing it and see if SIMUL_SIMD passes.


Link Copied to Clipboard
Who's Online Now
2 members (Pernod, 1 invisible), 21 guests, and 2 robots.
Key: Admin, Global Mod, Mod
ShoutChat
Comment Guidelines: Do post respectful and insightful comments. Don't flame, hate, spam.
Forum Statistics
Forums9
Topics9,086
Posts119,088
Members5,014
Most Online890
Jan 17th, 2020
Our Sponsor
These forums are sponsored by Superior Solitaire, an ad-free card game collection for macOS and iOS. Download it today!

Superior Solitaire
Forum hosted by www.retrogamesformac.com