Alright, I think I have the SCSP's FM-synthesis working reasonably well. Shinobi X and NiGHTS sound a bit better now for the most part.

Patch against AO SDK 1.2.0:
Code
diff -Nru aosdk_base/eng_ssf/scsp.c aosdk/eng_ssf/scsp.c
--- aosdk_base/eng_ssf/scsp.c	2007-12-17 12:45:26.000000000 -0800
+++ aosdk/eng_ssf/scsp.c	2008-01-04 19:39:51.000000000 -0800
@@ -36,6 +36,7 @@
 
 
 #define EG_SHIFT	16
+#define FM_DELAY    4    // delay in number of slots processed before samples are written to the FM ring buffer
 
 // include the LFO handling code
 #include "scsplfo.c"
@@ -181,6 +182,10 @@
 	struct _SLOT Slots[32];
 	signed short RINGBUF[64];
 	unsigned char BUFPTR;
+#if FM_DELAY
+	signed short DELAYBUF[FM_DELAY];
+	unsigned char DELAYPTR;
+#endif
 	unsigned char *SCSPRAM;
 	UINT32 SCSPRAM_LENGTH;
 	char Master;
@@ -1114,31 +1119,23 @@
 		addr2=(slot->nxt_addr>>(SHIFT-1))&0x7fffe;
 	}
 
-	/*if(MDL(slot)!=0 || MDXSL(slot)!=0 || MDYSL(slot)!=0)
+	if(MDL(slot)!=0 || MDXSL(slot)!=0 || MDYSL(slot)!=0)
 	{
 		INT32 smp=(SCSP->RINGBUF[(SCSP->BUFPTR+MDXSL(slot))&63]+SCSP->RINGBUF[(SCSP->BUFPTR+MDYSL(slot))&63])/2;
 		INT32 cycle=LEA(slot)-LSA(slot); // cycle corresponds to 2 pi
 
-		smp*=cycle; // associate cycle with full 16-bit sample range
+		smp<<=0xA; // associate cycle with 1024
 		smp>>=0x1A-MDL(slot); // ex. for MDL=0xF, sample range corresponds to +/- 64 pi (32=2^5 cycles) so shift by 11 (16-5 == 0x1A-0xF)
 		while(smp<0) smp+=cycle; smp%=cycle; // keep modulation sampler within a single cycle
 		if(!PCM8B(slot)) smp<<=1;
 		
 		addr1+=smp; addr2+=smp;
-		if(!PCM8B(slot))
-		{
-			addr1&=0x7fffe; addr2&=0x7fffe;
-		}
-		else
-		{
-			addr1&=0x7ffff; addr2&=0x7ffff;
-		}
-	}*/
+	}
 
 	if(PCM8B(slot))	//8 bit signed
 	{
-		INT8 *p1=(signed char *) (SCSP->SCSPRAM+((SA(slot)+addr1)^1));
-		INT8 *p2=(signed char *) (SCSP->SCSPRAM+((SA(slot)+addr2)^1));
+		INT8 *p1=(signed char *) (SCSP->SCSPRAM+(((SA(slot)+addr1)^1)&0x7FFFF));
+		INT8 *p2=(signed char *) (SCSP->SCSPRAM+(((SA(slot)+addr2)^1)&0x7FFFF));
 		//sample=(p[0])<<8;
 		INT32 s;
 		INT32 fpart=slot->cur_addr&((1<<SHIFT)-1);
@@ -1147,8 +1144,8 @@
 	}
 	else	//16 bit signed (endianness?)
 	{
-		INT16 *p1=(signed short *) (slot->base+addr1);
-		INT16 *p2=(signed short *) (slot->base+addr2);
+		INT16 *p1=(signed short *) (SCSP->SCSPRAM+((SA(slot)+addr1)&0x7FFFE));
+		INT16 *p2=(signed short *) (SCSP->SCSPRAM+((SA(slot)+addr2)&0x7FFFE));
 		//sample=LE16(p[0]);
 		INT32 s;
 		INT32 fpart=slot->cur_addr&((1<<SHIFT)-1);
@@ -1248,7 +1245,11 @@
 
 		for(sl=0;sl<32;++sl)
 		{
+#if FM_DELAY
+			RBUFDST=SCSP->DELAYBUF+SCSP->DELAYPTR;
+#else
 			RBUFDST=SCSP->RINGBUF+SCSP->BUFPTR;
+#endif
 			if(SCSP->Slots[sl].active)
 			{
 				struct _SLOT *slot=SCSP->Slots+sl;
@@ -1267,8 +1268,16 @@
 					smpr+=(sample*SCSP->RPANTABLE[Enc])>>SHIFT;
 				}
 			}
-			--SCSP->BUFPTR;
+			
+#if FM_DELAY
+			SCSP->RINGBUF[(SCSP->BUFPTR+64-(FM_DELAY-1))&63] = SCSP->DELAYBUF[(SCSP->DELAYPTR+FM_DELAY-(FM_DELAY-1))%FM_DELAY];
+#endif
+			++SCSP->BUFPTR;
 			SCSP->BUFPTR&=63;
+#if FM_DELAY
+			++SCSP->DELAYPTR;
+			if(SCSP->DELAYPTR>FM_DELAY-1) SCSP->DELAYPTR=0;
+#endif
 		}
 
 		SCSPDSP_Step(&SCSP->DSP);


Changes:
-Improved/enabled FM-synthesis emulation.

Also, I believe the endian XOR in the 8-bit sampler should use BYTE_XOR_BE when incorporating into MAME (I think a simple "^1" is ok for AO since SCSP RAM appears to always be organized in a byte-swapped manner). At least scsp.c before the last MAME SCSP update used BYTE_XOR_BE.