Nice! Here's a first pass attempt at some improvements:

Code
diff -Nru aosdk_base/eng_dsf/aica.c aosdk/eng_dsf/aica.c
--- aosdk_base/eng_dsf/aica.c	2008-01-29 22:57:08.000000000 -0800
+++ aosdk/eng_dsf/aica.c	2008-01-30 01:05:44.000000000 -0800
@@ -59,7 +59,7 @@
 #define DL(slot)		((slot->udata.data[0x14/2]>>0x5)&0x001F)
 #define RR(slot)		((slot->udata.data[0x14/2]>>0x0)&0x001F)
 
-#define TL(slot)		((slot->udata.data[0x20/2]>>0x0)&0x00FF)
+#define TL(slot)		((slot->udata.data[0x28/2]>>0x8)&0x00FF)
 
 #define OCT(slot)		((slot->udata.data[0x18/2]>>0xB)&0x000F)
 #define FNS(slot)		((slot->udata.data[0x18/2]>>0x0)&0x03FF)
@@ -71,10 +71,10 @@
 #define ALFOWS(slot)		((slot->udata.data[0x1c/2]>>0x3)&0x0003)
 #define ALFOS(slot)		((slot->udata.data[0x1c/2]>>0x0)&0x0007)
 
-#define ISEL(slot)		((slot->udata.data[0x20/2]>>0x3)&0x000F)
-#define IMXL(slot)		((slot->udata.data[0x24/2]>>12)&0x0007)
+#define ISEL(slot)		((slot->udata.data[0x20/2]>>0x4)&0x000F)
+#define IMXL(slot)		((slot->udata.data[0x20/2]>>0x0)&0x000F)
 
-#define DISDL(slot)		((slot->udata.data[0x24/2]>>0x8)&0x0007)
+#define DISDL(slot)		((slot->udata.data[0x24/2]>>0x8)&0x000F)
 #define DIPAN(slot)		((slot->udata.data[0x24/2]>>0x0)&0x001F)
 
 #define EFSDL(slot)		((AICA->EFSPAN[slot/2]>>8)&0x000f)
@@ -113,8 +113,8 @@
 {
 	union
 	{
-		UINT16 data[0x10];	//only 0x1a bytes used
-		UINT8 datab[0x20];
+		UINT16 data[0x40];	//only 0x1a bytes used
+		UINT8 datab[0x80];
 	} udata;
 	UINT8 active;	//this slot is currently playing
 	UINT8 *base;		//samples base address
@@ -126,11 +126,11 @@
 	struct _LFO PLFO;		//Phase LFO
 	struct _LFO ALFO;		//Amplitude LFO
 	int slot;
-	signed short Prev, PPrev; // Previous ADPCM sample (for interpolation)
-	int PrevQuant;
-	int PrevSignal;
-	unsigned int LastDecAddr;	//Last decoded address for ADPCM
-	unsigned int ADStep;
+	int cur_sample;    //current ADPCM sample
+	int nxt_sample;    //next ADPCM sample
+	int cur_quant;     //current ADPCM step
+	int nxt_quant;     //next ADPCM step
+	int do_adpcm;      //do ADPCM decoding
 };
 
 
@@ -185,8 +185,8 @@
 	UINT8 MidiStack[16];
 	UINT8 MidiW,MidiR;
 
-	int LPANTABLE[0x10000];
-	int RPANTABLE[0x10000];
+	int LPANTABLE[0x20000];
+	int RPANTABLE[0x20000];
 
 	int TimPris[3];
 	int TimCnt[3];
@@ -203,7 +203,7 @@
 
 static struct _AICA *AllocedAICA;
 
-static const float SDLT[8]={-1000000.0,-36.0,-30.0,-24.0,-18.0,-12.0,-6.0,0.0};
+static const float SDLT[16]={-1000000.0,-39.0,-36.0,-33.0,-30.0,-27.0,-24.0,-21.0,-18.0,-15.0,-12.0,-9.0,-6.0,-3.0,0.0};
 
 static INT16 *bufferl;
 static INT16 *bufferr;
@@ -428,14 +428,13 @@
 
 	if (PCMS(slot) >= 2)
 	{
-		InitADPCM(&slot->PrevSignal, &slot->PrevQuant);
-		slot->LastDecAddr = slot->cur_addr>>SHIFT;
-		slot->ADStep = 0;
-		slot->Prev = slot->PPrev = 0;
+		slot->do_adpcm=1;
+		InitADPCM(&(slot->cur_sample), &(slot->cur_quant));
+		InitADPCM(&(slot->nxt_sample), &(slot->nxt_quant));
 	}
 
-	printf("StartSlot: SA %x PCMS %x LPCTL %x ALFOS %x TL %x\n", SA(slot), PCMS(slot), LPCTL(slot), ALFOS(slot), TL(slot));
-	printf("           AR %x D1R %x D2R %x RR %x DL %x KRS %x EGHOLD %x LPSLNK %x\n", AR(slot), D1R(slot), D2R(slot), RR(slot), DL(slot), KRS(slot), EGHOLD(slot), LPSLNK(slot));
+	//printf("StartSlot: SA %x PCMS %x LPCTL %x ALFOS %x TL %x\n", SA(slot), PCMS(slot), LPCTL(slot), ALFOS(slot), TL(slot));
+	//printf("           AR %x D1R %x D2R %x RR %x DL %x KRS %x EGHOLD %x LPSLNK %x\n", AR(slot), D1R(slot), D2R(slot), RR(slot), DL(slot), KRS(slot), EGHOLD(slot), LPSLNK(slot));
 }
 
 static void AICA_StopSlot(struct _SLOT *slot,int keyoff)
@@ -497,11 +496,11 @@
 		EG_TABLE[i]=(INT32)(pow(10.0,envDB/20.0)*scale);
 	}
 
-	for(i=0;i<0x10000;++i)
+	for(i=0;i<0x20000;++i)
 	{
 		int iTL =(i>>0x0)&0xff;
 		int iPAN=(i>>0x8)&0x1f;
-		int iSDL=(i>>0xD)&0x07;
+		int iSDL=(i>>0xD)&0x0F;
 		float TL=1.0;
 		float SegaDB=0;
 		float fSDL=1.0;
@@ -575,6 +574,7 @@
 		AICA->Slots[i].slot=i;
 		AICA->Slots[i].active=0;
 		AICA->Slots[i].base=NULL;
+		AICA->Slots[i].EG.state=RELEASE;
 	}
 
 	AICALFO_Init();
@@ -605,9 +605,17 @@
 				{
 					struct _SLOT *s2=AICA->Slots+sl;
 					{
-						if(KEYONB(s2)) // && s2->EG.state==RELEASE/*&& !s2->active*/)
+						if(KEYONB(s2) && s2->EG.state==RELEASE/*&& !s2->active*/)
 						{
 							AICA_StartSlot(AICA, s2);
+							
+							printf("StartSlot[%02X]:   SSCTL %01X SA %06X LSA %04X LEA %04X PCMS %01X LPCTL %01X\n",sl,SSCTL(s2),SA(s2),LSA(s2),LEA(s2),PCMS(s2),LPCTL(s2));
+							printf("                 EGHOLD %01X AR %02X D1R %02X D2R %02X RR %02X DL %02X KRS %01X LPSLNK %01X\n",EGHOLD(s2)>>5,AR(s2),D1R(s2),D2R(s2),RR(s2),DL(s2),KRS(s2),LPSLNK(s2)>>14);
+							printf("                 TL %02X OCT %01X FNS %03X\n",TL(s2),OCT(s2),FNS(s2));
+							printf("                 LFORE %01X LFOF %02X ALFOWS %01X ALFOS %01X PLFOWS %01X PLFOS %01X\n",LFORE(s2),LFOF(s2),ALFOWS(s2),ALFOS(s2),PLFOWS(s2),PLFOS(s2));
+							printf("                 IMXL %01X ISEL %01X DISDL %01X DIPAN %02X\n",IMXL(s2),ISEL(s2),DISDL(s2),DIPAN(s2));
+							printf("\n");
+							fflush(stdout);
 						}
 						if(!KEYONB(s2) /*&& s2->active*/)
 						{
@@ -954,55 +962,38 @@
 		s=(int) LE16(p1[0])*((1<<SHIFT)-fpart)+(int) LE16(p2[0])*fpart;
 		sample=(s>>SHIFT);
 	}
-	else	// ADPCM
+	else	// 4-bit ADPCM
 	{
-		slot->ADStep+=step;
-		if(slot->ADStep>>SHIFT)
+		UINT8 *p1=(unsigned char *) (AICA->AICARAM+((SA(slot)+(addr1>>1))&0x1fffff));
+		UINT8 *p2=(unsigned char *) (AICA->AICARAM+((SA(slot)+(addr2>>1))&0x1fffff));
+		INT32 s;
+		INT32 fpart=slot->cur_addr&((1<<SHIFT)-1);
+		if (slot->do_adpcm)
 		{
-			int hl=(slot->cur_addr>>SHIFT)&1;
-			INT8 *p=(signed char *) (AICA->AICARAM+(((SA(slot)+addr1))&0x1fffff));
-			int ca=slot->cur_addr>>SHIFT;
-			int steps=slot->ADStep>>SHIFT;
-			
-			slot->PPrev=slot->Prev;
-			if(!steps)
-				steps=1;
-			slot->ADStep&=(1<<SHIFT)-1;
-			while(steps--)
-			{
-				slot->Prev=DecodeADPCM(&slot->PrevSignal, (p[0]>>(4*hl))&0xF, &slot->PrevQuant);
-				hl^=1;
-				if(!hl)
-				{
-					++ca;
-					if(ca>=LEA(slot))
-					{
-						ca=LSA(slot);
-						hl=ca&1;
-						p=(unsigned char *) (slot->base+(ca>>1));
-					}
-					else
-						++p;
-				}
-			}
-			slot->LastDecAddr=slot->cur_addr>>SHIFT;
+			int shift1 = 4*((addr1&1)^1);
+			int shift2 = 4*((addr2&1)^1);
+			int delta1 = (*p1>>shift1)&0xF;
+			int delta2 = (*p2>>shift2)&0xF;
+			DecodeADPCM(&(slot->cur_sample),delta1,&(slot->cur_quant));
+			slot->nxt_sample=slot->cur_sample;
+			slot->nxt_quant=slot->cur_quant;
+			DecodeADPCM(&(slot->nxt_sample),delta2,&(slot->nxt_quant));
 		}
-		int s;
-		signed int fpart=slot->ADStep&((1<<SHIFT)-1);
-		s=(int) slot->PPrev*((1<<SHIFT)-fpart)+(int) slot->Prev*fpart;
-		sample=CHOOSE(s>>SHIFT,slot->Prev);
+		s=(slot->cur_sample)*((1<<SHIFT)-fpart)+(slot->nxt_sample)*fpart;
+		sample=(s>>SHIFT);
 	}
 
-	if(slot->Backwards)
-		slot->cur_addr-=step;
-	else
-		slot->cur_addr+=step;
+	// Only do an ADPCM decode when crossing a whole-address boundary
+	if(((slot->cur_addr+step)>>SHIFT)>((slot->cur_addr)>>SHIFT)) slot->do_adpcm=1;
+	else slot->do_adpcm=0;
+	
+	slot->cur_addr+=step;
 	slot->nxt_addr=slot->cur_addr+(1<<SHIFT);
 	
 	addr1=slot->cur_addr>>SHIFT;
 	addr2=slot->nxt_addr>>SHIFT;
 	
-	if(addr1>=LSA(slot) && !(slot->Backwards))
+	if(addr1>=LSA(slot))
 	{
 		if(LPSLNK(slot) && slot->EG.state==ATTACK)
 			slot->EG.state = DECAY1;
@@ -1027,33 +1018,6 @@
 				*slot_addr[addr_select]=(LSA(slot)<<SHIFT) + rem_addr;
 			}
 			break;
-		case 2:	//reverse loop
-			if((*addr[addr_select]>=LSA(slot)) && !(slot->Backwards))
-			{
-				rem_addr = *slot_addr[addr_select] - (LSA(slot)<<SHIFT);
-				*slot_addr[addr_select]=(LEA(slot)<<SHIFT) - rem_addr;
-				slot->Backwards=1;
-			}
-			else if((*addr[addr_select]<LSA(slot) || (*slot_addr[addr_select]&0x80000000)) && slot->Backwards)
-			{
-				rem_addr = (LSA(slot)<<SHIFT) - *slot_addr[addr_select];
-				*slot_addr[addr_select]=(LEA(slot)<<SHIFT) - rem_addr;
-			}
-			break;
-		case 3: //ping-pong
-			if(*addr[addr_select]>=LEA(slot)) //reached end, reverse till start
-			{
-				rem_addr = *slot_addr[addr_select] - (LEA(slot)<<SHIFT); 
-				*slot_addr[addr_select]=(LEA(slot)<<SHIFT) - rem_addr;
-				slot->Backwards=1;
-			}
-			else if((*addr[addr_select]<LSA(slot) || (*slot_addr[addr_select]&0x80000000)) && slot->Backwards)//reached start or negative
-			{
-				rem_addr = (LSA(slot)<<SHIFT) - *slot_addr[addr_select];
-				*slot_addr[addr_select]=(LSA(slot)<<SHIFT) + rem_addr;
-				slot->Backwards=0;
-			}
-			break;
 		}
 	}
 
@@ -1098,7 +1062,8 @@
 
 //				Enc=((TL(slot))<<0x0)|((IMXL(slot))<<0xd);
 //				AICADSP_SetSample(&AICA->DSP,(sample*AICA->LPANTABLE[Enc])>>(SHIFT-2),ISEL(slot),IMXL(slot));
-				Enc=((TL(slot))<<0x0)|((DIPAN(slot))<<0x8)|((DISDL(slot))<<0xd);
+//				Enc=((TL(slot))<<0x0)|((DIPAN(slot))<<0x8)|((DISDL(slot))<<0xd);
+				Enc=((TL(slot))<<0x0)|(0x0<<0x8)|(0x0F<<0xd); // DISDL,DIPAN not being set correctly for sequences so ignore them for now (OSB data is OK, though)
 				{
 					smpl+=(sample*AICA->LPANTABLE[Enc])>>SHIFT;
 					smpr+=(sample*AICA->RPANTABLE[Enc])>>SHIFT;


Changes:
-Corrected some slot register addresses/sizes
-Expanded slot udata arrays to avoid buffer overruns (for me, it was clobbering over the EG state values, preventing slots from starting properly)
-Expanded LPANTABLE/RPANTABLE to 0x20000 values. On the AICA, DISDL/IMXL/EFSDL have more resolution (range?) (0x0-0xF) than on the SCSP (0x0-0x7)
-Redid the ADPCM sampler in an attempt to make it work better
-Removed the backwards sample looping code (not used on AICA)

Problems/issues I've noticed so far:
-Playback of sakutai3_01.dsf causes a crash (I haven't looked into this yet)
-For sequenced music, it seems that DISDL and DIPAN aren't being written correctly (I get a lot of zeros for DISDL). This problem doesn't seem to occur when I give it OSB data instead (basically samples without any sequence data attached).