And here are my latest updates:

Code
diff -Nru aosdk_base/eng_dsf/aica.c aosdk/eng_dsf/aica.c
--- aosdk_base/eng_dsf/aica.c	2008-02-07 21:46:42.000000000 -0800
+++ aosdk/eng_dsf/aica.c	2008-02-07 20:41:31.000000000 -0800
@@ -69,14 +69,14 @@
 #define ALFOWS(slot)		((slot->udata.data[0x1c/2]>>0x3)&0x0003)
 #define ALFOS(slot)		((slot->udata.data[0x1c/2]>>0x0)&0x0007)
 
-#define ISEL(slot)		((slot->udata.data[0x20/2]>>0x4)&0x000F)
-#define IMXL(slot)		((slot->udata.data[0x20/2]>>0x0)&0x000F)
+#define ISEL(slot)		((slot->udata.data[0x20/2]>>0x0)&0x000F)
+#define IMXL(slot)		((slot->udata.data[0x20/2]>>0x4)&0x000F)
 
 #define DISDL(slot)		((slot->udata.data[0x24/2]>>0x8)&0x000F)
 #define DIPAN(slot)		((slot->udata.data[0x24/2]>>0x0)&0x001F)
 
-#define EFSDL(slot)		((AICA->EFSPAN[slot/2]>>8)&0x000f)
-#define EFPAN(slot)		((AICA->EFSPAN[slot/2]>>0)&0x001f) 
+#define EFSDL(slot)		((AICA->EFSPAN[slot*4]>>8)&0x000f)
+#define EFPAN(slot)		((AICA->EFSPAN[slot*4]>>0)&0x001f) 
 
 //Envelope times in ms
 static const double ARTimes[64]={100000/*infinity*/,100000/*infinity*/,8100.0,6900.0,6000.0,4800.0,4000.0,3400.0,3000.0,2400.0,2000.0,1700.0,1500.0,
@@ -116,6 +116,7 @@
 	} udata;
 	UINT8 active;	//this slot is currently playing
 	UINT8 *base;		//samples base address
+	UINT32 prv_addr;    // previous play address (for ADPCM)
 	UINT32 cur_addr;	//current play address (24.8)
 	UINT32 nxt_addr;	//next play address
 	UINT32 step;		//pitch step (24.8)
@@ -124,18 +125,21 @@
 	struct _LFO PLFO;		//Phase LFO
 	struct _LFO ALFO;		//Amplitude LFO
 	int slot;
-	int cur_sample;    //current ADPCM sample
-	int nxt_sample;    //next ADPCM sample
-	int cur_quant;     //current ADPCM step
-	int nxt_quant;     //next ADPCM step
-	int do_adpcm;      //do ADPCM decoding
+	int cur_sample;       //current ADPCM sample
+	int nxt_sample;       //next ADPCM sample
+	int cur_quant;        //current ADPCM step
+	int nxt_quant;        //next ADPCM step
+	int sample_lsa;       // current ADPCM sample at loop start
+	int quant_lsa;        // current ADPCM step at loop start
+	int do_adpcm;         // do ADPCM decoding - number of iterations
+	int loop_adpcm;       // ADPCM sampler has passed LSA
 };
 
 
 #define MEM4B(aica)		((aica->udata.data[0]>>0x0)&0x0200)
 #define DAC18B(aica)		((aica->udata.data[0]>>0x0)&0x0100)
 #define MVOL(aica)		((aica->udata.data[0]>>0x0)&0x000F)
-#define RBL(aica)		((aica->udata.data[2]>>0x13)&0x0003)
+#define RBL(aica)		((aica->udata.data[2]>>0xD)&0x0003)
 #define RBP(aica)		((aica->udata.data[2]>>0x0)&0x0fff)
 #define MOFULL(aica)   		((aica->udata.data[4]>>0x0)&0x1000)
 #define MOEMPTY(aica)		((aica->udata.data[4]>>0x0)&0x0800)
@@ -415,7 +419,7 @@
 
 	slot->active=1;
 	slot->Backwards=0;
-	slot->cur_addr=0; slot->nxt_addr=1<<SHIFT;
+	slot->cur_addr=0; slot->nxt_addr=1<<SHIFT; slot->prv_addr=-1;
 	start_offset = SA(slot);	// AICA can play 16-bit samples from any boundry
 	slot->base=&AICA->AICARAM[start_offset];
 	slot->step=AICA_Step(slot);
@@ -427,6 +431,7 @@
 	if (PCMS(slot) >= 2)
 	{
 		slot->do_adpcm=1;
+		slot->loop_adpcm=0;
 		InitADPCM(&(slot->cur_sample), &(slot->cur_quant));
 		InitADPCM(&(slot->nxt_sample), &(slot->nxt_quant));
 	}
@@ -445,7 +450,7 @@
 	{
 		slot->active=0;
 	}
-	slot->udata.data[0]&=~0x800;
+	slot->udata.data[0]&=~0x4000;
 }
 
 #define log_base_2(n) (log((float) n)/log((float) 2))
@@ -622,7 +627,7 @@
 						}
 					}
 				}
-				slot->udata.data[0]&=~0x1000;
+				slot->udata.data[0]&=~0x8000;
 			}
 			break;
 		case 0x18:
@@ -657,9 +662,9 @@
 					AICA->DSP.RBL=8*1024;
 				else if(v==1)
 					AICA->DSP.RBL=16*1024;
-				if(v==2)
+				else if(v==2)
 					AICA->DSP.RBL=32*1024;
-				if(v==3)
+				else if(v==3)
 					AICA->DSP.RBL=64*1024;
 			}
 			break;
@@ -857,7 +862,11 @@
 	}
 	else if(addr<0x3000)
 	{
-		if (addr < 0x28be)
+		if (addr <= 0x2044)
+		{
+			v = AICA->EFSPAN[addr&0x7f];
+		}
+		else if (addr < 0x28be)
 		{
 			AICA_UpdateRegR(AICA, addr&0xff);
 			v= *((unsigned short *) (AICA->udata.datab+((addr&0xff))));
@@ -925,6 +934,8 @@
 	UINT32 addr1,addr2,addr_select;                                   // current and next sample addresses
 	UINT32 *addr[2]      = {&addr1, &addr2};                          // used for linear interpolation
 	UINT32 *slot_addr[2] = {&(slot->cur_addr), &(slot->nxt_addr)};    //
+	int    *adpcm_sample[2] = {&(slot->cur_sample), &(slot->nxt_sample)};
+	int    *adpcm_quant[2]  = {&(slot->cur_quant), &(slot->nxt_quant)};
 
 	if(SSCTL(slot)!=0)	//no FM or noise yet
 		return 0;
@@ -942,8 +953,8 @@
 	}
 	else if(PCMS(slot) == 0) 
 	{
-		addr1=(slot->cur_addr>>(SHIFT-1));
-		addr2=(slot->nxt_addr>>(SHIFT-1));
+		addr1=(slot->cur_addr>>(SHIFT-1))&0x1ffffe;
+		addr2=(slot->nxt_addr>>(SHIFT-1))&0x1ffffe;
 	}
 	else
 	{
@@ -963,11 +974,11 @@
 	}
 	else if (PCMS(slot) == 0)	//16 bit signed
 	{
-		UINT8 *p1=(UINT8 *) (AICA->AICARAM+((SA(slot)+addr1)&0x1fffff));
-		UINT8 *p2=(UINT8 *) (AICA->AICARAM+((SA(slot)+addr2)&0x1fffff));
+		INT16 *p1=(signed short *) (AICA->AICARAM+((SA(slot)+addr1)&0x1fffff));
+		INT16 *p2=(signed short *) (AICA->AICARAM+((SA(slot)+addr2)&0x1fffff));
 		INT32 s;
 		INT32 fpart=slot->cur_addr&((1<<SHIFT)-1);
-		s=(int) ((INT16)(p1[1] | (p1[0]<<8)))*((1<<SHIFT)-fpart)+(int) ((INT16)(p2[1] | (p2[0]<<8)))*fpart;
+		s=(int) LE16(p1[0])*((1<<SHIFT)-fpart)+(int) LE16(p2[0])*fpart;
 		sample=(s>>SHIFT);
 	}
 	else	// 4-bit ADPCM
@@ -976,13 +987,19 @@
 		UINT8 *p2=(unsigned char *) (AICA->AICARAM+((SA(slot)+(addr2>>1))&0x1fffff));
 		INT32 s;
 		INT32 fpart=slot->cur_addr&((1<<SHIFT)-1);
-		if (slot->do_adpcm)
+		UINT32 addr=slot->prv_addr>>SHIFT;
+		while (slot->do_adpcm--)
+		{
+			int shift1,delta1;
+			addr += 1;
+			shift1 = 4*((addr&1)^1);
+			delta1 = (*p1>>shift1)&0xF;
+//			printf("DEBUG: addr %04X sample %+06d delta %01X quant %05d\n",addr,slot->cur_sample,delta1,slot->cur_quant);
+			DecodeADPCM(&(slot->cur_sample),delta1,&(slot->cur_quant));
+		}
 		{
-			int shift1 = 4*((addr1&1)^1);
 			int shift2 = 4*((addr2&1)^1);
-			int delta1 = (*p1>>shift1)&0xF;
 			int delta2 = (*p2>>shift2)&0xF;
-			DecodeADPCM(&(slot->cur_sample),delta1,&(slot->cur_quant));
 			slot->nxt_sample=slot->cur_sample;
 			slot->nxt_quant=slot->cur_quant;
 			DecodeADPCM(&(slot->nxt_sample),delta2,&(slot->nxt_quant));
@@ -992,21 +1009,27 @@
 	}
 
 	// Only do an ADPCM decode when crossing a whole-address boundary
-	if(((slot->cur_addr+step)>>SHIFT)>((slot->cur_addr)>>SHIFT)) slot->do_adpcm=1;
-	else slot->do_adpcm=0;
+	slot->do_adpcm = ((slot->cur_addr+step)>>SHIFT)-((slot->cur_addr)>>SHIFT);
 	
+	slot->prv_addr=slot->cur_addr;
 	slot->cur_addr+=step;
 	slot->nxt_addr=slot->cur_addr+(1<<SHIFT);
-	
+
 	addr1=slot->cur_addr>>SHIFT;
 	addr2=slot->nxt_addr>>SHIFT;
-	
+
 	if(addr1>=LSA(slot))
 	{
 		if(LPSLNK(slot) && slot->EG.state==ATTACK)
 			slot->EG.state = DECAY1;
+		if(PCMS(slot)==2 && !(slot->loop_adpcm))
+		{
+			slot->sample_lsa = slot->cur_sample;
+			slot->quant_lsa = slot->cur_quant;
+			slot->loop_adpcm = 1;
+		}
 	}
-	
+
 	for (addr_select=0;addr_select<2;addr_select++)
 	{
 		INT32 rem_addr;
@@ -1024,6 +1047,11 @@
 			{
 				rem_addr = *slot_addr[addr_select] - (LEA(slot)<<SHIFT);
 				*slot_addr[addr_select]=(LSA(slot)<<SHIFT) + rem_addr;
+				if(PCMS(slot)==2 && addr_select==0)
+				{
+					*adpcm_sample[addr_select] = slot->sample_lsa;
+					*adpcm_quant[addr_select] = slot->quant_lsa;
+				}
 			}
 			break;
 		}
@@ -1047,7 +1075,6 @@
 {
 	INT16 *bufr,*bufl;
 	int sl, s, i;
-	struct _SLOT *s2 = &AICA->Slots[39];
 
 	bufr=bufferr;
 	bufl=bufferl;
@@ -1064,7 +1091,7 @@
 			if(AICA->Slots[sl].active)
 			{
 				struct _SLOT *slot=AICA->Slots+sl;
-				unsigned short Enc;
+				unsigned int Enc;
 				signed int sample;
 
 				sample=AICA_UpdateSlot(AICA, slot);
@@ -1081,14 +1108,14 @@
 			AICA->BUFPTR&=63;
 		}
 
-#if 0 	// actually works somewhat, but not yet
+#if 1
 		AICADSP_Step(&AICA->DSP);
 
 		for(i=0;i<16;++i)
 		{
 			if(EFSDL(i))
 			{
-				unsigned short Enc=((EFPAN(i))<<0x8)|((EFSDL(i))<<0xd);
+				unsigned int Enc=((EFPAN(i))<<0x8)|((EFSDL(i))<<0xd);
 				smpl+=(AICA->DSP.EFREG[i]*AICA->LPANTABLE[Enc])>>SHIFT;
 				smpr+=(AICA->DSP.EFREG[i]*AICA->RPANTABLE[Enc])>>SHIFT;
 			}
diff -Nru aosdk_base/eng_dsf/aicadsp.c aosdk/eng_dsf/aicadsp.c
--- aosdk_base/eng_dsf/aicadsp.c	2008-02-07 21:44:10.000000000 -0800
+++ aosdk/eng_dsf/aicadsp.c	2008-02-07 20:37:30.000000000 -0800
@@ -86,41 +86,41 @@
 #endif
 	for(step=0;step</*128*/DSP->LastStep;++step)
 	{
-		UINT16 *IPtr=DSP->MPRO+step*4;
-
+		UINT16 *IPtr=DSP->MPRO+step*8;
+				
 //      if(IPtr[0]==0 && IPtr[1]==0 && IPtr[2]==0 && IPtr[3]==0)
 //          break;
 
-		UINT32 TRA=(IPtr[0]>>8)&0x7F;
-		UINT32 TWT=(IPtr[0]>>7)&0x01;
-		UINT32 TWA=(IPtr[0]>>0)&0x7F;
-
-		UINT32 XSEL=(IPtr[1]>>15)&0x01;
-		UINT32 YSEL=(IPtr[1]>>13)&0x03;
-		UINT32 IRA=(IPtr[1]>>6)&0x3F;
-		UINT32 IWT=(IPtr[1]>>5)&0x01;
-		UINT32 IWA=(IPtr[1]>>0)&0x1F;
-
-		UINT32 TABLE=(IPtr[2]>>15)&0x01;
-		UINT32 MWT=(IPtr[2]>>14)&0x01;
-		UINT32 MRD=(IPtr[2]>>13)&0x01;
-		UINT32 EWT=(IPtr[2]>>12)&0x01;
-		UINT32 EWA=(IPtr[2]>>8)&0x0F;
-		UINT32 ADRL=(IPtr[2]>>7)&0x01;
-		UINT32 FRCL=(IPtr[2]>>6)&0x01;
-		UINT32 SHIFT=(IPtr[2]>>4)&0x03;
-		UINT32 YRL=(IPtr[2]>>3)&0x01;
-		UINT32 NEGB=(IPtr[2]>>2)&0x01;
-		UINT32 ZERO=(IPtr[2]>>1)&0x01;
-		UINT32 BSEL=(IPtr[2]>>0)&0x01;
-
-		UINT32 NOFL=(IPtr[3]>>15)&1;		//????
-		UINT32 COEF=(IPtr[3]>>9)&0x3f;
-
-		UINT32 MASA=(IPtr[3]>>2)&0x1f;	//???
-		UINT32 ADREB=(IPtr[3]>>1)&0x1;
-		UINT32 NXADR=(IPtr[3]>>0)&0x1;
-
+		UINT32 TRA=(IPtr[0]>>9)&0x7F;
+		UINT32 TWT=(IPtr[0]>>8)&0x01;
+		UINT32 TWA=(IPtr[0]>>1)&0x7F;
+
+		UINT32 XSEL=(IPtr[2]>>15)&0x01;
+		UINT32 YSEL=(IPtr[2]>>13)&0x03;
+		UINT32 IRA=(IPtr[2]>>7)&0x3F;
+		UINT32 IWT=(IPtr[2]>>6)&0x01;
+		UINT32 IWA=(IPtr[2]>>1)&0x1F;
+
+		UINT32 TABLE=(IPtr[4]>>15)&0x01;
+		UINT32 MWT=(IPtr[4]>>14)&0x01;
+		UINT32 MRD=(IPtr[4]>>13)&0x01;
+		UINT32 EWT=(IPtr[4]>>12)&0x01;
+		UINT32 EWA=(IPtr[4]>>8)&0x0F;
+		UINT32 ADRL=(IPtr[4]>>7)&0x01;
+		UINT32 FRCL=(IPtr[4]>>6)&0x01;
+		UINT32 SHIFT=(IPtr[4]>>4)&0x03;
+		UINT32 YRL=(IPtr[4]>>3)&0x01;
+		UINT32 NEGB=(IPtr[4]>>2)&0x01;
+		UINT32 ZERO=(IPtr[4]>>1)&0x01;
+		UINT32 BSEL=(IPtr[4]>>0)&0x01;
+
+		UINT32 NOFL=(IPtr[6]>>15)&1;		//????
+		UINT32 COEF=step;
+
+		UINT32 MASA=(IPtr[6]>>9)&0x1f;	//???
+		UINT32 ADREB=(IPtr[6]>>8)&0x1;
+		UINT32 NXADR=(IPtr[6]>>7)&0x1;
+		
 		INT64 v;
 
 		//operations are done at 24 bit precision
@@ -208,7 +208,7 @@
 		if(YSEL==0)
 			Y=FRC_REG;
 		else if(YSEL==1)
-			Y=DSP->COEF[COEF]>>3;	//COEF is 16 bits
+			Y=DSP->COEF[COEF<<1]>>3;	//COEF is 16 bits
 		else if(YSEL==2)
 			Y=(Y_REG>>11)&0x1FFF;
 		else if(YSEL==3)
@@ -276,7 +276,7 @@
 		if(MRD || MWT)
 		//if(0)
 		{
-			ADDR=DSP->MADRS[MASA];
+			ADDR=DSP->MADRS[MASA<<1];
 			if(!TABLE)
 				ADDR+=DSP->DEC;
 			if(ADREB)
@@ -290,7 +290,7 @@
 			//ADDR<<=1;
 			//ADDR+=DSP->RBP<<13;
 			//MEMVAL=DSP->AICARAM[ADDR>>1];
-			ADDR+=DSP->RBP<<12;
+			ADDR+=DSP->RBP<<10;
 			if(MRD && (step&1))	//memory only allowed on odd? DoA inserts NOPs on even
 			{
 				if(NOFL)
@@ -339,9 +339,9 @@
 	DSP->Stopped=0;
 	for(i=127;i>=0;--i)
 	{
-		UINT16 *IPtr=DSP->MPRO+i*4;
+		UINT16 *IPtr=DSP->MPRO+i*8;
 
-		if(IPtr[0]!=0 || IPtr[1]!=0 || IPtr[2]!=0 || IPtr[3]!=0)
+		if(IPtr[0]!=0 || IPtr[2]!=0 || IPtr[4]!=0 || IPtr[6]!=0)
 			break;
 	}
 	DSP->LastStep=i+1;
diff -Nru aosdk_base/eng_dsf/aicadsp.h aosdk/eng_dsf/aicadsp.h
--- aosdk_base/eng_dsf/aicadsp.h	2008-02-07 21:44:10.000000000 -0800
+++ aosdk/eng_dsf/aicadsp.h	2008-02-07 20:37:52.000000000 -0800
@@ -12,9 +12,9 @@
 
 //context
 
-	INT16 COEF[128];		//16 bit signed
-	UINT16 MADRS[64];	//offsets (in words), 16 bit
-	UINT16 MPRO[128*4*2];	//128 steps 64 bit
+	INT16 COEF[128*2];		//16 bit signed
+	UINT16 MADRS[64*2];	//offsets (in words), 16 bit
+	UINT16 MPRO[128*4*2*2];	//128 steps 64 bit
 	INT32 TEMP[128];	//TEMP regs,24 bit signed
 	INT32 MEMS[32];	//MEMS regs,24 bit signed
 	UINT32 DEC;

- Fixed 16-bit samples smile
- Swapped IMXL/ISEL (thanks Deunan!)
- Made all of the necessary changes (or so I think) to get the DSP working
- Changed LPANTABLE/RPANTABLE lookup to use int instead of short (since we're dealing with 0x20000 values now).
- Changed the ADPCM decoding loop so that it always operates a single step at a time. ADPCM still saturates far too much - still some work to be done here.