[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [microblaze-uclinux] Interrupts disabled in RETURN macro



Hi Alejando,

Thanks for that - see my comments below.

Alejandro Lucero wrote:

Alejandro Lucero wrote:

I'm looking at the RETURN macro (entry.S) and I'd like to ask you why
interrupts are disabled immediately. I think it's possible to delay it
just until before POP_STATE since is here when the problematic r14
register is modified. In the current way scheduling is called without
interrupts, the same with do_signal.

Your patch works for me (that is, it doesn't crash - mayjor bugs in entry.S usually show themselves pretty quickly).

Do you think the problem you were seeing under high interrupt load was because an IRQ was squeezing inside the state restore? Previously (pre your patch) interrupts should have been disabled during that phase, but fdorcing BIP makes extra sure I suppose.

Delaying disabling interrupts would improve interrupts latency since less
code would be executed with interrupts disabled.

Yes, that seems reasonable.

I have tested this approach and it works even with high interrupts
frequency. Perhaps I'm forgetting something and in this way some race
condition can happen and it's hard to reproduce in my tests.

I've attached a modified version of your patch, which implements the same changes (plus applies same logic to the debug trap), and goes a bit further to cleanup comments and makes better use of the delay slot on branch and return instructions wwere possible - since we're in the mood for saving cycles... :)

Can you, (and as many others as possible please) try this, and see it it works for you? Applies against current CVS head.

Thanks,

John
Index: entry.S
===================================================================
--- entry.S	(revision 403)
+++ entry.S	(working copy)
@@ -1,8 +1,8 @@
 /*
- * arch/microblaze/kernel/entry.S -- Low-level system-call handling, trap handlers,
- *	and context-switching
+ * arch/microblaze/kernel/entry.S -- Low-level system-call handling, 
+ * trap handlers, and context-switching
  *
- *  Copyright (C) 2003	     John Williams <jwilliams@xxxxxxxxxxxxxx>
+ *  Copyright (C) 2003,04,05,06 John Williams <jwilliams@xxxxxxxxxxxxxx>
  *  Copyright (C) 2001,2002  NEC Corporation
  *  Copyright (C) 2001,2002  Miles Bader <miles@xxxxxxx>
  *
@@ -10,8 +10,8 @@
  * Public License.  See the file COPYING in the main directory of this
  * archive for more details.
  *
- * Written by Miles Bader <miles@xxxxxxx>
- * Heavily modified by John Williams for Microblaze
+ * Original v850 version written by Miles Bader <miles@xxxxxxx>
+ *
  */
 
 #include <linux/sys.h>
@@ -26,14 +26,39 @@
 
 #include "microblaze_defs.h"
 
-
 /* Make a slightly more convenient alias for C_SYMBOL_NAME.  */
 #define CSYM	C_SYMBOL_NAME
 
 /* The offset of the struct pt_regs in a `state save frame' on the stack.  */
 #define PTO	STATE_SAVE_PT_OFFSET
 
-/* Standard way of enabling and disabling interrupts in entry.S */
+/* Standard way of enabling and disabling interrupts in entry.S
+   Several different implementations possible, depending on whether
+   CPU supports the msrset/msrclr instructions.
+   For consistency, all implementations expect a scratch register that
+   is clobbered by the macro.
+
+   Also note the seemingly superfluous NOP after each write to the MSR
+   This is necessary because there is a potential pipeline clash with
+   back-to-back writes to MSR.  e.g.
+
+   A code fragment like this
+
+	msrset r0, 0x2;
+	msrclr r0, 0x2;
+
+   Will not produce the expected result.  
+
+   Also, there are problems with back-to-back write/reads.  This doesn't work:
+
+	msrset	r0, 0x2;
+	mfs	r11, rmsr
+
+   r11 doesn't get the updated MSR value that you might expect.
+	
+   Given the way we use lots of macros to operate on the MSR, it's safer 
+   just to pad them.  Inserting the NOP wastes a cycle, but avoids the hazard.
+*/
 #if CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR
 #define ENTRY_CLI(scratch_reg)                                                \
 	msrclr scratch_reg, 0x2;					      \
@@ -43,7 +68,7 @@
 	msrset scratch_reg, 0x2;					      \
 	nop;
 
-#else
+#else  /* !CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR */
 
 #define ENTRY_CLI(scratch_reg)                                                \
         mfs     scratch_reg, rmsr;                                            \
@@ -58,11 +83,23 @@
 	nop;
 #endif
 
-/* Various ways of setting and clearing BIP in flags reg.  This is mucky, but
-   necessary */
-//#ifdef CONFIG_MICROBLAZE_BIPDIRECT
+/* Various ways of setting and clearing BIP in flags reg.  
+   Some things to consider.
+     1.  Early releases of MicroBlaze 3.00.a had a bug whereby the BIP
+         bit could not be directly set/cleared in the MSR.  The work around
+         was to BRKI ahead one instruction, thus setting BIP, or RTBD one 
+	 instruction ahead, to clear it.  This is all fixed on modern CPUs
+         but might sill crop up for people using old versions of EDK.  So
+         we leave it here but #IF'd out, for historical reasons.
+
+     2.  If msrset/clr opcodes are available, we use them.
+  
+     3.  A scratch register must be provided, and is clobbered.
+
+     4.  We pad these with NOPs to avoid potential back-to-back MSR 
+         operations.
+  */
 #if 1
-  /* using microblaze version that allows msr ops to write to BIP */
   #if CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR
 	#define BIPCLR(scratch_reg)					      \
 		msrclr 	scratch_reg, 0x8;				      \
@@ -71,7 +108,7 @@
 	#define BIPSET(scratch_reg)					      \
 		msrset	scratch_reg, 0x8;				      \
 		nop;
-  #else
+  #else	/* !CONFIG_XILINX_MICROBLAZE0_USE_MSR_INSTR */
 	#define BIPCLR(scratch_reg)					      \
 		mfs	scratch_reg, rmsr;				      \
 		andi	scratch_reg, scratch_reg, ~0x8;			      \
@@ -87,11 +124,14 @@
 #else
   /* Older microblaze version that prevents direct access to BIP bit 
      We hack around it using the rtbd and brki instructions */
+
+  /* Use RTBD to jump ahead, thus clearing BIP */
 	#define BIPCLR(scratch_reg)					      \
 		rtbd	r0, 1f;						      \
-		nop;							      \
+		nop;		/* (delay slot) */			      \
 	   1:
 
+  /* Use BRKI to jump ahead, thus setting BIP */
 	#define BIPSET(scratch_reg)					      \
 		brki	r0, 1f;						      \
 	   1:
@@ -105,7 +145,7 @@
 #define RESTORE_REG(reg_num) \
 	lwi	macrology_paste(r,reg_num), r1, PTO+PT_GPR(reg_num);
 
-/* Save argument registers to the struct pt_regs pointed to by EP.  */
+/* Save argument registers to the struct pt_regs pointed to by r1.  */
 #define SAVE_ARG_REGS							      \
 	SAVE_REG(5);							      \
 	SAVE_REG(6);							      \
@@ -114,7 +154,7 @@
 	SAVE_REG(9);							      \
 	SAVE_REG(10);	
 
-/* Restore argument registers from the struct pt_regs pointed to by EP.  */
+/* Restore argument registers from the struct pt_regs pointed to by r1.  */
 #define RESTORE_ARG_REGS						      \
 	RESTORE_REG(5);							      \
 	RESTORE_REG(6);							      \
@@ -123,12 +163,12 @@
 	RESTORE_REG(9);							      \
 	RESTORE_REG(10);
 
-/* Save value return registers to the struct pt_regs pointed to by EP.  */
+/* Save value return registers to the struct pt_regs pointed to by r1.  */
 #define SAVE_RVAL_REGS							      \
 	SAVE_REG(3);							      \
 	SAVE_REG(4);							      
 
-/* Restore value return registers from the struct pt_regs pointed to by EP.  */
+/* Restore value return registers from the struct pt_regs pointed to by r1.  */
 #define RESTORE_RVAL_REGS						      \
 	RESTORE_REG(3);							      \
 	RESTORE_REG(4);	
@@ -145,14 +185,15 @@
 	RESTORE_REG(11);						      \
 	RESTORE_REG(12);						      
 
-/* Save `call clobbered' registers to the struct pt_regs pointed to by EP.  */
+/* Save `call clobbered' registers to the struct pt_regs pointed to by r1.  */
 #define SAVE_CALL_CLOBBERED_REGS					      \
 	SAVE_CALL_CLOBBERED_REGS_BEFORE_ARGS;				      \
 	SAVE_RVAL_REGS;							      \
 	SAVE_ARG_REGS;							      \
 	SAVE_CALL_CLOBBERED_REGS_AFTER_RVAL;
+
 /* Restore `call clobbered' registers from the struct pt_regs pointed to
-   by EP.  */
+   by r1.  */
 #define RESTORE_CALL_CLOBBERED_REGS					      \
 	RESTORE_CALL_CLOBBERED_REGS_BEFORE_ARGS;			      \
 	RESTORE_RVAL_REGS;						      \
@@ -160,26 +201,29 @@
 	RESTORE_CALL_CLOBBERED_REGS_AFTER_RVAL;
 
 /* Save `call clobbered' registers except for the return-value registers
-   to the struct pt_regs pointed to by EP.  */
+   to the struct pt_regs pointed to by r1.  */
 #define SAVE_CALL_CLOBBERED_REGS_NO_RVAL				      \
 	SAVE_CALL_CLOBBERED_REGS_BEFORE_ARGS;				      \
 	SAVE_ARG_REGS;							      \
 	SAVE_CALL_CLOBBERED_REGS_AFTER_RVAL;
+
 /* Restore `call clobbered' registers except for the return-value registers
-   from the struct pt_regs pointed to by EP.  */
+   from the struct pt_regs pointed to by r1.  */
 #define RESTORE_CALL_CLOBBERED_REGS_NO_RVAL				      \
 	RESTORE_CALL_CLOBBERED_REGS_BEFORE_ARGS;			      \
 	RESTORE_ARG_REGS;						      \
 	RESTORE_CALL_CLOBBERED_REGS_AFTER_RVAL;
 
+/* Convenience macro to zero a register */
 #define ZERO_REG(reg_num)						      \
 	add	r ## reg_num, r0, r0;
 
 /* Zero `call clobbered' registers except for the return-value registers.  */
 #define ZERO_CALL_CLOBBERED_REGS_NO_RVAL				      \
-	ZERO_REG(11); ZERO_REG(12);			      
+	ZERO_REG(11); 							      \
+	ZERO_REG(12);			      
 
-/* Save `call saved' registers to the struct pt_regs pointed to by EP.  */
+/* Save `call saved' registers to the struct pt_regs pointed to by r1.  */
 #define SAVE_CALL_SAVED_REGS						      \
 	SAVE_REG(19);							      \
 	SAVE_REG(20);							      \
@@ -195,7 +239,7 @@
 	SAVE_REG(30);							      \
  	SAVE_REG(31);
 
-/* Restore `call saved' registers from the struct pt_regs pointed to by EP.  */
+/* Restore `call saved' registers from the struct pt_regs pointed to by r1.  */
 #define RESTORE_CALL_SAVED_REGS						      \
 	RESTORE_REG(19);						      \
 	RESTORE_REG(20);						      \
@@ -211,8 +255,11 @@
 	RESTORE_REG(30);						      \
 	RESTORE_REG(31);
 
-                                                                                
-/* Save link register into state save frame */
+/* Set the saved program counter (PC) in a saved state from.  This is the
+   link register for the particular kernel entry (IRQ, trap, dbtrap, ...)
+  
+   The actual register that contains the link value is provided as a param.
+*/
 #define SAVE_LINK(linkreg)						      \
 	swi	linkreg, r1, PTO+PT_PC;		/* PC, before IRQ/trap /*     
 
@@ -238,60 +285,93 @@
 #define RESTORE_PSW							      \
 	RESTORE_SPECIAL(msr,PSW)					      
 
-/* Save system registers to the struct pt_regs pointed to by REG.  
-   r11 is clobbered.  */
+/* Special version of SAVE_PSW that also sets BIP and IE bits in the saved
+   PSW on the state frame.  Reason for this is so that when PSW is restored
+   during state restore, BIP and IE will also be set. 
+
+   Note the actual PSW is unchanged by this macro, only the version saved 
+   on the stack.
+
+   BIP masks all interrupts, allowing the state restore to continue without
+   any IRQs getting through.  Filannly, return to userspace is via RTBD
+   (return from BRK) instruction, which will clear BIP, and leave us ready.
+   
+   As a result of this scheme, the return from an IRQ is RTBD, *not* RTID
+   as you might expect.
+ 
+   This seems to fix problems with lockups under high IRQ load.
+
+   Patch submitted by Alejandro Lucero. */
+#define SAVE_PSW_BIP_EI							      \
+	mfs	r11, rmsr;						      \
+	ori	r11, r11, 0xa;		/* BIP and EI set */		      \
+	swi	r11, r1, PTO+PT_PSW;
+
+/* Save system registers to the struct pt_regs pointed to by r1.  
+   r11 is clobbered by SAVE_PSW_BIP_EI.
+   PSW saved on state frame has BIP and IE set - see comment above 
+   SAVE_PSW_BIP_EI */
 #define SAVE_SYS_REGS_FOR_IRQ						      \
 	SAVE_REG(17); 							      \
 	SAVE_LINK(r14);							      \
-	SAVE_PSW;
+	SAVE_PSW_BIP_EI;
 
-/* Restore system registers from the struct pt_regs pointed to by EP.         
-   clobber r14 to restore status register, it gets fixed immediately */    
+/* Restore system registers from the struct pt_regs pointed to by r1.         
+   r11 clobbered by RESTORE_PSW. */
 #define RESTORE_SYS_REGS_FOR_IRQ					      \
 	RESTORE_REG(17); 						      \
 	RESTORE_PSW;							      \
 	RESTORE_LINK(r14);
 
-/* Save system registers to the struct pt_regs pointed to by REG.  
-   r11 is clobbered.  */
+/* Save system registers to the struct pt_regs pointed to by r1.  
+   r11 is clobbered.  
+
+   PSW as saved on state frame has BIP and IE set - see
+   comment above SAVE_PSW_BIP_EI */
 #define SAVE_SYS_REGS_FOR_TRAP						      \
 	SAVE_REG(17); 							      \
 	SAVE_LINK(r14);							      \
-	SAVE_PSW;
+	SAVE_PSW_BIP_EI;
 
-/* Restore system registers from the struct pt_regs pointed to by EP.         
+/* Restore system registers from the struct pt_regs pointed to by r1.
    clobber r11 to restore status register */
 #define RESTORE_SYS_REGS_FOR_TRAP					      \
 	RESTORE_REG(17);						      \
 	RESTORE_PSW;							      \
 	RESTORE_LINK(r14);
 
-/* Save system registers to the struct pt_regs pointed to by REG.  
-   r11 is clobbered.  */
+/* Save system registers to the struct pt_regs pointed to by r1.  
+   r11 is clobbered. 
+
+   PSW as saved on state frame has BIP and IE set - see
+   comment above SAVE_PSW_BIP_EI */
 #define SAVE_SYS_REGS_FOR_DBTRAP					      \
 	SAVE_REG(17);							      \
 	SAVE_LINK(r14);							      \
-	SAVE_PSW;
+	SAVE_PSW_BIP_EI;
 
-/* Restore system registers from the struct pt_regs pointed to by EP.         
+/* Restore system registers from the struct pt_regs pointed to by r1.         
    clobber r11 to restore status register */
 #define RESTORE_SYS_REGS_FOR_DBTRAP					      \
 	RESTORE_REG(17);						      \
 	RESTORE_PSW;							      \
 	RESTORE_LINK(r14);
 
-/* Save system registers to the struct pt_regs pointed to by REG.  This is a
-   NMI-specific version, because NMIs save the PC/PSW in a different place
-   than other interrupt requests.  r11 is clobbered.  */
+/* Save system registers to the struct pt_regs pointed to by r1.  This is a
+   NMI-specific version.
+   r11 is clobbered. 
+
+   NMI not implemented in MicroBlaze. */
 #define SAVE_SYS_REGS_FOR_NMI						      \
 	SAVE_LINK(r16);							      \
 	SAVE_PSW;
 
-/* Restore system registers from the struct pt_regs pointed to by EP.  This is
-   a NMI-specific version, because NMIs save the PC/PSW in a different place
-   than other interrupt requests.  r11 is clobbered (it is used as a scratch
-   register because the POP_STATE macro restores it, and this macro is usually
-   used inside POP_STATE).  */
+/* Restore system registers from the struct pt_regs pointed to by r1.  This is
+   a NMI-specific version.
+
+   r11 is clobbered.
+
+   NMI not implemented for MicroBlaze */
 #define RESTORE_SYS_REGS_FOR_NMI					      \
 	RESTORE_PSW;							      \
 	RESTORE_LINK(r16);
@@ -299,75 +379,55 @@
 /* Push register state, except for the stack pointer, on the stack in the form
    of a struct pt_regs, in preparation for a system call.  This macro makes
    sure that `special' registers, system registers; TYPE identifies the set of
-   extra registers to be saved as well.  EP is clobbered.  
+   extra registers to be saved as well.  
+
+   r1 is clobbered (changed) to make room on stack for pt_regs frame.
+   r11 is clobbered (after being saved to stack).
+
    Uses add(i)k to prevent corrupting carry flags prior to them being saved */
-#ifdef CONFIG_REGISTER_TASK_PTR						      
 #define PUSH_STATE(type)						      \
 	addik	r1, r1, -STATE_SAVE_SIZE; /* Make room on the stack.  */      \
 	SAVE_REG(2);				/* Save SDA */		      \
 	SAVE_REG(13);				/* Save SDA2 */		      \
 	SAVE_REG(15);				/* Save LP */		      \
 	SAVE_REG(18);				/* Save asm scratch reg */    \
-	SAVE_REG(CURRENT_TASK_REGNUM);		/* Save current task reg */   \
 	type ## _STATE_SAVER;
-#else
-#define PUSH_STATE(type)						      \
-	addik	r1, r1, -STATE_SAVE_SIZE; /* Make room on the stack.  */      \
-	SAVE_REG(2);				/* Save SDA */		      \
-	SAVE_REG(13);				/* Save SDA2 */		      \
-	SAVE_REG(15);				/* Save LP */		      \
-	SAVE_REG(18);				/* Save asm scratch reg */    \
-	type ## _STATE_SAVER;
-#endif									      
 
 /* Pop a register state, except for the stack pointer, from the struct pt_regs
    on the stack.  */
-#ifdef CONFIG_REGISTER_TASK_PTR						      
 #define POP_STATE(type)							      \
 	type ## _STATE_RESTORER;					      \
 	RESTORE_REG(2);				/* Restore SDA */	      \
 	RESTORE_REG(13);			/* Restore SDA2 */	      \
 	RESTORE_REG(15);			/* Restore LP */	      \
 	RESTORE_REG(18);			/* Restore asm scratch reg */ \
-	RESTORE_REG(CURRENT_TASK_REGNUM);	/* Restore cur task reg */    \
 	addik	r1, r1, STATE_SAVE_SIZE		/* Clean up stack space.  */
-#else
-#define POP_STATE(type)							      \
-	type ## _STATE_RESTORER;					      \
-	RESTORE_REG(2);				/* Restore SDA */	      \
-	RESTORE_REG(13);			/* Restore SDA2 */	      \
-	RESTORE_REG(15);			/* Restore LP */	      \
-	RESTORE_REG(18);			/* Restore asm scratch reg */ \
-	addik	r1, r1, STATE_SAVE_SIZE		/* Clean up stack space.  */
-#endif
 
 
 /* Switch to the kernel stack if necessary, and push register state on
    the stack in the form of a struct pt_regs.  Also load the current
-   task pointer if switching from user mode.  The stack-pointer (r3)
-   should have already been saved to the memory location SP_SAVE_LOC
-   (the reason for this is that the interrupt vectors may be beyond a
-   22-bit signed offset jump from the actual interrupt handler, and this
-   allows them to save the stack-pointer and use that register to do an
-   indirect jump).  This macro makes sure that `special' registers,
-   system registers, and the stack pointer are saved; TYPE identifies
-   the set of extra registers to be saved as well.  SYSCALL_NUM is the
-   register in which the system-call number this state is for is stored
-   (r0 if this isn't a system call).  Interrupts should already be
-   disabled when calling this.  */
+   task pointer if switching from user mode.  
+   
+   This macro makes sure that `special' registers, system registers, and 
+   the stack pointer are saved; TYPE identifies the set of extra registers 
+   to be saved as well.  
+
+   SYSCALL_NUM is the register in which the system-call number this state 
+   is for is stored (r0 if this isn't a system call).  
+
+   Interrupts should already be disabled when calling this.  */
 #define SAVE_STATE(type, syscall_num, sp_save_loc)			      \
 	swi	r11, r0, R11_SAVE;	/* Save r11 */			      \
         lwi	r11, r0, KM;		/* See if already in kernel mode.  */ \
 	beqi	r11, 1f;		/* Jump ahead if coming from user */  \
 	lwi	r11, r0, R11_SAVE;	/* restore r11 */		      \
-        /* Kernel-mode state save.  */					      \
+        				/* Kernel-mode state save.  */	      \
 	lwi	r1, r0, sp_save_loc;	/* Reload kernel stack-pointer.  */   \
 	swi	r1, r1, (PT_GPR(GPR_SP)-PT_SIZE); /* Save original SP. */     \
         PUSH_STATE(type);						      \
 	addi	r11, r0, 1; 		/* Was in kernel-mode.  */	      \
-        swi	r11, r1, PTO+PT_KERNEL_MODE; 	 			      \
         brid	2f;							      \
-	nop;				/* Fill delay slot */		      \
+        swi	r11, r1, PTO+PT_KERNEL_MODE;	/* (delay slot) */	      \
 1:      /* User-mode state save.  */					      \
 	lwi	r11, r0, R11_SAVE;	/* restore r11 */		      \
         lwi	r1, r0, KSP;		/* Switch to kernel stack.  */	      \
@@ -451,11 +511,14 @@
 #define TRAP_EXTRA_STATE_SAVER						      \
    SAVE_RVAL_REGS;							      \
    SAVE_CALL_SAVED_REGS
+
 #define TRAP_EXTRA_STATE_RESTORER					      \
    RESTORE_RVAL_REGS;							      \
    RESTORE_CALL_SAVED_REGS
+
 #define TRAP_FUNCALL_EXTRA_STATE_SAVER					      \
    SAVE_RVAL_REGS
+
 #define TRAP_FUNCALL_EXTRA_STATE_RESTORER				      \
    RESTORE_RVAL_REGS
 
@@ -531,15 +594,20 @@
    RESTORE_CALL_SAVED_REGS						      \
    RESTORE_PSW
 
-/* Instructions to return from an IRQ */
+/* Instructions to return from an IRQ.
+   This is RTBD (return from BRK) rather than RTID (return from IRQ)
+   because of our use of the BIP bit to protect the state save/restore.
+
+   See comments about SAVE_PSW_BIP_EI for details.
+*/
 #define IRQ_RETURN_INST							      \
-	rtid	r14, 0;		/* Follow interrupt link pointer back to */   \
+	rtbd	r14, 0;		/* Follow interrupt link pointer back to */   \
 				/* next insn after interrupt occured */       \
 	nop;			/* Delay slot */		      
 
-/* Instructions to return from a trap */
-/* Note we use rtbd to clear BIP bit on way out */
-/* Return offset is 4, to execute instruction after syscall brki insn*/
+/* Instructions to return from a trap.
+   RTBD clears BIP bit on way out .
+   Return offset is 4, to execute instruction after syscall brki insn*/
 #define TRAP_RETURN_INST						      \
 	rtbd	r14, 4;		/* r14 used as trap link register */	      \
 	nop;
@@ -559,11 +627,11 @@
 /* Restore register state from the struct pt_regs on the stack, switch back
    to the user stack if necessary, and return from the trap/interrupt.
    EXTRA_STATE_RESTORER is a sequence of assembly language statements to
-   restore anything not restored by this macro.  Only registers not saved by
-   the C compiler are restored (that is, R1(sp), R2(gp), R15(lp), and
-   anything restored by EXTRA_STATE_RESTORER).  */
+   restore anything not restored by this macro.  
+
+   Only registers not saved by the C compiler are restored (that is, 
+   r1, r2, r15, and anything restored by EXTRA_STATE_RESTORER).  */
 #define RETURN(type)							      \
-	ENTRY_CLI(r11);							      \
         lwi	r11, r1, PTO+PT_KERNEL_MODE;				      \
 	bnei	r11, 2f;		/* See if returning to kernel mode, */\
 					/* ... if so, skip resched &c.  */    \
@@ -588,7 +656,8 @@
 	bnei	r11, 4f;		/* Signals to handle, handle them */  \
 									      \
 /* Finally, return to user state.  */					      \
-1:	swi	r0, r0, KM;		/* Now officially in user state. */   \
+1:	BIPSET(r11);							      \
+	swi	r0, r0, KM;		/* Now officially in user state. */   \
 	POP_STATE(type);						      \
 	swi	r1, r0, KSP;		/* Save the kernel stack pointer. */  \
 	lwi	r1, r1, PT_GPR(GPR_SP)-PT_SIZE;				      \
@@ -596,7 +665,8 @@
 	bri	6f;							      \
 									      \
 /* Return to kernel state.  */						      \
-2:	POP_STATE(type);						      \
+2:	BIPSET(r11);							      \
+	POP_STATE(type);						      \
 6:									      \
 type ## _return:		/* Make global symbol for debugging */	      \
 	type ## _RETURN_INST						      \
@@ -605,12 +675,11 @@
 3:	SAVE_EXTRA_STATE_FOR_FUNCALL(type); /* Prepare for funcall. */	      \
 	bralid	r15, CSYM(schedule);	/* Call scheduler */		      \
 	nop;				/* delay slot */		      \
-	ENTRY_CLI(r11);		/* The scheduler enables interrupts */	      \
 	RESTORE_EXTRA_STATE_FOR_FUNCALL(type);				      \
 	brid	5b;		/* Back to continue return processing */      \
 	nop;								      \
 									      \
-/* Handle a signal return; Pending signals should be in r18.  */	      \
+/* Handle a signal return; */						      \
 4:      /* Not all registers are saved by the normal trap/interrupt entry     \
 	   points (for instance, call-saved registers (because the normal     \
 	   C-compiler calling sequence in the kernel makes sure they're	      \
@@ -621,43 +690,34 @@
 	   (in a possibly modified form) after do_signal returns.  */	      \
         SAVE_EXTRA_STATE(type)		/* Save state not saved by entry. */  \
 	la	r5, r1, PTO;		/* Arg 1: struct pt_regs *regs */     \
+	bralid	r15, CSYM(do_signal);	/* Handle any signals */	      \
 	add	r6, r0, r0;		/* Arg 2: sigset_t *oldset */	      \
-	bralid	r15, CSYM(do_signal);	/* Handle any signals */	      \
-	nop;								      \
-	ENTRY_CLI(r11);		/* Sig handling enables interrupts */	      \
+					/* (delay slot) */		      \
+									      \
         RESTORE_EXTRA_STATE(type);	/* Restore extra regs.  */	      \
 	brid	1b;							      \
 	nop;	
 
 
 /* Jump to the appropriate function for the system call number in r12
-   (r12 is not preserved), or return an error if r12 is not valid.  The
-   LP register should point to the location where the called function
+   (r12 is not preserved), or return an error if r12 is not valid.  
+   r15 should point to the location where the called function
    should return.  [note that MAKE_SYS_CALL uses label 1]  */
 #define MAKE_SYS_CALL							      \
 	/* See if the system call number is valid.  */			      \
 	addi	r11, r12, -NR_syscalls;					      \
-	bgei	r11,1f;							      \
+	bgei	r11, 1f;						      \
 	/* Figure out which function to use for this system call.  */	      \
-	/* Note Microblaze barrel shift is optional, so don't rely on it */   \
+	/* MicroBlaze barrel shift is optional, so don't rely on it */        \
 	add	r12, r12, r12;			/* convert num -> ptr */      \
 	add	r12, r12, r12;						      \
 	lwi	r12, r12, CSYM(sys_call_table);	/* Get function pointer */    \
 	/* Make the system call.  */					      \
 	bra	r12;							      \
 	/* The syscall number is invalid, return an error.  */		      \
-1:	addi	r3, r0, -ENOSYS;					      \
-	rtsd	r15,8;		/* looks like a normal subroutine return */
+1:	rtsd	r15,8;		/* looks like a normal subroutine return */   \
+	addi	r3, r0, -ENOSYS;	/* delay slot */
 
-
-	// Ugly kludge to get around some Xilinx mb-gcc wierdness
-	.data
-	.globl CSYM(_shift_temp_loc)
-CSYM(_shift_temp_loc):
-	.long	0
-
-	.text
-
 /*
  * User trap.
  *
@@ -671,13 +731,16 @@
  * are masked.  This is nice, means we don't have to CLI before state save
  */
 G_ENTRY(trap):
-	swi	r1, r0, ENTRY_SP;	// save stack (emulate v850)
+	swi	r1, r0, ENTRY_SP;	 // save stack pointer
 	SAVE_STATE (TRAP, r12, ENTRY_SP) // Save registers. 
-	BIPCLR(r11);			// Clear BIP
-					// BIP now cleared, interrupts enabled
-	la	r15, r0, ret_from_trap-8// where the trap should return
-					// need -8 to adjust for rtsd r15, 8
-	MAKE_SYS_CALL			// Jump to the syscall function. 
+
+	// State is safely saved, we can 
+        // clear BIP to re-enable interrupts 
+
+	BIPCLR(r11);			 // Clear BIP
+	la	r15, r0, ret_from_trap-8;// where the trap should return
+					 // need -8 to adjust for rtsd r15, 8
+	MAKE_SYS_CALL;			 // Jump to the syscall function. 
 END(trap)
 
 /* This is just like ret_from_trap, but first restores extra registers
@@ -688,9 +751,7 @@
 END(restore_extra_regs_and_ret_from_trap)
 
 /* Entry point used to return from a syscall/trap.  */
-/* We re-enable BIP bit before state restore */
 L_ENTRY(ret_from_trap):
-	BIPSET(r11);			//  Ints masked for state restore
 	RETURN(TRAP)
 END(ret_from_trap)
 
@@ -702,44 +763,20 @@
 C_ENTRY(ret_from_fork):
 	bralid	r15, CSYM(schedule_tail); // ...which is schedule_tail's arg
 	add	r3, r5, r0;		// switch_thread returns the prev task.
-					// ( in the delay slot )
+					// (delay slot)
+	brid	ret_from_trap		// Do normal trap return.
 	add	r3, r0, r0;		// Child's fork call should return 0.
-	brid	ret_from_trap		// Do normal trap return.
-	nop;
+					// (delay slot)
 C_END(ret_from_fork)
 
-
-#if !TRAPS_PRESERVE_CALL_CLOBBERED_REGS
-	// Make sure r13 and r14 are preserved, in case we have to restart a
-	// system call because of a signal (ep has already been set by caller).
-	//st.w	r13, PTO+PT_GPR(13)[sp]
-	//st.w	r14, PTO+PT_GPR(13)[sp]
-	la	r15, r0, ret_from_long_syscall-8
-#endif /* !TRAPS_PRESERVE_CALL_CLOBBERED_REGS */
-
-	MAKE_SYS_CALL			// Jump to the syscall function.
-END(syscall_long)	
-
-#if !TRAPS_PRESERVE_CALL_CLOBBERED_REGS
-/* Entry point used to return from a long syscall.  Only needed to restore
-   r13/r14 if the general trap mechanism doesnt' do so.  */
-L_ENTRY(ret_from_long_syscall):
-	//ld.w	PTO+PT_GPR(13)[sp], r13 // Restore the extra registers
-	//ld.w	PTO+PT_GPR(13)[sp], r14
-	brid	ret_from_trap;		// The rest is the same as other traps
-	nop;
-END(ret_from_long_syscall)
-#endif /* !TRAPS_PRESERVE_CALL_CLOBBERED_REGS */
-
-
 /* These syscalls need access to the struct pt_regs on the stack, so we
    implement them in assembly (they're basically all wrappers anyway).  */
 
 L_ENTRY(sys_fork_wrapper):
 #ifdef NO_MM
 	// fork almost works, enough to trick you into looking elsewhere :-(
-	addi	r3, r0, -EINVAL
 	rtsd	r15, 8;
+	addi	r3, r0, -EINVAL;	// Delay slot 
 #else
 	// Save state not saved by entry.  This is actually slight overkill;
 	// it's actually only necessary to save any state restored by
@@ -748,9 +785,8 @@
 	addi	r5, r0, SIGCHLD		// Arg 0: flags
 	lwi	r6, r1, PTO+PT_GPR(GPR_SP) // Arg 1: child SP (use parent's)
 	la	r7, r1, PTO;		// Arg 2: parent context
-	add	r9. r0, r0;		// Arg 3: (unused)
 	brid	CSYM(do_fork)		// Do real work (tail-call).
-	nop;
+	add	r9, r0, r0;		// Arg 3: (unused) (delay slot)
 #endif
 END(sys_fork_wrapper)
 
@@ -762,9 +798,8 @@
 	addi	r5, r0, CLONE_VFORK | CLONE_VM | SIGCHLD // Arg 0: flags
 	lwi	r6, r1, PTO+PT_GPR(GPR_SP) // Arg 1: child SP (use parent's)
 	la	r7, r1, PTO;		// Arg 2: parent context
-	add	r8, r0, r0;			// Arg 3: (unused)
 	brid	CSYM(do_fork)		// Do real work (tail-call).
-	nop
+	add	r8, r0, r0;		// Arg 3: (unused) (delay slot)
 END(sys_vfork_wrapper)
 
 L_ENTRY(sys_clone_wrapper):
@@ -775,81 +810,74 @@
 	bnei	r6, 1f; 		// See if child SP arg (arg 1) is 0.
 	lwi	r6, r1, PTO+PT_GPR(GPR_SP);	// If so, use paret's stack ptr
 1:	la	r7, r1, PTO;		// Arg 2: parent context
-	add	r8, r0, r0;		// Arg 3: (unused)
 	brid	CSYM(do_fork)		// Do real work (tail-call).
-	nop;
+	add	r8, r0, r0;		// Arg 3: (unused) (delay slot)
 END(sys_clone_wrapper)
 
 L_ENTRY(sys_execve_wrapper):
+	brid	CSYM(sys_execve);	// Do real work (tail-call).
 	la	r8, r1, PTO;		// add user context as 4th arg
-	brid	CSYM(sys_execve);	// Do real work (tail-call).
-	nop;
+					// (delay slot)
 END(sys_execve_wrapper)
 
 L_ENTRY(sys_sigsuspend_wrapper):
         SAVE_EXTRA_STATE(TRAP)		// Save state not saved by entry.
+	bralid	r15, CSYM(sys_sigsuspend); // Do real work.
 	la	r6, r1, PTO;		// add user context as 2nd arg
-	bralid	r15, CSYM(sys_sigsuspend); // Do real work.
-	nop;
+					// (delay slot)
+
 	brid	restore_extra_regs_and_ret_from_trap;
-	nop;
+	nop;				// (delay slot)
 END(sys_sigsuspend_wrapper)
 
 L_ENTRY(sys_rt_sigsuspend_wrapper):
         SAVE_EXTRA_STATE(TRAP)		// Save state not saved by entry.
+	brlid	r15, CSYM(sys_rt_sigsuspend);	// Do real work.
 	la	r7, r1, PTO;		// add user context as 3rd arg
-	brlid	r15, CSYM(sys_rt_sigsuspend);	// Do real work.
-	nop;
+					// (delay slot)
 	brid	restore_extra_regs_and_ret_from_trap;
-	nop;
+	nop;				// (delay slot)
 END(sys_rt_sigsuspend_wrapper)
 
 L_ENTRY(sys_sigreturn_wrapper):
         SAVE_EXTRA_STATE(TRAP)		// Save state not saved by entry.
+	brlid	r15, CSYM(sys_sigreturn);	// Do real work.
 	la	r5, r1, PTO;		// add user context as 1st arg
-	brlid	r15, CSYM(sys_sigreturn);	// Do real work.
-	nop;
+					// (delay slot)
 	brid	restore_extra_regs_and_ret_from_trap;
-	nop;
+	nop;				// (delay slot)
 END(sys_sigreturn_wrapper)
 
 L_ENTRY(sys_rt_sigreturn_wrapper):
         SAVE_EXTRA_STATE(TRAP)		// Save state not saved by entry.
+	brlid	r15, CSYM(sys_rt_sigreturn)	// Do real work.
 	la	r5, r1, PTO;		// add user context as 1st arg
-	brlid	r15, CSYM(sys_rt_sigreturn)	// Do real work.
-	nop;
+					// (delay slot)
 	brid	restore_extra_regs_and_ret_from_trap;
-	nop;
+	nop;				// (delay slot)
 END(sys_rt_sigreturn_wrapper)
 
 
 /*
  * Hardware maskable interrupts.
  *
- * The stack-pointer (r1) should have already been saved to the memory
- * location ENTRY_SP.
- * Currently this is just because the v850 code does it.  This should
- * be removed at some point to streamline things
+ * Interrupts are disabled prior to entry by the CPU.
  */
 G_ENTRY(irq):
-	swi	r1, r0, ENTRY_SP;	// save stack (emulate v850)
+	swi	r1, r0, ENTRY_SP;	// save stack pointer
 	SAVE_STATE (IRQ, r0, ENTRY_SP)	// Save registers. 
 
-	add	r5, r0, r0;		// Clear first param
-	la	r6, r1, PTO;		// User regs are arg2
-
-	// It's up to the high level handler to query the
+	// It's up to the arch handler to query the
 	// interrupt controller and get the IRQ number etc
 
-	// Call the high-level interrupt handling code.
+	add	r5, r0, r0;		// Clear first param
 	bralid r15, CSYM(handle_irq);
-	nop;
-
+	la	r6, r1, PTO;		// User regs are arg2
+					// (delay slot)
 	// fall through
 
 /* Entry point used to return from an interrupt (also used by exception
-   handle
-s, below).  */
+   handles, below).  */
 ret_from_irq:
 	RETURN(IRQ)
 END(irq)
@@ -858,73 +886,62 @@
 /*
  * Hardware non-maskable interrupts.
  *
- * The stack-pointer (r1) should have already been saved to the memory
- * location ENTRY_SP (the reason for this is that the interrupt vectors may be
- * beyond a 22-bit signed offset jump from the actual interrupt handler, and
- * this allows them to save the stack-pointer and use that register to do an
- * indirect jump).
+ * Not implemented on MicroBlaze.
  */
 G_ENTRY(nmi):
-	swi	r1, r0, NMI_ENTRY_SP;		/* Save state (emulate v850) */
+	swi	r1, r0, NMI_ENTRY_SP;	    /* Save stack pointer */
 	SAVE_STATE (NMI, r0, NMI_ENTRY_SP); /* Save registers.  */
 
 	add	r5, r0, r0;
-	la	r6, r1, PTO;	/* User regs are arg2.  */
+	la	r6, r1, PTO;		    /* User regs are arg2.  */
 
-	/* Non-maskable interrupts always lie right after maskable interrupts.
-	   Call the generic IRQ handler, with two arguments, the IRQ number,
-	   and a pointer to the user registers, to handle the specifics.
-	   (we subtract one because the first NMI has code 1).  */
+	/* Currently just do this like a normal interrupt */
 	bralid 	r15, CSYM(handle_irq);
 	nop;
 
 	RETURN(NMI)
-END(nmi0)
+END(nmi)
 
 
 /*
  * Illegal instruction trap.
  *
- * The stack-pointer (r3) should have already been saved to the memory
- * location ENTRY_SP (the reason for this is that the interrupt vectors may be
- * beyond a 22-bit signed offset jump from the actual interrupt handler, and
- * this allows them to save the stack-pointer and use that register to do an
- * indirect jump).
+ * Not implemented on MicroBlaze.
  */
 G_ENTRY(illegal_instruction):
-	swi	r1, r0, ENTRY_SP;	// Save stack (emulate v850)
+	swi	r1, r0, ENTRY_SP;	// Save stack pointer
 	SAVE_STATE (IRQ, r0, ENTRY_SP)	// Save registers. 
-	ENTRY_EI(r11);
+	ENTRY_EI(r11);			// Enable interrupts
+
+	/* Send Illegal Instruction signal (SIGILL) to process */
 	addi	r5, r0, SIGILL;		// Arg 0: signal number
 	RETRIEVE_CURRENT_TASK(r6);	// Arg 1: task
+	brid	CSYM(force_sig);
 	la	r15, r0, ret_from_irq-8	// where the handler should return
-	brid	CSYM(force_sig);
-	nop;
+					// (delay slot)
 
 END(illegal_instruction)
 
 
 /*
  * `Debug' trap
- *  We enter dbtrap in "BIP" (breakpoint) mode. 
- *  So we exit the breakpoint mode with an 'rtbd' and proceed with the
- *  original dbtrap.
- *  however, wait to save state first
+ *  We enter dbtrap via a SW breakpoint instruction "brki r14, 0x60".
+ *  Thus, BIP is set on entry (just like a system call).
  */
 G_ENTRY(dbtrap):
 	/* BIP bit is set on entry, no interrupts can occur */
-	swi	r1, r0, ENTRY_SP;	// Save stack (emulate v850)
-	SAVE_STATE (DBTRAP, r0, ENTRY_SP)   // Save registers. 
+	swi	r1, r0, ENTRY_SP;		// Save stack pointer
+	SAVE_STATE (DBTRAP, r0, ENTRY_SP)	// Save registers. 
+
 	BIPCLR(r11);
 	/* BIP bit now clear, interrupts can occur */
-	// Should insert code to detect illegal traps etc
+	/* Should insert code to detect illegal traps etc */
 
-	addi	r5, r0, SIGTRAP;	// send the trap signal
-	RETRIEVE_CURRENT_TASK(r6);	// to the current task
+	addi	r5, r0, SIGTRAP;		// send the trap signal
+	RETRIEVE_CURRENT_TASK(r6);		// to the current task
 	brlid	r15, CSYM(send_sig);
-	add	r7, r0, r0;		// 3rd param zero (delay slot)
+	add	r7, r0, r0;			// 3rd param zero (delay slot)
 
-	BIPSET(r11);		/* Set BIP again, ready for state restore */
 	RETURN(DBTRAP);
 
 END(dbtrap)
@@ -932,15 +949,14 @@
 G_ENTRY(reset_trap):
 
 	/* BIP bit is set on entry, no interrupts can occur */
-	swi	r1, r0, ENTRY_SP;	// Save stack (emulate v850)
-	SAVE_STATE (DBTRAP, r0, ENTRY_SP)   // Save registers. 
+	swi	r1, r0, ENTRY_SP;		// Save stack pointer
+	SAVE_STATE (DBTRAP, r0, ENTRY_SP)   	// Save registers. 
 	BIPCLR(r11);
 
 	// For now, call into kernel for debugging
 	brlid	r15, CSYM(debug_trap);
-	la	r5, r1, PTO
-	// And loop forever! */
-foo:	brid foo
+	la	r5, r1, PTO			// (delay slot)
+reset_loop:	brid reset_loop;		// loop forever 
 	nop
 END(reset_trap)
 
@@ -949,9 +965,9 @@
  */
 L_ENTRY(bad_trap_wrapper):
 	add	r5, r0, r19; 		// Arg 0: trap number
+	brid	CSYM(bad_trap);		// tail call handler
 	la	r6, r1, PTO;		// Arg 1: user regs
-	brid	CSYM(bad_trap);		// tail call handler
-	nop;
+					// (delay slot)
 
 END(bad_trap_wrapper)
 
@@ -966,7 +982,6 @@
 	// Return the previous task
 	// Do this before push_state, so that return value is on stack
 	// Update the current task pointer
-	//GET_CURRENT_TASK(CURRENT_TASK)
 	RETRIEVE_CURRENT_TASK(r3);
 
 	// First, push the current processor state on the stack
@@ -984,7 +999,7 @@
 	GET_CURRENT_TASK(CURRENT_TASK)
 	// Now return into the new thread
 	rtsd	r15,8;
-	nop;
+	nop;				// (delay slot)
 C_END(switch_thread)
 
 	.section .rodata