[backport from gcc-4.7/trunk ] Date: Sun, 13 Mar 2011 16:31:48 +0000 From: Andrew Stubbs Subject: [PATCH][ARM] Discourage use of NEON on Cortex-A8 List-Archive: This patch discourages the use of NEON for integer operations on ARM Cortex-A8. The problem is that transferring data from NEON/VFP registers to core registers is prohibitively expensive on A8. This should not affect Cortex-A9 in the same way. This change gives a 6% increase in performance on SPEC2000 crafty, on an imx51 board. An older version of the patch has been used for some time in the CodeSourcery and Linaro toolchains, so it's fairly well tested. OK (for stage 1)? gcc/ 2011-03-25 Bernd Schmidt Andrew Stubbs * config/arm/vfp.md (arm_movdi_vfp): Enable only when not tuning for Cortex-A8. (arm_movdi_vfp_cortexa8): New pattern. * config/arm/neon.md (adddi3_neon, subdi3_neon, anddi3_neon, iordi3_neon, xordi3_neon): Add alternatives to discourage Neon instructions when tuning for Cortex-A8. Set attribute "arch". * config/arm/arm.md: Move include arm-tune.md up a bit. (define_attr "arch"): Add "onlya8" and "nota8" values. (define_attr "arch_enabled"): Handle "onlya8" and "nota8". --- gcc-4.6.0/gcc/config/arm/arm.md.~1~ 2011-03-15 20:59:25.000000000 +0100 +++ gcc-4.6.0/gcc/config/arm/arm.md 2011-05-14 15:49:56.000000000 +0200 @@ -149,6 +149,9 @@ (define_constants ;;--------------------------------------------------------------------------- ;; Attributes +;; Processor type. This is created automatically from arm-cores.def. +(include "arm-tune.md") + ; IS_THUMB is set to 'yes' when we are generating Thumb code, and 'no' when ; generating ARM code. This is used to control the length of some insn ; patterns that share the same RTL in both ARM and Thumb code. @@ -192,7 +195,7 @@ (define_attr "length" "" ; for ARM or Thumb-2 with arm_arch6, and nov6 for ARM without ; arm_arch6. This attribute is used to compute attribute "enabled", ; use type "any" to enable an alternative in all cases. -(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6" +(define_attr "arch" "any,a,t,32,t1,t2,v6,nov6,onlya8,nota8" (const_string "any")) (define_attr "arch_enabled" "no,yes" @@ -225,6 +228,14 @@ (define_attr "arch_enabled" "no,yes" (and (eq_attr "arch" "nov6") (ne (symbol_ref "(TARGET_32BIT && !arm_arch6)") (const_int 0))) + (const_string "yes") + + (and (eq_attr "arch" "onlya8") + (eq_attr "tune" "cortexa8")) + (const_string "yes") + + (and (eq_attr "arch" "nota8") + (not (eq_attr "tune" "cortexa8"))) (const_string "yes")] (const_string "no"))) @@ -485,9 +496,6 @@ (define_attr "ce_count" "" (const_int 1) ;;--------------------------------------------------------------------------- ;; Pipeline descriptions -;; Processor type. This is created automatically from arm-cores.def. -(include "arm-tune.md") - (define_attr "tune_cortexr4" "yes,no" (const (if_then_else (eq_attr "tune" "cortexr4,cortexr4f") --- gcc-4.6.0/gcc/config/arm/neon.md.~1~ 2011-01-03 21:52:22.000000000 +0100 +++ gcc-4.6.0/gcc/config/arm/neon.md 2011-05-14 15:49:56.000000000 +0200 @@ -583,23 +583,25 @@ (define_insn "*add3_neon" ) (define_insn "adddi3_neon" - [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r") - (plus:DI (match_operand:DI 1 "s_register_operand" "%w,0,0") - (match_operand:DI 2 "s_register_operand" "w,r,0"))) + [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?w") + (plus:DI (match_operand:DI 1 "s_register_operand" "%w,0,0,w") + (match_operand:DI 2 "s_register_operand" "w,r,0,w"))) (clobber (reg:CC CC_REGNUM))] "TARGET_NEON" { switch (which_alternative) { - case 0: return "vadd.i64\t%P0, %P1, %P2"; + case 0: /* fall through */ + case 3: return "vadd.i64\t%P0, %P1, %P2"; case 1: return "#"; case 2: return "#"; default: gcc_unreachable (); } } - [(set_attr "neon_type" "neon_int_1,*,*") - (set_attr "conds" "*,clob,clob") - (set_attr "length" "*,8,8")] + [(set_attr "neon_type" "neon_int_1,*,*,neon_int_1") + (set_attr "conds" "*,clob,clob,*") + (set_attr "length" "*,8,8,*") + (set_attr "arch" "nota8,*,*,onlya8")] ) (define_insn "*sub3_neon" @@ -617,24 +619,26 @@ (define_insn "*sub3_neon" ) (define_insn "subdi3_neon" - [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?&r") - (minus:DI (match_operand:DI 1 "s_register_operand" "w,0,r,0") - (match_operand:DI 2 "s_register_operand" "w,r,0,0"))) + [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?&r,?w") + (minus:DI (match_operand:DI 1 "s_register_operand" "w,0,r,0,w") + (match_operand:DI 2 "s_register_operand" "w,r,0,0,w"))) (clobber (reg:CC CC_REGNUM))] "TARGET_NEON" { switch (which_alternative) { - case 0: return "vsub.i64\t%P0, %P1, %P2"; + case 0: /* fall through */ + case 4: return "vsub.i64\t%P0, %P1, %P2"; case 1: /* fall through */ case 2: /* fall through */ case 3: return "subs\\t%Q0, %Q1, %Q2\;sbc\\t%R0, %R1, %R2"; default: gcc_unreachable (); } } - [(set_attr "neon_type" "neon_int_2,*,*,*") - (set_attr "conds" "*,clob,clob,clob") - (set_attr "length" "*,8,8,8")] + [(set_attr "neon_type" "neon_int_2,*,*,*,neon_int_2") + (set_attr "conds" "*,clob,clob,clob,*") + (set_attr "length" "*,8,8,8,*") + (set_attr "arch" "nota8,*,*,*,onlya8")] ) (define_insn "*mul3_neon" @@ -720,23 +724,26 @@ (define_insn "ior3" ) (define_insn "iordi3_neon" - [(set (match_operand:DI 0 "s_register_operand" "=w,w,?&r,?&r") - (ior:DI (match_operand:DI 1 "s_register_operand" "%w,0,0,r") - (match_operand:DI 2 "neon_logic_op2" "w,Dl,r,r")))] + [(set (match_operand:DI 0 "s_register_operand" "=w,w,?&r,?&r,?w,?w") + (ior:DI (match_operand:DI 1 "s_register_operand" "%w,0,0,r,w,0") + (match_operand:DI 2 "neon_logic_op2" "w,Dl,r,r,w,Dl")))] "TARGET_NEON" { switch (which_alternative) { - case 0: return "vorr\t%P0, %P1, %P2"; - case 1: return neon_output_logic_immediate ("vorr", &operands[2], + case 0: /* fall through */ + case 4: return "vorr\t%P0, %P1, %P2"; + case 1: /* fall through */ + case 5: return neon_output_logic_immediate ("vorr", &operands[2], DImode, 0, VALID_NEON_QREG_MODE (DImode)); case 2: return "#"; case 3: return "#"; default: gcc_unreachable (); } } - [(set_attr "neon_type" "neon_int_1,neon_int_1,*,*") - (set_attr "length" "*,*,8,8")] + [(set_attr "neon_type" "neon_int_1,neon_int_1,*,*,neon_int_1,neon_int_1") + (set_attr "length" "*,*,8,8,*,*") + (set_attr "arch" "nota8,nota8,*,*,onlya8,onlya8")] ) ;; The concrete forms of the Neon immediate-logic instructions are vbic and @@ -762,23 +769,26 @@ (define_insn "and3" ) (define_insn "anddi3_neon" - [(set (match_operand:DI 0 "s_register_operand" "=w,w,?&r,?&r") - (and:DI (match_operand:DI 1 "s_register_operand" "%w,0,0,r") - (match_operand:DI 2 "neon_inv_logic_op2" "w,DL,r,r")))] + [(set (match_operand:DI 0 "s_register_operand" "=w,w,?&r,?&r,?w,?w") + (and:DI (match_operand:DI 1 "s_register_operand" "%w,0,0,r,w,0") + (match_operand:DI 2 "neon_inv_logic_op2" "w,DL,r,r,w,DL")))] "TARGET_NEON" { switch (which_alternative) { - case 0: return "vand\t%P0, %P1, %P2"; - case 1: return neon_output_logic_immediate ("vand", &operands[2], + case 0: /* fall through */ + case 4: return "vand\t%P0, %P1, %P2"; + case 1: /* fall through */ + case 5: return neon_output_logic_immediate ("vand", &operands[2], DImode, 1, VALID_NEON_QREG_MODE (DImode)); case 2: return "#"; case 3: return "#"; default: gcc_unreachable (); } } - [(set_attr "neon_type" "neon_int_1,neon_int_1,*,*") - (set_attr "length" "*,*,8,8")] + [(set_attr "neon_type" "neon_int_1,neon_int_1,*,*,neon_int_1,neon_int_1") + (set_attr "length" "*,*,8,8,*,*") + (set_attr "arch" "nota8,nota8,*,*,onlya8,onlya8")] ) (define_insn "orn3_neon" @@ -836,16 +846,18 @@ (define_insn "xor3" ) (define_insn "xordi3_neon" - [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r") - (xor:DI (match_operand:DI 1 "s_register_operand" "%w,0,r") - (match_operand:DI 2 "s_register_operand" "w,r,r")))] + [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?w") + (xor:DI (match_operand:DI 1 "s_register_operand" "%w,0,r,w") + (match_operand:DI 2 "s_register_operand" "w,r,r,w")))] "TARGET_NEON" "@ veor\t%P0, %P1, %P2 # - #" - [(set_attr "neon_type" "neon_int_1,*,*") - (set_attr "length" "*,8,8")] + # + veor\t%P0, %P1, %P2" + [(set_attr "neon_type" "neon_int_1,*,*,neon_int_1") + (set_attr "length" "*,8,8,*") + (set_attr "arch" "nota8,*,*,onlya8")] ) (define_insn "one_cmpl2" --- gcc-4.6.0/gcc/config/arm/vfp.md.~1~ 2011-01-20 23:03:29.000000000 +0100 +++ gcc-4.6.0/gcc/config/arm/vfp.md 2011-05-14 15:49:56.000000000 +0200 @@ -134,9 +134,51 @@ (define_insn "*thumb2_movsi_vfp" ;; DImode moves (define_insn "*arm_movdi_vfp" - [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r,m,w,r,w,w, Uv") + [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r, m,w,r,w,w, Uv") (match_operand:DI 1 "di_operand" "rIK,mi,r,r,w,w,Uvi,w"))] - "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP && arm_tune != cortexa8 + && ( register_operand (operands[0], DImode) + || register_operand (operands[1], DImode))" + "* + switch (which_alternative) + { + case 0: + return \"#\"; + case 1: + case 2: + return output_move_double (operands); + case 3: + return \"fmdrr%?\\t%P0, %Q1, %R1\\t%@ int\"; + case 4: + return \"fmrrd%?\\t%Q0, %R0, %P1\\t%@ int\"; + case 5: + if (TARGET_VFP_SINGLE) + return \"fcpys%?\\t%0, %1\\t%@ int\;fcpys%?\\t%p0, %p1\\t%@ int\"; + else + return \"fcpyd%?\\t%P0, %P1\\t%@ int\"; + case 6: case 7: + return output_move_vfp (operands); + default: + gcc_unreachable (); + } + " + [(set_attr "type" "*,load2,store2,r_2_f,f_2_r,ffarithd,f_loadd,f_stored") + (set_attr "neon_type" "*,*,*,neon_mcr_2_mcrr,neon_mrrc,neon_vmov,*,*") + (set (attr "length") (cond [(eq_attr "alternative" "0,1,2") (const_int 8) + (eq_attr "alternative" "5") + (if_then_else + (eq (symbol_ref "TARGET_VFP_SINGLE") (const_int 1)) + (const_int 8) + (const_int 4))] + (const_int 4))) + (set_attr "pool_range" "*,1020,*,*,*,*,1020,*") + (set_attr "neg_pool_range" "*,1008,*,*,*,*,1008,*")] +) + +(define_insn "*arm_movdi_vfp_cortexa8" + [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r, r,m,w,!r,w,w, Uv") + (match_operand:DI 1 "di_operand" "rIK,mi,r,r,w,w,Uvi,w"))] + "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_VFP && arm_tune == cortexa8 && ( register_operand (operands[0], DImode) || register_operand (operands[1], DImode))" "*