[backport proposed 4.7 patch for ARM PR48941 ] From: Richard Sandiford Subject: [ARM] PR 48941: poor code generated for vzip*() and vunzp*() Date: Fri, 09 Dec 2011 16:41:23 +0000 List-Archive: The buit-in functions that underlie ARM's vzip*() and vunzp*() functions pass the result by reference rather than value. This leads to very poor output, as demonstrated in PR 48941. This patch makes them return the vectors by value instead, using the structure modes TImode and OImode. Tested on arm-linux-gnueabi. OK to install? Richard gcc/ PR target/48941 * config/arm/arm.c (arm_init_neon_builtins): Make RESULTPAIR intrinsics return by value rather than reference. (arm_expand_neon_builtin): Update accordingly. (neon_emit_pair_result_insn): Likewise. * config/arm/neon.md (neon_vtrn): Split into double and quad patterns. Make the former return TImode and the latter OImode. (neon_vzip, neon_vuzp): Likewise. * config/arm/neon.ml (features): Remove ReturnPtr. (ops): Remove ReturnPtr from Vtrn, Vzip and Vuzp feature lists. * config/arm/neon-gen.ml (return_by_ptr): Delete. (return, params, print_variant): Remove return-by-pointer handling. * config/arm/arm_neon.h: Regenerate. --- gcc-4.6.2/gcc/config/arm/arm.c.~1~ 2011-10-20 11:24:06.000000000 +0200 +++ gcc-4.6.2/gcc/config/arm/arm.c 2011-12-11 15:57:30.000000000 +0100 @@ -18643,26 +18643,16 @@ arm_init_neon_builtins (void) tree intCI_type_node; tree intXI_type_node; - tree V8QI_pointer_node; - tree V4HI_pointer_node; - tree V2SI_pointer_node; - tree V2SF_pointer_node; - tree V16QI_pointer_node; - tree V8HI_pointer_node; - tree V4SI_pointer_node; - tree V4SF_pointer_node; - tree V2DI_pointer_node; - - tree void_ftype_pv8qi_v8qi_v8qi; - tree void_ftype_pv4hi_v4hi_v4hi; - tree void_ftype_pv2si_v2si_v2si; - tree void_ftype_pv2sf_v2sf_v2sf; - tree void_ftype_pdi_di_di; - tree void_ftype_pv16qi_v16qi_v16qi; - tree void_ftype_pv8hi_v8hi_v8hi; - tree void_ftype_pv4si_v4si_v4si; - tree void_ftype_pv4sf_v4sf_v4sf; - tree void_ftype_pv2di_v2di_v2di; + tree ti_ftype_v8qi_v8qi; + tree ti_ftype_v4hi_v4hi; + tree ti_ftype_v2si_v2si; + tree ti_ftype_v2sf_v2sf; + tree ti_ftype_di_di; + tree oi_ftype_v16qi_v16qi; + tree oi_ftype_v8hi_v8hi; + tree oi_ftype_v4si_v4si; + tree oi_ftype_v4sf_v4sf; + tree oi_ftype_v2di_v2di; tree reinterp_ftype_dreg[5][5]; tree reinterp_ftype_qreg[5][5]; @@ -18776,47 +18766,36 @@ arm_init_neon_builtins (void) (*lang_hooks.types.register_builtin_type) (intXI_type_node, "__builtin_neon_xi"); - /* Pointers to vector types. */ - V8QI_pointer_node = build_pointer_type (V8QI_type_node); - V4HI_pointer_node = build_pointer_type (V4HI_type_node); - V2SI_pointer_node = build_pointer_type (V2SI_type_node); - V2SF_pointer_node = build_pointer_type (V2SF_type_node); - V16QI_pointer_node = build_pointer_type (V16QI_type_node); - V8HI_pointer_node = build_pointer_type (V8HI_type_node); - V4SI_pointer_node = build_pointer_type (V4SI_type_node); - V4SF_pointer_node = build_pointer_type (V4SF_type_node); - V2DI_pointer_node = build_pointer_type (V2DI_type_node); - /* Operations which return results as pairs. */ - void_ftype_pv8qi_v8qi_v8qi = - build_function_type_list (void_type_node, V8QI_pointer_node, V8QI_type_node, + ti_ftype_v8qi_v8qi = + build_function_type_list (intTI_type_node, V8QI_type_node, V8QI_type_node, NULL); - void_ftype_pv4hi_v4hi_v4hi = - build_function_type_list (void_type_node, V4HI_pointer_node, V4HI_type_node, + ti_ftype_v4hi_v4hi = + build_function_type_list (intTI_type_node, V4HI_type_node, V4HI_type_node, NULL); - void_ftype_pv2si_v2si_v2si = - build_function_type_list (void_type_node, V2SI_pointer_node, V2SI_type_node, + ti_ftype_v2si_v2si = + build_function_type_list (intTI_type_node, V2SI_type_node, V2SI_type_node, NULL); - void_ftype_pv2sf_v2sf_v2sf = - build_function_type_list (void_type_node, V2SF_pointer_node, V2SF_type_node, + ti_ftype_v2sf_v2sf = + build_function_type_list (intTI_type_node, V2SF_type_node, V2SF_type_node, NULL); - void_ftype_pdi_di_di = - build_function_type_list (void_type_node, intDI_pointer_node, - neon_intDI_type_node, neon_intDI_type_node, NULL); - void_ftype_pv16qi_v16qi_v16qi = - build_function_type_list (void_type_node, V16QI_pointer_node, - V16QI_type_node, V16QI_type_node, NULL); - void_ftype_pv8hi_v8hi_v8hi = - build_function_type_list (void_type_node, V8HI_pointer_node, V8HI_type_node, + ti_ftype_di_di = + build_function_type_list (intTI_type_node, neon_intDI_type_node, + neon_intDI_type_node, NULL); + oi_ftype_v16qi_v16qi = + build_function_type_list (intOI_type_node, V16QI_type_node, + V16QI_type_node, NULL); + oi_ftype_v8hi_v8hi = + build_function_type_list (intOI_type_node, V8HI_type_node, V8HI_type_node, NULL); - void_ftype_pv4si_v4si_v4si = - build_function_type_list (void_type_node, V4SI_pointer_node, V4SI_type_node, + oi_ftype_v4si_v4si = + build_function_type_list (intOI_type_node, V4SI_type_node, V4SI_type_node, NULL); - void_ftype_pv4sf_v4sf_v4sf = - build_function_type_list (void_type_node, V4SF_pointer_node, V4SF_type_node, + oi_ftype_v4sf_v4sf = + build_function_type_list (intOI_type_node, V4SF_type_node, V4SF_type_node, NULL); - void_ftype_pv2di_v2di_v2di = - build_function_type_list (void_type_node, V2DI_pointer_node, V2DI_type_node, + oi_ftype_v2di_v2di = + build_function_type_list (intOI_type_node, V2DI_type_node, V2DI_type_node, NULL); dreg_types[0] = V8QI_type_node; @@ -19040,16 +19019,16 @@ arm_init_neon_builtins (void) { switch (insn_data[icode].operand[1].mode) { - case V8QImode: ftype = void_ftype_pv8qi_v8qi_v8qi; break; - case V4HImode: ftype = void_ftype_pv4hi_v4hi_v4hi; break; - case V2SImode: ftype = void_ftype_pv2si_v2si_v2si; break; - case V2SFmode: ftype = void_ftype_pv2sf_v2sf_v2sf; break; - case DImode: ftype = void_ftype_pdi_di_di; break; - case V16QImode: ftype = void_ftype_pv16qi_v16qi_v16qi; break; - case V8HImode: ftype = void_ftype_pv8hi_v8hi_v8hi; break; - case V4SImode: ftype = void_ftype_pv4si_v4si_v4si; break; - case V4SFmode: ftype = void_ftype_pv4sf_v4sf_v4sf; break; - case V2DImode: ftype = void_ftype_pv2di_v2di_v2di; break; + case V8QImode: ftype = ti_ftype_v8qi_v8qi; break; + case V4HImode: ftype = ti_ftype_v4hi_v4hi; break; + case V2SImode: ftype = ti_ftype_v2si_v2si; break; + case V2SFmode: ftype = ti_ftype_v2sf_v2sf; break; + case DImode: ftype = ti_ftype_di_di; break; + case V16QImode: ftype = oi_ftype_v16qi_v16qi; break; + case V8HImode: ftype = oi_ftype_v8hi_v8hi; break; + case V4SImode: ftype = oi_ftype_v4si_v4si; break; + case V4SFmode: ftype = oi_ftype_v4sf_v4sf; break; + case V2DImode: ftype = oi_ftype_v2di_v2di; break; default: gcc_unreachable (); } } @@ -19488,9 +19467,8 @@ arm_expand_neon_builtin (int fcode, tree NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_STOP); case NEON_RESULTPAIR: - return arm_expand_neon_args (target, icode, 0, exp, - NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, - NEON_ARG_STOP); + return arm_expand_neon_args (target, icode, 1, exp, + NEON_ARG_COPY_TO_REG, NEON_ARG_COPY_TO_REG, NEON_ARG_STOP); case NEON_LANEMUL: case NEON_LANEMULL: @@ -19557,18 +19535,16 @@ neon_reinterpret (rtx dest, rtx src) registers). */ void neon_emit_pair_result_insn (enum machine_mode mode, - rtx (*intfn) (rtx, rtx, rtx, rtx), rtx destaddr, + rtx (*intfn) (rtx, rtx, rtx, rtx), rtx pair, rtx op1, rtx op2) { - rtx mem = gen_rtx_MEM (mode, destaddr); - rtx tmp1 = gen_reg_rtx (mode); - rtx tmp2 = gen_reg_rtx (mode); - - emit_insn (intfn (tmp1, op1, op2, tmp2)); - - emit_move_insn (mem, tmp1); - mem = adjust_address (mem, mode, GET_MODE_SIZE (mode)); - emit_move_insn (mem, tmp2); + rtx first, second; + + first = simplify_gen_subreg (mode, pair, GET_MODE (pair), 0); + second = simplify_gen_subreg (mode, pair, GET_MODE (pair), + GET_MODE_SIZE (mode)); + + emit_insn (intfn (first, op1, op2, second)); } /* Set up operands for a register copy from src to dest, taking care not to --- gcc-4.6.2/gcc/config/arm/arm_neon.h.~1~ 2010-07-03 02:31:43.000000000 +0200 +++ gcc-4.6.2/gcc/config/arm/arm_neon.h 2011-12-11 15:57:30.000000000 +0100 @@ -7395,433 +7395,433 @@ vbslq_p16 (uint16x8_t __a, poly16x8_t __ __extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) vtrn_s8 (int8x8_t __a, int8x8_t __b) { - int8x8x2_t __rv; - __builtin_neon_vtrnv8qi (&__rv.val[0], __a, __b); - return __rv; + union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vtrnv8qi (__a, __b); + return __rv.__i; } __extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) vtrn_s16 (int16x4_t __a, int16x4_t __b) { - int16x4x2_t __rv; - __builtin_neon_vtrnv4hi (&__rv.val[0], __a, __b); - return __rv; + union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vtrnv4hi (__a, __b); + return __rv.__i; } __extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) vtrn_s32 (int32x2_t __a, int32x2_t __b) { - int32x2x2_t __rv; - __builtin_neon_vtrnv2si (&__rv.val[0], __a, __b); - return __rv; + union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vtrnv2si (__a, __b); + return __rv.__i; } __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vtrn_f32 (float32x2_t __a, float32x2_t __b) { - float32x2x2_t __rv; - __builtin_neon_vtrnv2sf (&__rv.val[0], __a, __b); - return __rv; + union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vtrnv2sf (__a, __b); + return __rv.__i; } __extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) vtrn_u8 (uint8x8_t __a, uint8x8_t __b) { - uint8x8x2_t __rv; - __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); - return __rv; + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vtrnv8qi ((int8x8_t) __a, (int8x8_t) __b); + return __rv.__i; } __extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) vtrn_u16 (uint16x4_t __a, uint16x4_t __b) { - uint16x4x2_t __rv; - __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); - return __rv; + union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vtrnv4hi ((int16x4_t) __a, (int16x4_t) __b); + return __rv.__i; } __extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) vtrn_u32 (uint32x2_t __a, uint32x2_t __b) { - uint32x2x2_t __rv; - __builtin_neon_vtrnv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b); - return __rv; + union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vtrnv2si ((int32x2_t) __a, (int32x2_t) __b); + return __rv.__i; } __extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) vtrn_p8 (poly8x8_t __a, poly8x8_t __b) { - poly8x8x2_t __rv; - __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); - return __rv; + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vtrnv8qi ((int8x8_t) __a, (int8x8_t) __b); + return __rv.__i; } __extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) vtrn_p16 (poly16x4_t __a, poly16x4_t __b) { - poly16x4x2_t __rv; - __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); - return __rv; + union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vtrnv4hi ((int16x4_t) __a, (int16x4_t) __b); + return __rv.__i; } __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) vtrnq_s8 (int8x16_t __a, int8x16_t __b) { - int8x16x2_t __rv; - __builtin_neon_vtrnv16qi (&__rv.val[0], __a, __b); - return __rv; + union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vtrnv16qi (__a, __b); + return __rv.__i; } __extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) vtrnq_s16 (int16x8_t __a, int16x8_t __b) { - int16x8x2_t __rv; - __builtin_neon_vtrnv8hi (&__rv.val[0], __a, __b); - return __rv; + union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vtrnv8hi (__a, __b); + return __rv.__i; } __extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) vtrnq_s32 (int32x4_t __a, int32x4_t __b) { - int32x4x2_t __rv; - __builtin_neon_vtrnv4si (&__rv.val[0], __a, __b); - return __rv; + union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vtrnv4si (__a, __b); + return __rv.__i; } __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) vtrnq_f32 (float32x4_t __a, float32x4_t __b) { - float32x4x2_t __rv; - __builtin_neon_vtrnv4sf (&__rv.val[0], __a, __b); - return __rv; + union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vtrnv4sf (__a, __b); + return __rv.__i; } __extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__)) vtrnq_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16x2_t __rv; - __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); - return __rv; + union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vtrnv16qi ((int8x16_t) __a, (int8x16_t) __b); + return __rv.__i; } __extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) vtrnq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint16x8x2_t __rv; - __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); - return __rv; + union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vtrnv8hi ((int16x8_t) __a, (int16x8_t) __b); + return __rv.__i; } __extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) vtrnq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint32x4x2_t __rv; - __builtin_neon_vtrnv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b); - return __rv; + union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vtrnv4si ((int32x4_t) __a, (int32x4_t) __b); + return __rv.__i; } __extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__)) vtrnq_p8 (poly8x16_t __a, poly8x16_t __b) { - poly8x16x2_t __rv; - __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); - return __rv; + union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vtrnv16qi ((int8x16_t) __a, (int8x16_t) __b); + return __rv.__i; } __extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) vtrnq_p16 (poly16x8_t __a, poly16x8_t __b) { - poly16x8x2_t __rv; - __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); - return __rv; + union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vtrnv8hi ((int16x8_t) __a, (int16x8_t) __b); + return __rv.__i; } __extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) vzip_s8 (int8x8_t __a, int8x8_t __b) { - int8x8x2_t __rv; - __builtin_neon_vzipv8qi (&__rv.val[0], __a, __b); - return __rv; + union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vzipv8qi (__a, __b); + return __rv.__i; } __extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) vzip_s16 (int16x4_t __a, int16x4_t __b) { - int16x4x2_t __rv; - __builtin_neon_vzipv4hi (&__rv.val[0], __a, __b); - return __rv; + union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vzipv4hi (__a, __b); + return __rv.__i; } __extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) vzip_s32 (int32x2_t __a, int32x2_t __b) { - int32x2x2_t __rv; - __builtin_neon_vzipv2si (&__rv.val[0], __a, __b); - return __rv; + union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vzipv2si (__a, __b); + return __rv.__i; } __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vzip_f32 (float32x2_t __a, float32x2_t __b) { - float32x2x2_t __rv; - __builtin_neon_vzipv2sf (&__rv.val[0], __a, __b); - return __rv; + union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vzipv2sf (__a, __b); + return __rv.__i; } __extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) vzip_u8 (uint8x8_t __a, uint8x8_t __b) { - uint8x8x2_t __rv; - __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); - return __rv; + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vzipv8qi ((int8x8_t) __a, (int8x8_t) __b); + return __rv.__i; } __extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) vzip_u16 (uint16x4_t __a, uint16x4_t __b) { - uint16x4x2_t __rv; - __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); - return __rv; + union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vzipv4hi ((int16x4_t) __a, (int16x4_t) __b); + return __rv.__i; } __extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) vzip_u32 (uint32x2_t __a, uint32x2_t __b) { - uint32x2x2_t __rv; - __builtin_neon_vzipv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b); - return __rv; + union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vzipv2si ((int32x2_t) __a, (int32x2_t) __b); + return __rv.__i; } __extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) vzip_p8 (poly8x8_t __a, poly8x8_t __b) { - poly8x8x2_t __rv; - __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); - return __rv; + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vzipv8qi ((int8x8_t) __a, (int8x8_t) __b); + return __rv.__i; } __extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) vzip_p16 (poly16x4_t __a, poly16x4_t __b) { - poly16x4x2_t __rv; - __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); - return __rv; + union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vzipv4hi ((int16x4_t) __a, (int16x4_t) __b); + return __rv.__i; } __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) vzipq_s8 (int8x16_t __a, int8x16_t __b) { - int8x16x2_t __rv; - __builtin_neon_vzipv16qi (&__rv.val[0], __a, __b); - return __rv; + union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vzipv16qi (__a, __b); + return __rv.__i; } __extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) vzipq_s16 (int16x8_t __a, int16x8_t __b) { - int16x8x2_t __rv; - __builtin_neon_vzipv8hi (&__rv.val[0], __a, __b); - return __rv; + union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vzipv8hi (__a, __b); + return __rv.__i; } __extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) vzipq_s32 (int32x4_t __a, int32x4_t __b) { - int32x4x2_t __rv; - __builtin_neon_vzipv4si (&__rv.val[0], __a, __b); - return __rv; + union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vzipv4si (__a, __b); + return __rv.__i; } __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) vzipq_f32 (float32x4_t __a, float32x4_t __b) { - float32x4x2_t __rv; - __builtin_neon_vzipv4sf (&__rv.val[0], __a, __b); - return __rv; + union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vzipv4sf (__a, __b); + return __rv.__i; } __extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__)) vzipq_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16x2_t __rv; - __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); - return __rv; + union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vzipv16qi ((int8x16_t) __a, (int8x16_t) __b); + return __rv.__i; } __extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) vzipq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint16x8x2_t __rv; - __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); - return __rv; + union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vzipv8hi ((int16x8_t) __a, (int16x8_t) __b); + return __rv.__i; } __extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) vzipq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint32x4x2_t __rv; - __builtin_neon_vzipv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b); - return __rv; + union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vzipv4si ((int32x4_t) __a, (int32x4_t) __b); + return __rv.__i; } __extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__)) vzipq_p8 (poly8x16_t __a, poly8x16_t __b) { - poly8x16x2_t __rv; - __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); - return __rv; + union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vzipv16qi ((int8x16_t) __a, (int8x16_t) __b); + return __rv.__i; } __extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) vzipq_p16 (poly16x8_t __a, poly16x8_t __b) { - poly16x8x2_t __rv; - __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); - return __rv; + union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vzipv8hi ((int16x8_t) __a, (int16x8_t) __b); + return __rv.__i; } __extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__)) vuzp_s8 (int8x8_t __a, int8x8_t __b) { - int8x8x2_t __rv; - __builtin_neon_vuzpv8qi (&__rv.val[0], __a, __b); - return __rv; + union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vuzpv8qi (__a, __b); + return __rv.__i; } __extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) vuzp_s16 (int16x4_t __a, int16x4_t __b) { - int16x4x2_t __rv; - __builtin_neon_vuzpv4hi (&__rv.val[0], __a, __b); - return __rv; + union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vuzpv4hi (__a, __b); + return __rv.__i; } __extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__)) vuzp_s32 (int32x2_t __a, int32x2_t __b) { - int32x2x2_t __rv; - __builtin_neon_vuzpv2si (&__rv.val[0], __a, __b); - return __rv; + union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vuzpv2si (__a, __b); + return __rv.__i; } __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vuzp_f32 (float32x2_t __a, float32x2_t __b) { - float32x2x2_t __rv; - __builtin_neon_vuzpv2sf (&__rv.val[0], __a, __b); - return __rv; + union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vuzpv2sf (__a, __b); + return __rv.__i; } __extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__)) vuzp_u8 (uint8x8_t __a, uint8x8_t __b) { - uint8x8x2_t __rv; - __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); - return __rv; + union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vuzpv8qi ((int8x8_t) __a, (int8x8_t) __b); + return __rv.__i; } __extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__)) vuzp_u16 (uint16x4_t __a, uint16x4_t __b) { - uint16x4x2_t __rv; - __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); - return __rv; + union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vuzpv4hi ((int16x4_t) __a, (int16x4_t) __b); + return __rv.__i; } __extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__)) vuzp_u32 (uint32x2_t __a, uint32x2_t __b) { - uint32x2x2_t __rv; - __builtin_neon_vuzpv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b); - return __rv; + union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vuzpv2si ((int32x2_t) __a, (int32x2_t) __b); + return __rv.__i; } __extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__)) vuzp_p8 (poly8x8_t __a, poly8x8_t __b) { - poly8x8x2_t __rv; - __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b); - return __rv; + union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vuzpv8qi ((int8x8_t) __a, (int8x8_t) __b); + return __rv.__i; } __extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__)) vuzp_p16 (poly16x4_t __a, poly16x4_t __b) { - poly16x4x2_t __rv; - __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b); - return __rv; + union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vuzpv4hi ((int16x4_t) __a, (int16x4_t) __b); + return __rv.__i; } __extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__)) vuzpq_s8 (int8x16_t __a, int8x16_t __b) { - int8x16x2_t __rv; - __builtin_neon_vuzpv16qi (&__rv.val[0], __a, __b); - return __rv; + union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vuzpv16qi (__a, __b); + return __rv.__i; } __extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__)) vuzpq_s16 (int16x8_t __a, int16x8_t __b) { - int16x8x2_t __rv; - __builtin_neon_vuzpv8hi (&__rv.val[0], __a, __b); - return __rv; + union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vuzpv8hi (__a, __b); + return __rv.__i; } __extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__)) vuzpq_s32 (int32x4_t __a, int32x4_t __b) { - int32x4x2_t __rv; - __builtin_neon_vuzpv4si (&__rv.val[0], __a, __b); - return __rv; + union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vuzpv4si (__a, __b); + return __rv.__i; } __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) vuzpq_f32 (float32x4_t __a, float32x4_t __b) { - float32x4x2_t __rv; - __builtin_neon_vuzpv4sf (&__rv.val[0], __a, __b); - return __rv; + union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vuzpv4sf (__a, __b); + return __rv.__i; } __extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__)) vuzpq_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16x2_t __rv; - __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); - return __rv; + union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vuzpv16qi ((int8x16_t) __a, (int8x16_t) __b); + return __rv.__i; } __extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__)) vuzpq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint16x8x2_t __rv; - __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); - return __rv; + union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vuzpv8hi ((int16x8_t) __a, (int16x8_t) __b); + return __rv.__i; } __extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__)) vuzpq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint32x4x2_t __rv; - __builtin_neon_vuzpv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b); - return __rv; + union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vuzpv4si ((int32x4_t) __a, (int32x4_t) __b); + return __rv.__i; } __extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__)) vuzpq_p8 (poly8x16_t __a, poly8x16_t __b) { - poly8x16x2_t __rv; - __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b); - return __rv; + union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vuzpv16qi ((int8x16_t) __a, (int8x16_t) __b); + return __rv.__i; } __extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__)) vuzpq_p16 (poly16x8_t __a, poly16x8_t __b) { - poly16x8x2_t __rv; - __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b); - return __rv; + union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vuzpv8hi ((int16x8_t) __a, (int16x8_t) __b); + return __rv.__i; } __extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) --- gcc-4.6.2/gcc/config/arm/neon-gen.ml.~1~ 2009-11-03 18:58:59.000000000 +0100 +++ gcc-4.6.2/gcc/config/arm/neon-gen.ml 2011-12-11 15:57:30.000000000 +0100 @@ -98,8 +98,6 @@ let print_function arity fnname body = close_braceblock ffmt; end_function ffmt -let return_by_ptr features = List.mem ReturnPtr features - let union_string num elts base = let itype = inttype_for_array num elts in let iname = string_of_inttype itype @@ -141,19 +139,14 @@ let cast_for_return to_ty = "(" ^ (strin (* Return a tuple of a list of declarations to go at the start of the function, and a list of statements needed to return THING. *) -let return arity return_by_ptr thing = +let return arity thing = match arity with Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _) | Arity4 (ret, _, _, _, _) -> match ret with T_arrayof (num, vec) -> - if return_by_ptr then - let sname = string_of_vectype ret in - [Printf.sprintf "%s __rv;" sname], - [thing ^ ";"; "return __rv;"] - else - let uname = union_string num vec "__rv" in - [uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "return __rv.__i;"] + let uname = union_string num vec "__rv" in + [uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "return __rv.__i;"] | T_void -> [], [thing ^ ";"] | _ -> [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"] @@ -163,7 +156,7 @@ let rec element_type ctype = T_arrayof (_, v) -> element_type v | _ -> ctype -let params return_by_ptr ps = +let params ps = let pdecls = ref [] in let ptype t p = match t with @@ -183,10 +176,7 @@ let params return_by_ptr ps = match ps with Arity0 ret | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _) | Arity4 (ret, _, _, _, _) -> - if return_by_ptr then - !pdecls, add_cast (T_ptrto (element_type ret)) "&__rv.val[0]" :: plist - else - !pdecls, plist + !pdecls, plist let modify_params features plist = let is_flipped = @@ -242,14 +232,13 @@ let rec mode_suffix elttype shape = let print_variant opcode features shape name (ctype, asmtype, elttype) = let bits = infoword_value elttype features in let modesuf = mode_suffix elttype shape in - let return_by_ptr = return_by_ptr features in - let pdecls, paramlist = params return_by_ptr ctype in + let pdecls, paramlist = params ctype in let paramlist' = modify_params features paramlist in let paramlist'' = extra_word shape features paramlist' bits in let parstr = String.concat ", " paramlist'' in let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)" (builtin_name features name) modesuf parstr in - let rdecls, stmts = return ctype return_by_ptr builtin in + let rdecls, stmts = return ctype builtin in let body = pdecls @ rdecls @ stmts and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in print_function ctype fnname body --- gcc-4.6.2/gcc/config/arm/neon.md.~1~ 2011-06-20 13:14:50.000000000 +0200 +++ gcc-4.6.2/gcc/config/arm/neon.md 2011-12-11 15:57:30.000000000 +0100 @@ -4094,9 +4094,20 @@ (define_insn "neon_vtrn_internal" ) (define_expand "neon_vtrn" - [(match_operand:SI 0 "s_register_operand" "r") - (match_operand:VDQW 1 "s_register_operand" "w") - (match_operand:VDQW 2 "s_register_operand" "w")] + [(match_operand:TI 0 "s_register_operand") + (match_operand:VD 1 "s_register_operand") + (match_operand:VD 2 "s_register_operand")] + "TARGET_NEON" +{ + neon_emit_pair_result_insn (mode, gen_neon_vtrn_internal, + operands[0], operands[1], operands[2]); + DONE; +}) + +(define_expand "neon_vtrn" + [(match_operand:OI 0 "s_register_operand") + (match_operand:VQ 1 "s_register_operand") + (match_operand:VQ 2 "s_register_operand")] "TARGET_NEON" { neon_emit_pair_result_insn (mode, gen_neon_vtrn_internal, @@ -4121,9 +4132,20 @@ (define_insn "neon_vzip_internal" ) (define_expand "neon_vzip" - [(match_operand:SI 0 "s_register_operand" "r") - (match_operand:VDQW 1 "s_register_operand" "w") - (match_operand:VDQW 2 "s_register_operand" "w")] + [(match_operand:TI 0 "s_register_operand") + (match_operand:VD 1 "s_register_operand") + (match_operand:VD 2 "s_register_operand")] + "TARGET_NEON" +{ + neon_emit_pair_result_insn (mode, gen_neon_vzip_internal, + operands[0], operands[1], operands[2]); + DONE; +}) + +(define_expand "neon_vzip" + [(match_operand:OI 0 "s_register_operand") + (match_operand:VQ 1 "s_register_operand") + (match_operand:VQ 2 "s_register_operand")] "TARGET_NEON" { neon_emit_pair_result_insn (mode, gen_neon_vzip_internal, @@ -4148,9 +4170,20 @@ (define_insn "neon_vuzp_internal" ) (define_expand "neon_vuzp" - [(match_operand:SI 0 "s_register_operand" "r") - (match_operand:VDQW 1 "s_register_operand" "w") - (match_operand:VDQW 2 "s_register_operand" "w")] + [(match_operand:TI 0 "s_register_operand") + (match_operand:VD 1 "s_register_operand") + (match_operand:VD 2 "s_register_operand")] + "TARGET_NEON" +{ + neon_emit_pair_result_insn (mode, gen_neon_vuzp_internal, + operands[0], operands[1], operands[2]); + DONE; +}) + +(define_expand "neon_vuzp" + [(match_operand:OI 0 "s_register_operand") + (match_operand:VQ 1 "s_register_operand") + (match_operand:VQ 2 "s_register_operand")] "TARGET_NEON" { neon_emit_pair_result_insn (mode, gen_neon_vuzp_internal, --- gcc-4.6.2/gcc/config/arm/neon.ml.~1~ 2010-07-03 02:31:43.000000000 +0200 +++ gcc-4.6.2/gcc/config/arm/neon.ml 2011-12-11 15:57:30.000000000 +0100 @@ -214,7 +214,6 @@ type features = | Flipped of string (* Builtin name to use with flipped arguments. *) | InfoWord (* Pass an extra word for signage/rounding etc. (always passed for All _, Long, Wide, Narrow shape_forms. *) - | ReturnPtr (* Pass explicit pointer to return value as first argument. *) (* A specification as to the shape of instruction expected upon disassembly, used if it differs from the shape used to build the intrinsic prototype. Multiple entries in the constructor's argument @@ -1308,25 +1307,16 @@ let ops = Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select, pf_su_8_64; - (* Transpose elements. **NOTE** ReturnPtr goes some of the way towards - generating good code for intrinsics which return structure types -- - builtins work well by themselves (and understand that the values being - stored on e.g. the stack also reside in registers, so can optimise the - stores away entirely if the results are used immediately), but - intrinsics are very much less efficient. Maybe something can be improved - re: inlining, or tweaking the ABI used for intrinsics (a special call - attribute?). - *) - Vtrn, [ReturnPtr], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32; - Vtrn, [ReturnPtr], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32; + Vtrn, [], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32; + Vtrn, [], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32; (* Zip elements. *) - Vzip, [ReturnPtr], Pair_result Dreg, "vzip", bits_2, pf_su_8_32; - Vzip, [ReturnPtr], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32; + Vzip, [], Pair_result Dreg, "vzip", bits_2, pf_su_8_32; + Vzip, [], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32; (* Unzip elements. *) - Vuzp, [ReturnPtr], Pair_result Dreg, "vuzp", bits_2, pf_su_8_32; - Vuzp, [ReturnPtr], Pair_result Qreg, "vuzpQ", bits_2, pf_su_8_32; + Vuzp, [], Pair_result Dreg, "vuzp", bits_2, pf_su_8_32; + Vuzp, [], Pair_result Qreg, "vuzpQ", bits_2, pf_su_8_32; (* Element/structure loads. VLD1 variants. *) Vldx 1,