[backport from gcc-4.8/trunk r187999 <mikpe@it.uu.se>]

Date: Wed, 30 May 2012 03:03:45 +0100
Subject: Re: [Patch ARM] Fix off by one error in neon_evpc_vrev.
From: Ramana Radhakrishnan <ramana dot radhakrishnan at linaro dot org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>

On 29 May 2012 18:30, Richard Henderson <rth@redhat.com> wrote:
> On 05/26/2012 01:27 AM, Ramana Radhakrishnan wrote:
>>
>> - =A0for (i =3D 0; i< =A0nelt; i +=3D diff)
>> + =A0for (i =3D 0; i< =A0nelt ; i +=3D (diff + 1))
>> =A0 =A0 =A0for (j =3D 0; j<=3D diff; j +=3D 1)
>> - =A0 =A0 =A0if (d->perm[i + j] !=3D i + diff - j)
>> - =A0 =A0 =A0 return false;
>> + =A0 =A0 =A0{
>> + =A0 =A0 =A0 /* This is guaranteed to be true as the value of diff
>> + =A0 =A0 =A0 =A0 =A0is 7, 3, 1 and we should have enough elements in the
>> + =A0 =A0 =A0 =A0 =A0queue to generate this. Getting a vector mask with a
>> + =A0 =A0 =A0 =A0 =A0value of diff other than these values implies that
>> + =A0 =A0 =A0 =A0 =A0something is wrong by the time we get here. =A0*/
>> + =A0 =A0 =A0 gcc_assert ((i + j)< =A0nelt);
>
>
> Yep, that all looks correct. =A0Unnecessary () in both lines though.

Bah - Thanks - don't know why I put those in :(  .Committed to trunk
with those changes and I would like to backport this to the 4.7 branch
after a couple of weeks to allow the auto-testers to pick this up as
it really turns on this functionality in this particular case if the
release managers don't object. This is a significant performance issue
in 4.7 for cases where we reverse vectors and would be nice to fix
there. ( 2 loads + 2 generic permutes vs a single reverse instruciton)

regards,
Ramana

gcc/

2012-05-30  Ramana Radhakrishnan  <ramana.radhakrishnan@linaro.org>

	* config/arm/arm.c (arm_evpc_neon_vrev): Adjust off by one error.

gcc/testsuite/

2012-05-30  Ramana Radhakrishnan  <ramana.radhakrishnan@linaro.org>

	* gcc.target/arm/neon-vrev.c: New.

--- gcc-4.7.0/gcc/config/arm/arm.c.~1~	2012-02-28 17:13:52.000000000 +0100
+++ gcc-4.7.0/gcc/config/arm/arm.c	2012-06-10 11:03:48.000000000 +0200
@@ -25268,10 +25268,18 @@ arm_evpc_neon_vrev (struct expand_vec_pe
       return false;
     }
 
-  for (i = 0; i < nelt; i += diff)
+  for (i = 0; i < nelt ; i += diff + 1)
     for (j = 0; j <= diff; j += 1)
-      if (d->perm[i + j] != i + diff - j)
-	return false;
+      {
+	/* This is guaranteed to be true as the value of diff
+	   is 7, 3, 1 and we should have enough elements in the
+	   queue to generate this. Getting a vector mask with a
+	   value of diff other than these values implies that
+	   something is wrong by the time we get here.  */
+	gcc_assert (i + j < nelt);
+	if (d->perm[i + j] != i + diff - j)
+	  return false;
+      }
 
   /* Success! */
   if (d->testing_p)
--- gcc-4.7.0/gcc/testsuite/gcc.target/arm/neon-vrev.c.~1~	1970-01-01 01:00:00.000000000 +0100
+++ gcc-4.7.0/gcc/testsuite/gcc.target/arm/neon-vrev.c	2012-06-10 11:03:48.000000000 +0200
@@ -0,0 +1,105 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_neon_ok } */
+/* { dg-options "-O2" } */
+/* { dg-add-options arm_neon } */
+
+#include <arm_neon.h>
+
+uint16x4_t
+tst_vrev642_u16 (uint16x4_t __a)
+{
+  uint16x4_t __rv;
+  uint16x4_t __mask1 = { 3, 2, 1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x8_t
+tst_vrev64q2_u16 (uint16x8_t __a)
+{
+  uint16x8_t __rv;
+  uint16x8_t __mask1 = {3, 2, 1, 0, 7, 6, 5, 4 };
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x8_t
+tst_vrev642_u8 (uint8x8_t __a)
+{
+  uint8x8_t __rv;
+  uint8x8_t __mask1 = { 7, 6, 5, 4, 3, 2, 1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint8x16_t
+tst_vrev64q2_u8 (uint8x16_t __a)
+{
+  uint8x16_t __rv;
+  uint8x16_t __mask1 = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8};
+  return __builtin_shuffle ( __a, __mask1) ;
+
+}
+
+uint32x2_t
+tst_vrev642_u32 (uint32x2_t __a)
+{
+  uint32x2_t __rv;
+  uint32x2_t __mask1 = {1, 0};
+  return __builtin_shuffle ( __a, __mask1) ;
+
+}
+
+uint32x4_t
+tst_vrev64q2_u32 (uint32x4_t __a)
+{
+  uint32x4_t __rv;
+  uint32x4_t __mask1 = {1, 0, 3, 2};
+  return __builtin_shuffle ( __a, __mask1) ;
+}
+
+uint16x4_t
+tst_vrev322_u16 (uint16x4_t __a)
+{
+  uint16x4_t __mask1 = { 1, 0, 3, 2 };
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint16x8_t
+tst_vrev32q2_u16 (uint16x8_t __a)
+{
+  uint16x8_t __mask1 = { 1, 0, 3, 2, 5, 4, 7, 6 }; 
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint8x8_t
+tst_vrev322_u8 (uint8x8_t __a)
+{
+  uint8x8_t __mask1 = { 3, 2, 1, 0, 7, 6, 5, 4};
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint8x16_t
+tst_vrev32q2_u8 (uint8x16_t __a)
+{
+  uint8x16_t __mask1 = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12};
+  return __builtin_shuffle (__a, __mask1);
+}
+
+uint8x8_t
+tst_vrev162_u8 (uint8x8_t __a)
+{
+  uint8x8_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6};
+  return __builtin_shuffle (__a, __mask);
+}
+
+uint8x16_t
+tst_vrev16q2_u8 (uint8x16_t __a)
+{
+  uint8x16_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+  return __builtin_shuffle (__a, __mask);
+}
+
+/* { dg-final {scan-assembler-times "vrev32\.16\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev32\.8\\t" 2} }  */ 
+/* { dg-final {scan-assembler-times "vrev16\.8\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev64\.8\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev64\.32\\t" 2} }  */
+/* { dg-final {scan-assembler-times "vrev64\.16\\t" 2} }  */