/* Copyright (c) 2010 CodeSourcery, Inc.
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of CodeSourcery nor the
 *       names of its contributors may be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY CODESOURCERY, INC. ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL CODESOURCERY BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifdef __NO_FPRS__
#error Need a hard-float machine.
#endif

typedef union {
  unsigned int word;
  float value;
} ieee_float_shape_type;

typedef union {
  struct {
#if defined(__BIG_ENDIAN__)
    unsigned int msw;
    unsigned int lsw;
#elif defined(__LITTLE_ENDIAN__)
    unsigned int lsw;
    unsigned int msw;
#else
#error Must define endianness appropriately.
#endif
  } parts;
  double value;
} ieee_double_shape_type;

static const ieee_float_shape_type a_nan = {.word = 0x7fc00000 };
static const ieee_float_shape_type a_inf = {.word = 0x7f800000 };
static const float one = 1.0;
static const float two108 = 3.245185536584267269e+32;
static const float twom54 = 5.551115123125782702e-17;
static const float half = 0.5;
static const float zero = 0.0;

#define f_wash(x)                               \
  ({ double f;                                  \
  asm volatile ("fmul %[r],%[a],%[b]"          \
                : [r] "=f" (f)                  \
                : [a] "f" (x), [b] "f" (one));  \
  f; })

/* The method is based on the descriptions in:

   _The Handbook of Floating-Point Arithmetic_ by Muller et al., chapter 5;
   _IA-64 and Elementary Functions: Speed and Precision_ by Markstein, chapter 9

   We find the actual square root and half of its reciprocal
   simultaneously.  */

#ifdef __STDC__
double
__ieee754_sqrt (double b)
#else
double
__ieee754_sqrt (b)
     double b;
#endif
{
  if (__builtin_expect (b > zero, 1))
    {
      double y, g, h, d, r;
      ieee_double_shape_type u;

      if (__builtin_expect (b != a_inf.value, 1))
        {
          u.value = b;

          __asm__ ("frsqrte %[estimate], %[x]\n"
                   : [estimate] "=f" (y) : [x] "f" (b));

          /* Following Muller et al, page 168, equation 5.20.

             h goes to 1/(2*sqrt(b))
             g goes to sqrt(b).

             We need three iterations to get within 1ulp.  */

          /* Indicate that these can be performed prior to the branch.  GCC
             insists on sinking them below the branch, however; it seems like
             they'd be better before the branch so that we can cover any latency
             from storing the argument and loading its high word.  Oh well.  */

          g = b * y;
          h = half * y;
  
          /* Handle small numbers by scaling.  */
          if (__builtin_expect ((u.parts.msw & 0x7ff00000) <= 0x02000000, 0))
            return __ieee754_sqrt (b * two108) * twom54;

#define FMADD(a_, c_, b_)                                               \
          ({ double __r;                                                \
          __asm__ ("fmadd %[r], %[a], %[c], %[b]\n"                     \
                   : [r] "=f" (__r) : [a] "f" (a_), [c] "f" (c_), [b] "f" (b_)); \
          __r;})
#define FNMSUB(a_, c_, b_)                                          \
          ({ double __r;                                                \
          __asm__ ("fnmsub %[r], %[a], %[c], %[b]\n"                     \
                   : [r] "=f" (__r) : [a] "f" (a_), [c] "f" (c_), [b] "f" (b_)); \
          __r;})

          r = FNMSUB (g, h, half);
          g = FMADD (g, r, g);
          h = FMADD (h, r, h);

          r = FNMSUB (g, h, half);
          g = FMADD (g, r, g);
          h = FMADD (h, r, h);

          r = FNMSUB (g, h, half);
          g = FMADD (g, r, g);
          h = FMADD (h, r, h);

          /* g is now +/- 1ulp, or exactly equal to, the square root of b.  */

          /* Final refinement.  */
          d = FNMSUB (g, g, b);

          return FMADD (d, h, g);
        }
    }
  else if (b < zero)
    {
      b = a_nan.value;
    }
  return f_wash (b);
}
