/*
This file is part of mfaktc.
Copyright (C) 2012  George Woltman (woltman@alum.mit.edu)

mfaktc is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
*/


__device__ static int cmp_ge_128(int128 a, int128 b)
/* checks if a is greater or equal than b */
{
  if(a.d3 != b.d3) return(a.d3 > b.d3);
  if(a.d2 != b.d2) return(a.d2 > b.d2);
  if(a.d1 != b.d1) return(a.d1 > b.d1);
  return(a.d0 >= b.d0);
}


__device__ static void sub_128(int128 *res, int128 a, int128 b)
/* a must be greater or equal b!
res = a - b */
{
  res->d0 = __sub_cc (a.d0, b.d0);
  res->d1 = __subc_cc(a.d1, b.d1);
  res->d2 = __subc_cc(a.d2, b.d2);
  res->d3 = __subc   (a.d3, b.d3);
}

__device__ static void square_128_256(int256 *res, int128 a)
/* res = a^2, assuming that a is < 2^127 (a.d3 < 2^31)! */
{
  asm("{\n\t"
      "mul.lo.u32      %1, %8, %9;\n\t"         /* (a.d0 * a.d1).lo */
      "mul.lo.u32      %2, %8, %10;\n\t"        /* (a.d0 * a.d2).lo */
      "mul.lo.u32      %3, %8, %11;\n\t"        /* (a.d0 * a.d3).lo */

      "mad.hi.cc.u32   %2, %8, %9, %2;\n\t"     /* (a.d0 * a.d1).hi */
      "madc.hi.cc.u32  %3, %8, %10, %3;\n\t"    /* (a.d0 * a.d2).hi */
      "madc.hi.u32     %4, %8, %11, 0;\n\t"     /* (a.d0 * a.d3).hi */

      "mad.lo.cc.u32   %3, %9, %10, %3;\n\t"    /* (a.d1 * a.d2).lo */
      "madc.hi.cc.u32  %4, %9, %10, %4;\n\t"    /* (a.d1 * a.d2).hi */
      "madc.hi.u32     %5, %9, %11, 0;\n\t"     /* (a.d1 * a.d3).hi */

      "mad.lo.cc.u32   %4, %9, %11, %4;\n\t"    /* (a.d1 * a.d3).lo */
      "madc.lo.cc.u32  %5, %10, %11, %5;\n\t"   /* (a.d2 * a.d3).lo */
      "madc.hi.u32     %6, %10, %11, 0;\n\t"    /* (a.d2 * a.d3).hi */

      "add.cc.u32      %1, %1, %1;\n\t"         /* Double the partial results */
      "addc.cc.u32     %2, %2, %2;\n\t"
      "addc.cc.u32     %3, %3, %3;\n\t"
      "addc.cc.u32     %4, %4, %4;\n\t"
      "addc.cc.u32     %5, %5, %5;\n\t"
      "addc.u32        %6, %6, %6;\n\t"

      "mul.lo.u32      %0, %8, %8;\n\t"         /* (a.d0 * a.d0).lo */
      "mad.hi.cc.u32   %1, %8, %8, %1;\n\t"     /* (a.d0 * a.d0).hi */
      "madc.lo.cc.u32  %2, %9, %9, %2;\n\t"     /* (a.d1 * a.d1).lo */
      "madc.hi.cc.u32  %3, %9, %9, %3;\n\t"     /* (a.d1 * a.d1).hi */
      "madc.lo.cc.u32  %4, %10, %10, %4;\n\t"   /* (a.d2 * a.d2).lo */
      "madc.hi.cc.u32  %5, %10, %10, %5;\n\t"   /* (a.d2 * a.d2).hi */
      "madc.lo.cc.u32  %6, %11, %11, %6;\n\t"   /* (a.d3 * a.d3).lo */
      "madc.hi.u32     %7, %11, %11, 0;\n\t"    /* (a.d3 * a.d3).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5),
        "=r" (res->d6), "=r" (res->d7)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3));
}


__device__ static void square_128_224(int224 *res, int128 a)
/* res = a^2, assuming that a is < 2^112 (a.d3 < 2^16)! */
{
  asm("{\n\t"
      "mul.lo.u32      %1, %7, %8;\n\t"       /* (a.d0 * a.d1).lo */
      "mul.lo.u32      %2, %7, %9;\n\t"       /* (a.d0 * a.d2).lo */
      "mul.lo.u32      %3, %7, %10;\n\t"      /* (a.d0 * a.d3).lo */

      "mad.hi.cc.u32   %2, %7, %8, %2;\n\t"   /* (a.d0 * a.d1).hi */
      "madc.hi.cc.u32  %3, %7, %9, %3;\n\t"   /* (a.d0 * a.d2).hi */
      "madc.hi.u32     %4, %7, %10, 0;\n\t"   /* (a.d0 * a.d3).hi */

      "mad.lo.cc.u32   %3, %8, %9, %3;\n\t"   /* (a.d1 * a.d2).lo */
      "madc.hi.cc.u32  %4, %8, %9, %4;\n\t"   /* (a.d1 * a.d2).hi */
      "madc.hi.u32     %5, %8, %10, 0;\n\t"   /* (a.d1 * a.d3).hi */

      "mad.lo.cc.u32   %4, %8, %10, %4;\n\t"  /* (a.d1 * a.d3).lo */
      "madc.lo.cc.u32  %5, %9, %10, %5;\n\t"  /* (a.d2 * a.d3).lo */
      "madc.hi.u32     %6, %9, %10, 0;\n\t"   /* (a.d2 * a.d3).hi */

      "add.cc.u32      %1, %1, %1;\n\t"       /* Double the partial results */
      "addc.cc.u32     %2, %2, %2;\n\t"
      "addc.cc.u32     %3, %3, %3;\n\t"
      "addc.cc.u32     %4, %4, %4;\n\t"
      "addc.cc.u32     %5, %5, %5;\n\t"
      "addc.u32        %6, %6, %6;\n\t"

      "mul.lo.u32      %0, %7, %7;\n\t"       /* (a.d0 * a.d0).lo */
      "mad.hi.cc.u32   %1, %7, %7, %1;\n\t"   /* (a.d0 * a.d0).hi */
      "madc.lo.cc.u32  %2, %8, %8, %2;\n\t"   /* (a.d1 * a.d1).lo */
      "madc.hi.cc.u32  %3, %8, %8, %3;\n\t"   /* (a.d1 * a.d1).hi */
      "madc.lo.cc.u32  %4, %9, %9, %4;\n\t"   /* (a.d2 * a.d2).lo */
      "madc.hi.cc.u32  %5, %9, %9, %5;\n\t"   /* (a.d2 * a.d2).hi */
      "madc.lo.u32     %6, %10, %10, %6;\n\t" /* (a.d3 * a.d3).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4), "=r" (res->d5),
        "=r" (res->d6)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3));
}


__device__ static void mul_128_256_no_low4(int128 *res, int128 a, int128 b)
/*
res ~= a * b / 2^128
Carries into res.d0 are NOT computed. So the result differs from a full mul_128_256() / 2^128.
In a full mul_128_256() there are six possible carries from res.d3 to res.d4. So ignoring the carries
the result is 0 to 6 lower than a full mul_128_256() / 2^128.
*/
{
  asm("{\n\t"
      "mul.hi.u32      %0, %4, %11;\n\t"     /* (a.d0 * b.d3).hi */

      "mad.lo.cc.u32   %0, %5, %11, %0;\n\t" /* (a.d1 * b.d3).lo */
      "addc.u32        %1, 0, 0;\n\t"

      "mad.hi.cc.u32   %0, %5, %10, %0;\n\t" /* (a.d1 * b.d2).hi */
      "madc.hi.u32     %1, %5, %11, %1;\n\t" /* (a.d1 * b.d3).hi */

      "mad.lo.cc.u32   %0, %6, %10, %0;\n\t" /* (a.d2 * b.d2).lo */
      "madc.lo.cc.u32  %1, %6, %11, %1;\n\t" /* (a.d2 * b.d3).lo */
      "addc.u32        %2, 0, 0;\n\t"

      "mad.hi.cc.u32   %0, %6, %9, %0;\n\t"  /* (a.d2 * b.d1).hi */
      "madc.hi.cc.u32  %1, %6, %10, %1;\n\t" /* (a.d2 * b.d2).hi */
      "madc.hi.u32     %2, %6, %11, %2;\n\t" /* (a.d2 * b.d3).hi */

      "mad.lo.cc.u32   %0, %7, %9, %0;\n\t"  /* (a.d3 * b.d1).lo */
      "madc.lo.cc.u32  %1, %7, %10, %1;\n\t" /* (a.d3 * b.d2).lo */
      "madc.lo.cc.u32  %2, %7, %11, %2;\n\t" /* (a.d3 * b.d3).lo */
      "addc.u32        %3, 0, 0;\n\t"

      "mad.hi.cc.u32   %0, %7, %8, %0;\n\t"  /* (a.d3 * b.d0).hi */
      "madc.hi.cc.u32  %1, %7, %9, %1;\n\t"  /* (a.d3 * b.d1).hi */
      "madc.hi.cc.u32  %2, %7, %10, %2;\n\t" /* (a.d3 * b.d2).hi */
      "madc.hi.u32     %3, %7, %11, %3;\n\t" /* (a.d3 * b.d3).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3),
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3));
}


__device__ static void mul_128(int128 *res, int128 a, int128 b)
/* res = a * b (only lower 128 bits of the result) */
{
  asm("{\n\t"
      "mul.lo.u32      %0, %4, %8;\n\t"      /* (a.d0 * b.d0).lo */
      "mul.lo.u32      %1, %4, %9;\n\t"      /* (a.d0 * b.d1).lo */
      "mul.lo.u32      %2, %4, %10;\n\t"     /* (a.d0 * b.d2).lo */
      "mul.lo.u32      %3, %4, %11;\n\t"     /* (a.d0 * b.d3).lo */

      "mad.hi.cc.u32   %1, %4, %8, %1;\n\t"  /* (a.d0 * b.d0).hi */
      "madc.hi.cc.u32  %2, %4, %9, %2;\n\t"  /* (a.d0 * b.d1).hi */
      "madc.hi.u32     %3, %4, %10, %3;\n\t" /* (a.d0 * b.d2).hi */

      "mad.lo.cc.u32   %1, %5, %8, %1;\n\t"  /* (a.d1 * b.d0).lo */
      "madc.lo.cc.u32  %2, %5, %9, %2;\n\t"  /* (a.d1 * b.d1).lo */
      "madc.lo.u32     %3, %5, %10, %3;\n\t" /* (a.d1 * b.d2).lo */

      "mad.hi.cc.u32   %2, %5, %8, %2;\n\t"  /* (a.d1 * b.d0).hi */
      "madc.hi.u32     %3, %5, %9, %3;\n\t"  /* (a.d1 * b.d1).hi */

      "mad.lo.cc.u32   %2, %6, %8, %2;\n\t"  /* (a.d2 * b.d0).lo */
      "madc.lo.u32     %3, %6, %9, %3;\n\t"  /* (a.d2 * b.d1).lo */

      "mad.hi.u32      %3, %6, %8, %3;\n\t"  /* (a.d2 * b.d0).hi */

      "mad.lo.u32      %3, %7, %8, %3;\n\t"  /* (a.d3 * b.d0).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3)
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3));
}


__device__ static void mulsub_128(int128 *res, int256 c, int128 a, int128 negb)
/* res = c - a * b (only lower 128 bits of the result) */
{
  asm("{\n\t"
      "mad.lo.cc.u32    %0, %4, %8, %12;\n\t"  /* c += (a.d0 * negb.d0).lo */
      "madc.lo.cc.u32   %1, %4, %9, %13;\n\t"  /* c += (a.d0 * negb.d1).lo */
      "madc.lo.cc.u32   %2, %4, %10, %14;\n\t" /* c += (a.d0 * negb.d2).lo */
      "madc.lo.u32      %3, %4, %11, %15;\n\t" /* c += (a.d0 * negb.d3).lo */

      "mad.hi.cc.u32   %1, %4, %8, %1;\n\t"    /* c += (a.d0 * negb.d0).hi */
      "madc.hi.cc.u32  %2, %4, %9, %2;\n\t"    /* c += (a.d0 * negb.d1).hi */
      "madc.hi.u32     %3, %4, %10, %3;\n\t"   /* c += (a.d0 * negb.d2).hi */

      "mad.lo.cc.u32   %1, %5, %8, %1;\n\t"    /* c += (a.d1 * negb.d0).lo */
      "madc.lo.cc.u32  %2, %5, %9, %2;\n\t"    /* c += (a.d1 * negb.d1).lo */
      "madc.lo.u32     %3, %5, %10, %3;\n\t"   /* c += (a.d1 * negb.d2).lo */

      "mad.hi.cc.u32   %2, %5, %8, %2;\n\t"    /* c += (a.d1 * negb.d0).hi */
      "madc.hi.u32     %3, %5, %9, %3;\n\t"    /* c += (a.d1 * negb.d1).hi */

      "mad.lo.cc.u32   %2, %6, %8, %2;\n\t"    /* c += (a.d2 * negb.d0).lo */
      "madc.lo.u32     %3, %6, %9, %3;\n\t"    /* c += (a.d2 * negb.d1).lo */

      "mad.hi.u32      %3, %6, %8, %3;\n\t"    /* c += (a.d2 * negb.d0).hi */

      "mad.lo.u32      %3, %7, %8, %3;\n\t"    /* c += (a.d3 * negb.d0).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3),
        "r" (negb.d0), "r" (negb.d1), "r" (negb.d2), "r" (negb.d3),
        "r" (c.d0), "r" (c.d1), "r" (c.d2), "r" (c.d3));
}


__device__ static void mulsub_128_224(int128 *res, int224 c, int128 a, int128 negb)
/* res = c - a * b (only lower 128 bits of the result) */
{
  asm("{\n\t"
      "mad.lo.cc.u32    %0, %4, %8, %12;\n\t"  /* c += (a.d0 * negb.d0).lo */
      "madc.lo.cc.u32   %1, %4, %9, %13;\n\t"  /* c += (a.d0 * negb.d1).lo */
      "madc.lo.cc.u32   %2, %4, %10, %14;\n\t" /* c += (a.d0 * negb.d2).lo */
      "madc.lo.u32      %3, %4, %11, %15;\n\t" /* c += (a.d0 * negb.d3).lo */

      "mad.hi.cc.u32   %1, %4, %8, %1;\n\t"    /* c += (a.d0 * negb.d0).hi */
      "madc.hi.cc.u32  %2, %4, %9, %2;\n\t"    /* c += (a.d0 * negb.d1).hi */
      "madc.hi.u32     %3, %4, %10, %3;\n\t"   /* c += (a.d0 * negb.d2).hi */

      "mad.lo.cc.u32   %1, %5, %8, %1;\n\t"    /* c += (a.d1 * negb.d0).lo */
      "madc.lo.cc.u32  %2, %5, %9, %2;\n\t"    /* c += (a.d1 * negb.d1).lo */
      "madc.lo.u32     %3, %5, %10, %3;\n\t"   /* c += (a.d1 * negb.d2).lo */

      "mad.hi.cc.u32   %2, %5, %8, %2;\n\t"    /* c += (a.d1 * negb.d0).hi */
      "madc.hi.u32     %3, %5, %9, %3;\n\t"    /* c += (a.d1 * negb.d1).hi */

      "mad.lo.cc.u32   %2, %6, %8, %2;\n\t"    /* c += (a.d2 * negb.d0).lo */
      "madc.lo.u32     %3, %6, %9, %3;\n\t"    /* c += (a.d2 * negb.d1).lo */

      "mad.hi.u32      %3, %6, %8, %3;\n\t"    /* c += (a.d2 * negb.d0).hi */

      "mad.lo.u32      %3, %7, %8, %3;\n\t"    /* c += (a.d3 * negb.d0).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3),
        "r" (negb.d0), "r" (negb.d1), "r" (negb.d2), "r" (negb.d3),
        "r" (c.d0), "r" (c.d1), "r" (c.d2), "r" (c.d3));
}


__device__ static void mul_128_96(int128 *res, int128 a, int96 b)
/* res = a * b (only lower 128 bits of the result) */
{
  asm("{\n\t"
      "mul.lo.u32      %0, %4, %8;\n\t"      /* (a.d0 * b.d0).lo */
      "mul.lo.u32      %1, %4, %9;\n\t"      /* (a.d0 * b.d1).lo */
      "mul.lo.u32      %2, %4, %10;\n\t"     /* (a.d0 * b.d2).lo */
      "mul.lo.u32      %3, %7, %8;\n\t"      /* (a.d3 * b.d0).lo */

      "mad.hi.cc.u32   %1, %4, %8, %1;\n\t"  /* (a.d0 * b.d0).hi */
      "madc.hi.cc.u32  %2, %4, %9, %2;\n\t"  /* (a.d0 * b.d1).hi */
      "madc.hi.u32     %3, %4, %10, %3;\n\t" /* (a.d0 * b.d2).hi */

      "mad.lo.cc.u32   %1, %5, %8, %1;\n\t"  /* (a.d1 * b.d0).lo */
      "madc.lo.cc.u32  %2, %5, %9, %2;\n\t"  /* (a.d1 * b.d1).lo */
      "madc.lo.u32     %3, %5, %10, %3;\n\t" /* (a.d1 * b.d2).lo */

      "mad.hi.cc.u32   %2, %5, %8, %2;\n\t"  /* (a.d1 * b.d0).hi */
      "madc.hi.u32     %3, %5, %9, %3;\n\t"  /* (a.d1 * b.d1).hi */

      "mad.lo.cc.u32   %2, %6, %8, %2;\n\t"  /* (a.d2 * b.d0).lo */
      "madc.lo.u32     %3, %6, %9, %3;\n\t"  /* (a.d2 * b.d1).lo */

      "mad.hi.u32      %3, %6, %8, %3;\n\t"  /* (a.d2 * b.d0).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3)
        "r" (b.d0), "r" (b.d1), "r" (b.d2));
}


__device__ static void div_288_128(int160 *res, int288 q, int128 n, float nf)
/* res = q / n (integer division) */
{
  float qf;
  unsigned int qi;
  int288 nn;
  int128 tmp128;

/********** Step 2, Offset 2^155 (4*32 + 27) **********/
  qf= __uint2float_rn(q.d8);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d7);
  qf*= 32.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 4) printf ("q2: %X\n", qi);

  res->d4 = qi << 27;

// nn = n * qi
  nn.d4  =                                 __umul32(n.d0, qi);
  nn.d5  = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d6  = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d7  = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d8  = __addc_cc(__umul32hi(n.d3, qi),                  0);

// shiftleft nn 27 bits
  nn.d8  = (nn.d8  << 27) + (nn.d7  >> 5);
  nn.d7  = (nn.d7  << 27) + (nn.d6  >> 5);
  nn.d6  = (nn.d6  << 27) + (nn.d5  >> 5);
  nn.d5  = (nn.d5  << 27) + (nn.d4  >> 5);
  nn.d4  =  nn.d4  << 27;

//  q = q - nn
  q.d4  = __sub_cc (q.d4,  nn.d4);
  q.d5  = __subc_cc(q.d5,  nn.d5);
  q.d6  = __subc_cc(q.d6,  nn.d6);
  q.d7  = __subc_cc(q.d7,  nn.d7);
  q.d8  = __subc   (q.d8,  nn.d8);

/********** Step 3, Offset 2^135 (4*32 + 7) **********/
  qf= __uint2float_rn(q.d8);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d7);
  qf*= 33554432.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 4) printf ("q3: %X %X %X\n", qi, q.d8, q.d7);

  qi <<= 7;
  res->d4 += qi;

// nn = n * qi
  nn.d4  =                                 __umul32(n.d0, qi);
  nn.d5  = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d6  = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d7  = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d8  = __addc_cc(__umul32hi(n.d3, qi),                  0);

//  q = q - nn
  q.d4  = __sub_cc (q.d4,  nn.d4);
  q.d5  = __subc_cc(q.d5,  nn.d5);
  q.d6  = __subc_cc(q.d6,  nn.d6);
  q.d7  = __subc_cc(q.d7,  nn.d7);
  q.d8  = __subc   (q.d8,  nn.d8);

/********** Step 4, Offset 2^115 (3*32 + 19) **********/
  qf= __uint2float_rn(q.d8);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d7);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d6);
  qf*= 8192.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 4) printf ("q4: %X, %X, %X\n", qi, q.d7, q.d6);

  res->d3 = qi << 19;
  res->d4 += qi >> 13;

// nn = n * qi
  nn.d3 =                                 __umul32(n.d0, qi);
  nn.d4 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d7 = __addc_cc(__umul32hi(n.d3, qi),                  0);

// shiftleft nn 19 bits
  nn.d7  = (nn.d7 << 19) + (nn.d6 >> 13);
  nn.d6  = (nn.d6 << 19) + (nn.d5 >> 13);
  nn.d5  = (nn.d5 << 19) + (nn.d4 >> 13);
  nn.d4  = (nn.d4 << 19) + (nn.d3 >> 13);
  nn.d3  =  nn.d3 << 19;

//  q = q - nn
  q.d3  = __sub_cc (q.d3,  nn.d3);
  q.d4  = __subc_cc(q.d4,  nn.d4);
  q.d5  = __subc_cc(q.d5,  nn.d5);
  q.d6  = __subc_cc(q.d6,  nn.d6);
  q.d7  = __subc   (q.d7,  nn.d7);

/********** Step 5, Offset 2^95 (2*32 + 31) **********/
  qf= __uint2float_rn(q.d7);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d6);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d5);
  qf*= 2.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 4) printf ("q5: %X\n", qi);

  res->d2 = qi << 31;
  res->d3 = __add_cc (res->d3, qi >> 1);
  res->d4 = __addc   (res->d4, 0);

// nn = n * qi
  nn.d2 =                                 __umul32(n.d0, qi);
  nn.d3 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//if (nn.d6 >> 1 != q.d7) printf ("1/f fail 7\n");
// shiftleft nn 31 bits
  nn.d6 = (nn.d6 << 31) + (nn.d5 >> 1);
  nn.d5 = (nn.d5 << 31) + (nn.d4 >> 1);
  nn.d4 = (nn.d4 << 31) + (nn.d3 >> 1);
  nn.d3 = (nn.d3 << 31) + (nn.d2 >> 1);
  nn.d2 =  nn.d2 << 31;

//  q = q - nn
  q.d2 = __sub_cc (q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc   (q.d6, nn.d6);

/********** Step 6, Offset 2^75 (2*32 + 11) **********/
  qf= __uint2float_rn(q.d6);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d5);
  qf*= 2097152.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 4) printf ("q6: %X\n", qi);

  qi <<= 11;
  res->d2 = __add_cc (res->d2, qi);
  res->d3 = __addc_cc(res->d3, 0);
  res->d4 = __addc   (res->d4, 0);

// nn = n * qi
  nn.d2 =                                 __umul32(n.d0, qi);
  nn.d3 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//  q = q - nn
  q.d2 = __sub_cc (q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc   (q.d6, nn.d6);

/********** Step 7, Offset 2^55 (1*32 + 23) **********/
  qf= __uint2float_rn(q.d6);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d5);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d4);
  qf*= 512.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 4) printf ("q7: %X\n", qi);

  res->d1 = qi << 23;
  res->d2 = __add_cc (res->d2, qi >> 9);
  res->d3 = __addc_cc(res->d3, 0);
  res->d4 = __addc   (res->d4, 0);

// nn = n * qi
  nn.d1 =                                 __umul32(n.d0, qi);
  nn.d2 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//if (nn.d5 >> 9 != q.d6) printf ("1/f fail 7\n");
// shiftleft nn 23 bits
  nn.d5 = (nn.d5 << 23) + (nn.d4 >> 9);
  nn.d4 = (nn.d4 << 23) + (nn.d3 >> 9);
  nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
  nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
  nn.d1 =  nn.d1 << 23;

// q = q - nn
  q.d1 = __sub_cc (q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc   (q.d5, nn.d5);

/********** Step 8, Offset 2^35 (1*32 + 3) **********/

  qf= __uint2float_rn(q.d5);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d4);
  qf*= 536870912.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 4) printf ("q8: %X\n", qi);

  qi <<= 3;
  res->d1 = __add_cc (res->d1, qi);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc_cc(res->d3, 0);
  res->d4 = __addc   (res->d4, 0);

// nn = n * qi
  nn.d1 =                                 __umul32(n.d0, qi);
  nn.d2 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//  q = q - nn
  q.d1 = __sub_cc (q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc   (q.d5, nn.d5);

/********** Step 9, Offset 2^15 (0*32 + 15) **********/

  qf= __uint2float_rn(q.d5);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d4);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d3);
  qf*= 131072.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 4) printf ("q9: %X\n", qi);

  res->d0 = qi << 15;
  res->d1 = __add_cc (res->d1, qi >> 17);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc_cc(res->d3, 0);
  res->d4 = __addc   (res->d4, 0);

// nn = n * qi
  nn.d0 =                                 __umul32(n.d0, qi);
  nn.d1 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d2 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d4 = __addc   (__umul32hi(n.d3, qi),                  0);

//if (blockIdx.x==12 && threadIdx.x == 4) if (nn.d5 >> 17 != q.d6) printf ("1/f fail 9\n");

// shiftleft nn 15 bits
  nn.d4 = (nn.d4 << 15) + (nn.d3 >> 17);
  nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
  nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
  nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
  nn.d0 =  nn.d0 << 15;

//  q = q - nn
  q.d0 = __sub_cc (q.d0, nn.d0);
  q.d1 = __subc_cc(q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc   (q.d4, nn.d4);

/********** Step 10, Offset 2^0 (0*32 + 0) **********/

  qf= __uint2float_rn(q.d4);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d3);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d2);

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==0 && threadIdx.x == 4) printf ("q10: %X\n", qi);

  res->d0 = __add_cc (res->d0, qi);
  res->d1 = __addc_cc(res->d1, 0);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc_cc(res->d3, 0);
  res->d4 = __addc   (res->d4, 0);

// nn = n * qi
  nn.d0 =                                  __umul32(n.d0, qi);
  nn.d1 = __addc_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d2 = __addc_cc (__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d3 = __addc_cc (__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d4 = __addc    (__umul32hi(n.d3, qi),                  0);

//  q = q - nn
  q.d0 = __sub_cc (q.d0, nn.d0);
  q.d1 = __subc_cc(q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc   (q.d4, nn.d4);

/*
qi is always a little bit too small, this is OK for all steps except the last
one. Sometimes the result is a little bit bigger than n
*/

//if (blockIdx.x == 12 && threadIdx.x == 4)
//printf ("  rem: %X %X %X %X\r\n", q.d3, q.d2, q.d1, q.d0);

  tmp128.d0 = q.d0;
  tmp128.d1 = q.d1;
  tmp128.d2 = q.d2;
  tmp128.d3 = q.d3;

  if(q.d4 || cmp_ge_128(tmp128,n))
  {
    res->d0 = __add_cc (res->d0, 1);
    res->d1 = __addc_cc(res->d1, 0);
    res->d2 = __addc_cc(res->d2, 0);
    res->d3 = __addc   (res->d3, 0);
  }
}


__device__ static void div_256_128(int128 *res, int256 q, int128 n, float nf)
/* res = q / n (integer division) */
{
  float qf;
  unsigned int qi;
  int256 nn;
  int128 tmp128;

/********** Step 4, Offset 2^115 (3*32 + 19) **********/
  qf= __uint2float_rn(q.d7);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d6);
  qf*= 8192.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q4: %X\n", qi);

  res->d3 = qi << 19;

// nn = n * qi
  nn.d3 =                                 __umul32(n.d0, qi);
  nn.d4 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d7 = __addc_cc(__umul32hi(n.d3, qi),                  0);

// shiftleft nn 19 bits
  nn.d7  = (nn.d7 << 19) + (nn.d6 >> 13);
  nn.d6  = (nn.d6 << 19) + (nn.d5 >> 13);
  nn.d5  = (nn.d5 << 19) + (nn.d4 >> 13);
  nn.d4  = (nn.d4 << 19) + (nn.d3 >> 13);
  nn.d3  =  nn.d3 << 19;

//  q = q - nn
  q.d3  = __sub_cc (q.d3,  nn.d3);
  q.d4  = __subc_cc(q.d4,  nn.d4);
  q.d5  = __subc_cc(q.d5,  nn.d5);
  q.d6  = __subc_cc(q.d6,  nn.d6);
  q.d7  = __subc   (q.d7,  nn.d7);

/********** Step 5, Offset 2^95 (2*32 + 31) **********/
  qf= __uint2float_rn(q.d7);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d6);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d5);
  qf*= 2.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q5: %X\n", qi);

  res->d2  = qi << 31;
  res->d3 += qi >> 1;

// nn = n * qi
  nn.d2 =                                 __umul32(n.d0, qi);
  nn.d3 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//if (nn.d6 >> 1 != q.d7) printf ("1/f fail 7\n");
// shiftleft nn 31 bits
  nn.d6 = (nn.d6 << 31) + (nn.d5 >> 1);
  nn.d5 = (nn.d5 << 31) + (nn.d4 >> 1);
  nn.d4 = (nn.d4 << 31) + (nn.d3 >> 1);
  nn.d3 = (nn.d3 << 31) + (nn.d2 >> 1);
  nn.d2 =  nn.d2 << 31;

//  q = q - nn
  q.d2 = __sub_cc (q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc   (q.d6, nn.d6);

/********** Step 6, Offset 2^75 (2*32 + 11) **********/
  qf= __uint2float_rn(q.d6);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d5);
  qf*= 2097152.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q6: %X\n", qi);

  qi <<= 11;
  res->d2 = __add_cc (res->d2, qi);
  res->d3 = __addc   (res->d3, 0);

// nn = n * qi
  nn.d2 =                                 __umul32(n.d0, qi);
  nn.d3 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//  q = q - nn
  q.d2 = __sub_cc (q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc   (q.d6, nn.d6);

/********** Step 7, Offset 2^55 (1*32 + 23) **********/
  qf= __uint2float_rn(q.d6);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d5);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d4);
  qf*= 512.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q7: %X\n", qi);

  res->d1 = qi << 23;
  res->d2 = __add_cc (res->d2, qi >> 9);
  res->d3 = __addc   (res->d3, 0);

// nn = n * qi
  nn.d1 =                                 __umul32(n.d0, qi);
  nn.d2 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//if (nn.d5 >> 9 != q.d6) printf ("1/f fail 7\n");
// shiftleft nn 23 bits
  nn.d5 = (nn.d5 << 23) + (nn.d4 >> 9);
  nn.d4 = (nn.d4 << 23) + (nn.d3 >> 9);
  nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
  nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
  nn.d1 =  nn.d1 << 23;

// q = q - nn
  q.d1 = __sub_cc (q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc   (q.d5, nn.d5);

/********** Step 8, Offset 2^35 (1*32 + 3) **********/

  qf= __uint2float_rn(q.d5);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d4);
  qf*= 536870912.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q8: %X\n", qi);

  qi <<= 3;
  res->d1 = __add_cc (res->d1, qi);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc   (res->d3, 0);

// nn = n * qi
  nn.d1 =                                 __umul32(n.d0, qi);
  nn.d2 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//  q = q - nn
  q.d1 = __sub_cc (q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc   (q.d5, nn.d5);

/********** Step 9, Offset 2^15 (0*32 + 15) **********/

  qf= __uint2float_rn(q.d5);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d4);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d3);
  qf*= 131072.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q9: %X\n", qi);

  res->d0 = qi << 15;
  res->d1 = __add_cc (res->d1, qi >> 17);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc   (res->d3, 0);

// nn = n * qi
  nn.d0 =                                 __umul32(n.d0, qi);
  nn.d1 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d2 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d4 = __addc   (__umul32hi(n.d3, qi),                  0);

//if (blockIdx.x==12 && threadIdx.x == 4) if (nn.d5 >> 17 != q.d6) printf ("1/f fail 9\n");

// shiftleft nn 15 bits
  nn.d4 = (nn.d4 << 15) + (nn.d3 >> 17);
  nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
  nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
  nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
  nn.d0 =  nn.d0 << 15;

//  q = q - nn
  q.d0 = __sub_cc (q.d0, nn.d0);
  q.d1 = __subc_cc(q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc   (q.d4, nn.d4);

/********** Step 10, Offset 2^0 (0*32 + 0) **********/

  qf= __uint2float_rn(q.d4);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d3);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d2);

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q10: %X\n", qi);

  res->d0 = __add_cc (res->d0, qi);
  res->d1 = __addc_cc(res->d1, 0);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc   (res->d3, 0);

// nn = n * qi
  nn.d0 =                                  __umul32(n.d0, qi);
  nn.d1 = __add_cc  (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d2 = __addc_cc (__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d3 = __addc    (__umul32hi(n.d2, qi), __umul32(n.d3, qi));

//  q = q - nn
  q.d0 = __sub_cc (q.d0, nn.d0);
  q.d1 = __subc_cc(q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc   (q.d3, nn.d3);

/*
qi is always a little bit too small, this is OK for all steps except the last
one. Sometimes the result is a little bit bigger than n
*/

//if (blockIdx.x == 12 && threadIdx.x == 4)
//printf ("  rem: %X %X %X %X\r\n", q.d3, q.d2, q.d1, q.d0);

  tmp128.d0 = q.d0;
  tmp128.d1 = q.d1;
  tmp128.d2 = q.d2;
  tmp128.d3 = q.d3;

  if(cmp_ge_128(tmp128,n))
  {
    res->d0 = __add_cc (res->d0, 1);
    res->d1 = __addc_cc(res->d1, 0);
    res->d2 = __addc_cc(res->d2, 0);
    res->d3 = __addc   (res->d3, 0);
  }
}


__device__ static void div_224_128(int128 *res, int224 q, int128 n, float nf)
/* res = q / n (integer division) */
{
  float qf;
  unsigned int qi;
  int224 nn;
  int128 tmp128;

/********** Step 4, Offset 2^115 (3*32 + 19) **********/
  qf= __uint2float_rn(q.d6);
  qf*= 8192.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q4: %X\n", qi);

  res->d3 = qi << 19;

// nn = n * qi
  nn.d3 =                                 __umul32(n.d0, qi);
  nn.d4 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));

// shiftleft nn 19 bits
  nn.d6  = (nn.d6 << 19) + (nn.d5 >> 13);
  nn.d5  = (nn.d5 << 19) + (nn.d4 >> 13);
  nn.d4  = (nn.d4 << 19) + (nn.d3 >> 13);
  nn.d3  =  nn.d3 << 19;

//  q = q - nn
  q.d3  = __sub_cc (q.d3,  nn.d3);
  q.d4  = __subc_cc(q.d4,  nn.d4);
  q.d5  = __subc_cc(q.d5,  nn.d5);
  q.d6  = __subc   (q.d6,  nn.d6);

/********** Step 5, Offset 2^95 (2*32 + 31) **********/
  qf= __uint2float_rn(q.d6);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d5);
  qf*= 2.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q5: %X\n", qi);

  res->d2  = qi << 31;
  res->d3 += qi >> 1;

// nn = n * qi
  nn.d2 =                                 __umul32(n.d0, qi);
  nn.d3 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d3, qi),                  0);

// shiftleft nn 31 bits
  nn.d6 = (nn.d6 << 31) + (nn.d5 >> 1);
  nn.d5 = (nn.d5 << 31) + (nn.d4 >> 1);
  nn.d4 = (nn.d4 << 31) + (nn.d3 >> 1);
  nn.d3 = (nn.d3 << 31) + (nn.d2 >> 1);
  nn.d2 =  nn.d2 << 31;

//  q = q - nn
  q.d2 = __sub_cc (q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc   (q.d6, nn.d6);

/********** Step 6, Offset 2^75 (2*32 + 11) **********/
  qf= __uint2float_rn(q.d6);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d5);
  qf*= 2097152.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q6: %X\n", qi);

  qi <<= 11;
  res->d2 = __add_cc (res->d2, qi);
  res->d3 = __addc   (res->d3, 0);

// nn = n * qi
  nn.d2 =                                 __umul32(n.d0, qi);
  nn.d3 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d6 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//  q = q - nn
  q.d2 = __sub_cc (q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc_cc(q.d5, nn.d5);
  q.d6 = __subc   (q.d6, nn.d6);

/********** Step 7, Offset 2^55 (1*32 + 23) **********/
  qf= __uint2float_rn(q.d6);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d5);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d4);
  qf*= 512.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q7: %X\n", qi);

  res->d1 = qi << 23;
  res->d2 = __add_cc (res->d2, qi >> 9);
  res->d3 = __addc   (res->d3, 0);

// nn = n * qi
  nn.d1 =                                 __umul32(n.d0, qi);
  nn.d2 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//if (nn.d5 >> 9 != q.d6) printf ("1/f fail 7\n");
// shiftleft nn 23 bits
  nn.d5 = (nn.d5 << 23) + (nn.d4 >> 9);
  nn.d4 = (nn.d4 << 23) + (nn.d3 >> 9);
  nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
  nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
  nn.d1 =  nn.d1 << 23;

// q = q - nn
  q.d1 = __sub_cc (q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc   (q.d5, nn.d5);

/********** Step 8, Offset 2^35 (1*32 + 3) **********/

  qf= __uint2float_rn(q.d5);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d4);
  qf*= 536870912.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q8: %X\n", qi);

  qi <<= 3;
  res->d1 = __add_cc (res->d1, qi);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc   (res->d3, 0);

// nn = n * qi
  nn.d1 =                                 __umul32(n.d0, qi);
  nn.d2 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d4 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d5 = __addc_cc(__umul32hi(n.d3, qi),                  0);

//  q = q - nn
  q.d1 = __sub_cc (q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc_cc(q.d4, nn.d4);
  q.d5 = __subc   (q.d5, nn.d5);

/********** Step 9, Offset 2^15 (0*32 + 15) **********/

  qf= __uint2float_rn(q.d5);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d4);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d3);
  qf*= 131072.0f;

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q9: %X\n", qi);

  res->d0 = qi << 15;
  res->d1 = __add_cc (res->d1, qi >> 17);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc   (res->d3, 0);

// nn = n * qi
  nn.d0 =                                 __umul32(n.d0, qi);
  nn.d1 = __add_cc (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d2 = __addc_cc(__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d3 = __addc_cc(__umul32hi(n.d2, qi), __umul32(n.d3, qi));
  nn.d4 = __addc   (__umul32hi(n.d3, qi),                  0);

//if (blockIdx.x==12 && threadIdx.x == 4) if (nn.d5 >> 17 != q.d6) printf ("1/f fail 9\n");

// shiftleft nn 15 bits
  nn.d4 = (nn.d4 << 15) + (nn.d3 >> 17);
  nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
  nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
  nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
  nn.d0 =  nn.d0 << 15;

//  q = q - nn
  q.d0 = __sub_cc (q.d0, nn.d0);
  q.d1 = __subc_cc(q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc_cc(q.d3, nn.d3);
  q.d4 = __subc   (q.d4, nn.d4);

/********** Step 10, Offset 2^0 (0*32 + 0) **********/

  qf= __uint2float_rn(q.d4);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d3);
  qf= qf * 4294967296.0f + __uint2float_rn(q.d2);

  qi=__float2uint_rz(qf*nf);
//if (blockIdx.x==12 && threadIdx.x == 4) printf ("q10: %X\n", qi);

  res->d0 = __add_cc (res->d0, qi);
  res->d1 = __addc_cc(res->d1, 0);
  res->d2 = __addc_cc(res->d2, 0);
  res->d3 = __addc   (res->d3, 0);

// nn = n * qi
  nn.d0 =                                  __umul32(n.d0, qi);
  nn.d1 = __add_cc  (__umul32hi(n.d0, qi), __umul32(n.d1, qi));
  nn.d2 = __addc_cc (__umul32hi(n.d1, qi), __umul32(n.d2, qi));
  nn.d3 = __addc    (__umul32hi(n.d2, qi), __umul32(n.d3, qi));

//  q = q - nn
  q.d0 = __sub_cc (q.d0, nn.d0);
  q.d1 = __subc_cc(q.d1, nn.d1);
  q.d2 = __subc_cc(q.d2, nn.d2);
  q.d3 = __subc   (q.d3, nn.d3);

/*
qi is always a little bit too small, this is OK for all steps except the last
one. Sometimes the result is a little bit bigger than n
*/

//if (blockIdx.x == 12 && threadIdx.x == 4)
//printf ("  rem: %X %X %X %X\r\n", q.d3, q.d2, q.d1, q.d0);

  tmp128.d0 = q.d0;
  tmp128.d1 = q.d1;
  tmp128.d2 = q.d2;
  tmp128.d3 = q.d3;

  if(cmp_ge_128(tmp128,n))
  {
    res->d0 = __add_cc (res->d0, 1);
    res->d1 = __addc_cc(res->d1, 0);
    res->d2 = __addc_cc(res->d2, 0);
    res->d3 = __addc   (res->d3, 0);
  }
}


__device__ static void mod_simple_160_128(int128 *res, int160 q, int128 n, float nf)
/*
res = q mod n
used for refinement in barrett modular multiplication
assumes q < Xn where X is a small integer
*/
{
  float qf;
  unsigned int qi;
  int160 nn;

  qf = __uint2float_rn(q.d4);
  qf = qf * 4294967296.0f + __uint2float_rn(q.d3);
  qf = qf * 4294967296.0f + __uint2float_rn(q.d2);

  qi=__float2uint_rz(qf*nf);

  nn.d0 =                           __umul32(n.d0, qi);
  nn.d1 = __umad32hi_cc  (n.d0, qi, __umul32(n.d1, qi));
  nn.d2 = __umad32hic_cc (n.d1, qi, __umul32(n.d2, qi));
  nn.d3 = __umad32hic_cc (n.d2, qi, __umul32(n.d3, qi));
  nn.d4 = __umad32hic    (n.d3, qi,                  0);

  res->d0 = __sub_cc (q.d0, nn.d0);
  res->d1 = __subc_cc(q.d1, nn.d1);
  res->d2 = __subc_cc(q.d2, nn.d2);
  res->d3 = __subc_cc(q.d3, nn.d3);
  q.d4 =    __subc   (q.d4, nn.d4);

  if(q.d4 || cmp_ge_128(*res,n))			// final adjustment in case res >= n
  {
    sub_128(res, *res, n);
  }
}


__device__ static void mod_simple_128(int128 *res, int128 q, int128 n, float nf)
/*
res = q mod n
used for refinement in barrett modular multiplication
assumes q < Xn where X is a small integer
*/
{
  float qf;
  unsigned int qi;
  int128 nn;

  qf = __uint2float_rn(q.d3);
  qf = qf * 4294967296.0f + __uint2float_rn(q.d2);

  qi=__float2uint_rz(qf*nf);

  nn.d0 =                           __umul32(n.d0, qi);
  nn.d1 = __umad32hi_cc  (n.d0, qi, __umul32(n.d1, qi));
  nn.d2 = __umad32hic_cc (n.d1, qi, __umul32(n.d2, qi));
  nn.d3 = __umad32hic    (n.d2, qi, __umul32(n.d3, qi));

  sub_128(res, q, nn);

  if(cmp_ge_128(*res, n))			// final adjustment in case finalrem >= f
  {
    sub_128(res, *res, n);
  }
}
