/*
This file is part of mfaktc.
Copyright (C) 2012  George Woltman (woltman@alum.mit.edu)

mfaktc is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
*/


__device__ static void mul_96_F32_63_initial_special(int96 *res, int96 a, int96 b)
/* res = a * b (only lower 96 bits of the result), a.d2 is zero, b.d0 is one */
{
  asm("{\n\t"
      "mul.lo.u32    %2, %3, %8;\n\t"       /* (a.d0 * b.d2).lo */

      "mov.u32       %0,         %3;\n\t"   /*                    (a.d0 * b.d0).lo */
      "mad.lo.cc.u32 %1, %3, %7, %4;\n\t"   /* (a.d0 * b.d1).lo + (a.d1 * b.d0).lo */
      "madc.hi.u32   %2, %3, %7, %2;\n\t"   /* (a.d0 * b.d1).hi */

      "mad.lo.u32    %2, %4, %7, %2;\n\t"   /* (a.d1 * b.d1).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (b.d0), "r" (b.d1), "r" (b.d2));
}


__device__ static void mul_96_F32_63_special(int96 *res, int96 a, int96 b)
/* res = a * b (only lower 96 bits of the result), b.d0 is one */
{
  asm("{\n\t"
      "mov.u32       %0,         %3;\n\t"   /*                    (a.d0 * b.d0).lo */
      "mad.lo.cc.u32 %1, %3, %7, %4;\n\t"   /* (a.d0 * b.d1).lo + (a.d1 * b.d0).lo */
      "madc.hi.u32   %2, %3, %7, %5;\n\t"   /* (a.d0 * b.d1).hi + (a.d2 * b.d0).lo */

      "mad.lo.u32    %2, %3, %8, %2;\n\t"   /* (a.d0 * b.d2).lo */

      "mad.lo.u32    %2, %4, %7, %2;\n\t"   /* (a.d1 * b.d1).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (b.d0), "r" (b.d1), "r" (b.d2));
}


__device__ static void mul_128_96_F32_63_initial_special16(int128 *res, int128 a, int96 b)
/* res = a * b (only lower 128 bits of the result), a.d2, a.d3 are zero, b.d0 is one */
{
  asm("{\n\t"
      "mul.lo.u32      %2, %5, %9;\n\t"      /* (a.d1 * b.d1).lo */
      "mul.hi.u32      %3, %5, %9;\n\t"      /* (a.d1 * b.d1).hi */

      "mov.u32         %0,         %4;\n\t"  /*                    (a.d0 * b.d0).lo */
      "mad.lo.cc.u32   %1, %4, %9, %5;\n\t"  /* (a.d0 * b.d1).lo + (a.d1 * b.d0).lo */
      "madc.lo.cc.u32  %2, %4, %10, %2;\n\t" /* (a.d0 * b.d2).lo */
      "madc.lo.u32     %3, %5, %10, %3;\n\t" /* (a.d1 * b.d2).lo */

      "mad.hi.cc.u32   %2, %4, %9, %2;\n\t"  /* (a.d0 * b.d1).hi */
      "madc.hi.u32     %3, %4, %10, %3;\n\t" /* (a.d0 * b.d2).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3)
        "r" (b.d0), "r" (b.d1), "r" (b.d2));
}


__device__ static void mul_128_96_F32_63_special(int128 *res, int128 a, int96 b)
/* res = a * b (only lower 128 bits of the result), b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0,         %4;\n\t"  /*                    (a.d0 * b.d0).lo */
      "mad.lo.cc.u32   %1, %4, %9, %5;\n\t"  /* (a.d0 * b.d1).lo + (a.d1 * b.d0).lo */
      "madc.lo.cc.u32  %2, %4, %10, %6;\n\t" /* (a.d0 * b.d2).lo + (a.d2 * b.d0).lo */
      "madc.lo.u32     %3, %6, %9, %7;\n\t"  /* (a.d2 * b.d1).lo + (a.d3 * b.d0).lo */

      "mad.hi.cc.u32   %2, %4, %9, %2;\n\t"  /* (a.d0 * b.d1).hi */
      "madc.hi.u32     %3, %4, %10, %3;\n\t" /* (a.d0 * b.d2).hi */

      "mad.lo.cc.u32   %2, %5, %9, %2;\n\t"  /* (a.d1 * b.d1).lo */
      "madc.lo.u32     %3, %5, %10, %3;\n\t" /* (a.d1 * b.d2).lo */

      "mad.hi.u32      %3, %5, %9, %3;\n\t"  /* (a.d1 * b.d1).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3)
        "r" (b.d0), "r" (b.d1), "r" (b.d2));
}


__device__ static void mul_128_F32_63_initial_special(int128 *res, int128 a, int128 b)
/* res = a * b (only lower 128 bits of the result), a.d1, a.d2, a.d3 are zero, b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0, %4;\n\t"          /* (a.d0 * b.d0).lo */
      "mul.lo.u32      %1, %4, %9;\n\t"      /* (a.d0 * b.d1).lo */
      "mul.lo.u32      %2, %4, %10;\n\t"     /* (a.d0 * b.d2).lo */
      "mul.lo.u32      %3, %4, %11;\n\t"     /* (a.d0 * b.d3).lo */

      "mad.hi.cc.u32   %2, %4, %9, %2;\n\t"  /* (a.d0 * b.d1).hi */
      "madc.hi.u32     %3, %4, %10, %3;\n\t" /* (a.d0 * b.d2).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3)
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3));
}


__device__ static void mul_128_F32_63_special(int128 *res, int128 a, int128 b)
/* res = a * b (only lower 128 bits of the result), b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0,         %4;\n\t"  /*                    (a.d0 * b.d0).lo */
      "mad.lo.cc.u32   %1, %4, %9, %5;\n\t"  /* (a.d0 * b.d1).lo + (a.d1 * b.d0).lo */
      "madc.lo.cc.u32  %2, %4, %10, %6;\n\t" /* (a.d0 * b.d2).lo + (a.d2 * b.d0).lo */
      "madc.lo.u32     %3, %4, %11, %7;\n\t" /* (a.d0 * b.d3).lo + (a.d3 * b.d0).lo */

      "mad.hi.cc.u32   %2, %4, %9, %2;\n\t"  /* (a.d0 * b.d1).hi */
      "madc.hi.u32     %3, %4, %10, %3;\n\t" /* (a.d0 * b.d2).hi */

      "mad.lo.cc.u32   %2, %5, %9, %2;\n\t"  /* (a.d1 * b.d1).lo */
      "madc.lo.u32     %3, %5, %10, %3;\n\t" /* (a.d1 * b.d2).lo */

      "mad.hi.u32      %3, %5, %9, %3;\n\t"  /* (a.d1 * b.d1).hi */

      "mad.lo.u32      %3, %6, %9, %3;\n\t"  /* (a.d2 * b.d1).lo */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3)
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3));
}


__device__ static void mul_160_128_F32_63_special(int160 *res, int160 a, int128 b)
/* res = a * b (only lower 160 bits of the result), b.d0 is one */
{
  asm("{\n\t"
      "mov.u32         %0,          %5;\n\t" /*                    (a.d0 * b.d0).lo */
      "mad.lo.cc.u32   %1, %5, %11, %6;\n\t" /* (a.d0 * b.d1).lo + (a.d1 * b.d0).lo */
      "madc.lo.cc.u32  %2, %5, %12, %7;\n\t" /* (a.d0 * b.d2).lo + (a.d2 * b.d0).lo */
      "madc.lo.cc.u32  %3, %5, %13, %8;\n\t" /* (a.d0 * b.d3).lo + (a.d3 * b.d0).lo */
      "madc.lo.u32     %4, %8, %11, %9;\n\t" /* (a.d3 * b.d1).lo + (a.d4 * b.d0).lo */

      "mad.hi.cc.u32   %2, %5, %11, %2;\n\t" /* (a.d0 * b.d1).hi */
      "madc.hi.cc.u32  %3, %5, %12, %3;\n\t" /* (a.d0 * b.d2).hi */
      "madc.hi.u32     %4, %5, %13, %4;\n\t" /* (a.d0 * b.d3).hi */

      "mad.lo.cc.u32   %2, %6, %11, %2;\n\t" /* (a.d1 * b.d1).lo */
      "madc.lo.cc.u32  %3, %6, %12, %3;\n\t" /* (a.d1 * b.d2).lo */
      "madc.lo.u32     %4, %6, %13, %4;\n\t" /* (a.d1 * b.d3).lo */

      "mad.hi.cc.u32   %3, %6, %11, %3;\n\t" /* (a.d1 * b.d1).hi */
      "madc.hi.u32     %4, %6, %12, %4;\n\t" /* (a.d1 * b.d2).hi */

      "mad.lo.cc.u32   %3, %7, %11, %3;\n\t" /* (a.d2 * b.d1).lo */
      "madc.lo.u32     %4, %7, %12, %4;\n\t" /* (a.d2 * b.d2).lo */

      "mad.hi.u32      %4, %7, %11, %4;\n\t" /* (a.d2 * b.d1).hi */
      "}"
      : "=r" (res->d0), "=r" (res->d1), "=r" (res->d2), "=r" (res->d3), "=r" (res->d4)
      : "r" (a.d0), "r" (a.d1), "r" (a.d2), "r" (a.d3), "r" (a.d4)
        "r" (b.d0), "r" (b.d1), "r" (b.d2), "r" (b.d3));
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 32 <= N <= 63.  Works on f between 65 and 89 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MIN_BLOCKS) mfaktc_barrett89_F32_63gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int96 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int96 a, u, tmp96;
    int192 b, tmp192;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = (k.d0 << (exp - 32));
    f.d2 = (k.d1 << (exp - 32)) + (k.d0 >> (32 - (exp - 32)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d2);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d1);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp192.d5 = 1 << (bits_max - 1);			// tmp192 = 2^(95 + bits_in_f)
    tmp192.d4 = 0; tmp192.d3 = 0; tmp192.d2 = 0; tmp192.d1 = 0; tmp192.d0 = 0;

    // Could write optimized div_192_96 with so many tmp192 elements known to be zero
    div_192_96(&u,tmp192,f,ff);				// u = floor(2^(95 + bits_in_f) / f), giving 96 bits of precision

							// b_preinit = 2^128
							// a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp192 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (95 + bits_in_f) / f)     (ignore the floor functions for now)
							// a = tmp192 / 2^96, which if we do the math simplifies to the quotient: b_preinit / f
    a.d0 = (u.d1 >> (bits_max - 1)) + (u.d2 << (32 - (bits_max - 1)));
    a.d1 = (u.d2 >> (bits_max - 1));

    mul_96_F32_63_initial_special(&tmp96, a, f);	// tmp96 = quotient * f, we only compute the low 96-bits here

    a.d0 = __sub_cc (0, tmp96.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp96.d1);			// we do not need the upper digits of b_preinit and tmp96 because the result is 0 after subtraction!
    a.d2 = __subc   (0, tmp96.d2);

    for (shifter = 0; shifter < exp - 2 - 7; shifter++)
    {
							// On input a is at most 91.807 bits (see end of this loop)

      square_96_192(&b, a);				// b = a^2, b is at most 183.614 bits

      tmp96.d0 = (b.d2 >> (bits_max - 1)) + (b.d3 << (32 - (bits_max - 1))); // a = b / (2 ^ (bits_in_f - 1)), a is at most 95.614 bits
      tmp96.d1 = (b.d3 >> (bits_max - 1)) + (b.d4 << (32 - (bits_max - 1)));
      tmp96.d2 = (b.d4 >> (bits_max - 1)) + (b.d5 << (32 - (bits_max - 1)));

      mul_96_192_no_low3(&a, tmp96, u);			// a = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (95 + bits_in_f) / f) / 2^96   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 6.  A full mul_96_192 would add 5 partial results
							// into tmp192.d2 which could have generated 4 carries into tmp192.d3.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d2 could have been added into
							// tmp192.d2 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d2 could have been added into tmp192.d2 possibly generating a carry.
							// A grand total of up to 6 carries lost.

      mul_96_F32_63_special(&tmp96, a, f);		// tmp96 = quotient * f, we only compute the low 96-bits here

      a.d0 = __sub_cc (b.d0, tmp96.d0);			// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp96.d1);			// we do not need the upper digits of b and tmp96 because the result is 0 after subtraction!
      a.d2 = __subc   (b.d2, tmp96.d2);
							// Since the quotient was up to 6 too small, the remainder has a maximum value of 7*f,
							// or 89 bits + log2 (7) bits, which is 91.807 bits.
    }

    mod_simple_96(&finalrem, a, f, ff);			// Adjustment.  The code above may produce an a that is too large by up to 6 times f.

#if 0
    if(cmp_ge_96(finalrem,f) && f.d2)
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_96_192_no_low3(&tmp96,u,f);
printf ("    f: %08X%08X%08X\r\n", f.d2, f.d1, f.d0);
printf ("u    : %X %X %X\r\n", u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X\r\n", tmp96.d2, tmp96.d1, tmp96.d0);
printf ("  rem: %08X%08X%08X\r\n", finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if((finalrem.d1 == f.d1 && finalrem.d0 == 0 && finalrem.d2 == f.d2) ||
       (finalrem.d1 == 0    && finalrem.d0 == 1 && finalrem.d2 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=3;
      if(index<10)				/* limit to 10 factors per class */
      {
	RES[index*3 + 2]=1;
	RES[index*3 + 3]=f.d1;
	RES[index*3 + 4]=f.d2;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 3);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=f.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=f.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=finalrem.d2;
  }
}

//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 32 <= N <= 63.  Works on f between 89 and 96 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MIN_BLOCKS) mfaktc_barrett96_F32_63gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int96 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int128 a, u, tmp128;
    int224 b, tmp224;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = (k.d0 << (exp - 32));
    f.d2 = (k.d1 << (exp - 32)) + (k.d0 >> (32 - (exp - 32)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d2);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d1);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp224.d6 = 0x10000;				// tmp224 is 2^208
    tmp224.d5 = 0; tmp224.d4 = 0; tmp224.d3 = 0; tmp224.d2 = 0; tmp224.d1 = 0; tmp224.d0 = 0;

    // Could write optimized div_224_96 with so many tmp224 elements known to be zero
    div_224_96(&u,tmp224,f,ff);				// u = floor(2^208 / f).  This requires f >= 81 bits.

							// b_preinit = 2^128
							// a = b_preinit / 2^80 = 2^48
							// tmp256 = a * u = (b_preinit / 2^80) * (2^208 / f)     (ignore the floor functions for now)
    a.d0 = (u.d3 << 16) + (u.d2 >> 16);			// a = tmp256 / 2^128, which if we do the math simplifies to the quotient: b_preinit / f
    a.d1 =                (u.d3 >> 16);

    mul_128_96_F32_63_initial_special16(&tmp128, a, f);	// tmp128 = quotient * f, we only compute the low 128-bits here

    a.d0 = __sub_cc (0, tmp128.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp192 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp128.d2);
    a.d3 = __subc   (0, tmp128.d3);

    for (shifter = 0; shifter < exp - 2 - 7; shifter++)
    {
							// On input a is at most 99.17 bits (see end of this loop)

      square_128_224(&b, a);				// b = a^2, b is at most 198.34 bits

      tmp128.d0 = (b.d3 << 16) + (b.d2 >> 16);		// a = b / 2^80, a is at most 118.34 bits
      tmp128.d1 = (b.d4 << 16) + (b.d3 >> 16);
      tmp128.d2 = (b.d5 << 16) + (b.d4 >> 16);
      tmp128.d3 = (b.d6 << 16) + (b.d5 >> 16);

      mul_128_256_no_low4(&a, tmp128, u);		// a = (b / 2^80) * (2^208 / f) / 2^128   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 8.  A full mul_128_256 would add 7 partial results
							// into tmp256.d3 which could have generated 6 carries into tmp256.d4.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d3 could have been added into
							// tmp256.d3 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d3 could have been added into tmp256.d3 possibly generating a carry.
							// A grand total of up to 8 carries lost.

      mul_128_96_F32_63_special(&tmp128, a, f);		// tmp128 = quotient * f, we only compute the low 128-bits here

      a.d0 = __sub_cc (b.d0, tmp128.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp128.d1);		// we do not need the upper digits of b and tmp128 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp128.d2);
      a.d3 = __subc   (b.d3, tmp128.d3);
							// Since the quotient was up to 8 too small, the remainder has a maximum value of 9*f,
							// or 96 bits + log2 (9) bits, which is 99.17 bits.
    }

    mod_simple_128_96(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 8 times f.

#if 0
    if(cmp_ge_96(finalrem,f) && (f.d2 & 0xFFFF0000))
    {
      printf("EEEEEK, final rem is >= f\n");
    }
//    if(cmp_ge_96(finalrem,f) && (f.d2 & 0xFFFF0000)) {
if ((blockIdx.x == 0 && threadIdx.x == 4)){
int128 f128;	    
f128.d0 = f.d0;
f128.d1 = f.d1;
f128.d2 = f.d2;
f128.d3 = 0;
mul_128_256_no_low4(&tmp128,u,f128);
printf ("    f: %08X%08X%08X\r\n", f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X\r\n", u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X\r\n", tmp128.d3, tmp128.d2, tmp128.d1, tmp128.d0);
printf ("  rem: %08X%08X%08X\r\n", finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if((finalrem.d1 == f.d1 && finalrem.d0 == 0 && finalrem.d2 == f.d2) ||
       (finalrem.d1 == 0    && finalrem.d0 == 1 && finalrem.d2 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=3;
      if(index<10)				/* limit to 10 factors per class */
      {
	RES[index*3 + 2]=1;
	RES[index*3 + 3]=f.d1;
	RES[index*3 + 4]=f.d2;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 3);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=f.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=f.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=finalrem.d2;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 32 <= N <= 63.  Works on f between 97 and 108 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MIN_BLOCKS) mfaktc_barrett108_F32_63gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int128 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int128 a, u, tmp128;
    int224 b, tmp224;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = (k.d0 << (exp - 32));
    f.d2 = (k.d1 << (exp - 32)) + (k.d0 >> (32 - (exp - 32)));
    f.d3 =                        (k.d1 >> (32 - (exp - 32)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d3);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d2);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp224.d6 = 0xFFFFFFFF;				// tmp224 is nearly 2^224
    tmp224.d5 = 0xFFFFFFFF; tmp224.d4 = 0xFFFFFFFF; tmp224.d3 = 0xFFFFFFFF;
    tmp224.d2 = 0xFFFFFFFF; tmp224.d1 = 0xFFFFFFFF; tmp224.d0 = 0xFFFFFFFF;

    // Could write optimized div_224_128 with so many tmp224 elements known to be zero
    div_224_128(&u,tmp224,f,ff);			// u = floor(2^224 / f).  This requires f >= 97 bits.

							// b_preinit = 2^128
							// a = b_preinit / 2^96 = 2^32
							// tmp256 = a * u = (b_preinit / 2^96) * (2^224 / f)     (ignore the floor functions for now)
    a.d0 = u.d3;					// a = tmp256 / 2^128, which if we do the math simplifies to the quotient: b_preinit / f

    mul_128_F32_63_initial_special(&tmp128, a, f);	// tmp128 = quotient * f, we only compute the low 128-bits here

    a.d0 = __sub_cc (0, tmp128.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp128 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp128.d2);
    a.d3 = __subc   (0, tmp128.d3);

    for (shifter = 0; shifter < exp - 2 - 7; shifter++)
    {
							// On input a is at most 111.17 bits (see end of this loop)

      square_128_224(&b, a);				// b = a^2, b is at most 222.34 bits

      tmp128.d0 = b.d3;					// a = b / 2^96, a is at most 126.34 bits
      tmp128.d1 = b.d4;
      tmp128.d2 = b.d5;
      tmp128.d3 = b.d6;

      mul_128_256_no_low4(&a, tmp128, u);		// a = (b / 2^96) * (2^224 / f) / 2^128    (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 8.  A full mul_128_256 would add 7 partial results
							// into tmp256.d3 which could have generated 6 carries into tmp256.d4.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d3 could have been added into
							// tmp256.d3 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d3 could have been added into tmp256.d3 possibly generating a carry.
							// A grand total of up to 8 carries lost.

      mul_128_F32_63_special(&tmp128, a, f);		// tmp128 = quotient * f, we only compute the low 128-bits here

      a.d0 = __sub_cc (b.d0, tmp128.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp128.d1);		// we do not need the upper digits of b and tmp128 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp128.d2);
      a.d3 = __subc   (b.d3, tmp128.d3);
							// Since the quotient was up to 8 too small, the remainder has a maximum value of 9*f,
							// or 108 bits + log2 (9) bits, which is 111.17 bits.
    }

    mod_simple_128(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 8 times f.

#if 0
    if(cmp_ge_128(finalrem,f) && f.d3)
    {
      printf("EEEEEK, final finalrem is >= f\n, f: %X %X %X %X, a: %X %X %X %X", f.d3, f.d2, f.d1, f.d0, finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
    }
if (blockIdx.x == 0 && threadIdx.x == 4){
mul_128_256_no_low4(&tmp128,u,f);
printf ("    f: %08X%08X%08X%08X\r\n", f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X\r\n", u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X\r\n", tmp128.d3, tmp128.d2, tmp128.d1, tmp128.d0);
printf ("  rem: %08X%08X%08X%08X\r\n", finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if((finalrem.d2 == f.d2 && finalrem.d1 == f.d1 && finalrem.d0 == 0 && finalrem.d3 == f.d3) ||
       (finalrem.d2 == 0    && finalrem.d1 == 0    && finalrem.d0 == 1 && finalrem.d3 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=4;
      if(index<10)				/* limit to 10 factors per class */
      {
	RES[index*4 + 2]=1;
	RES[index*4 + 3]=f.d1;
	RES[index*4 + 4]=f.d2;
	RES[index*4 + 5]=f.d3;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 4);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=f.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=f.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=f.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=finalrem.d3;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 32 <= N <= 63.  Works on f between 109 and 120 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MIN_BLOCKS) mfaktc_barrett120_F32_63gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES, unsigned int bits_max)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int128 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int128 a, u, tmp128;
    int256 b, tmp256;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = (k.d0 << (exp - 32));
    f.d2 = (k.d1 << (exp - 32)) + (k.d0 >> (32 - (exp - 32)));
    f.d3 =                        (k.d1 >> (32 - (exp - 32)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d3);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d2);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp256.d7 = 1 << (bits_max - 1);			// tmp256 = 2^(127 + bits_in_f)
    tmp256.d6 = 0; tmp256.d5 = 0; tmp256.d4 = 0;
    tmp256.d3 = 0; tmp256.d2 = 0; tmp256.d1 = 0; tmp256.d0 = 0;

    // Could write optimized div_256_128 with so many tmp256 elements known to be zero
    div_256_128(&u,tmp256,f,ff);			// u = floor(2^(127 + bits_in_f) / f), giving 128 bits of precision

							// b_preinit = 2^128
							// a = b_preinit / 2 ^ (bits_in_f - 1)
							// tmp256 = a * u = (b_preinit / 2 ^ (bits_in_f - 1)) * (2 ^ (127 + bits_in_f) / f)     (ignore the floor functions for now)
							// a = tmp256 / 2^128, which if we do the math simplifies to the quotient: b_preinit / f
    a.d0 = (u.d3 >> (bits_max - 1));

    mul_128_F32_63_initial_special(&tmp128, a, f);	// tmp128 = quotient * f, we only compute the low 128-bits here

    a.d0 = __sub_cc (0, tmp128.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp128.d1);			// we do not need the upper digits of b_preinit and tmp128 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp128.d2);
    a.d3 = __subc   (0, tmp128.d3);

    for (shifter = 0; shifter < exp - 2 - 7; shifter++)
    {
							// On input a is at most 123.17 bits (see end of this loop)

      square_128_256(&b, a);				// b = a^2, b is at most 246.34 bits

      tmp128.d0 = (b.d3 >> (bits_max - 1)) + (b.d4 << (32 - (bits_max - 1))); // a = b / (2 ^ (bits_in_f - 1)), a is at most 127.34 bits
      tmp128.d1 = (b.d4 >> (bits_max - 1)) + (b.d5 << (32 - (bits_max - 1)));
      tmp128.d2 = (b.d5 >> (bits_max - 1)) + (b.d6 << (32 - (bits_max - 1)));
      tmp128.d3 = (b.d6 >> (bits_max - 1)) + (b.d7 << (32 - (bits_max - 1)));

      mul_128_256_no_low4(&a, tmp128, u);		// a = (b / 2 ^ (bits_in_f - 1)) * (2 ^ (127 + bits_in_f) / f) / 2^128   (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 8.  A full mul_128_256 would add 7 partial results
							// into tmp256.d3 which could have generated 6 carries into tmp256.d4.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d3 could have been added into
							// tmp256.d3 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d3 could have been added into tmp256.d3 possibly generating a carry.
							// A grand total of up to 8 carries lost.

      mul_128_F32_63_special(&tmp128, a, f);		// tmp128 = quotient * f, we only compute the low 128-bits here

      a.d0 = __sub_cc (b.d0, tmp128.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp128.d1);		// we do not need the upper digits of b and tmp160 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp128.d2);
      a.d3 = __subc   (b.d3, tmp128.d3);
							// Since the quotient was up to 8 too small, the remainder has a maximum value of 9*f,
							// or 120 bits + log2 (9) bits, which is 123.17 bits.
    }

    mod_simple_128(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 8 times f.

#if 0
    if(cmp_ge_128(finalrem,f))
    {
      printf("EEEEEK, final finalrem is >= f\n");
    }
if (cmp_ge_128(finalrem,f) || (blockIdx.x == 12 && threadIdx.x == 4)){
mul_128_256_no_low4(&tmp128,u,f);
printf ("    f: %08X%08X%08X%08X\r\n", f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X\r\n", u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X\r\n", tmp128.d3, tmp128.d2, tmp128.d1, tmp128.d0);
printf ("  rem: %08X%08X%08X%08X\r\n", finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if((finalrem.d2 == f.d2 && finalrem.d1 == f.d1 && finalrem.d0 == 0 && finalrem.d3 == f.d3) ||
       (finalrem.d2 == 0    && finalrem.d1 == 0    && finalrem.d0 == 1 && finalrem.d3 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=4;
      if(index<10)				/* limit to 10 factors per class */
      {
	RES[index*4 + 2]=1;
	RES[index*4 + 3]=f.d1;
	RES[index*4 + 4]=f.d2;
	RES[index*4 + 5]=f.d3;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 4);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=f.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=f.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=f.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=finalrem.d3;
  }
}


//
// Kernel to factor Fermat numbers with factors f = k*2^N+1, 32 <= N <= 63.  Works on f between 121 and 128 bits inclusive
//

__global__ void
__launch_bounds__(THREADS_PER_BLOCK, KERNEL_MIN_BLOCKS) mfaktc_barrett128_F32_63gs(unsigned int exp, int96 k_base, unsigned int *bit_array, unsigned int bits_to_process, unsigned int *RES)
{
  int i, words_per_thread, sieve_word, k_bit_base, total_bit_count;
  int128 f, finalrem;
  __shared__ volatile unsigned short bitcount[256];	// Each thread of our block puts bit-counts here
  extern __shared__ unsigned short smem[];		// Write bits to test here.  Launching program must estimate
							// how much shared memory to allocate based on number of primes sieved.

  // Get pointer to section of the bit_array this thread is processing.

  words_per_thread = bits_to_process / 8192;
  bit_array += blockIdx.x * bits_to_process / 32 + threadIdx.x * words_per_thread;

// Count number of bits set in this thread's word(s) from the bit_array

  bitcount[threadIdx.x] = 0;
  for (i = 0; i < words_per_thread; i++)
    bitcount[threadIdx.x] += ___popcnt(bit_array[i]);

// Create total count of bits set in block up to and including this threads popcnt.
// Kudos to Rocke Verser for the population counting code.
// CAUTION:  Following requires 256 threads per block

  // First five tallies remain within one warp.  Should be in lock-step.
  if (threadIdx.x & 1)        // If we are running on any thread 0bxxxxxxx1, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[threadIdx.x - 1];

  if (threadIdx.x & 2)        // If we are running on any thread 0bxxxxxx1x, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 2) | 1];

  if (threadIdx.x & 4)        // If we are running on any thread 0bxxxxx1xx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 4) | 3];

  if (threadIdx.x & 8)        // If we are running on any thread 0bxxxx1xxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 8) | 7];

  if (threadIdx.x & 16)       // If we are running on any thread 0bxxx1xxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 16) | 15];

  // Further tallies are across warps.  Must synchronize
  __syncthreads();
  if (threadIdx.x  & 32)      // If we are running on any thread 0bxx1xxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 32) | 31];

  __syncthreads();
  if (threadIdx.x & 64)       // If we are running on any thread 0bx1xxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[(threadIdx.x - 64) | 63];

  __syncthreads();
  if (threadIdx.x & 128)       // If we are running on any thread 0b1xxxxxxx, tally neighbor's count.
    bitcount[threadIdx.x] += bitcount[127];

  // At this point, bitcount[...] contains the total number of bits for the indexed
  // thread plus all lower-numbered threads.  I.e., bitcount[255] is the total count.

  __syncthreads();
  total_bit_count = bitcount[255];

//POSSIBLE OPTIMIZATION - bitcounts and smem could use the same memory space if we'd read bitcount into a register
// and sync threads before doing any writes to smem.

//POSSIBLE SANITY CHECK -- is there any way to test if total_bit_count exceeds the amount of shared memory allocated?

// Loop til this thread's section of the bit array is finished.

  sieve_word = *bit_array;
  k_bit_base = threadIdx.x * words_per_thread * 32;
  for (i = total_bit_count - bitcount[threadIdx.x]; ; i++) {
    int bit_to_test;

// Make sure we have a non-zero sieve word

    while (sieve_word == 0) {
      if (--words_per_thread == 0) break;
      sieve_word = *++bit_array;
      k_bit_base += 32;
    }

// Check if this thread has processed all its set bits

    if (sieve_word == 0) break;

// Find a bit to test in the sieve word

    bit_to_test = 31 - ___clz (sieve_word);
    sieve_word &= ~(1 << bit_to_test);

// Copy the k value to the shared memory array

    smem[i] = k_bit_base + bit_to_test;
  }

  __syncthreads();

// Here, all warps in our block have placed their candidates in shared memory.
// Now we can start TFing candidates.

// Pre-init values used to compute factor

  // Compute base k value
  k_base.d0 = __add_cc (k_base.d0, __umul32  (blockIdx.x * bits_to_process, NUM_CLASSES));
  k_base.d1 = __addc   (k_base.d1, __umul32hi(blockIdx.x * bits_to_process, NUM_CLASSES)); /* k values are limited to 64 bits */

// Loop til the k values written to shared memory are exhausted

  for (i = threadIdx.x; i < total_bit_count; i += THREADS_PER_BLOCK) {
    int96 k;
    int160 a, u, tmp160;
    int288 b, tmp288;
    float ff;
    int k_delta, shifter;

// Get the (k - k_base) value to test

    k_delta = smem[i];

// Compute f = k * 2^exp + 1

    k.d0 = __add_cc (k_base.d0, k_delta * NUM_CLASSES);	// k
    k.d1 = __addc   (k_base.d1, 0);

    f.d0 = 1;						// f = k * 2^exp + 1
    f.d1 = (k.d0 << (exp - 32));
    f.d2 = (k.d1 << (exp - 32)) + (k.d0 >> (32 - (exp - 32)));
    f.d3 =                        (k.d1 >> (32 - (exp - 32)));

/*
ff = f as float
Precalculated here since it is the same for all steps in the following loop */
    ff= __uint2float_rn(f.d3);
    ff= ff * 4294967296.0f + __uint2float_rn(f.d2);
    ff=__int_as_float(0x3f7ffffb) / ff;			// just a little bit below 1.0f so we always underestimate the quotient

    tmp288.d8 = 0x10000;				// tmp288 is 2^272
    tmp288.d7 = 0; tmp288.d6 = 0; tmp288.d5 = 0; tmp288.d4 = 0;
    tmp288.d3 = 0; tmp288.d2 = 0; tmp288.d1 = 0; tmp288.d0 = 0;

    // Could write optimized div_288_128 with so many tmp288 elements known to be zero
    div_288_128(&u,tmp288,f,ff);			// u = floor(2^272 / f).  This requires f >= 113 bits.

							// b_preinit = 2^256
							// a = b_preinit / 2^112 = 2^144
							// tmp320 = a * u = (b_preinit / 2^112) * (2^272 / f)     (ignore the floor functions for now)
    a.d0 = (u.d1 << 16) + (u.d0 >> 16);			// a = tmp320 / 2^160, which if we do the math simplifies to the quotient: b_preinit / f
    a.d1 = (u.d2 << 16) + (u.d1 >> 16);
    a.d2 = (u.d3 << 16) + (u.d2 >> 16);
    a.d3 = (u.d4 << 16) + (u.d3 >> 16);
    a.d4 =                (u.d4 >> 16);

    mul_160_128_F32_63_special(&tmp160, a, f);		// tmp160 = quotient * f, we only compute the low 160-bits here

    a.d0 = __sub_cc (0, tmp160.d0);			// Compute the remainder
    a.d1 = __subc_cc(0, tmp160.d1);			// we do not need the upper digits of b_preinit and tmp192 because the result is 0 after subtraction!
    a.d2 = __subc_cc(0, tmp160.d2);
    a.d3 = __subc_cc(0, tmp160.d3);
    a.d4 = __subc   (0, tmp160.d4);

    for (shifter = 0; shifter < exp - 2 - 8; shifter++)
    {
							// On input a is at most 131.459 bits (see end of this loop)

      square_160_288(&b, a);				// b = a^2, b is at most 262.918 bits

      tmp160.d0 = (b.d4 << 16) + (b.d3 >> 16);		// a = b / 2^112, a is at most 150.918 bits
      tmp160.d1 = (b.d5 << 16) + (b.d4 >> 16);
      tmp160.d2 = (b.d6 << 16) + (b.d5 >> 16);
      tmp160.d3 = (b.d7 << 16) + (b.d6 >> 16);
      tmp160.d4 = (b.d8 << 16) + (b.d7 >> 16);

      mul_160_320_no_low5(&a, tmp160, u);		// a = (b / 2^112) * (2^272 / f) / 2^160    (ignore the floor functions for now)
							// which if we do the math simplifies to the quotient: b / f

							// The quotient is off by at most 10.  A full mul_160_320 would add 9 partial results
							// into tmp320.d4 which could have generated 8 carries into tmp320.d5.
							// Also, since u was generated with the floor function, it could be low by up to
							// almost 1.  If we account for this a value up to a.d4 could have been added into
							// tmp320.d4 possibly generating a carry.  Similarly, a was generated by a floor
							// function, and could thus be low by almost 1.  If we account for this a value up
							// to u.d4 could have been added into tmp320.d4 possibly generating a carry.
							// A grand total of up to 10 carries lost.

      mul_160_128_F32_63_special(&tmp160, a, f);	// tmp160 = quotient * f, we only compute the low 160-bits here

      a.d0 = __sub_cc (b.d0, tmp160.d0);		// Compute the remainder
      a.d1 = __subc_cc(b.d1, tmp160.d1);		// we do not need the upper digits of b and tmp160 because the result is 0 after subtraction!
      a.d2 = __subc_cc(b.d2, tmp160.d2);
      a.d3 = __subc_cc(b.d3, tmp160.d3);
      a.d4 = __subc   (b.d4, tmp160.d4);
							// Since the quotient was up to 10 too small, the remainder has a maximum value of 11*f,
							// or 128 bits + log2 (11) bits, which is 131.459 bits.
    }

    mod_simple_160_128(&finalrem, a, f, ff);		// Adjustment.  The code above may produce an a that is too large by up to 10 times f.

#if 0
    if(cmp_ge_128(finalrem,f))
    {
      printf("EEEEEK, final rem is >= f\n");
    }
if ((blockIdx.x == 0 && threadIdx.x == 4)){
int160 f160;	    
f160.d0 = f.d0;
f160.d1 = f.d1;
f160.d2 = f.d2;
f160.d3 = f.d3;
f160.d4 = 0;
mul_160_320_no_low5(&tmp160,u,f160);
printf ("    f: %08X%08X%08X%08X\r\n", f.d3, f.d2, f.d1, f.d0);
printf ("u    : %X %X %X %X %X\r\n", u.d4, u.d3, u.d2, u.d1, u.d0);
printf ("u * f: %X %X %X %X %X\r\n", tmp160.d4, tmp160.d3, tmp160.d2, tmp160.d1, tmp160.d0);
printf ("  rem: %08X%08X%08X%08X\r\n", finalrem.d3, finalrem.d2, finalrem.d1, finalrem.d0);
}
#endif

/* check if we found a factor and write the factor to RES[] */

    if((finalrem.d2 == f.d2 && finalrem.d1 == f.d1 && finalrem.d0 == 0 && finalrem.d3 == f.d3) ||
       (finalrem.d2 == 0    && finalrem.d1 == 0    && finalrem.d0 == 1 && finalrem.d3 == 0))
    {
      int index=atomicInc(&RES[0],10000);
      RES[1]=4;
      if(index<10)				/* limit to 10 factors per class */
      {
	RES[index*4 + 2]=1;
	RES[index*4 + 3]=f.d1;
	RES[index*4 + 4]=f.d2;
	RES[index*4 + 5]=f.d3;
      }
    }
  }

/* finally write occasional result for validation by C code */

  if(blockIdx.x == 0 && threadIdx.x == 0)
  {
    RES[RESULTS_ARRAY_VALIDATION_OFFSET]=(total_bit_count == 0 ? 0 : 4);
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+1]=1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+2]=f.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+3]=f.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+4]=f.d3;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+5]=finalrem.d0;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+6]=finalrem.d1;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+7]=finalrem.d2;
    RES[RESULTS_ARRAY_VALIDATION_OFFSET+8]=finalrem.d3;
  }
}
