/*
This file is part of mfaktc.
Copyright (C) 2009, 2010, 2011, 2012  Oliver Weihe (o.weihe@t-online.de)

mfaktc is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

mfaktc is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
                                
You should have received a copy of the GNU General Public License
along with mfaktc.  If not, see <http://www.gnu.org/licenses/>.
*/

/* int72 and int96 are the same but this way the compiler warns
when an int96 is passed to a function designed to handle 72 bit int.
The applies to int144 and int192, too. */

/* 96bit (3x 32bit) integer
D= d0 + d1*(2^32) + d2*(2^64) */
typedef struct
{
  unsigned int d0,d1,d2;
}int96;

/* 128bit (4x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3;
}int128;

/* 160bit (5x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4;
}int160;

/* 192bit (6x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5;
}int192;

/* 224bit (7x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5,d6;
}int224;

/* 256bit (8x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5,d6,d7;
}int256;

/* 288bit (9x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5,d6,d7,d8;
}int288;

/* 320bit (10x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5,d6,d7,d8,d9;
}int320;

/* 352bit (11x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10;
}int352;

/* 384bit (12x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11;
}int384;

/* 416bit (13x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12;
}int416;

/* 448bit (14x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13;
}int448;

/* 480bit (15x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14;
}int480;

/* 512bit (16x 32bit) integer
D=d0 + d1*(2^32) + d2*(2^64) + ... */
typedef struct
{
  unsigned int d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15;
}int512;


typedef struct
{
  char progressformat[256];            /* userconfigureable progress line */
  char gpuprogressheader[256];         /* userconfigureable GPU sieving progress header */
  int class_number;                    /* the number of the last processed class */
  int grid_count;                      /* number of grids processed in the last processed class */
  unsigned long long int class_time;   /* time (in ms) needed to process the last processed class */
  float cpu_wait;                      /* percentage CPU was waiting for the GPU */
  int output_counter;                  /* count how often the status line was written since last headline */
  int class_counter;                   /* number of finished classes of the current job */
  char kernelname[30];
}stats_t;


typedef struct
{
  unsigned int *h_RES;			/* Host copy of the results array */
  unsigned int *d_RES;			/* GPU copy of the results array */
  
  int fermat_factoring;			/* TRUE if we are factoring Fermat numbers */
  int dont_checksum;			/* TRUE if we're bypassing the checksum when reading save files */
  unsigned int exponent;               /* the exponent we're currently working on */
  int bit_min;                         /* where do we start TFing */
  int bit_max_assignment;              /* the upper size of factors we're searching for */
  int bit_max_stage;                   /* as above, but only for the current stage */
  unsigned long long k_lower_bound;	/* Used for work assignments that are only part of a bit range */
  unsigned long long k_upper_bound;
  
  char workfile[51];                   /* allow filenames up to 50 chars... */
  char resultfile[51];                 /* allow filenames up to 50 chars... */
  
  int compcapa_major;                  /* compute capability major */
  int compcapa_minor;                  /* compute capability minor */
  
  int checkpoints, checkpointdelay, mode, stages, stopafterfactor;
  int threads_per_grid_max, threads_per_grid;

  char exponent_string[32];
  int gpu_sieving;			/* TRUE if we're letting the GPU do the sieving */
  int gpu_sieve_size;			/* Size (in bits) of the GPU sieve.  Default is 128M bits. */
  int gpu_sieve_primes;                 /* the actual number of primes using for sieving */
  int gpu_sieve_processing_size;	/* The number of GPU sieve bits each thread in a Barrett kernel will process.  Default is 2K bits. */
  int gpu_sieve_primes_auto;            /* Change the number of sieve primes for each work unit */
  unsigned int *d_bitarray;		/* 128M bit array for GPU sieve */
  unsigned int *d_sieve_info;		/* Device array containing compressed info needed for prime number GPU sieves */
  unsigned int *d_calc_bit_to_clear_info; /* Device array containing uncompressed info needed to calculate initial bit-to-clear */

  int printmode;
  
  int print_timestamp;
  
  int quit;
  int verbosity;                       /* 0 = reduced number of screen prints, 1 = default, >1 current unused */
  
  stats_t stats;                       /* stuff for statistics, etc. */
  
  char V5UserID[51];                   /* primenet V5UserID and ComputerID */
  char ComputerID[51];                 /* currently only used for screen/result output */
  
}mystuff_t;                            /* FIXME: propper name needed */

/* The results array returns any factors found by the GPU as well as sample data to validate */

#define RESULTS_ARRAY_SIZE		100	/* Number of ints in the GPU and host results array */
#define RESULTS_ARRAY_VALIDATION_OFFSET	80	/* Offset to the validation info in the results array */


enum GPUKernels
{
  AUTOSELECT_KERNEL,
  _71BIT_MUL24,
  _75BIT_MUL32,
  _95BIT_MUL32,
  BARRETT76_MUL32,
  BARRETT77_MUL32,
  BARRETT79_MUL32,
  BARRETT87_MUL32,
  BARRETT88_MUL32,
  BARRETT92_MUL32,
  BARRETT76_MUL32_GS,
  BARRETT77_MUL32_GS,
  BARRETT79_MUL32_GS,
  BARRETT87_MUL32_GS,
  BARRETT88_MUL32_GS,
  BARRETT92_MUL32_GS
};

enum MODES
{
  MODE_NORMAL,
  MODE_SELFTEST_SHORT,
  MODE_SELFTEST_FULL
};

#define RET_CUDA_ERROR 1000000001
#define RET_QUIT       1000000002



#define TESLA  100
#define FERMI  200
#define KEPLER 300
