#ifndef _ORDERGRAPH_KERNEL_H_
#define _ORDERGRAPH_KERNEL_H_

#include <stdio.h>

#ifdef DATA_25
#include "data25.cu"
#endif
#ifdef DATA_30
#include "data30.cu"
#endif
#ifdef DATA_40
#include "data40.cu"
#endif
#ifdef DATA_45
#include "data45.cu"
#endif
#ifdef DATA_50
#include "data50.cu"
#endif
#ifdef DATA_125
#include "data125.cu"
#endif
;

#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>

using namespace nvcuda::experimental;

#define PREFETCH_COUNT 2

#define BLOCK_SIZE 256
#define MAX_NBLOCKS 1024
#define MIN_NBATCHES 16


__device__ void Dincr(int *bit, int n);
__device__ void DincrS(int *bit, int n);
__device__ bool D_getState(int parN, int *sta, int time);
__device__ void D_findComb(int *comb, int l, int n);
__device__ int D_findindex(int *arr, int size);
__device__ int D_C(int n, int a);

__global__ void genScoreKernel(int sizepernode, float *D_localscore,
                               int *D_data, float *D_LG) {
  int id = blockIdx.x * BLOCK_SIZE + threadIdx.x;
  int node, index;
  bool flag;
  int parent[5] = {0};
  int pre[NODE_N] = {0};
  int state[5] = {0};
  int i, j, parN = 0, tmp, t;
  int t1 = 0, t2 = 0;
  float ls = 0;
  int Nij[STATE_N] = {0};

  if (id < sizepernode) {

    D_findComb(parent, id, NODE_N - 1);

    for (i = 0; i < 4; i++) {
      if (parent[i] > 0)
        parN++;
    }

    for (node = 0; node < NODE_N; node++) {

      j = 1;
      for (i = 0; i < NODE_N; i++) {
        if (i != node)
          pre[j++] = i;
      }

      for (tmp = 0; tmp < parN; tmp++)
        state[tmp] = 0;

      index = sizepernode * node + id;

      // priors
      /*
      for(tmp=1;tmp<=4;tmp++){
          localscore[index]+=100*(prior[node][pre[parent[tmp]]]-0.5)*(prior[node][pre[parent[tmp]]]-0.5)*(prior[node][pre[parent[tmp]]]-0.5);
      }
      */
      t = 0;
      while (D_getState(parN, state, t++)) { // for get state
                                             // printf("test %u\n",id);
        ls = 0;
        for (tmp = 0; tmp < STATE_N; tmp++)
          Nij[tmp] = 0;

        for (t1 = 0; t1 < DATA_N; t1++) {
          flag = true;
          for (t2 = 0; t2 < parN; t2++) {
            if (D_data[t1 * NODE_N + pre[parent[t2]]] != state[t2]) {
              flag = false;
              break;
            }
          }
          if (!flag)
            continue;

          Nij[D_data[t1 * NODE_N + node]]++;
        }

        tmp = STATE_N - 1;

        for (t1 = 0; t1 < STATE_N; t1++) {
          ls += D_LG[Nij[t1]];
          tmp += Nij[t1];
        }

        ls -= D_LG[tmp];
        ls += D_LG[STATE_N - 1];

        D_localscore[index] += ls;
      }
    }
  }
}

__global__ void computeKernel(int taskperthr, int sizepernode,
                              float *D_localscore, bool *D_parent, int node,
                              int total, float *D_Score, int *D_resP,
                              int nbatches) {
  cooperative_groups::thread_block block =
      cooperative_groups::this_thread_block();
  pipeline pipe;
  __shared__ float lsinblock[PREFETCH_COUNT][BLOCK_SIZE];

  int fetch = 0;
  int end_tile = fetch + nbatches;
  int bestparent[4] = {0}, parent[5] = {-1};

  for (int compute = fetch; compute < end_tile; compute++) {
    for (; fetch < end_tile && fetch < compute + PREFETCH_COUNT; fetch++) {
      unsigned int bid = blockIdx.x * nbatches + fetch;
      unsigned int tid = threadIdx.x;
      unsigned int id = bid * (BLOCK_SIZE * nbatches) + tid;

      int posN = 1, i, index, tmp;
      int pre[NODE_N] = {0};
      int parN = 0;

      float bestls = -999999999999999, ls;

      for (i = 0; i < NODE_N; i++) {
        if (D_parent[i] == 1) {
          pre[posN++] = i;
        }
      }

      for (i = 0; i < taskperthr && ((id * taskperthr + i) < total); i++) {

        D_findComb(parent, id * taskperthr + i, posN);

        for (parN = 0; parN < 4; parN++) {
          if (parent[parN] < 0)
            break;
          if (pre[parent[parN]] > node)
            parent[parN] = pre[parent[parN]];
          else
            parent[parN] = pre[parent[parN]] + 1;
        }

        for (tmp = parN; tmp > 0; tmp--) {
          parent[tmp] = parent[tmp - 1];
        }
        parent[0] = 0;

        index = D_findindex(parent, parN);
        index += sizepernode * node;

        ls = D_localscore[index];

        if (ls > bestls) {
          bestls = ls;
          for (tmp = 0; tmp < 4; tmp++)
            bestparent[tmp] = parent[tmp + 1];
        }
      }

      memcpy_async(lsinblock[fetch % PREFETCH_COUNT][tid], bestls, pipe);
      pipe.commit();
    }
    if (fetch == end_tile) {
      for (int i = 0; i < PREFETCH_COUNT - 1; ++i) {
        pipe.commit();
      }
      ++fetch;
    }
    pipe.wait_prior<PREFETCH_COUNT - 1>();
    block.sync();

    int i, t;
    unsigned int bid = blockIdx.x * nbatches + compute;
    unsigned int tid = threadIdx.x;
    // unsigned int id = bid * (BLOCK_SIZE * nbatches) + tid;

    for (i = BLOCK_SIZE / 2; i >= 1; i /= 2) {
      if (tid < i) {
        if (lsinblock[compute % PREFETCH_COUNT][tid + i] >
                lsinblock[compute % PREFETCH_COUNT][tid] &&
            lsinblock[compute % PREFETCH_COUNT][tid + i] < 0) {
          lsinblock[compute % PREFETCH_COUNT][tid] =
              lsinblock[compute % PREFETCH_COUNT][tid + i];
          lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)(tid + i);
        } else if (lsinblock[compute % PREFETCH_COUNT][tid + i] <
                       lsinblock[compute % PREFETCH_COUNT][tid] &&
                   lsinblock[compute % PREFETCH_COUNT][tid] < 0) {
          lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)tid;
        } else if (lsinblock[tid] > 0 && lsinblock[tid + i] < 0) {
          lsinblock[compute % PREFETCH_COUNT][tid] =
              lsinblock[compute % PREFETCH_COUNT][tid + i];
          lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)(tid + i);
        } else if (lsinblock[compute % PREFETCH_COUNT][tid] < 0 &&
                   lsinblock[compute % PREFETCH_COUNT][tid + i] > 0) {
          lsinblock[compute % PREFETCH_COUNT][tid + i] = (float)tid;
        }
      }
      block.sync();
    }
    block.sync();

    if (tid == 0) {
      D_Score[bid] = lsinblock[compute % PREFETCH_COUNT][0];
      t = 0;
      for (i = 0; i < 7 && t < 128 && t >= 0; i++) {
        t = (int)lsinblock[compute % PREFETCH_COUNT][(int)powf(2.0, i) + t];
      }
      lsinblock[compute % PREFETCH_COUNT][0] = (float)t;
    }
    block.sync();

    if (tid == (int)lsinblock[compute % PREFETCH_COUNT][0]) {
      for (i = 0; i < 4; i++) {
        D_resP[bid * 4 + i] = bestparent[i];
      }
    }
  }
}

__device__ void Dincr(int *bit, int n) {

  while (n <= NODE_N) {
    bit[n]++;
    if (bit[n] >= 2) {
      bit[n] = 0;
      n++;
    } else {
      break;
    }
  }

  return;
}

__device__ void DincrS(int *bit, int n) {

  bit[n]++;
  if (bit[n] >= STATE_N) {
    bit[n] = 0;
    Dincr(bit, n + 1);
  }

  return;
}

__device__ bool D_getState(int parN, int *sta, int time) {
  int i, j = 1;

  for (i = 0; i < parN; i++) {
    j *= STATE_N;
  }
  j--;
  if (time > j)
    return false;

  if (time >= 1)
    DincrS(sta, 0);

  return true;
}

__device__ void D_findComb(int *comb, int l, int n) {
  const int len = 4;
  if (l == 0) {
    for (int i = 0; i < len; i++)
      comb[i] = -1;
    return;
  }
  int sum = 0;
  int k = 1;

  while (sum < l)
    sum += D_C(n, k++);
  l -= sum - D_C(n, --k);
  int low = 0;
  int pos = 0;
  while (k > 1) {
    sum = 0;
    int s = 1;
    while (sum < l)
      sum += D_C(n - s++, k - 1);
    l -= sum - D_C(n - (--s), --k);
    low += s;
    comb[pos++] = low;
    n -= s;
  }
  comb[pos] = low + l;
  for (int i = pos + 1; i < 4; i++)
    comb[i] = -1;
}

__device__ int D_findindex(int *arr,
                           int size) { // reminder: arr[0] has to be 0 && size
                                       // == array size-1 && index start from 0
  int i, j, index = 0;

  for (i = 1; i < size; i++) {
    index += D_C(NODE_N - 1, i);
  }

  for (i = 1; i <= size - 1; i++) {
    for (j = arr[i - 1] + 1; j <= arr[i] - 1; j++) {
      index += D_C(NODE_N - 1 - j, size - i);
    }
  }

  index += arr[size] - arr[size - 1];

  return index;
}

__device__ int D_C(int n, int a) {
  int i, res = 1, atmp = a;

  for (i = 0; i < atmp; i++) {
    res *= n;
    n--;
  }

  for (i = 0; i < atmp; i++) {
    res /= a;
    a--;
  }

  return res;
}

#endif