package model

import (
	"encoding/json"
	"fmt"
	"math"
	"math/rand"
	"os"
	"sort"
)

// Value represents a scalar for autograd
type Value struct {
	Data       float64
	Grad       float64
	Children   []*Value
	LocalGrads []float64
}

func V(x float64) *Value {
	return &Value{Data: x}
}

func Add(a, b *Value) *Value {
	out := &Value{Data: a.Data + b.Data, Children: []*Value{a, b}, LocalGrads: []float64{1, 1}}
	return out
}

func Sub(a, b *Value) *Value {
	out := &Value{Data: a.Data - b.Data, Children: []*Value{a, b}, LocalGrads: []float64{1, -1}}
	return out
}

func Mul(a, b *Value) *Value {
	out := &Value{Data: a.Data * b.Data, Children: []*Value{a, b}, LocalGrads: []float64{b.Data, a.Data}}
	return out
}

func Pow(a *Value, p float64) *Value {
	out := &Value{Data: math.Pow(a.Data, p), Children: []*Value{a}, LocalGrads: []float64{p * math.Pow(a.Data, p-1)}}
	return out
}

func Div(a, b *Value) *Value {
	return Mul(a, Pow(b, -1))
}

func Neg(a *Value) *Value {
	return Mul(a, V(-1))
}

func Log(a *Value) *Value {
	out := &Value{Data: math.Log(a.Data), Children: []*Value{a}, LocalGrads: []float64{1 / a.Data}}
	return out
}

func Exp(a *Value) *Value {
	out := &Value{Data: math.Exp(a.Data), Children: []*Value{a}, LocalGrads: []float64{math.Exp(a.Data)}}
	return out
}

func ReLU(a *Value) *Value {
	val := 0.0
	grad := 0.0
	if a.Data > 0 {
		val = a.Data
		grad = 1
	}
	out := &Value{Data: val, Children: []*Value{a}, LocalGrads: []float64{grad}}
	return out
}

func Backward(out *Value) {
	topo := make([]*Value, 0)
	visited := make(map[*Value]bool)
	var buildTopo func(*Value)
	buildTopo = func(v *Value) {
		if !visited[v] {
			visited[v] = true
			for _, child := range v.Children {
				buildTopo(child)
			}
			topo = append(topo, v)
		}
	}
	buildTopo(out)

	for _, v := range topo {
		v.Grad = 0
	}
	out.Grad = 1
	for i := len(topo) - 1; i >= 0; i-- {
		v := topo[i]
		for j, child := range v.Children {
			child.Grad += v.LocalGrads[j] * v.Grad
		}
	}
}

func linear(x []*Value, w [][]*Value) []*Value {
	nout := len(w)
	nin := len(x)
	out := make([]*Value, nout)
	for i := 0; i < nout; i++ {
		s := V(0)
		for j := 0; j < nin; j++ {
			s = Add(s, Mul(x[j], w[i][j]))
		}
		out[i] = s
	}
	return out
}

func softmax(logits []*Value) []*Value {
	maxVal := -math.MaxFloat64
	for _, l := range logits {
		if l.Data > maxVal {
			maxVal = l.Data
		}
	}
	exps := make([]*Value, len(logits))
	sumExp := V(0)
	for i, l := range logits {
		exps[i] = Exp(Sub(l, V(maxVal)))
		sumExp = Add(sumExp, exps[i])
	}
	out := make([]*Value, len(logits))
	invSum := Div(V(1), sumExp)
	for i := range exps {
		out[i] = Mul(exps[i], invSum)
	}
	return out
}

func rmsnorm(x []*Value) []*Value {
	meanSq := V(0)
	for _, v := range x {
		meanSq = Add(meanSq, Pow(v, 2))
	}
	meanSq = Mul(V(1/float64(len(x))), meanSq)
	invStd := Div(V(1), Pow(Add(meanSq, V(1e-6)), 0.5))
	out := make([]*Value, len(x))
	for i, v := range x {
		out[i] = Mul(v, invStd)
	}
	return out
}

// TrainingCheckpoint structs
type TrainingCheckpoint struct {
	Version      int                      `json:"version"`
	CreatedAt    string                   `json:"created_at"`
	Config       TrainingCheckpointConfig `json:"config"`
	Tokenization string                   `json:"tokenization,omitempty"`
	BPEEncoding  string                   `json:"bpe_encoding,omitempty"`
	BPETokenIDs  []int                    `json:"bpe_token_ids,omitempty"`
	Vocab        []string                 `json:"vocab,omitempty"`
	State        map[string][][]float64   `json:"state"`
}

type TrainingCheckpointConfig struct {
	NLayer    int `json:"n_layer"`
	NEmbd     int `json:"n_embd"`
	NHead     int `json:"n_head"`
	BlockSize int `json:"block_size"`
}

func ImportState(src map[string][][]float64) map[string][][]*Value {
	out := make(map[string][][]*Value, len(src))
	for name, mat := range src {
		rows := make([][]*Value, len(mat))
		for i, row := range mat {
			r := make([]*Value, len(row))
			for j, v := range row {
				r[j] = V(v)
			}
			rows[i] = r
		}
		out[name] = rows
	}
	return out
}

func LoadCheckpoint(path string) (TrainingCheckpoint, error) {
	b, err := os.ReadFile(path)
	if err != nil {
		return TrainingCheckpoint{}, err
	}
	var ckpt TrainingCheckpoint
	if err := json.Unmarshal(b, &ckpt); err != nil {
		return TrainingCheckpoint{}, err
	}
	if ckpt.Config.NLayer < 1 || ckpt.Config.NEmbd < 1 || ckpt.Config.NHead < 1 || ckpt.Config.BlockSize < 2 {
		return TrainingCheckpoint{}, fmt.Errorf("invalid checkpoint config")
	}
	if ckpt.Config.NEmbd%ckpt.Config.NHead != 0 {
		return TrainingCheckpoint{}, fmt.Errorf("invalid checkpoint: n_embd must be divisible by n_head")
	}
	return ckpt, nil
}

func BuildGPT(state map[string][][]*Value, nLayer, nEmbd, nHead int) func(tokenID, posID int, keys, values [][][]*Value) []*Value {
	headDim := nEmbd / nHead
	return func(tokenID, posID int, keys, values [][][]*Value) []*Value {
		tokEmb := state["wte"][tokenID]
		posEmb := state["wpe"][posID]
		x := make([]*Value, len(tokEmb))
		for i := range tokEmb {
			x[i] = Add(tokEmb[i], posEmb[i])
		}
		x = rmsnorm(x)

		for li := 0; li < nLayer; li++ {
			xResidual := x
			x = rmsnorm(x)
			q := linear(x, state[fmt.Sprintf("layer%d.attn_wq", li)])
			k := linear(x, state[fmt.Sprintf("layer%d.attn_wk", li)])
			v := linear(x, state[fmt.Sprintf("layer%d.attn_wv", li)])
			keys[li] = append(keys[li], k)
			values[li] = append(values[li], v)

			xAttn := make([]*Value, 0, nEmbd)
			for h := 0; h < nHead; h++ {
				hs := h * headDim
				qH := q[hs : hs+headDim]

				kH := make([][]*Value, len(keys[li]))
				vH := make([][]*Value, len(values[li]))
				for t := 0; t < len(keys[li]); t++ {
					kH[t] = keys[li][t][hs : hs+headDim]
					vH[t] = values[li][t][hs : hs+headDim]
				}

				attnLogits := make([]*Value, len(kH))
				for t := 0; t < len(kH); t++ {
					score := V(0)
					for j := 0; j < headDim; j++ {
						score = Add(score, Mul(qH[j], kH[t][j]))
					}
					attnLogits[t] = Div(score, V(math.Sqrt(float64(headDim))))
				}
				attnWeights := softmax(attnLogits)

				headOut := make([]*Value, headDim)
				for j := 0; j < headDim; j++ {
					s := V(0)
					for t := 0; t < len(vH); t++ {
						s = Add(s, Mul(attnWeights[t], vH[t][j]))
					}
					headOut[j] = s
				}
				xAttn = append(xAttn, headOut...)
			}

			x = linear(xAttn, state[fmt.Sprintf("layer%d.attn_wo", li)])
			for i := range x {
				x[i] = Add(x[i], xResidual[i])
			}

			xResidual = x
			x = rmsnorm(x)
			x = linear(x, state[fmt.Sprintf("layer%d.mlp_fc1", li)])
			for i := range x {
				x[i] = ReLU(x[i])
			}
			x = linear(x, state[fmt.Sprintf("layer%d.mlp_fc2", li)])
			for i := range x {
				x[i] = Add(x[i], xResidual[i])
			}
		}

		return linear(x, state["lm_head"])
	}
}

// Sampling functions
func SampleWeighted(weights []float64) int {
	sum := 0.0
	for _, w := range weights {
		sum += w
	}
	r := rand.Float64() * sum
	running := 0.0
	for i, w := range weights {
		running += w
		if r <= running {
			return i
		}
	}
	return len(weights) - 1
}

func SoftmaxFloat(logits []float64) []float64 {
	maxLogit := -math.MaxFloat64
	for _, l := range logits {
		if l > maxLogit {
			maxLogit = l
		}
	}
	sum := 0.0
	out := make([]float64, len(logits))
	for i, l := range logits {
		out[i] = math.Exp(l - maxLogit)
		sum += out[i]
	}
	for i := range out {
		out[i] /= sum
	}
	return out
}

func NextTokenWeights(logits []*Value, temperature float64, topK int, topP float64, recent map[int]bool, repetitionPenalty float64) []float64 {
	l := make([]float64, len(logits))
	for i, v := range logits {
		l[i] = v.Data
		if recent[i] {
			if l[i] >= 0 {
				l[i] /= repetitionPenalty
			} else {
				l[i] *= repetitionPenalty
			}
		}
		l[i] /= temperature
	}
	w := SoftmaxFloat(l)
	if topK > 0 {
		w = ApplyTopK(w, topK)
	}
	if topP > 0 && topP < 1.0 {
		w = ApplyTopP(w, topP)
	}
	return w
}

func ApplyTopK(weights []float64, k int) []float64 {
	if k >= len(weights) {
		return weights
	}
	type kv struct {
		i int
		w float64
	}
	arr := make([]kv, len(weights))
	for i, w := range weights {
		arr[i] = kv{i, w}
	}
	sort.Slice(arr, func(i, j int) bool { return arr[i].w > arr[j].w })
	out := make([]float64, len(weights))
	for i := 0; i < k; i++ {
		out[arr[i].i] = arr[i].w
	}
	return out
}

func ApplyTopP(weights []float64, p float64) []float64 {
	type kv struct {
		i int
		w float64
	}
	arr := make([]kv, len(weights))
	for i, w := range weights {
		arr[i] = kv{i, w}
	}
	sort.Slice(arr, func(i, j int) bool { return arr[i].w > arr[j].w })
	out := make([]float64, len(weights))
	sum := 0.0
	for i := 0; i < len(arr); i++ {
		sum += arr[i].w
		out[arr[i].i] = arr[i].w
		if sum >= p {
			break
		}
	}
	return out
}