Spaces:
Running
Running
| package model | |
| import ( | |
| "encoding/json" | |
| "fmt" | |
| "math" | |
| "math/rand" | |
| "os" | |
| "sort" | |
| ) | |
| // Value represents a scalar for autograd | |
| type Value struct { | |
| Data float64 | |
| Grad float64 | |
| Children []*Value | |
| LocalGrads []float64 | |
| } | |
| func V(x float64) *Value { | |
| return &Value{Data: x} | |
| } | |
| func Add(a, b *Value) *Value { | |
| out := &Value{Data: a.Data + b.Data, Children: []*Value{a, b}, LocalGrads: []float64{1, 1}} | |
| return out | |
| } | |
| func Sub(a, b *Value) *Value { | |
| out := &Value{Data: a.Data - b.Data, Children: []*Value{a, b}, LocalGrads: []float64{1, -1}} | |
| return out | |
| } | |
| func Mul(a, b *Value) *Value { | |
| out := &Value{Data: a.Data * b.Data, Children: []*Value{a, b}, LocalGrads: []float64{b.Data, a.Data}} | |
| return out | |
| } | |
| func Pow(a *Value, p float64) *Value { | |
| out := &Value{Data: math.Pow(a.Data, p), Children: []*Value{a}, LocalGrads: []float64{p * math.Pow(a.Data, p-1)}} | |
| return out | |
| } | |
| func Div(a, b *Value) *Value { | |
| return Mul(a, Pow(b, -1)) | |
| } | |
| func Neg(a *Value) *Value { | |
| return Mul(a, V(-1)) | |
| } | |
| func Log(a *Value) *Value { | |
| out := &Value{Data: math.Log(a.Data), Children: []*Value{a}, LocalGrads: []float64{1 / a.Data}} | |
| return out | |
| } | |
| func Exp(a *Value) *Value { | |
| out := &Value{Data: math.Exp(a.Data), Children: []*Value{a}, LocalGrads: []float64{math.Exp(a.Data)}} | |
| return out | |
| } | |
| func ReLU(a *Value) *Value { | |
| val := 0.0 | |
| grad := 0.0 | |
| if a.Data > 0 { | |
| val = a.Data | |
| grad = 1 | |
| } | |
| out := &Value{Data: val, Children: []*Value{a}, LocalGrads: []float64{grad}} | |
| return out | |
| } | |
| func Backward(out *Value) { | |
| topo := make([]*Value, 0) | |
| visited := make(map[*Value]bool) | |
| var buildTopo func(*Value) | |
| buildTopo = func(v *Value) { | |
| if !visited[v] { | |
| visited[v] = true | |
| for _, child := range v.Children { | |
| buildTopo(child) | |
| } | |
| topo = append(topo, v) | |
| } | |
| } | |
| buildTopo(out) | |
| for _, v := range topo { | |
| v.Grad = 0 | |
| } | |
| out.Grad = 1 | |
| for i := len(topo) - 1; i >= 0; i-- { | |
| v := topo[i] | |
| for j, child := range v.Children { | |
| child.Grad += v.LocalGrads[j] * v.Grad | |
| } | |
| } | |
| } | |
| func linear(x []*Value, w [][]*Value) []*Value { | |
| nout := len(w) | |
| nin := len(x) | |
| out := make([]*Value, nout) | |
| for i := 0; i < nout; i++ { | |
| s := V(0) | |
| for j := 0; j < nin; j++ { | |
| s = Add(s, Mul(x[j], w[i][j])) | |
| } | |
| out[i] = s | |
| } | |
| return out | |
| } | |
| func softmax(logits []*Value) []*Value { | |
| maxVal := -math.MaxFloat64 | |
| for _, l := range logits { | |
| if l.Data > maxVal { | |
| maxVal = l.Data | |
| } | |
| } | |
| exps := make([]*Value, len(logits)) | |
| sumExp := V(0) | |
| for i, l := range logits { | |
| exps[i] = Exp(Sub(l, V(maxVal))) | |
| sumExp = Add(sumExp, exps[i]) | |
| } | |
| out := make([]*Value, len(logits)) | |
| invSum := Div(V(1), sumExp) | |
| for i := range exps { | |
| out[i] = Mul(exps[i], invSum) | |
| } | |
| return out | |
| } | |
| func rmsnorm(x []*Value) []*Value { | |
| meanSq := V(0) | |
| for _, v := range x { | |
| meanSq = Add(meanSq, Pow(v, 2)) | |
| } | |
| meanSq = Mul(V(1/float64(len(x))), meanSq) | |
| invStd := Div(V(1), Pow(Add(meanSq, V(1e-6)), 0.5)) | |
| out := make([]*Value, len(x)) | |
| for i, v := range x { | |
| out[i] = Mul(v, invStd) | |
| } | |
| return out | |
| } | |
| // TrainingCheckpoint structs | |
| type TrainingCheckpoint struct { | |
| Version int `json:"version"` | |
| CreatedAt string `json:"created_at"` | |
| Config TrainingCheckpointConfig `json:"config"` | |
| Tokenization string `json:"tokenization,omitempty"` | |
| BPEEncoding string `json:"bpe_encoding,omitempty"` | |
| BPETokenIDs []int `json:"bpe_token_ids,omitempty"` | |
| Vocab []string `json:"vocab,omitempty"` | |
| State map[string][][]float64 `json:"state"` | |
| } | |
| type TrainingCheckpointConfig struct { | |
| NLayer int `json:"n_layer"` | |
| NEmbd int `json:"n_embd"` | |
| NHead int `json:"n_head"` | |
| BlockSize int `json:"block_size"` | |
| } | |
| func ImportState(src map[string][][]float64) map[string][][]*Value { | |
| out := make(map[string][][]*Value, len(src)) | |
| for name, mat := range src { | |
| rows := make([][]*Value, len(mat)) | |
| for i, row := range mat { | |
| r := make([]*Value, len(row)) | |
| for j, v := range row { | |
| r[j] = V(v) | |
| } | |
| rows[i] = r | |
| } | |
| out[name] = rows | |
| } | |
| return out | |
| } | |
| func LoadCheckpoint(path string) (TrainingCheckpoint, error) { | |
| b, err := os.ReadFile(path) | |
| if err != nil { | |
| return TrainingCheckpoint{}, err | |
| } | |
| var ckpt TrainingCheckpoint | |
| if err := json.Unmarshal(b, &ckpt); err != nil { | |
| return TrainingCheckpoint{}, err | |
| } | |
| if ckpt.Config.NLayer < 1 || ckpt.Config.NEmbd < 1 || ckpt.Config.NHead < 1 || ckpt.Config.BlockSize < 2 { | |
| return TrainingCheckpoint{}, fmt.Errorf("invalid checkpoint config") | |
| } | |
| if ckpt.Config.NEmbd%ckpt.Config.NHead != 0 { | |
| return TrainingCheckpoint{}, fmt.Errorf("invalid checkpoint: n_embd must be divisible by n_head") | |
| } | |
| return ckpt, nil | |
| } | |
| func BuildGPT(state map[string][][]*Value, nLayer, nEmbd, nHead int) func(tokenID, posID int, keys, values [][][]*Value) []*Value { | |
| headDim := nEmbd / nHead | |
| return func(tokenID, posID int, keys, values [][][]*Value) []*Value { | |
| tokEmb := state["wte"][tokenID] | |
| posEmb := state["wpe"][posID] | |
| x := make([]*Value, len(tokEmb)) | |
| for i := range tokEmb { | |
| x[i] = Add(tokEmb[i], posEmb[i]) | |
| } | |
| x = rmsnorm(x) | |
| for li := 0; li < nLayer; li++ { | |
| xResidual := x | |
| x = rmsnorm(x) | |
| q := linear(x, state[fmt.Sprintf("layer%d.attn_wq", li)]) | |
| k := linear(x, state[fmt.Sprintf("layer%d.attn_wk", li)]) | |
| v := linear(x, state[fmt.Sprintf("layer%d.attn_wv", li)]) | |
| keys[li] = append(keys[li], k) | |
| values[li] = append(values[li], v) | |
| xAttn := make([]*Value, 0, nEmbd) | |
| for h := 0; h < nHead; h++ { | |
| hs := h * headDim | |
| qH := q[hs : hs+headDim] | |
| kH := make([][]*Value, len(keys[li])) | |
| vH := make([][]*Value, len(values[li])) | |
| for t := 0; t < len(keys[li]); t++ { | |
| kH[t] = keys[li][t][hs : hs+headDim] | |
| vH[t] = values[li][t][hs : hs+headDim] | |
| } | |
| attnLogits := make([]*Value, len(kH)) | |
| for t := 0; t < len(kH); t++ { | |
| score := V(0) | |
| for j := 0; j < headDim; j++ { | |
| score = Add(score, Mul(qH[j], kH[t][j])) | |
| } | |
| attnLogits[t] = Div(score, V(math.Sqrt(float64(headDim)))) | |
| } | |
| attnWeights := softmax(attnLogits) | |
| headOut := make([]*Value, headDim) | |
| for j := 0; j < headDim; j++ { | |
| s := V(0) | |
| for t := 0; t < len(vH); t++ { | |
| s = Add(s, Mul(attnWeights[t], vH[t][j])) | |
| } | |
| headOut[j] = s | |
| } | |
| xAttn = append(xAttn, headOut...) | |
| } | |
| x = linear(xAttn, state[fmt.Sprintf("layer%d.attn_wo", li)]) | |
| for i := range x { | |
| x[i] = Add(x[i], xResidual[i]) | |
| } | |
| xResidual = x | |
| x = rmsnorm(x) | |
| x = linear(x, state[fmt.Sprintf("layer%d.mlp_fc1", li)]) | |
| for i := range x { | |
| x[i] = ReLU(x[i]) | |
| } | |
| x = linear(x, state[fmt.Sprintf("layer%d.mlp_fc2", li)]) | |
| for i := range x { | |
| x[i] = Add(x[i], xResidual[i]) | |
| } | |
| } | |
| return linear(x, state["lm_head"]) | |
| } | |
| } | |
| // Sampling functions | |
| func SampleWeighted(weights []float64) int { | |
| sum := 0.0 | |
| for _, w := range weights { | |
| sum += w | |
| } | |
| r := rand.Float64() * sum | |
| running := 0.0 | |
| for i, w := range weights { | |
| running += w | |
| if r <= running { | |
| return i | |
| } | |
| } | |
| return len(weights) - 1 | |
| } | |
| func SoftmaxFloat(logits []float64) []float64 { | |
| maxLogit := -math.MaxFloat64 | |
| for _, l := range logits { | |
| if l > maxLogit { | |
| maxLogit = l | |
| } | |
| } | |
| sum := 0.0 | |
| out := make([]float64, len(logits)) | |
| for i, l := range logits { | |
| out[i] = math.Exp(l - maxLogit) | |
| sum += out[i] | |
| } | |
| for i := range out { | |
| out[i] /= sum | |
| } | |
| return out | |
| } | |
| func NextTokenWeights(logits []*Value, temperature float64, topK int, topP float64, recent map[int]bool, repetitionPenalty float64) []float64 { | |
| l := make([]float64, len(logits)) | |
| for i, v := range logits { | |
| l[i] = v.Data | |
| if recent[i] { | |
| if l[i] >= 0 { | |
| l[i] /= repetitionPenalty | |
| } else { | |
| l[i] *= repetitionPenalty | |
| } | |
| } | |
| l[i] /= temperature | |
| } | |
| w := SoftmaxFloat(l) | |
| if topK > 0 { | |
| w = ApplyTopK(w, topK) | |
| } | |
| if topP > 0 && topP < 1.0 { | |
| w = ApplyTopP(w, topP) | |
| } | |
| return w | |
| } | |
| func ApplyTopK(weights []float64, k int) []float64 { | |
| if k >= len(weights) { | |
| return weights | |
| } | |
| type kv struct { | |
| i int | |
| w float64 | |
| } | |
| arr := make([]kv, len(weights)) | |
| for i, w := range weights { | |
| arr[i] = kv{i, w} | |
| } | |
| sort.Slice(arr, func(i, j int) bool { return arr[i].w > arr[j].w }) | |
| out := make([]float64, len(weights)) | |
| for i := 0; i < k; i++ { | |
| out[arr[i].i] = arr[i].w | |
| } | |
| return out | |
| } | |
| func ApplyTopP(weights []float64, p float64) []float64 { | |
| type kv struct { | |
| i int | |
| w float64 | |
| } | |
| arr := make([]kv, len(weights)) | |
| for i, w := range weights { | |
| arr[i] = kv{i, w} | |
| } | |
| sort.Slice(arr, func(i, j int) bool { return arr[i].w > arr[j].w }) | |
| out := make([]float64, len(weights)) | |
| sum := 0.0 | |
| for i := 0; i < len(arr); i++ { | |
| sum += arr[i].w | |
| out[arr[i].i] = arr[i].w | |
| if sum >= p { | |
| break | |
| } | |
| } | |
| return out | |
| } | |