Traves's picture
Upload folder using huggingface_hub
bc742a1 verified
package model
import (
"encoding/json"
"fmt"
"math"
"math/rand"
"os"
"sort"
)
// Value represents a scalar for autograd
type Value struct {
Data float64
Grad float64
Children []*Value
LocalGrads []float64
}
func V(x float64) *Value {
return &Value{Data: x}
}
func Add(a, b *Value) *Value {
out := &Value{Data: a.Data + b.Data, Children: []*Value{a, b}, LocalGrads: []float64{1, 1}}
return out
}
func Sub(a, b *Value) *Value {
out := &Value{Data: a.Data - b.Data, Children: []*Value{a, b}, LocalGrads: []float64{1, -1}}
return out
}
func Mul(a, b *Value) *Value {
out := &Value{Data: a.Data * b.Data, Children: []*Value{a, b}, LocalGrads: []float64{b.Data, a.Data}}
return out
}
func Pow(a *Value, p float64) *Value {
out := &Value{Data: math.Pow(a.Data, p), Children: []*Value{a}, LocalGrads: []float64{p * math.Pow(a.Data, p-1)}}
return out
}
func Div(a, b *Value) *Value {
return Mul(a, Pow(b, -1))
}
func Neg(a *Value) *Value {
return Mul(a, V(-1))
}
func Log(a *Value) *Value {
out := &Value{Data: math.Log(a.Data), Children: []*Value{a}, LocalGrads: []float64{1 / a.Data}}
return out
}
func Exp(a *Value) *Value {
out := &Value{Data: math.Exp(a.Data), Children: []*Value{a}, LocalGrads: []float64{math.Exp(a.Data)}}
return out
}
func ReLU(a *Value) *Value {
val := 0.0
grad := 0.0
if a.Data > 0 {
val = a.Data
grad = 1
}
out := &Value{Data: val, Children: []*Value{a}, LocalGrads: []float64{grad}}
return out
}
func Backward(out *Value) {
topo := make([]*Value, 0)
visited := make(map[*Value]bool)
var buildTopo func(*Value)
buildTopo = func(v *Value) {
if !visited[v] {
visited[v] = true
for _, child := range v.Children {
buildTopo(child)
}
topo = append(topo, v)
}
}
buildTopo(out)
for _, v := range topo {
v.Grad = 0
}
out.Grad = 1
for i := len(topo) - 1; i >= 0; i-- {
v := topo[i]
for j, child := range v.Children {
child.Grad += v.LocalGrads[j] * v.Grad
}
}
}
func linear(x []*Value, w [][]*Value) []*Value {
nout := len(w)
nin := len(x)
out := make([]*Value, nout)
for i := 0; i < nout; i++ {
s := V(0)
for j := 0; j < nin; j++ {
s = Add(s, Mul(x[j], w[i][j]))
}
out[i] = s
}
return out
}
func softmax(logits []*Value) []*Value {
maxVal := -math.MaxFloat64
for _, l := range logits {
if l.Data > maxVal {
maxVal = l.Data
}
}
exps := make([]*Value, len(logits))
sumExp := V(0)
for i, l := range logits {
exps[i] = Exp(Sub(l, V(maxVal)))
sumExp = Add(sumExp, exps[i])
}
out := make([]*Value, len(logits))
invSum := Div(V(1), sumExp)
for i := range exps {
out[i] = Mul(exps[i], invSum)
}
return out
}
func rmsnorm(x []*Value) []*Value {
meanSq := V(0)
for _, v := range x {
meanSq = Add(meanSq, Pow(v, 2))
}
meanSq = Mul(V(1/float64(len(x))), meanSq)
invStd := Div(V(1), Pow(Add(meanSq, V(1e-6)), 0.5))
out := make([]*Value, len(x))
for i, v := range x {
out[i] = Mul(v, invStd)
}
return out
}
// TrainingCheckpoint structs
type TrainingCheckpoint struct {
Version int `json:"version"`
CreatedAt string `json:"created_at"`
Config TrainingCheckpointConfig `json:"config"`
Tokenization string `json:"tokenization,omitempty"`
BPEEncoding string `json:"bpe_encoding,omitempty"`
BPETokenIDs []int `json:"bpe_token_ids,omitempty"`
Vocab []string `json:"vocab,omitempty"`
State map[string][][]float64 `json:"state"`
}
type TrainingCheckpointConfig struct {
NLayer int `json:"n_layer"`
NEmbd int `json:"n_embd"`
NHead int `json:"n_head"`
BlockSize int `json:"block_size"`
}
func ImportState(src map[string][][]float64) map[string][][]*Value {
out := make(map[string][][]*Value, len(src))
for name, mat := range src {
rows := make([][]*Value, len(mat))
for i, row := range mat {
r := make([]*Value, len(row))
for j, v := range row {
r[j] = V(v)
}
rows[i] = r
}
out[name] = rows
}
return out
}
func LoadCheckpoint(path string) (TrainingCheckpoint, error) {
b, err := os.ReadFile(path)
if err != nil {
return TrainingCheckpoint{}, err
}
var ckpt TrainingCheckpoint
if err := json.Unmarshal(b, &ckpt); err != nil {
return TrainingCheckpoint{}, err
}
if ckpt.Config.NLayer < 1 || ckpt.Config.NEmbd < 1 || ckpt.Config.NHead < 1 || ckpt.Config.BlockSize < 2 {
return TrainingCheckpoint{}, fmt.Errorf("invalid checkpoint config")
}
if ckpt.Config.NEmbd%ckpt.Config.NHead != 0 {
return TrainingCheckpoint{}, fmt.Errorf("invalid checkpoint: n_embd must be divisible by n_head")
}
return ckpt, nil
}
func BuildGPT(state map[string][][]*Value, nLayer, nEmbd, nHead int) func(tokenID, posID int, keys, values [][][]*Value) []*Value {
headDim := nEmbd / nHead
return func(tokenID, posID int, keys, values [][][]*Value) []*Value {
tokEmb := state["wte"][tokenID]
posEmb := state["wpe"][posID]
x := make([]*Value, len(tokEmb))
for i := range tokEmb {
x[i] = Add(tokEmb[i], posEmb[i])
}
x = rmsnorm(x)
for li := 0; li < nLayer; li++ {
xResidual := x
x = rmsnorm(x)
q := linear(x, state[fmt.Sprintf("layer%d.attn_wq", li)])
k := linear(x, state[fmt.Sprintf("layer%d.attn_wk", li)])
v := linear(x, state[fmt.Sprintf("layer%d.attn_wv", li)])
keys[li] = append(keys[li], k)
values[li] = append(values[li], v)
xAttn := make([]*Value, 0, nEmbd)
for h := 0; h < nHead; h++ {
hs := h * headDim
qH := q[hs : hs+headDim]
kH := make([][]*Value, len(keys[li]))
vH := make([][]*Value, len(values[li]))
for t := 0; t < len(keys[li]); t++ {
kH[t] = keys[li][t][hs : hs+headDim]
vH[t] = values[li][t][hs : hs+headDim]
}
attnLogits := make([]*Value, len(kH))
for t := 0; t < len(kH); t++ {
score := V(0)
for j := 0; j < headDim; j++ {
score = Add(score, Mul(qH[j], kH[t][j]))
}
attnLogits[t] = Div(score, V(math.Sqrt(float64(headDim))))
}
attnWeights := softmax(attnLogits)
headOut := make([]*Value, headDim)
for j := 0; j < headDim; j++ {
s := V(0)
for t := 0; t < len(vH); t++ {
s = Add(s, Mul(attnWeights[t], vH[t][j]))
}
headOut[j] = s
}
xAttn = append(xAttn, headOut...)
}
x = linear(xAttn, state[fmt.Sprintf("layer%d.attn_wo", li)])
for i := range x {
x[i] = Add(x[i], xResidual[i])
}
xResidual = x
x = rmsnorm(x)
x = linear(x, state[fmt.Sprintf("layer%d.mlp_fc1", li)])
for i := range x {
x[i] = ReLU(x[i])
}
x = linear(x, state[fmt.Sprintf("layer%d.mlp_fc2", li)])
for i := range x {
x[i] = Add(x[i], xResidual[i])
}
}
return linear(x, state["lm_head"])
}
}
// Sampling functions
func SampleWeighted(weights []float64) int {
sum := 0.0
for _, w := range weights {
sum += w
}
r := rand.Float64() * sum
running := 0.0
for i, w := range weights {
running += w
if r <= running {
return i
}
}
return len(weights) - 1
}
func SoftmaxFloat(logits []float64) []float64 {
maxLogit := -math.MaxFloat64
for _, l := range logits {
if l > maxLogit {
maxLogit = l
}
}
sum := 0.0
out := make([]float64, len(logits))
for i, l := range logits {
out[i] = math.Exp(l - maxLogit)
sum += out[i]
}
for i := range out {
out[i] /= sum
}
return out
}
func NextTokenWeights(logits []*Value, temperature float64, topK int, topP float64, recent map[int]bool, repetitionPenalty float64) []float64 {
l := make([]float64, len(logits))
for i, v := range logits {
l[i] = v.Data
if recent[i] {
if l[i] >= 0 {
l[i] /= repetitionPenalty
} else {
l[i] *= repetitionPenalty
}
}
l[i] /= temperature
}
w := SoftmaxFloat(l)
if topK > 0 {
w = ApplyTopK(w, topK)
}
if topP > 0 && topP < 1.0 {
w = ApplyTopP(w, topP)
}
return w
}
func ApplyTopK(weights []float64, k int) []float64 {
if k >= len(weights) {
return weights
}
type kv struct {
i int
w float64
}
arr := make([]kv, len(weights))
for i, w := range weights {
arr[i] = kv{i, w}
}
sort.Slice(arr, func(i, j int) bool { return arr[i].w > arr[j].w })
out := make([]float64, len(weights))
for i := 0; i < k; i++ {
out[arr[i].i] = arr[i].w
}
return out
}
func ApplyTopP(weights []float64, p float64) []float64 {
type kv struct {
i int
w float64
}
arr := make([]kv, len(weights))
for i, w := range weights {
arr[i] = kv{i, w}
}
sort.Slice(arr, func(i, j int) bool { return arr[i].w > arr[j].w })
out := make([]float64, len(weights))
sum := 0.0
for i := 0; i < len(arr); i++ {
sum += arr[i].w
out[arr[i].i] = arr[i].w
if sum >= p {
break
}
}
return out
}