GunaKoppula commited on
Commit
a856b9f
·
1 Parent(s): f173057

Delete bigram.py

Browse files
Files changed (1) hide show
  1. bigram.py +0 -122
bigram.py DELETED
@@ -1,122 +0,0 @@
1
- import torch
2
- import torch.nn as nn
3
- from torch.nn import functional as F
4
-
5
- # hyperparameters
6
- batch_size = 32 # how many independent sequences will we process in parallel?
7
- block_size = 8 # what is the maximum context length for predictions?
8
- max_iters = 3000
9
- eval_interval = 300
10
- learning_rate = 1e-2
11
- device = 'cuda' if torch.cuda.is_available() else 'cpu'
12
- eval_iters = 200
13
- # ------------
14
-
15
- torch.manual_seed(1337)
16
-
17
- # wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
18
- with open('input.txt', 'r', encoding='utf-8') as f:
19
- text = f.read()
20
-
21
- # here are all the unique characters that occur in this text
22
- chars = sorted(list(set(text)))
23
- vocab_size = len(chars)
24
- # create a mapping from characters to integers
25
- stoi = { ch:i for i,ch in enumerate(chars) }
26
- itos = { i:ch for i,ch in enumerate(chars) }
27
- encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
28
- decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
29
-
30
- # Train and test splits
31
- data = torch.tensor(encode(text), dtype=torch.long)
32
- n = int(0.9*len(data)) # first 90% will be train, rest val
33
- train_data = data[:n]
34
- val_data = data[n:]
35
-
36
- # data loading
37
- def get_batch(split):
38
- # generate a small batch of data of inputs x and targets y
39
- data = train_data if split == 'train' else val_data
40
- ix = torch.randint(len(data) - block_size, (batch_size,))
41
- x = torch.stack([data[i:i+block_size] for i in ix])
42
- y = torch.stack([data[i+1:i+block_size+1] for i in ix])
43
- x, y = x.to(device), y.to(device)
44
- return x, y
45
-
46
- @torch.no_grad()
47
- def estimate_loss():
48
- out = {}
49
- model.eval()
50
- for split in ['train', 'val']:
51
- losses = torch.zeros(eval_iters)
52
- for k in range(eval_iters):
53
- X, Y = get_batch(split)
54
- logits, loss = model(X, Y)
55
- losses[k] = loss.item()
56
- out[split] = losses.mean()
57
- model.train()
58
- return out
59
-
60
- # super simple bigram model
61
- class BigramLanguageModel(nn.Module):
62
-
63
- def __init__(self, vocab_size):
64
- super().__init__()
65
- # each token directly reads off the logits for the next token from a lookup table
66
- self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
67
-
68
- def forward(self, idx, targets=None):
69
-
70
- # idx and targets are both (B,T) tensor of integers
71
- logits = self.token_embedding_table(idx) # (B,T,C)
72
-
73
- if targets is None:
74
- loss = None
75
- else:
76
- B, T, C = logits.shape
77
- logits = logits.view(B*T, C)
78
- targets = targets.view(B*T)
79
- loss = F.cross_entropy(logits, targets)
80
-
81
- return logits, loss
82
-
83
- def generate(self, idx, max_new_tokens):
84
- # idx is (B, T) array of indices in the current context
85
- for _ in range(max_new_tokens):
86
- # get the predictions
87
- logits, loss = self(idx)
88
- # focus only on the last time step
89
- logits = logits[:, -1, :] # becomes (B, C)
90
- # apply softmax to get probabilities
91
- probs = F.softmax(logits, dim=-1) # (B, C)
92
- # sample from the distribution
93
- idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
94
- # append sampled index to the running sequence
95
- idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
96
- return idx
97
-
98
- model = BigramLanguageModel(vocab_size)
99
- m = model.to(device)
100
-
101
- # create a PyTorch optimizer
102
- optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
103
-
104
- for iter in range(max_iters):
105
-
106
- # every once in a while evaluate the loss on train and val sets
107
- if iter % eval_interval == 0:
108
- losses = estimate_loss()
109
- print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
110
-
111
- # sample a batch of data
112
- xb, yb = get_batch('train')
113
-
114
- # evaluate the loss
115
- logits, loss = model(xb, yb)
116
- optimizer.zero_grad(set_to_none=True)
117
- loss.backward()
118
- optimizer.step()
119
-
120
- # generate from the model
121
- context = torch.zeros((1, 1), dtype=torch.long, device=device)
122
- print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))