LuisDarioHinojosa commited on
Commit
25c2e0c
·
1 Parent(s): ade2dee

initial commit

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ gpt_weights_trained_100_epochs.pth filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from timeit import default_timer as timer
4
+
5
+ # import the langauge model and the dataset
6
+ from assets.gpt_dataset_mk_II import GptDatasetMKII
7
+ from assets.gpt_architecture_mk_VI import BigramLanguageModelMKVI
8
+
9
+ # hardcode device with cpu
10
+ device = "cpu"
11
+
12
+ # model hyperparameters
13
+ BLOCK_SIZE = 256 # max sequence length for the context and target samples
14
+ EMBEDDING_DIMENTION = 384 # number of features that will be extracted from the tokens to create numeric representations of them
15
+ HEAD_SIZE = 32 # number of dimentions in the self attention mechanism
16
+ NUM_HEADS = 12 # number of heads that will be used to instance the multihead self attention heads.
17
+ DROPOUT_RATE = 0.2 # dropout rate for architecture
18
+ NUM_BLOCKS = 12 # number of encoder blocks that will be used
19
+
20
+ # instance the model and the dataset to make predictions
21
+ train_dataset = GptDatasetMKII("assets/training_data.txt",block_size = BLOCK_SIZE,tokenization_mode="shifted")
22
+
23
+ # instance the model
24
+ model = BigramLanguageModelMKVI(
25
+ vocab_size=train_dataset.vocab_size,
26
+ embedding_dimention=EMBEDDING_DIMENTION,
27
+ block_size = BLOCK_SIZE,
28
+ num_heads = NUM_HEADS,
29
+ head_dropout= DROPOUT_RATE,
30
+ device = device,
31
+ num_blocks = NUM_BLOCKS
32
+ ).to(device)
33
+
34
+ # load the state dictionary
35
+ model.load_state_dict(torch.load("gpt_weights_trained_100_epochs.pth", map_location=torch.device('cpu')))
36
+
37
+ # gradio function
38
+ def generate_output(length):
39
+ start_time = timer()
40
+ output_sequence = train_dataset.decode(model.generate(context=torch.zeros((1,1),dtype = torch.long).to(device),max_new_tokens=int(length))[0].tolist())
41
+ end_time = timer()
42
+ total_time = end_time - start_time
43
+ return output_sequence,total_time
44
+
45
+ # instance gradio applications
46
+ title = "Shakespeare Text Generation"
47
+ description = "Model that generates text in the style of the writter William Shakespeare."
48
+ article = "The model is based on the transformer architecture originally published in \"[Attention Is All You Need](https://arxiv.org/abs/1706.03762) \" paper. It was trained on a dataset cotaining all the plays from William Shakespeare, and was implemented on Pytorch from scratch by myself. It is still imperfect, but i will update it as i work on it. The purpose was for me to get acquainted with transformers and sequence models."
49
+
50
+ # instance interface
51
+ demo = gr.Interface(
52
+ fn = generate_output,
53
+ inputs = [gr.Number(value = 50,label = "Sequence Length",info = "Length of the sample sequence you wish to generate.")],
54
+ outputs = [gr.TextArea(lines = 5,label="Sequence Output"),gr.Number(label = "Execution Time (seconds)")],
55
+ title = title,
56
+ article = article,
57
+ description = description
58
+ )
59
+
60
+ # launch interface
61
+ demo.launch()
assets/gpt_architecture_mk_VI.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ torch.manual_seed(4444)
6
+
7
+ class FeedForward(nn.Module):
8
+ def __init__(self,embedding_dimention,dropout):
9
+ super().__init__()
10
+ self.embedding_dimention = embedding_dimention
11
+ self.dropout = dropout
12
+ self.net = nn.Sequential(
13
+ nn.Linear(self.embedding_dimention, 4 * self.embedding_dimention),
14
+ nn.ReLU(),
15
+ nn.Linear(4 * self.embedding_dimention, self.embedding_dimention),
16
+ nn.Dropout(self.dropout)
17
+ )
18
+ def forward(self,x):
19
+ return self.net(x)
20
+
21
+ # attention mechanism for a single head
22
+ class Head(nn.Module):
23
+ def __init__(self,head_size,block_size,embedding_dimention,dropout):
24
+ super().__init__()
25
+ # instance hyperparameters
26
+ self.head_size = head_size
27
+ self.block_size = block_size
28
+ self.embedding_dimention = embedding_dimention
29
+ self.dropout = dropout
30
+
31
+ # instance layers for single self attention head
32
+ self.key = nn.Linear(self.embedding_dimention, self.head_size, bias=False)
33
+ self.query = nn.Linear(self.embedding_dimention, self.head_size, bias=False)
34
+ self.value = nn.Linear(self.embedding_dimention, self.head_size, bias=False)
35
+ self.register_buffer('tril', torch.tril(torch.ones(self.block_size, self.block_size)))
36
+ self.dropout_layer = nn.Dropout(self.dropout)
37
+
38
+ def forward(self,x):
39
+ batch,timesteps,channels = x.shape
40
+ k = self.key(x)
41
+ q = self.query(x)
42
+
43
+ # compute the attention scores
44
+ wei = q @ k.transpose(-2,-1) * channels ** -0.5 # dot product normalization and normalize to prevent explosion
45
+ wei = wei.masked_fill(self.tril[:timesteps,:timesteps] == 0,float("-inf")) # only include the previous tokes to average
46
+ wei = F.softmax(wei,dim = -1 ) # normalize to 1
47
+ wei = self.dropout_layer(wei)
48
+ # add the value
49
+ v = self.value(x)
50
+ out = wei @ v
51
+
52
+ return out
53
+
54
+ # attention mechanism for multiple heads (may head layers placed in papalel)
55
+ class MulheadSelfAttention(nn.Module):
56
+ def __init__(self,num_heads,head_size,block_size,embedding_dimention,dropout):
57
+ super().__init__()
58
+ # instance hyperparameters
59
+ self.num_heads = num_heads
60
+ self.head_size = head_size
61
+ self.block_size = block_size
62
+ self.embedding_dimention = embedding_dimention
63
+ self.dropout = dropout
64
+ # instance
65
+ self.heads = nn.ModuleList([Head(self.head_size,self.block_size,self.embedding_dimention,self.dropout) for _ in range(self.num_heads)])
66
+ self.projection = nn.Linear(self.embedding_dimention,self.embedding_dimention)
67
+ self.dropout_layer = nn.Dropout(self.dropout)
68
+ def forward(self,x):
69
+ out = torch.cat([h(x) for h in self.heads],dim = -1)
70
+ out = self.dropout_layer(self.projection(x))
71
+ return x
72
+
73
+
74
+ class Block(nn.Module):
75
+ def __init__(self,embedding_dimention,num_heads,block_size,dropout):
76
+ super().__init__()
77
+ # instance parameters
78
+ self.num_heads = num_heads
79
+ self.embedding_dimention = embedding_dimention
80
+ self.head_size = self.embedding_dimention // self.num_heads
81
+ self.block_size = block_size
82
+ self.dropout = dropout
83
+ self.layer_norm_1 = nn.LayerNorm(self.embedding_dimention)
84
+ self.layer_norm_2 = nn.LayerNorm(self.embedding_dimention)
85
+
86
+ # layers
87
+ self.sa_heads = MulheadSelfAttention(self.num_heads,self.head_size,self.block_size,self.embedding_dimention,self.dropout)
88
+ self.feedfwrd = FeedForward(self.embedding_dimention,self.dropout)
89
+
90
+ def forward(self,x):
91
+ x = x + self.sa_heads(self.layer_norm_1(x))
92
+ x = x + self.feedfwrd(self.layer_norm_2(x))
93
+
94
+ return x
95
+
96
+ class BigramLanguageModelMKVI(nn.Module):
97
+ def __init__(self,vocab_size,embedding_dimention,block_size,num_heads,head_dropout,device,num_blocks):
98
+ super().__init__()
99
+ self.vocab_size = vocab_size
100
+ self.block_size = block_size
101
+ self.embedding_dimention = embedding_dimention
102
+ #self.head_size = head_size
103
+ self.head_dropout = head_dropout
104
+ self.num_heads = num_heads
105
+ self.num_blocks = num_blocks
106
+ self.device = device
107
+
108
+ # embedding matrix for each of the tokens
109
+ self.token_embedding = nn.Embedding(self.vocab_size,self.embedding_dimention)
110
+ self.position_embedding = nn.Embedding(self.block_size,self.embedding_dimention)
111
+ # This are replaced by the block
112
+ #self.sa_heads = MulheadSelfAttention(self.num_heads,self.embedding_dimention // self.num_heads,self.block_size,self.embedding_dimention,self.head_dropout)
113
+ #self.feedfwrd = FeedForward(self.embedding_dimention)
114
+ self.blocks = nn.Sequential(*
115
+ [
116
+ Block(self.embedding_dimention,self.num_heads,self.block_size,self.head_dropout) for _ in range(self.num_blocks)
117
+ ]
118
+ )
119
+
120
+ self.layer_norm = nn.LayerNorm(self.embedding_dimention)
121
+ self.lm_head = nn.Linear(in_features=self.embedding_dimention,out_features=self.vocab_size)
122
+
123
+ def forward(self,context,targets = None):
124
+ batch,timesteps = context.shape
125
+ # get the logits in shape (BATCH,TIMESTEPS,CHANNELS)
126
+ token_embedding = self.token_embedding(context)
127
+ pos_embedding = self.position_embedding(torch.arange(timesteps,device = self.device))
128
+ x = token_embedding + pos_embedding
129
+ #x = self.sa_heads(x)
130
+ #x = self.feedfwrd(x)
131
+ x = self.blocks(x)
132
+ self.layer_norm(x)
133
+ logits = self.lm_head(x)
134
+ if targets is None:
135
+ loss = None
136
+ else:
137
+ batch,timesteps,channels = logits.shape
138
+ logits = logits.view(batch*timesteps,channels)
139
+ targets = targets.view(batch*timesteps)
140
+ loss = F.cross_entropy(logits,targets)
141
+ return logits,loss
142
+
143
+ def generate(self,context,max_new_tokens):
144
+ for _ in range(max_new_tokens):
145
+ # cut down block size
146
+ context_condition = context[:,-self.block_size:]
147
+ logits,loss = self(context_condition)
148
+ # focus only on the last timestep
149
+ logits = logits[:,-1,:]
150
+ # convert logits intro pobability distribution
151
+ probs = F.softmax(logits,dim=-1)
152
+ # sample from the distribution
153
+ indx_next = torch.multinomial(probs,num_samples = 1)
154
+ context = torch.cat([context,indx_next],dim = 1)
155
+ return context
156
+
assets/gpt_dataset_mk_II.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.utils.data import Dataset
4
+
5
+ torch.manual_seed(4444)
6
+
7
+ """
8
+ This one addresses a problem where the construction of the features and labels as tensors fails due to a lackage
9
+ of samples given the specified sequence block size. The tensors will be constucting eliminating the last incomplete sequence
10
+ """
11
+
12
+ class GptDatasetMKII(Dataset):
13
+ """
14
+ **THIS CLASS DOES NOT SUPPORT TOKEN PADDING***
15
+ Inputs:
16
+ target_dir: the directory containing the text corpus
17
+ encoding: the encoding that will be used to create the dataset
18
+ sequence_size: the length the the sequence the model will be fed at each batch iteration
19
+ tokenization_mode: can be either "uniform" or "shifted"
20
+ -> uniform: x will be a tensor of dimentions [block_size] and y will be a tensor of dimentions [1]
21
+ where y contains the index of the token that goes after the last token of x
22
+ -> shifted: both x and y will be tensors of dimentions [block_size], but y is shifted one position
23
+ to the right. This means the "i" element of y is the index of the token that goes after the end
24
+ of sequence x[:i]
25
+ """
26
+ def __init__(self,target_dir,encoding = "utf-8",block_size = 8,tokenization_mode = "uniform"):
27
+ self.text_path = target_dir # directory containing the text corpus
28
+ self.encoding = encoding # encoding used to read the text
29
+ self.block_size = block_size # length the sequence to tokenize and parse the samples
30
+ self.token_mode = tokenization_mode # can be "uniform" or "shifted"
31
+ # retrieve the text corpus from the target directory
32
+ with open(target_dir,"r",encoding=self.encoding) as f:
33
+ self.raw_text = f.read() # raw text
34
+ f.close()
35
+ self.corpus_size = len(self.raw_text)
36
+ self.vocab = sorted(list(set(self.raw_text))) # all the characters in the vocab
37
+ self.vocab_size = len(self.vocab) # length of the vocab
38
+ self.sample_2_index = {ch:i for i,ch in enumerate(self.vocab)} # convert vocab samples to indices
39
+ self.index_2_sample = {i:ch for i,ch in enumerate(self.vocab)} # convert an index to a vocab sample
40
+ self.encode = lambda s: [self.sample_2_index[c] for c in s]
41
+ self.decode = lambda l: "".join([self.index_2_sample[i] for i in l])
42
+ if self.token_mode == "uniform":
43
+ self.uniform_tokenization_mode()
44
+ else:
45
+ self.shifted_tokenization_mode()
46
+
47
+ def uniform_tokenization_mode(self):
48
+ text_encoded = self.encode(self.raw_text)
49
+ dataset = list()
50
+ labels = list()
51
+ for i in range(0,self.corpus_size,self.block_size):
52
+ if(len(text_encoded[i:i+self.block_size]) < self.block_size):
53
+ break
54
+ dataset.append(text_encoded[i:i+self.block_size])
55
+ labels.append(text_encoded[i+self.block_size])
56
+ try:
57
+ self.x = torch.tensor(dataset,dtype = torch.long)
58
+ self.y = torch.tensor(labels,dtype = torch.long)
59
+ except:
60
+ dataset = dataset[:-1]
61
+ labels = dataset[:-1]
62
+ self.x = torch.tensor(dataset,dtype = torch.long)
63
+ self.y = torch.tensor(labels,dtype = torch.long)
64
+
65
+ def shifted_tokenization_mode(self):
66
+ text_encoded = self.encode(self.raw_text)
67
+ dataset = list()
68
+ labels = list()
69
+ for i in range(0,self.corpus_size,self.block_size):
70
+ if(len(text_encoded[i:i+self.block_size]) < self.block_size):
71
+ break
72
+ dataset.append(text_encoded[i:i+self.block_size])
73
+ labels.append(text_encoded[i+1:i+self.block_size+1])
74
+ try:
75
+ self.x = torch.tensor(dataset,dtype = torch.long)
76
+ self.y = torch.tensor(labels,dtype = torch.long)
77
+ except:
78
+ dataset = dataset[:-1]
79
+ labels = dataset[:-1]
80
+ self.x = torch.tensor(dataset,dtype = torch.long)
81
+ self.y = torch.tensor(labels,dtype = torch.long)
82
+
83
+ def __len__(self):
84
+ return len(self.x)
85
+
86
+ def __getitem__(self,index):
87
+ return self.x[index],self.y[index]
88
+
assets/training_data.txt ADDED
The diff for this file is too large to render. See raw diff
 
gpt_weights_trained_100_epochs.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29f7a980b1ece9b170b34304d3239dfe1fd8a76e7881d8161291e54076fdb5e0
3
+ size 123747781
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ torch==2.0.0
2
+ gradio==3.28.1