Spaces:

Q-b1t
/

shakespeare_text_generation

Sleeping

App Files Files Community

LuisDarioHinojosa commited on May 2, 2023

Commit

25c2e0c

1 Parent(s): ade2dee

initial commit

Browse files

Files changed (7) hide show

.gitattributes +1 -0
app.py +61 -0
assets/gpt_architecture_mk_VI.py +156 -0
assets/gpt_dataset_mk_II.py +88 -0
assets/training_data.txt +0 -0
gpt_weights_trained_100_epochs.pth +3 -0
requirements.txt +2 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+gpt_weights_trained_100_epochs.pth filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import gradio as gr
+import torch
+from timeit import default_timer as timer
+# import the langauge model and the dataset
+from assets.gpt_dataset_mk_II import GptDatasetMKII
+from assets.gpt_architecture_mk_VI import BigramLanguageModelMKVI
+# hardcode device with cpu
+device = "cpu"
+# model hyperparameters
+BLOCK_SIZE = 256 # max sequence length for the context and target samples
+EMBEDDING_DIMENTION = 384 # number of features that will be extracted from the tokens to create numeric representations of them
+HEAD_SIZE = 32 # number of dimentions in the self attention mechanism
+NUM_HEADS = 12 # number of heads that will be used to instance the multihead self attention heads.
+DROPOUT_RATE = 0.2 # dropout rate for architecture
+NUM_BLOCKS = 12 # number of encoder blocks that will be used
+# instance the model and the dataset to make predictions
+train_dataset = GptDatasetMKII("assets/training_data.txt",block_size = BLOCK_SIZE,tokenization_mode="shifted")
+# instance the model
+model = BigramLanguageModelMKVI(
+    vocab_size=train_dataset.vocab_size,
+    embedding_dimention=EMBEDDING_DIMENTION,
+    block_size = BLOCK_SIZE,
+    num_heads = NUM_HEADS,
+    head_dropout= DROPOUT_RATE,
+    device = device,
+    num_blocks = NUM_BLOCKS
+).to(device)
+# load the state dictionary
+model.load_state_dict(torch.load("gpt_weights_trained_100_epochs.pth", map_location=torch.device('cpu')))
+# gradio function
+def generate_output(length):
+  start_time = timer()
+  output_sequence = train_dataset.decode(model.generate(context=torch.zeros((1,1),dtype = torch.long).to(device),max_new_tokens=int(length))[0].tolist())
+  end_time = timer()
+  total_time = end_time - start_time
+  return output_sequence,total_time
+# instance gradio applications
+title = "Shakespeare Text Generation"
+description = "Model that generates text in the style of the writter William Shakespeare."
+article = "The model is based on the transformer architecture originally published in \"[Attention Is All You Need](https://arxiv.org/abs/1706.03762) \" paper. It was trained on a dataset cotaining all the plays from William Shakespeare, and was implemented on Pytorch from scratch by myself. It is still imperfect, but i will update it as i work on it. The purpose was for me to get acquainted with transformers and sequence models."
+# instance interface
+demo = gr.Interface(
+    fn = generate_output,
+    inputs = [gr.Number(value = 50,label = "Sequence Length",info = "Length of the sample sequence you wish to generate.")],
+    outputs = [gr.TextArea(lines = 5,label="Sequence Output"),gr.Number(label = "Execution Time (seconds)")],
+    title = title,
+    article = article,
+    description = description
+)
+# launch interface
+demo.launch()

assets/gpt_architecture_mk_VI.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+torch.manual_seed(4444)
+class FeedForward(nn.Module):
+  def __init__(self,embedding_dimention,dropout):
+    super().__init__()
+    self.embedding_dimention = embedding_dimention
+    self.dropout = dropout
+    self.net = nn.Sequential(
+        nn.Linear(self.embedding_dimention, 4 * self.embedding_dimention),
+        nn.ReLU(),
+        nn.Linear(4 * self.embedding_dimention, self.embedding_dimention),
+        nn.Dropout(self.dropout)
+    )
+  def forward(self,x):
+    return self.net(x)
+# attention mechanism for a single head
+class Head(nn.Module):
+  def __init__(self,head_size,block_size,embedding_dimention,dropout):
+    super().__init__()
+    # instance hyperparameters
+    self.head_size = head_size
+    self.block_size = block_size
+    self.embedding_dimention = embedding_dimention
+    self.dropout = dropout
+    # instance layers for single self attention head
+    self.key = nn.Linear(self.embedding_dimention, self.head_size, bias=False)
+    self.query = nn.Linear(self.embedding_dimention, self.head_size, bias=False)
+    self.value = nn.Linear(self.embedding_dimention, self.head_size, bias=False)
+    self.register_buffer('tril', torch.tril(torch.ones(self.block_size, self.block_size)))
+    self.dropout_layer = nn.Dropout(self.dropout)
+  def forward(self,x):
+    batch,timesteps,channels = x.shape
+    k = self.key(x)
+    q = self.query(x)
+    # compute the attention scores
+    wei = q @ k.transpose(-2,-1) * channels ** -0.5 # dot product normalization and normalize to prevent explosion
+    wei = wei.masked_fill(self.tril[:timesteps,:timesteps] == 0,float("-inf")) # only include the previous tokes to average
+    wei = F.softmax(wei,dim = -1 ) # normalize to 1
+    wei = self.dropout_layer(wei)
+    # add the value
+    v = self.value(x)
+    out = wei @ v
+    return out
+# attention mechanism for multiple heads (may head layers placed in papalel)
+class MulheadSelfAttention(nn.Module):
+  def __init__(self,num_heads,head_size,block_size,embedding_dimention,dropout):
+    super().__init__()
+    # instance hyperparameters
+    self.num_heads = num_heads
+    self.head_size = head_size
+    self.block_size = block_size
+    self.embedding_dimention = embedding_dimention
+    self.dropout = dropout
+    # instance
+    self.heads = nn.ModuleList([Head(self.head_size,self.block_size,self.embedding_dimention,self.dropout) for _ in range(self.num_heads)])
+    self.projection = nn.Linear(self.embedding_dimention,self.embedding_dimention)
+    self.dropout_layer = nn.Dropout(self.dropout)
+  def forward(self,x):
+    out = torch.cat([h(x) for h in self.heads],dim = -1)
+    out = self.dropout_layer(self.projection(x))
+    return x
+class Block(nn.Module):
+  def __init__(self,embedding_dimention,num_heads,block_size,dropout):
+    super().__init__()
+    # instance parameters
+    self.num_heads = num_heads
+    self.embedding_dimention = embedding_dimention
+    self.head_size = self.embedding_dimention // self.num_heads
+    self.block_size = block_size
+    self.dropout = dropout
+    self.layer_norm_1 = nn.LayerNorm(self.embedding_dimention)
+    self.layer_norm_2 = nn.LayerNorm(self.embedding_dimention)
+    # layers
+    self.sa_heads = MulheadSelfAttention(self.num_heads,self.head_size,self.block_size,self.embedding_dimention,self.dropout)
+    self.feedfwrd = FeedForward(self.embedding_dimention,self.dropout)
+  def forward(self,x):
+    x = x + self.sa_heads(self.layer_norm_1(x))
+    x = x + self.feedfwrd(self.layer_norm_2(x))
+    return x
+class BigramLanguageModelMKVI(nn.Module):
+  def __init__(self,vocab_size,embedding_dimention,block_size,num_heads,head_dropout,device,num_blocks):
+    super().__init__()
+    self.vocab_size = vocab_size
+    self.block_size = block_size
+    self.embedding_dimention = embedding_dimention
+    #self.head_size = head_size
+    self.head_dropout = head_dropout
+    self.num_heads = num_heads
+    self.num_blocks = num_blocks
+    self.device = device
+    # embedding matrix for each of the tokens
+    self.token_embedding = nn.Embedding(self.vocab_size,self.embedding_dimention)
+    self.position_embedding = nn.Embedding(self.block_size,self.embedding_dimention)
+    # This are replaced by the block
+    #self.sa_heads = MulheadSelfAttention(self.num_heads,self.embedding_dimention // self.num_heads,self.block_size,self.embedding_dimention,self.head_dropout)
+    #self.feedfwrd = FeedForward(self.embedding_dimention)
+    self.blocks = nn.Sequential(*
+        [
+            Block(self.embedding_dimention,self.num_heads,self.block_size,self.head_dropout) for _ in range(self.num_blocks)
+        ]
+    )
+    self.layer_norm = nn.LayerNorm(self.embedding_dimention)
+    self.lm_head = nn.Linear(in_features=self.embedding_dimention,out_features=self.vocab_size)
+  def forward(self,context,targets = None):
+    batch,timesteps = context.shape
+    # get the logits in shape (BATCH,TIMESTEPS,CHANNELS)
+    token_embedding = self.token_embedding(context)
+    pos_embedding = self.position_embedding(torch.arange(timesteps,device = self.device))
+    x = token_embedding + pos_embedding
+    #x = self.sa_heads(x)
+    #x = self.feedfwrd(x)
+    x = self.blocks(x)
+    self.layer_norm(x)
+    logits = self.lm_head(x)
+    if targets is None:
+      loss = None
+    else:
+      batch,timesteps,channels = logits.shape
+      logits = logits.view(batch*timesteps,channels)
+      targets = targets.view(batch*timesteps)
+      loss = F.cross_entropy(logits,targets)
+    return logits,loss
+  def generate(self,context,max_new_tokens):
+    for _ in range(max_new_tokens):
+      # cut down block size
+      context_condition = context[:,-self.block_size:]
+      logits,loss = self(context_condition)
+      # focus only on the last timestep
+      logits = logits[:,-1,:]
+      # convert logits intro pobability distribution
+      probs = F.softmax(logits,dim=-1)
+      # sample from the distribution
+      indx_next = torch.multinomial(probs,num_samples = 1)
+      context = torch.cat([context,indx_next],dim = 1)
+    return context

assets/gpt_dataset_mk_II.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+torch.manual_seed(4444)
+"""
+This one addresses a problem where the construction of the features and labels as tensors fails due to a lackage
+of samples given the specified sequence block size. The tensors will be constucting eliminating the last incomplete sequence
+"""
+class GptDatasetMKII(Dataset):
+  """
+  **THIS CLASS DOES NOT SUPPORT TOKEN PADDING***
+  Inputs:
+  target_dir: the directory containing the text corpus
+  encoding: the encoding that will be used to create the dataset
+  sequence_size: the length the the sequence the model will be fed at each batch iteration
+  tokenization_mode: can be either "uniform" or "shifted"
+    -> uniform: x will be a tensor of dimentions [block_size] and y will be a tensor of dimentions [1]
+       where y contains the index of the token that goes after the last token of x
+    -> shifted: both x and y will be tensors of dimentions [block_size], but y is shifted one position
+       to the right. This means the "i" element of y is the index of the token that goes after the end
+       of sequence x[:i]
+  """
+  def __init__(self,target_dir,encoding = "utf-8",block_size = 8,tokenization_mode = "uniform"):
+    self.text_path = target_dir # directory containing the text corpus
+    self.encoding = encoding # encoding used to read the text
+    self.block_size = block_size # length the sequence to tokenize and parse the samples
+    self.token_mode = tokenization_mode # can be "uniform" or "shifted"
+    # retrieve the text corpus from the target directory
+    with open(target_dir,"r",encoding=self.encoding) as f:
+      self.raw_text = f.read() # raw text
+      f.close()
+    self.corpus_size = len(self.raw_text)
+    self.vocab = sorted(list(set(self.raw_text))) # all the characters in the vocab
+    self.vocab_size = len(self.vocab) # length of the vocab
+    self.sample_2_index = {ch:i for i,ch in enumerate(self.vocab)} # convert vocab samples to indices
+    self.index_2_sample = {i:ch for i,ch in enumerate(self.vocab)} # convert an index to a vocab sample
+    self.encode = lambda s: [self.sample_2_index[c] for c in s]
+    self.decode = lambda l: "".join([self.index_2_sample[i] for i in l])
+    if self.token_mode == "uniform":
+      self.uniform_tokenization_mode()
+    else:
+      self.shifted_tokenization_mode()
+  def uniform_tokenization_mode(self):
+    text_encoded = self.encode(self.raw_text)
+    dataset = list()
+    labels = list()
+    for i in range(0,self.corpus_size,self.block_size):
+      if(len(text_encoded[i:i+self.block_size]) < self.block_size):
+        break
+      dataset.append(text_encoded[i:i+self.block_size])
+      labels.append(text_encoded[i+self.block_size])
+    try:
+      self.x = torch.tensor(dataset,dtype = torch.long)
+      self.y = torch.tensor(labels,dtype = torch.long)
+    except:
+      dataset = dataset[:-1]
+      labels = dataset[:-1]
+      self.x = torch.tensor(dataset,dtype = torch.long)
+      self.y = torch.tensor(labels,dtype = torch.long)
+  def shifted_tokenization_mode(self):
+    text_encoded = self.encode(self.raw_text)
+    dataset = list()
+    labels = list()
+    for i in range(0,self.corpus_size,self.block_size):
+      if(len(text_encoded[i:i+self.block_size]) < self.block_size):
+        break
+      dataset.append(text_encoded[i:i+self.block_size])
+      labels.append(text_encoded[i+1:i+self.block_size+1])
+    try:
+      self.x = torch.tensor(dataset,dtype = torch.long)
+      self.y = torch.tensor(labels,dtype = torch.long)
+    except:
+      dataset = dataset[:-1]
+      labels = dataset[:-1]
+      self.x = torch.tensor(dataset,dtype = torch.long)
+      self.y = torch.tensor(labels,dtype = torch.long)
+  def __len__(self):
+    return len(self.x)
+  def __getitem__(self,index):
+    return self.x[index],self.y[index]

assets/training_data.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

gpt_weights_trained_100_epochs.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29f7a980b1ece9b170b34304d3239dfe1fd8a76e7881d8161291e54076fdb5e0
+size 123747781

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch==2.0.0
2	+ gradio==3.28.1