Spaces:

celalkartoglu
/

llm-from-scratch

Sleeping

App Files Files Community

Celal Şamil Kartoğlu commited on Jul 24, 2025

Commit

14f592d

1 Parent(s): e95ef11

app is completed

Browse files

Files changed (25) hide show

.gradio/certificate.pem +31 -0
app.py +189 -60
requirements.txt +4 -1
v1/__init__.py +0 -0
v1/__pycache__/__init__.cpython-311.pyc +0 -0
v1/__pycache__/usta_decoder_block.cpython-311.pyc +0 -0
v1/__pycache__/usta_embedding.cpython-311.pyc +0 -0
v1/__pycache__/usta_layer_norm.cpython-311.pyc +0 -0
v1/__pycache__/usta_mlp.cpython-311.pyc +0 -0
v1/__pycache__/usta_model.cpython-311.pyc +0 -0
v1/__pycache__/usta_multi_head_attention.cpython-311.pyc +0 -0
v1/__pycache__/usta_tokenizer.cpython-311.pyc +0 -0
v1/tokenizer.json +66 -0
v1/u_model.pth +3 -0
v1/u_model_4000.pth +3 -0
v1/usta_causal_attention.py +39 -0
v1/usta_decoder_block.py +28 -0
v1/usta_embedding.py +51 -0
v1/usta_layer_norm.py +16 -0
v1/usta_mlp.py +34 -0
v1/usta_model.py +38 -0
v1/usta_multi_head_attention.py +22 -0
v1/usta_multi_head_attention_old.py +22 -0
v1/usta_self_attention.py +30 -0
v1/usta_tokenizer.py +50 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py CHANGED Viewed

@@ -1,64 +1,193 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
 if __name__ == "__main__":
-    demo.launch()

+import os
+import torch
 import gradio as gr
+from v1.usta_model import UstaModel
+from v1.usta_tokenizer import UstaTokenizer
+model,tokenizer,model_status = None,None,"Not Loaded"
+def load_model(custom_model_path=None):
+    try:
+        u_tokenizer = UstaTokenizer("v1/tokenizer.json")
+        print(f"Tokenizer loaded successfully, vocab size : {len(u_tokenizer.vocab)}")
+        context_length = 32
+        vocab_size = len(u_tokenizer.vocab)
+        embed_dim = 12
+        num_heads = 4
+        num_layers = 8
+        model = UstaModel(context_length=context_length,vocab_size=vocab_size,embedding_dim=embed_dim,num_heads=num_heads,num_layers=num_layers)
+        if custom_model_path and os.path.exists(custom_model_path):
+            model.load_state_dict(torch.load(custom_model_path))
+        else:
+            model.load_state_dict(torch.load("v1/u_model_4000.pth"))
+        model.eval()
+        print(f"Model loaded successfully vocab size: {len(u_tokenizer.vocab)}")
+        return model,u_tokenizer,"Model Loaded Successfully !"
+    except Exception as e:
+        print(f"Error loading model : {e}")
+        return None,None,"Error Loading Model"
+try:
+    model,tokenizer,model_status = load_model()
+except Exception as e:
+    print(f"Error loading model: {e}")
+    model,tokenizer,model_status = None,None,"Error Loading Model"
+print(f"Model status: {model_status}")
+if model is not None:
+    print("Model loaded successfully")
+def chat_with_model(message,chat_history,max_new_tokens=20):
+    try:
+        tokens = tokenizer.encode(message)
+        if len(tokens) > 25:
+            tokens = tokens[-25:]
+        with torch.no_grad():
+            actual_max_tokens = min(max_new_tokens,32-len(tokens))
+            generated_tokens = model.generate(tokens,max_new_tokens=actual_max_tokens)
+        response = tokenizer.decode(generated_tokens)
+        original_message = tokenizer.decode(tokens.tolist())
+        if response.startswith(original_message):
+            response = response[len(original_message):]
+        response = response.replace("<pad>","").replace("<unk>","").strip()
+        if len(response) <=0:
+            response = "I'm sorryi I don't know the answer to that question."
+        chat_history.append([message,response])
+        return chat_history,""
+    except Exception as e:
+        print(f"Error generating response: {e}")
+        return chat_history, "Error generating response"
+def load_model_from_url(custom_model_url):
+    global model,tokenizer,model_status
+    try:
+        import requests
+        headers = {
+            "Accept": "application/octet-stream",
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
+        }
+        response = requests.get(custom_model_url)
+        response.raise_for_status()
+        temp_file = "temp_model.pth"
+        with open(temp_file,"wb") as f:
+            f.write(response.content)
+        model,tokenizer,model_status = load_model(temp_file)
+        os.remove(temp_file)
+        return  "Model loaded successfully from URL"
+    except Exception as e:
+        print(f"Error loading model from URL:{e}")
+        return "Error loading model from URL"
+def load_model_from_file(model_file):
+    global model,tokenizer,model_status
+    try:
+        print(f"Loading model from file : {model_file.name}")
+        model,tokenizer,model_status = load_model(model_file.name)
+        return "Model loaded successfully from file"
+    except Exception as e:
+        print(f"Error loading model from file:{e}")
+        return "Error loading model from file"
+with gr.Blocks(title="🤖 Usta Model Chat") as demo:
+    gr.Markdown("# 🤖 Usta Model Chat")
+    gr.Markdown("Chat with a custom transformer language model built from scratch! This model specializes in geographical knowledge.")
+    chatbot = gr.Chatbot(height=400)
+    msg = gr.Textbox(placeholder="Ask about countries, capitals, or cities...",label="Message")
+    # simple chat interface
+    with gr.Row():
+        send_button = gr.Button("Send",variant="primary")
+        clear_button = gr.Button("Clear")
+    max_new_tokens = gr.Slider(
+        minimum = 1,
+        maximum = 30,
+        value=20,
+        step=1,
+        label="Max New Tokens",
+        info="The maximum number of new tokens to generate in response to each user message."
+    )
+    gr.Markdown("## 🤖 Load Custom Model")
+    with gr.Row():
+        custom_model_url = gr.Textbox(
+            placeholder="https://github.com/malibayram/llm-from-scratch/raw/main/u_model_4000.pth",
+            label = "Custom Model Path",
+            scale=4
+        )
+        load_url_button = gr.Button("Load Model",variant="primary",scale=1)
+    with gr.Row():
+        model_file = gr.File(
+            label = "Custom Model File",
+            file_types = [".pth",".pt",".bin"],
+        )
+        load_file_button = gr.Button("Load Model",variant="primary")
+    status = gr.Textbox(
+    label = "Model Status",
+    value=model_status,
+    interactive=False,
+    scale=4
+    )
+    def send_message(message,chat_history,max_new_tokens):
+        if not message.strip():
+            return chat_history, ""
+        return chat_with_model(message,chat_history,max_new_tokens)
+    send_button.click(
+        send_message,
+        inputs=[msg,chatbot,max_new_tokens],
+        outputs=[chatbot,msg]
+    )
+    msg.submit(
+        send_message,
+        inputs=[msg,chatbot,max_new_tokens],
+        outputs=[chatbot,msg]
+    )
+    clear_button.click(lambda:None,None,chatbot,status)
+    load_url_button.click(
+        load_model_from_url,
+        inputs=[custom_model_url],
+        outputs=[status]
+    )
+    load_file_button.click(
+        load_model_from_file,
+        inputs=[model_file],
+        outputs=[status]
+    )
 if __name__ == "__main__":
+    demo.launch(share=True)

requirements.txt CHANGED Viewed

	@@ -1 +1,4 @@
1	- ~~huggingface_hub~~==0.25.2

+gradio==5.33.1
+torch==2.7.1
+requests==2.32.3
+pydantic==2.9.2

v1/__init__.py ADDED Viewed

File without changes

v1/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (165 Bytes). View file

v1/__pycache__/usta_decoder_block.cpython-311.pyc ADDED Viewed

Binary file (1.95 kB). View file

v1/__pycache__/usta_embedding.cpython-311.pyc ADDED Viewed

Binary file (2.69 kB). View file

v1/__pycache__/usta_layer_norm.cpython-311.pyc ADDED Viewed

Binary file (1.48 kB). View file

v1/__pycache__/usta_mlp.cpython-311.pyc ADDED Viewed

Binary file (2.58 kB). View file

v1/__pycache__/usta_model.cpython-311.pyc ADDED Viewed

Binary file (3.03 kB). View file

v1/__pycache__/usta_multi_head_attention.cpython-311.pyc ADDED Viewed

Binary file (2.01 kB). View file

v1/__pycache__/usta_tokenizer.cpython-311.pyc ADDED Viewed

Binary file (3.3 kB). View file

v1/tokenizer.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "the": 0,
+  "capital": 1,
+  "of": 2,
+  "united": 3,
+  "state": 4,
+  "is": 5,
+  "not": 6,
+  "london": 7,
+  "france": 8,
+  "paris": 9,
+  "and": 10,
+  "berlin": 11,
+  "germany": 12,
+  "rome": 13,
+  "in": 14,
+  "italy": 15,
+  "madrid": 16,
+  "spain": 17,
+  "lisbon": 18,
+  "portugal": 19,
+  "kingdom": 20,
+  "washington": 21,
+  "although": 22,
+  "these": 23,
+  "place": 24,
+  "are": 25,
+  "often": 26,
+  "mention": 27,
+  "together": 28,
+  "each": 29,
+  "country": 30,
+  "has": 31,
+  "its": 32,
+  "own": 33,
+  "identity": 34,
+  "any": 35,
+  "european": 36,
+  "city": 37,
+  "remain": 38,
+  "important": 39,
+  "with": 40,
+  "a": 41,
+  "rich": 42,
+  "history": 43,
+  "culture": 44,
+  "europe": 45,
+  "made": 46,
+  "many": 47,
+  "unique": 48,
+  "world": 49,
+  "while": 50,
+  "known": 51,
+  "for": 52,
+  "art": 53,
+  "fashion": 54,
+  "famous": 55,
+  "they": 56,
+  "ed": 57,
+  "s": 58,
+  ".": 59,
+  ",": 60,
+  " ": 61,
+  "<unk>": 62,
+  "<pad>": 63
+}

v1/u_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7043bb10712bcfa0b6489c906c15a5d528cb2bfdbd3076c9f12de3dcaab95dbf
+size 97223

v1/u_model_4000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7648bd04ddae2c9817df032ae3c30dac7c89cea31ea199ce423aab420e01f4b8
+size 96055

v1/usta_causal_attention.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torch.nn as nn
+class UstaCausalAttention(nn.Module):
+    def __init__(self,embedding_dim,output_dim,context_length,dropout_rate=0.0):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.q_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
+        self.k_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
+        self.v_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.register_buffer('mask',torch.tril(torch.ones(context_length,context_length)))
+        self.context_length = context_length
+    def forward(self,x):
+        number_of_tokens = x.shape[0]
+        # truncate the context length to the context length of the model
+        x = x[:self.context_length]
+        q = self.q_weights(x)
+        k = self.k_weights(x)
+        v = self.v_weights(x)
+        attention_scores = q @ k.T
+        attention_scores = attention_scores.masked_fill_(
+        self.mask.bool()[:number_of_tokens, :number_of_tokens] == 0, -torch.inf
+    )
+        attention_scores = torch.softmax(attention_scores / (k.shape[1] ** 0.5), dim=1)
+        attention_scores = self.dropout(attention_scores)
+        return attention_scores @ v

v1/usta_decoder_block.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch
+import torch.nn as nn
+from .usta_multi_head_attention import UstaMultiHeadAttention
+from .usta_layer_norm import UstaLayerNorm
+from .usta_mlp import UstaMLP
+class UstaDecoderBlock(nn.Module):
+    def __init__(self,embedding_dim,num_heads,context_length):
+        super().__init__()
+        self.self_attention = UstaMultiHeadAttention(embedding_dim, embedding_dim, context_length, num_heads, dropout_rate=0.5)
+        self.norm1 = UstaLayerNorm(embedding_dim)
+        self.mlp = UstaMLP(embedding_dim,embedding_dim)
+        self.norm2 = UstaLayerNorm(embedding_dim)
+    def forward(self,x):
+        res = self.norm1(x)
+        x = self.self_attention(x)
+        x = self.norm1(x)
+        x = x + res
+        res = self.norm2(x)
+        x = self.mlp(x)
+        x = self.norm2(x)
+        x = x + res
+        return x

v1/usta_embedding.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import torch.nn as nn
+def get_rotary_position_encoding(input:torch.Tensor,base=10000,device="cpu"):
+    context_length,dimension = input.shape
+    assert dimension % 2 == 0, "dimension must be even"
+    half_dimension = dimension // 2
+    freqs_indices  = torch.arange(0,half_dimension,device=device,dtype=torch.float32)
+    freqs = 1.0 / (base ** (freqs_indices / dimension))
+    positions = torch.arange(0,context_length,device=device,dtype=torch.float32).unsqueeze(1)
+    angles = positions * freqs
+    sin_angles = torch.sin(angles)
+    cos_angles = torch.cos(angles)
+    input_even = input[:,:dimension//2]
+    input_odd = input[:,dimension//2:]
+    input_even_rotated = input_even * cos_angles - input_odd * sin_angles
+    input_odd_rotated = input_even * sin_angles + input_odd * cos_angles
+    input_rotated = torch.empty_like(input)
+    input_rotated[:,:dimension//2] = input_even_rotated
+    input_rotated[:,:dimension//2:] = input_odd_rotated
+    return input_rotated
+class UstaEmbedding(nn.Module):
+    def __init__(self,vocab_size,embedding_dim,context_length):
+        super().__init__()
+        # position embedding but not being used in the forward pass
+        # it is just for educational purposes
+        #self.pos_embedding = nn.Embedding(context_length,embedding_dim)
+        #self.get_pos = get_rotary_position_encoding
+        self.embedding = nn.Embedding(vocab_size,embedding_dim)
+        self.get_pos = get_rotary_position_encoding
+    def forward(self,x):
+        x = self.embedding(x) # dictionary meaning of the tokens (words)
+        x = self.get_pos(x)   #meaning of the tokens in the sentence according to their position
+        return x

v1/usta_layer_norm.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+import torch.nn as nn
+class UstaLayerNorm(nn.Module):
+    def __init__(self,embedding_dim,eps=1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(embedding_dim))
+    def forward(self,x):
+        mean = x.mean(dim=-1,keepdim=True)
+        variance = x.var()
+        normalized_x = (x - mean) / torch.sqrt(variance + self.eps)
+        return self.weight * normalized_x

v1/usta_mlp.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class GELU(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self,x):
+        return 0.5 *  x * (
+            1+torch.tanh(
+                torch.sqrt(torch.tensor(2/torch.pi))*(x+0.044715*torch.pow(x,3))
+                )
+        )
+class UstaMLP(nn.Module):
+    def __init__(self,embedding_dim,hidden_dim):
+        super().__init__()
+        self.gate_proj = nn.Linear(embedding_dim,hidden_dim)
+        self.up_proj = nn.Linear(embedding_dim,hidden_dim)
+        self.down_proj = nn.Linear(hidden_dim,embedding_dim)
+        self.gelu = GELU()
+    def forward(self,x):
+        gate = self.gate_proj(x)
+        gate = self.gelu(gate)
+        up = self.up_proj(x)
+        fuse = gate*up
+        outputs = self.down_proj(fuse)
+        return outputs

v1/usta_model.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+import torch.nn as nn
+from .usta_decoder_block import UstaDecoderBlock
+from .usta_embedding import UstaEmbedding
+class UstaModel(nn.Module):
+    def __init__(self,vocab_size,embedding_dim,num_heads,context_length,num_layers):
+        super().__init__()
+        self.embedding = UstaEmbedding(vocab_size,embedding_dim,context_length)
+        self.layers = nn.Sequential(*[UstaDecoderBlock(embedding_dim,num_heads,context_length) for _ in range(num_layers)])
+        self.lm_head = nn.Linear(embedding_dim,vocab_size)
+    def forward(self,x:torch.Tensor):
+        x = self.embedding(x) # dictionary meaning of the tokens (words)
+        x = self.layers(x)
+        x = self.lm_head(x)
+        return x
+    def generate(self,x:torch.Tensor,max_new_tokens:int): #top_k,top_p temperature
+        tokens = x.detach().cpu().numpy().tolist()
+        for _ in range(max_new_tokens):
+            out = self.forward(x)
+            probs = torch.softmax(out[:,-1],dim=-1)
+            _,max_index = torch.max(probs,dim=-1)
+            tokens.append(max_index.item())
+            if max_index == 59 or len(tokens) > 32:  # end of sentence token or context length
+                break
+            x = torch.tensor(tokens)
+        return tokens

v1/usta_multi_head_attention.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+class UstaMultiHeadAttention(nn.Module):
+  def __init__(self, embedding_dim, output_dim, context_length, num_heads, dropout_rate = 0.0):
+    super().__init__()
+    self.context_length = context_length
+    self.multi_head_attention = nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_rate)
+    self.projection = nn.Linear(embedding_dim, output_dim)
+    self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1).bool())
+  def forward(self, x):
+    number_of_tokens = x.shape[0]
+    x = x[:self.context_length]
+    attention_mask = self.mask[:number_of_tokens, :number_of_tokens]  # type: ignore
+    out, _ = self.multi_head_attention(x, x, x, attn_mask=attention_mask)
+    out = self.projection(out)
+    return out

v1/usta_multi_head_attention_old.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+import torch.nn as nn
+from .usta_causal_attention import UstaCausalAttention
+class UstaMultiHeadAttention(torch.nn.Module):
+    def __init__(self,embedding_dim,output_dim,context_length,num_heads,dropout_rate=0.0):
+        super().__init__()
+        self.heads = nn.ModuleList([UstaCausalAttention(embedding_dim,output_dim,context_length,dropout_rate) for _ in range(num_heads)])
+        self.projection = nn.Linear(embedding_dim,output_dim)
+    def forward(self,x):
+        attention_outs = []
+        for head in self.heads:
+            head_out = head(x)
+            attention_outs.append(head_out)
+        attention_out = torch.cat(attention_outs,dim=1)
+        return self.projection(attention_out)

v1/usta_self_attention.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+import torch.nn as nn
+class UstaSelfAttention(nn.Module):
+    def __init__(self,embedding_dim,output_dim):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.q_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
+        self.k_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
+        self.v_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
+    def forward(self,x):
+        q = self.q_weights(x)
+        k = self.k_weights(x)
+        v = self.v_weights(x)
+        attention_scores = q @ k.T
+        attention_weights = torch.softmax(attention_scores/k.shape[-1]**0.5,dim=1)
+        context_vector = attention_weights @ v
+        return context_vector

v1/usta_tokenizer.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import json
+import torch
+class UstaTokenizer:
+  def __init__(self, vocab_file):
+    with open(vocab_file, "r") as f:
+      self.vocab = json.load(f)
+      self.reverse_vocab = {v: k for k, v in self.vocab.items()}
+  def encode(self, text):
+    tokens = []
+    for word in text.split():
+      i = 0
+      # example: states
+      # state => 4
+      # s => 58
+      while i < len(word):
+        found_match = False
+        for j in range(len(word), i, -1):
+          sub_word = word[i:j]
+          if sub_word in self.vocab:
+            tokens.append(self.vocab[sub_word])
+            i = j
+            found_match = True
+            break
+        if not found_match:
+          tokens.append(self.vocab["<unk>"])
+          i += 1
+      tokens.append(self.vocab[" "])
+    # check if text is not ends with a space
+    if not text.endswith(" "):
+      tokens.pop()
+    return torch.tensor(tokens)
+  def tokenize(self, text):
+    token_ids = self.encode(text)
+    # token_ids from tensor to list
+    token_ids = token_ids.detach().numpy().tolist()
+    return [self.reverse_vocab[id] for id in token_ids]
+  def decode(self, ids):
+    text = ""
+    for id in ids:
+      text += self.reverse_vocab[id]
+    return text