Celal Şamil Kartoğlu commited on
Commit
14f592d
·
1 Parent(s): e95ef11

app is completed

Browse files
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app.py CHANGED
@@ -1,64 +1,193 @@
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  if __name__ == "__main__":
64
- demo.launch()
 
 
1
+ import os
2
+ import torch
3
  import gradio as gr
4
+ from v1.usta_model import UstaModel
5
+ from v1.usta_tokenizer import UstaTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
 
8
+ model,tokenizer,model_status = None,None,"Not Loaded"
9
+
10
+
11
+ def load_model(custom_model_path=None):
12
+ try:
13
+ u_tokenizer = UstaTokenizer("v1/tokenizer.json")
14
+ print(f"Tokenizer loaded successfully, vocab size : {len(u_tokenizer.vocab)}")
15
+
16
+ context_length = 32
17
+ vocab_size = len(u_tokenizer.vocab)
18
+ embed_dim = 12
19
+ num_heads = 4
20
+ num_layers = 8
21
+
22
+ model = UstaModel(context_length=context_length,vocab_size=vocab_size,embedding_dim=embed_dim,num_heads=num_heads,num_layers=num_layers)
23
+ if custom_model_path and os.path.exists(custom_model_path):
24
+ model.load_state_dict(torch.load(custom_model_path))
25
+ else:
26
+ model.load_state_dict(torch.load("v1/u_model_4000.pth"))
27
+ model.eval()
28
+ print(f"Model loaded successfully vocab size: {len(u_tokenizer.vocab)}")
29
+ return model,u_tokenizer,"Model Loaded Successfully !"
30
+ except Exception as e:
31
+ print(f"Error loading model : {e}")
32
+ return None,None,"Error Loading Model"
33
+
34
+
35
+ try:
36
+ model,tokenizer,model_status = load_model()
37
+ except Exception as e:
38
+ print(f"Error loading model: {e}")
39
+ model,tokenizer,model_status = None,None,"Error Loading Model"
40
+
41
+ print(f"Model status: {model_status}")
42
+
43
+
44
+ if model is not None:
45
+ print("Model loaded successfully")
46
+
47
+
48
+
49
+ def chat_with_model(message,chat_history,max_new_tokens=20):
50
+ try:
51
+ tokens = tokenizer.encode(message)
52
+ if len(tokens) > 25:
53
+ tokens = tokens[-25:]
54
+
55
+ with torch.no_grad():
56
+ actual_max_tokens = min(max_new_tokens,32-len(tokens))
57
+ generated_tokens = model.generate(tokens,max_new_tokens=actual_max_tokens)
58
+
59
+ response = tokenizer.decode(generated_tokens)
60
+ original_message = tokenizer.decode(tokens.tolist())
61
+ if response.startswith(original_message):
62
+ response = response[len(original_message):]
63
+ response = response.replace("<pad>","").replace("<unk>","").strip()
64
+ if len(response) <=0:
65
+ response = "I'm sorryi I don't know the answer to that question."
66
+ chat_history.append([message,response])
67
+
68
+ return chat_history,""
69
+
70
+ except Exception as e:
71
+ print(f"Error generating response: {e}")
72
+ return chat_history, "Error generating response"
73
+
74
+
75
+ def load_model_from_url(custom_model_url):
76
+ global model,tokenizer,model_status
77
+ try:
78
+ import requests
79
+ headers = {
80
+ "Accept": "application/octet-stream",
81
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0",
82
+ }
83
+ response = requests.get(custom_model_url)
84
+ response.raise_for_status()
85
+
86
+ temp_file = "temp_model.pth"
87
+ with open(temp_file,"wb") as f:
88
+ f.write(response.content)
89
+
90
+ model,tokenizer,model_status = load_model(temp_file)
91
+ os.remove(temp_file)
92
+ return "Model loaded successfully from URL"
93
+ except Exception as e:
94
+ print(f"Error loading model from URL:{e}")
95
+ return "Error loading model from URL"
96
+
97
+
98
+ def load_model_from_file(model_file):
99
+ global model,tokenizer,model_status
100
+
101
+ try:
102
+ print(f"Loading model from file : {model_file.name}")
103
+ model,tokenizer,model_status = load_model(model_file.name)
104
+ return "Model loaded successfully from file"
105
+ except Exception as e:
106
+ print(f"Error loading model from file:{e}")
107
+ return "Error loading model from file"
108
+
109
+
110
+
111
+
112
+ with gr.Blocks(title="🤖 Usta Model Chat") as demo:
113
+ gr.Markdown("# 🤖 Usta Model Chat")
114
+ gr.Markdown("Chat with a custom transformer language model built from scratch! This model specializes in geographical knowledge.")
115
+ chatbot = gr.Chatbot(height=400)
116
+ msg = gr.Textbox(placeholder="Ask about countries, capitals, or cities...",label="Message")
117
+ # simple chat interface
118
+
119
+
120
+
121
+ with gr.Row():
122
+ send_button = gr.Button("Send",variant="primary")
123
+ clear_button = gr.Button("Clear")
124
+
125
+ max_new_tokens = gr.Slider(
126
+ minimum = 1,
127
+ maximum = 30,
128
+ value=20,
129
+ step=1,
130
+ label="Max New Tokens",
131
+ info="The maximum number of new tokens to generate in response to each user message."
132
+ )
133
+
134
+
135
+ gr.Markdown("## 🤖 Load Custom Model")
136
+ with gr.Row():
137
+ custom_model_url = gr.Textbox(
138
+ placeholder="https://github.com/malibayram/llm-from-scratch/raw/main/u_model_4000.pth",
139
+ label = "Custom Model Path",
140
+ scale=4
141
+ )
142
+ load_url_button = gr.Button("Load Model",variant="primary",scale=1)
143
+
144
+ with gr.Row():
145
+ model_file = gr.File(
146
+ label = "Custom Model File",
147
+ file_types = [".pth",".pt",".bin"],
148
+ )
149
+ load_file_button = gr.Button("Load Model",variant="primary")
150
+
151
+ status = gr.Textbox(
152
+ label = "Model Status",
153
+ value=model_status,
154
+ interactive=False,
155
+ scale=4
156
+ )
157
+
158
+
159
+ def send_message(message,chat_history,max_new_tokens):
160
+ if not message.strip():
161
+ return chat_history, ""
162
+
163
+ return chat_with_model(message,chat_history,max_new_tokens)
164
+
165
+ send_button.click(
166
+ send_message,
167
+ inputs=[msg,chatbot,max_new_tokens],
168
+ outputs=[chatbot,msg]
169
+ )
170
+
171
+ msg.submit(
172
+ send_message,
173
+ inputs=[msg,chatbot,max_new_tokens],
174
+ outputs=[chatbot,msg]
175
+ )
176
+
177
+ clear_button.click(lambda:None,None,chatbot,status)
178
+
179
+ load_url_button.click(
180
+ load_model_from_url,
181
+ inputs=[custom_model_url],
182
+ outputs=[status]
183
+ )
184
+
185
+ load_file_button.click(
186
+ load_model_from_file,
187
+ inputs=[model_file],
188
+ outputs=[status]
189
+ )
190
+
191
  if __name__ == "__main__":
192
+ demo.launch(share=True)
193
+
requirements.txt CHANGED
@@ -1 +1,4 @@
1
- huggingface_hub==0.25.2
 
 
 
 
1
+ gradio==5.33.1
2
+ torch==2.7.1
3
+ requests==2.32.3
4
+ pydantic==2.9.2
v1/__init__.py ADDED
File without changes
v1/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (165 Bytes). View file
 
v1/__pycache__/usta_decoder_block.cpython-311.pyc ADDED
Binary file (1.95 kB). View file
 
v1/__pycache__/usta_embedding.cpython-311.pyc ADDED
Binary file (2.69 kB). View file
 
v1/__pycache__/usta_layer_norm.cpython-311.pyc ADDED
Binary file (1.48 kB). View file
 
v1/__pycache__/usta_mlp.cpython-311.pyc ADDED
Binary file (2.58 kB). View file
 
v1/__pycache__/usta_model.cpython-311.pyc ADDED
Binary file (3.03 kB). View file
 
v1/__pycache__/usta_multi_head_attention.cpython-311.pyc ADDED
Binary file (2.01 kB). View file
 
v1/__pycache__/usta_tokenizer.cpython-311.pyc ADDED
Binary file (3.3 kB). View file
 
v1/tokenizer.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "the": 0,
3
+ "capital": 1,
4
+ "of": 2,
5
+ "united": 3,
6
+ "state": 4,
7
+ "is": 5,
8
+ "not": 6,
9
+ "london": 7,
10
+ "france": 8,
11
+ "paris": 9,
12
+ "and": 10,
13
+ "berlin": 11,
14
+ "germany": 12,
15
+ "rome": 13,
16
+ "in": 14,
17
+ "italy": 15,
18
+ "madrid": 16,
19
+ "spain": 17,
20
+ "lisbon": 18,
21
+ "portugal": 19,
22
+ "kingdom": 20,
23
+ "washington": 21,
24
+ "although": 22,
25
+ "these": 23,
26
+ "place": 24,
27
+ "are": 25,
28
+ "often": 26,
29
+ "mention": 27,
30
+ "together": 28,
31
+ "each": 29,
32
+ "country": 30,
33
+ "has": 31,
34
+ "its": 32,
35
+ "own": 33,
36
+ "identity": 34,
37
+ "any": 35,
38
+ "european": 36,
39
+ "city": 37,
40
+ "remain": 38,
41
+ "important": 39,
42
+ "with": 40,
43
+ "a": 41,
44
+ "rich": 42,
45
+ "history": 43,
46
+ "culture": 44,
47
+ "europe": 45,
48
+ "made": 46,
49
+ "many": 47,
50
+ "unique": 48,
51
+ "world": 49,
52
+ "while": 50,
53
+ "known": 51,
54
+ "for": 52,
55
+ "art": 53,
56
+ "fashion": 54,
57
+ "famous": 55,
58
+ "they": 56,
59
+ "ed": 57,
60
+ "s": 58,
61
+ ".": 59,
62
+ ",": 60,
63
+ " ": 61,
64
+ "<unk>": 62,
65
+ "<pad>": 63
66
+ }
v1/u_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7043bb10712bcfa0b6489c906c15a5d528cb2bfdbd3076c9f12de3dcaab95dbf
3
+ size 97223
v1/u_model_4000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7648bd04ddae2c9817df032ae3c30dac7c89cea31ea199ce423aab420e01f4b8
3
+ size 96055
v1/usta_causal_attention.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+
6
+ class UstaCausalAttention(nn.Module):
7
+ def __init__(self,embedding_dim,output_dim,context_length,dropout_rate=0.0):
8
+ super().__init__()
9
+ self.embedding_dim = embedding_dim
10
+ self.q_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
11
+ self.k_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
12
+ self.v_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
13
+ self.dropout = nn.Dropout(dropout_rate)
14
+ self.register_buffer('mask',torch.tril(torch.ones(context_length,context_length)))
15
+ self.context_length = context_length
16
+
17
+
18
+
19
+ def forward(self,x):
20
+ number_of_tokens = x.shape[0]
21
+ # truncate the context length to the context length of the model
22
+ x = x[:self.context_length]
23
+ q = self.q_weights(x)
24
+ k = self.k_weights(x)
25
+ v = self.v_weights(x)
26
+
27
+ attention_scores = q @ k.T
28
+ attention_scores = attention_scores.masked_fill_(
29
+ self.mask.bool()[:number_of_tokens, :number_of_tokens] == 0, -torch.inf
30
+ )
31
+ attention_scores = torch.softmax(attention_scores / (k.shape[1] ** 0.5), dim=1)
32
+ attention_scores = self.dropout(attention_scores)
33
+ return attention_scores @ v
34
+
35
+
36
+
37
+
38
+
39
+
v1/usta_decoder_block.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from .usta_multi_head_attention import UstaMultiHeadAttention
5
+ from .usta_layer_norm import UstaLayerNorm
6
+ from .usta_mlp import UstaMLP
7
+
8
+
9
+ class UstaDecoderBlock(nn.Module):
10
+ def __init__(self,embedding_dim,num_heads,context_length):
11
+ super().__init__()
12
+ self.self_attention = UstaMultiHeadAttention(embedding_dim, embedding_dim, context_length, num_heads, dropout_rate=0.5)
13
+ self.norm1 = UstaLayerNorm(embedding_dim)
14
+ self.mlp = UstaMLP(embedding_dim,embedding_dim)
15
+ self.norm2 = UstaLayerNorm(embedding_dim)
16
+
17
+
18
+ def forward(self,x):
19
+ res = self.norm1(x)
20
+ x = self.self_attention(x)
21
+ x = self.norm1(x)
22
+ x = x + res
23
+ res = self.norm2(x)
24
+ x = self.mlp(x)
25
+ x = self.norm2(x)
26
+ x = x + res
27
+ return x
28
+
v1/usta_embedding.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ def get_rotary_position_encoding(input:torch.Tensor,base=10000,device="cpu"):
5
+ context_length,dimension = input.shape
6
+
7
+ assert dimension % 2 == 0, "dimension must be even"
8
+
9
+ half_dimension = dimension // 2
10
+
11
+ freqs_indices = torch.arange(0,half_dimension,device=device,dtype=torch.float32)
12
+
13
+ freqs = 1.0 / (base ** (freqs_indices / dimension))
14
+
15
+ positions = torch.arange(0,context_length,device=device,dtype=torch.float32).unsqueeze(1)
16
+
17
+ angles = positions * freqs
18
+
19
+ sin_angles = torch.sin(angles)
20
+ cos_angles = torch.cos(angles)
21
+
22
+ input_even = input[:,:dimension//2]
23
+ input_odd = input[:,dimension//2:]
24
+
25
+ input_even_rotated = input_even * cos_angles - input_odd * sin_angles
26
+ input_odd_rotated = input_even * sin_angles + input_odd * cos_angles
27
+
28
+ input_rotated = torch.empty_like(input)
29
+
30
+ input_rotated[:,:dimension//2] = input_even_rotated
31
+ input_rotated[:,:dimension//2:] = input_odd_rotated
32
+
33
+ return input_rotated
34
+
35
+
36
+ class UstaEmbedding(nn.Module):
37
+ def __init__(self,vocab_size,embedding_dim,context_length):
38
+ super().__init__()
39
+ # position embedding but not being used in the forward pass
40
+ # it is just for educational purposes
41
+ #self.pos_embedding = nn.Embedding(context_length,embedding_dim)
42
+ #self.get_pos = get_rotary_position_encoding
43
+ self.embedding = nn.Embedding(vocab_size,embedding_dim)
44
+ self.get_pos = get_rotary_position_encoding
45
+
46
+
47
+
48
+ def forward(self,x):
49
+ x = self.embedding(x) # dictionary meaning of the tokens (words)
50
+ x = self.get_pos(x) #meaning of the tokens in the sentence according to their position
51
+ return x
v1/usta_layer_norm.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ class UstaLayerNorm(nn.Module):
5
+ def __init__(self,embedding_dim,eps=1e-5):
6
+ super().__init__()
7
+ self.eps = eps
8
+
9
+ self.weight = nn.Parameter(torch.ones(embedding_dim))
10
+
11
+
12
+ def forward(self,x):
13
+ mean = x.mean(dim=-1,keepdim=True)
14
+ variance = x.var()
15
+ normalized_x = (x - mean) / torch.sqrt(variance + self.eps)
16
+ return self.weight * normalized_x
v1/usta_mlp.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+
6
+ class GELU(nn.Module):
7
+ def __init__(self):
8
+ super().__init__()
9
+
10
+ def forward(self,x):
11
+ return 0.5 * x * (
12
+ 1+torch.tanh(
13
+ torch.sqrt(torch.tensor(2/torch.pi))*(x+0.044715*torch.pow(x,3))
14
+ )
15
+ )
16
+
17
+
18
+ class UstaMLP(nn.Module):
19
+ def __init__(self,embedding_dim,hidden_dim):
20
+ super().__init__()
21
+
22
+ self.gate_proj = nn.Linear(embedding_dim,hidden_dim)
23
+ self.up_proj = nn.Linear(embedding_dim,hidden_dim)
24
+ self.down_proj = nn.Linear(hidden_dim,embedding_dim)
25
+ self.gelu = GELU()
26
+
27
+ def forward(self,x):
28
+ gate = self.gate_proj(x)
29
+ gate = self.gelu(gate)
30
+ up = self.up_proj(x)
31
+ fuse = gate*up
32
+ outputs = self.down_proj(fuse)
33
+ return outputs
34
+
v1/usta_model.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from .usta_decoder_block import UstaDecoderBlock
5
+ from .usta_embedding import UstaEmbedding
6
+
7
+
8
+
9
+
10
+ class UstaModel(nn.Module):
11
+ def __init__(self,vocab_size,embedding_dim,num_heads,context_length,num_layers):
12
+ super().__init__()
13
+
14
+ self.embedding = UstaEmbedding(vocab_size,embedding_dim,context_length)
15
+ self.layers = nn.Sequential(*[UstaDecoderBlock(embedding_dim,num_heads,context_length) for _ in range(num_layers)])
16
+ self.lm_head = nn.Linear(embedding_dim,vocab_size)
17
+
18
+
19
+ def forward(self,x:torch.Tensor):
20
+ x = self.embedding(x) # dictionary meaning of the tokens (words)
21
+ x = self.layers(x)
22
+ x = self.lm_head(x)
23
+ return x
24
+
25
+
26
+ def generate(self,x:torch.Tensor,max_new_tokens:int): #top_k,top_p temperature
27
+ tokens = x.detach().cpu().numpy().tolist()
28
+
29
+ for _ in range(max_new_tokens):
30
+ out = self.forward(x)
31
+ probs = torch.softmax(out[:,-1],dim=-1)
32
+ _,max_index = torch.max(probs,dim=-1)
33
+ tokens.append(max_index.item())
34
+ if max_index == 59 or len(tokens) > 32: # end of sentence token or context length
35
+ break
36
+ x = torch.tensor(tokens)
37
+
38
+ return tokens
v1/usta_multi_head_attention.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ class UstaMultiHeadAttention(nn.Module):
6
+ def __init__(self, embedding_dim, output_dim, context_length, num_heads, dropout_rate = 0.0):
7
+ super().__init__()
8
+
9
+ self.context_length = context_length
10
+
11
+ self.multi_head_attention = nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_rate)
12
+ self.projection = nn.Linear(embedding_dim, output_dim)
13
+
14
+ self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1).bool())
15
+
16
+ def forward(self, x):
17
+ number_of_tokens = x.shape[0]
18
+ x = x[:self.context_length]
19
+ attention_mask = self.mask[:number_of_tokens, :number_of_tokens] # type: ignore
20
+ out, _ = self.multi_head_attention(x, x, x, attn_mask=attention_mask)
21
+ out = self.projection(out)
22
+ return out
v1/usta_multi_head_attention_old.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+ from .usta_causal_attention import UstaCausalAttention
6
+
7
+
8
+
9
+ class UstaMultiHeadAttention(torch.nn.Module):
10
+ def __init__(self,embedding_dim,output_dim,context_length,num_heads,dropout_rate=0.0):
11
+ super().__init__()
12
+ self.heads = nn.ModuleList([UstaCausalAttention(embedding_dim,output_dim,context_length,dropout_rate) for _ in range(num_heads)])
13
+ self.projection = nn.Linear(embedding_dim,output_dim)
14
+
15
+ def forward(self,x):
16
+ attention_outs = []
17
+ for head in self.heads:
18
+ head_out = head(x)
19
+ attention_outs.append(head_out)
20
+ attention_out = torch.cat(attention_outs,dim=1)
21
+ return self.projection(attention_out)
22
+
v1/usta_self_attention.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+
5
+
6
+ class UstaSelfAttention(nn.Module):
7
+ def __init__(self,embedding_dim,output_dim):
8
+ super().__init__()
9
+ self.embedding_dim = embedding_dim
10
+ self.q_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
11
+ self.k_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
12
+ self.v_weights = nn.Linear(embedding_dim,embedding_dim,bias=False)
13
+
14
+
15
+ def forward(self,x):
16
+ q = self.q_weights(x)
17
+ k = self.k_weights(x)
18
+ v = self.v_weights(x)
19
+
20
+ attention_scores = q @ k.T
21
+ attention_weights = torch.softmax(attention_scores/k.shape[-1]**0.5,dim=1)
22
+ context_vector = attention_weights @ v
23
+
24
+ return context_vector
25
+
26
+
27
+
28
+
29
+
30
+
v1/usta_tokenizer.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import torch
4
+
5
+
6
+ class UstaTokenizer:
7
+ def __init__(self, vocab_file):
8
+ with open(vocab_file, "r") as f:
9
+ self.vocab = json.load(f)
10
+ self.reverse_vocab = {v: k for k, v in self.vocab.items()}
11
+
12
+ def encode(self, text):
13
+ tokens = []
14
+
15
+ for word in text.split():
16
+ i = 0
17
+ # example: states
18
+ # state => 4
19
+ # s => 58
20
+ while i < len(word):
21
+ found_match = False
22
+ for j in range(len(word), i, -1):
23
+ sub_word = word[i:j]
24
+ if sub_word in self.vocab:
25
+ tokens.append(self.vocab[sub_word])
26
+ i = j
27
+ found_match = True
28
+ break
29
+ if not found_match:
30
+ tokens.append(self.vocab["<unk>"])
31
+ i += 1
32
+ tokens.append(self.vocab[" "])
33
+
34
+ # check if text is not ends with a space
35
+ if not text.endswith(" "):
36
+ tokens.pop()
37
+ return torch.tensor(tokens)
38
+
39
+ def tokenize(self, text):
40
+ token_ids = self.encode(text)
41
+ # token_ids from tensor to list
42
+ token_ids = token_ids.detach().numpy().tolist()
43
+
44
+ return [self.reverse_vocab[id] for id in token_ids]
45
+
46
+ def decode(self, ids):
47
+ text = ""
48
+ for id in ids:
49
+ text += self.reverse_vocab[id]
50
+ return text