FarhanAK128 commited on
Commit
53ed65f
·
verified ·
1 Parent(s): a800925

Initial upload from local save

Browse files
Files changed (4) hide show
  1. __init__.py +0 -0
  2. config.json +20 -0
  3. model.safetensors +3 -0
  4. model_class.py +187 -0
__init__.py ADDED
File without changes
config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "TicketGPT"
4
+ ],
5
+ "classes": 8,
6
+ "context_length": 1024,
7
+ "drop_rate": 0.1,
8
+ "dtype": "float32",
9
+ "emb_dim": 768,
10
+ "model_type": "ticket_gpt",
11
+ "n_heads": 12,
12
+ "n_layers": 12,
13
+ "qkv_bias": true,
14
+ "transformers_version": "4.57.3",
15
+ "vocab_size": 50257,
16
+ "auto_map": {
17
+ "AutoConfig": "model_class.TicketGPTConfig",
18
+ "AutoModel": "model_class.TicketGPT"
19
+ }
20
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62a4f5687aafc68d9848f3ab22948c6119de9e3810a5df1d943a1f63114e3698
3
+ size 548136408
model_class.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel, PretrainedConfig
4
+ # from huggingface_hub import PyTorchModelHubMixin
5
+
6
+
7
+ class MultiheadAttention(nn.Module):
8
+ def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
9
+ super().__init__()
10
+
11
+ self.d_out = d_out
12
+ self.num_heads = num_heads
13
+ self.head_dim = d_out // num_heads
14
+
15
+ #step 3
16
+ self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
17
+ self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
18
+ self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
19
+
20
+ self.out_proj = nn.Linear(d_out, d_out)
21
+ self.dropout = nn.Dropout(dropout)
22
+ self.register_buffer("mask",torch.triu(torch.ones(context_length, context_length), diagonal=1))
23
+
24
+
25
+ def forward(self, x):
26
+ b, num_tokens, d_in = x.shape
27
+
28
+ #step 4
29
+ keys = self.W_key(x)
30
+ queries = self.W_query(x)
31
+ values = self.W_value(x)
32
+
33
+ #step 5
34
+ keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
35
+ queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
36
+ values = values.view(b, num_tokens, self.num_heads, self.head_dim)
37
+
38
+ #step 6
39
+ keys = keys.transpose(1,2)
40
+ queries = queries.transpose(1,2)
41
+ values = values.transpose(1,2)
42
+
43
+ #step 7
44
+ attn_scores = queries @ keys.transpose(2,3)
45
+
46
+ #step 8
47
+ mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
48
+ attn_scores.masked_fill_(mask_bool, -torch.inf)
49
+
50
+ attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
51
+ attn_weights = self.dropout(attn_weights)
52
+
53
+ #step 9 - 11
54
+ ctx_vec = (attn_weights @ values).transpose(1, 2)
55
+
56
+ #step 12
57
+ ctx_vec = ctx_vec.contiguous().view(b, num_tokens, self.d_out)
58
+ ctx_vec = self.out_proj(ctx_vec)
59
+
60
+ return ctx_vec
61
+
62
+ #==========================================================================
63
+
64
+
65
+ class LayerNorm(nn.Module):
66
+ def __init__(self, emb_dim):
67
+ super().__init__()
68
+ self.eps = 1e-5
69
+ self.scale = nn.Parameter(torch.ones(emb_dim))
70
+ self.shift = nn.Parameter(torch.zeros(emb_dim))
71
+
72
+ def forward(self, x):
73
+ mean = x.mean(dim=-1, keepdim=True)
74
+ var = x.var(dim=-1, keepdim=True, unbiased=False)
75
+ norm_x = (x - mean) / torch.sqrt(var + self.eps)
76
+ return self.scale * norm_x + self.shift
77
+
78
+ #==========================================================================
79
+
80
+
81
+ class GeLU(nn.Module):
82
+ def __init__(self):
83
+ super().__init__()
84
+
85
+ def forward(self, x):
86
+ return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)) * (x + 0.044715 * torch.pow(x,3))))
87
+
88
+ #==========================================================================
89
+
90
+
91
+ class FeedForward(nn.Module):
92
+ def __init__(self, cfg):
93
+ super().__init__()
94
+ self.layers = nn.Sequential(
95
+ nn.Linear(cfg.emb_dim, 4*cfg.emb_dim),
96
+ GeLU(),
97
+ nn.Linear(4*cfg.emb_dim, cfg.emb_dim)
98
+ )
99
+ def forward(self, x):
100
+ return self.layers(x)
101
+
102
+ #==========================================================================
103
+
104
+ class TransformerBlock(nn.Module):
105
+ def __init__(self, cfg):
106
+ super().__init__()
107
+ self.att = MultiheadAttention(
108
+ d_in = cfg.emb_dim,
109
+ d_out = cfg.emb_dim,
110
+ context_length = cfg.context_length,
111
+ dropout = cfg.drop_rate,
112
+ num_heads = cfg.n_heads,
113
+ qkv_bias = cfg.qkv_bias
114
+ )
115
+ self.ff = FeedForward(cfg)
116
+ self.norm1 = LayerNorm(cfg.emb_dim)
117
+ self.norm2 = LayerNorm(cfg.emb_dim)
118
+ self.drop_shortcut = nn.Dropout(cfg.drop_rate)
119
+
120
+ def forward(self, x):
121
+ shortcut = x
122
+ x = self.norm1(x)
123
+ x = self.att(x)
124
+ x = self.drop_shortcut(x)
125
+ x = x + shortcut
126
+
127
+ shortcut = x
128
+ x = self.norm2(x)
129
+ x = self.ff(x)
130
+ x = self.drop_shortcut(x)
131
+ x = x + shortcut
132
+
133
+ return x
134
+
135
+ #=======================================================================
136
+
137
+ # "config": {
138
+ # "classes": 8,
139
+ # "context_length": 1024,
140
+ # "drop_rate": 0.1,
141
+ # "emb_dim": 768,
142
+ # "n_heads": 12,
143
+ # "n_layers": 12,
144
+ # "qkv_bias": true,
145
+ # "vocab_size": 50257
146
+ # },
147
+
148
+ class TicketGPTConfig(PretrainedConfig):
149
+ model_type = "ticket_gpt" # Unique identifier for the AutoClass
150
+ def __init__(self, classes=8, context_length=1024, drop_rate=0.1, emb_dim=768, n_heads=12, n_layers=12, qkv_bias=True, vocab_size=50257, **kwargs):
151
+ super().__init__(**kwargs)
152
+ self.classes = classes
153
+ self.context_length = context_length
154
+ self.drop_rate = drop_rate
155
+ self.emb_dim = emb_dim
156
+ self.n_heads = n_heads
157
+ self.n_layers = n_layers
158
+ self.qkv_bias = qkv_bias
159
+ self.vocab_size = vocab_size
160
+
161
+ class TicketGPT(
162
+ PreTrainedModel,
163
+ ):
164
+ config_class = TicketGPTConfig
165
+ def __init__(self, config):
166
+ super().__init__(config)
167
+ self.tok_emb = nn.Embedding(config.vocab_size, config.emb_dim)
168
+ self.pos_emb = nn.Embedding(config.context_length, config.emb_dim)
169
+ self.drop_emb = nn.Dropout(config.drop_rate)
170
+
171
+ self.trf_blocks = nn.Sequential(
172
+ *[TransformerBlock(config) for _ in range(config.n_layers)]
173
+ )
174
+
175
+ self.final_norm = LayerNorm(config.emb_dim)
176
+ self.out_head = nn.Linear(config.emb_dim, config.classes, bias=True)
177
+
178
+ def forward(self, x):
179
+ batch_size, seq_len = x.shape
180
+ tok_embeddings = self.tok_emb(x) #[2,4,768]
181
+ pos_embeddings = self.pos_emb(torch.arange(seq_len, device=x.device)) #[2,4,768]
182
+ x = tok_embeddings + pos_embeddings #[2,4,768]
183
+ x = self.drop_emb(x)
184
+ x = self.trf_blocks(x)
185
+ x = self.final_norm(x)
186
+ logits = self.out_head(x) #[2,4,50257]
187
+ return logits