FarhanAK128 commited on
Commit
91a975f
·
verified ·
1 Parent(s): 53ed65f

Update model_class.py

Browse files
Files changed (1) hide show
  1. model_class.py +210 -187
model_class.py CHANGED
@@ -1,187 +1,210 @@
1
- import torch
2
- import torch.nn as nn
3
- from transformers import PreTrainedModel, PretrainedConfig
4
- # from huggingface_hub import PyTorchModelHubMixin
5
-
6
-
7
- class MultiheadAttention(nn.Module):
8
- def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
9
- super().__init__()
10
-
11
- self.d_out = d_out
12
- self.num_heads = num_heads
13
- self.head_dim = d_out // num_heads
14
-
15
- #step 3
16
- self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
17
- self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
18
- self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
19
-
20
- self.out_proj = nn.Linear(d_out, d_out)
21
- self.dropout = nn.Dropout(dropout)
22
- self.register_buffer("mask",torch.triu(torch.ones(context_length, context_length), diagonal=1))
23
-
24
-
25
- def forward(self, x):
26
- b, num_tokens, d_in = x.shape
27
-
28
- #step 4
29
- keys = self.W_key(x)
30
- queries = self.W_query(x)
31
- values = self.W_value(x)
32
-
33
- #step 5
34
- keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
35
- queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
36
- values = values.view(b, num_tokens, self.num_heads, self.head_dim)
37
-
38
- #step 6
39
- keys = keys.transpose(1,2)
40
- queries = queries.transpose(1,2)
41
- values = values.transpose(1,2)
42
-
43
- #step 7
44
- attn_scores = queries @ keys.transpose(2,3)
45
-
46
- #step 8
47
- mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
48
- attn_scores.masked_fill_(mask_bool, -torch.inf)
49
-
50
- attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
51
- attn_weights = self.dropout(attn_weights)
52
-
53
- #step 9 - 11
54
- ctx_vec = (attn_weights @ values).transpose(1, 2)
55
-
56
- #step 12
57
- ctx_vec = ctx_vec.contiguous().view(b, num_tokens, self.d_out)
58
- ctx_vec = self.out_proj(ctx_vec)
59
-
60
- return ctx_vec
61
-
62
- #==========================================================================
63
-
64
-
65
- class LayerNorm(nn.Module):
66
- def __init__(self, emb_dim):
67
- super().__init__()
68
- self.eps = 1e-5
69
- self.scale = nn.Parameter(torch.ones(emb_dim))
70
- self.shift = nn.Parameter(torch.zeros(emb_dim))
71
-
72
- def forward(self, x):
73
- mean = x.mean(dim=-1, keepdim=True)
74
- var = x.var(dim=-1, keepdim=True, unbiased=False)
75
- norm_x = (x - mean) / torch.sqrt(var + self.eps)
76
- return self.scale * norm_x + self.shift
77
-
78
- #==========================================================================
79
-
80
-
81
- class GeLU(nn.Module):
82
- def __init__(self):
83
- super().__init__()
84
-
85
- def forward(self, x):
86
- return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)) * (x + 0.044715 * torch.pow(x,3))))
87
-
88
- #==========================================================================
89
-
90
-
91
- class FeedForward(nn.Module):
92
- def __init__(self, cfg):
93
- super().__init__()
94
- self.layers = nn.Sequential(
95
- nn.Linear(cfg.emb_dim, 4*cfg.emb_dim),
96
- GeLU(),
97
- nn.Linear(4*cfg.emb_dim, cfg.emb_dim)
98
- )
99
- def forward(self, x):
100
- return self.layers(x)
101
-
102
- #==========================================================================
103
-
104
- class TransformerBlock(nn.Module):
105
- def __init__(self, cfg):
106
- super().__init__()
107
- self.att = MultiheadAttention(
108
- d_in = cfg.emb_dim,
109
- d_out = cfg.emb_dim,
110
- context_length = cfg.context_length,
111
- dropout = cfg.drop_rate,
112
- num_heads = cfg.n_heads,
113
- qkv_bias = cfg.qkv_bias
114
- )
115
- self.ff = FeedForward(cfg)
116
- self.norm1 = LayerNorm(cfg.emb_dim)
117
- self.norm2 = LayerNorm(cfg.emb_dim)
118
- self.drop_shortcut = nn.Dropout(cfg.drop_rate)
119
-
120
- def forward(self, x):
121
- shortcut = x
122
- x = self.norm1(x)
123
- x = self.att(x)
124
- x = self.drop_shortcut(x)
125
- x = x + shortcut
126
-
127
- shortcut = x
128
- x = self.norm2(x)
129
- x = self.ff(x)
130
- x = self.drop_shortcut(x)
131
- x = x + shortcut
132
-
133
- return x
134
-
135
- #=======================================================================
136
-
137
- # "config": {
138
- # "classes": 8,
139
- # "context_length": 1024,
140
- # "drop_rate": 0.1,
141
- # "emb_dim": 768,
142
- # "n_heads": 12,
143
- # "n_layers": 12,
144
- # "qkv_bias": true,
145
- # "vocab_size": 50257
146
- # },
147
-
148
- class TicketGPTConfig(PretrainedConfig):
149
- model_type = "ticket_gpt" # Unique identifier for the AutoClass
150
- def __init__(self, classes=8, context_length=1024, drop_rate=0.1, emb_dim=768, n_heads=12, n_layers=12, qkv_bias=True, vocab_size=50257, **kwargs):
151
- super().__init__(**kwargs)
152
- self.classes = classes
153
- self.context_length = context_length
154
- self.drop_rate = drop_rate
155
- self.emb_dim = emb_dim
156
- self.n_heads = n_heads
157
- self.n_layers = n_layers
158
- self.qkv_bias = qkv_bias
159
- self.vocab_size = vocab_size
160
-
161
- class TicketGPT(
162
- PreTrainedModel,
163
- ):
164
- config_class = TicketGPTConfig
165
- def __init__(self, config):
166
- super().__init__(config)
167
- self.tok_emb = nn.Embedding(config.vocab_size, config.emb_dim)
168
- self.pos_emb = nn.Embedding(config.context_length, config.emb_dim)
169
- self.drop_emb = nn.Dropout(config.drop_rate)
170
-
171
- self.trf_blocks = nn.Sequential(
172
- *[TransformerBlock(config) for _ in range(config.n_layers)]
173
- )
174
-
175
- self.final_norm = LayerNorm(config.emb_dim)
176
- self.out_head = nn.Linear(config.emb_dim, config.classes, bias=True)
177
-
178
- def forward(self, x):
179
- batch_size, seq_len = x.shape
180
- tok_embeddings = self.tok_emb(x) #[2,4,768]
181
- pos_embeddings = self.pos_emb(torch.arange(seq_len, device=x.device)) #[2,4,768]
182
- x = tok_embeddings + pos_embeddings #[2,4,768]
183
- x = self.drop_emb(x)
184
- x = self.trf_blocks(x)
185
- x = self.final_norm(x)
186
- logits = self.out_head(x) #[2,4,50257]
187
- return logits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel, PretrainedConfig
4
+ # from huggingface_hub import PyTorchModelHubMixin
5
+
6
+
7
+ class MultiheadAttention(nn.Module):
8
+ def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
9
+ super().__init__()
10
+
11
+ self.d_out = d_out
12
+ self.num_heads = num_heads
13
+ self.head_dim = d_out // num_heads
14
+
15
+ #step 3
16
+ self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
17
+ self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
18
+ self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
19
+
20
+ self.out_proj = nn.Linear(d_out, d_out)
21
+ self.dropout = nn.Dropout(dropout)
22
+ self.register_buffer("mask",torch.triu(torch.ones(context_length, context_length), diagonal=1))
23
+
24
+
25
+ def forward(self, x):
26
+ b, num_tokens, d_in = x.shape
27
+
28
+ #step 4
29
+ keys = self.W_key(x)
30
+ queries = self.W_query(x)
31
+ values = self.W_value(x)
32
+
33
+ #step 5
34
+ keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
35
+ queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
36
+ values = values.view(b, num_tokens, self.num_heads, self.head_dim)
37
+
38
+ #step 6
39
+ keys = keys.transpose(1,2)
40
+ queries = queries.transpose(1,2)
41
+ values = values.transpose(1,2)
42
+
43
+ #step 7
44
+ attn_scores = queries @ keys.transpose(2,3)
45
+
46
+ #step 8
47
+ mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
48
+ attn_scores.masked_fill_(mask_bool, -torch.inf)
49
+
50
+ attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
51
+ attn_weights = self.dropout(attn_weights)
52
+
53
+ #step 9 - 11
54
+ ctx_vec = (attn_weights @ values).transpose(1, 2)
55
+
56
+ #step 12
57
+ ctx_vec = ctx_vec.contiguous().view(b, num_tokens, self.d_out)
58
+ ctx_vec = self.out_proj(ctx_vec)
59
+
60
+ return ctx_vec
61
+
62
+ #==========================================================================
63
+
64
+
65
+ class LayerNorm(nn.Module):
66
+ def __init__(self, emb_dim):
67
+ super().__init__()
68
+ self.eps = 1e-5
69
+ self.scale = nn.Parameter(torch.ones(emb_dim))
70
+ self.shift = nn.Parameter(torch.zeros(emb_dim))
71
+
72
+ def forward(self, x):
73
+ mean = x.mean(dim=-1, keepdim=True)
74
+ var = x.var(dim=-1, keepdim=True, unbiased=False)
75
+ norm_x = (x - mean) / torch.sqrt(var + self.eps)
76
+ return self.scale * norm_x + self.shift
77
+
78
+ #==========================================================================
79
+
80
+
81
+ class GeLU(nn.Module):
82
+ def __init__(self):
83
+ super().__init__()
84
+
85
+ def forward(self, x):
86
+ return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)) * (x + 0.044715 * torch.pow(x,3))))
87
+
88
+ #==========================================================================
89
+
90
+
91
+ class FeedForward(nn.Module):
92
+ def __init__(self, cfg):
93
+ super().__init__()
94
+ self.layers = nn.Sequential(
95
+ nn.Linear(cfg.emb_dim, 4*cfg.emb_dim),
96
+ GeLU(),
97
+ nn.Linear(4*cfg.emb_dim, cfg.emb_dim)
98
+ )
99
+ def forward(self, x):
100
+ return self.layers(x)
101
+
102
+ #==========================================================================
103
+
104
+ class TransformerBlock(nn.Module):
105
+ def __init__(self, cfg):
106
+ super().__init__()
107
+ self.att = MultiheadAttention(
108
+ d_in = cfg.emb_dim,
109
+ d_out = cfg.emb_dim,
110
+ context_length = cfg.context_length,
111
+ dropout = cfg.drop_rate,
112
+ num_heads = cfg.n_heads,
113
+ qkv_bias = cfg.qkv_bias
114
+ )
115
+ self.ff = FeedForward(cfg)
116
+ self.norm1 = LayerNorm(cfg.emb_dim)
117
+ self.norm2 = LayerNorm(cfg.emb_dim)
118
+ self.drop_shortcut = nn.Dropout(cfg.drop_rate)
119
+
120
+ def forward(self, x):
121
+ shortcut = x
122
+ x = self.norm1(x)
123
+ x = self.att(x)
124
+ x = self.drop_shortcut(x)
125
+ x = x + shortcut
126
+
127
+ shortcut = x
128
+ x = self.norm2(x)
129
+ x = self.ff(x)
130
+ x = self.drop_shortcut(x)
131
+ x = x + shortcut
132
+
133
+ return x
134
+
135
+ #=======================================================================
136
+
137
+ class TicketGPTConfig(PretrainedConfig):
138
+ model_type = "ticket_gpt" # Unique identifier for the AutoClass
139
+ def __init__(self, classes=8, context_length=1024, drop_rate=0.1, emb_dim=768, n_heads=12, n_layers=12, qkv_bias=True, vocab_size=50257, **kwargs):
140
+ super().__init__(**kwargs)
141
+ self.classes = classes
142
+ self.context_length = context_length
143
+ self.drop_rate = drop_rate
144
+ self.emb_dim = emb_dim
145
+ self.n_heads = n_heads
146
+ self.n_layers = n_layers
147
+ self.qkv_bias = qkv_bias
148
+ self.vocab_size = vocab_size
149
+
150
+ class TicketGPT(
151
+ PreTrainedModel,
152
+ ):
153
+ config_class = TicketGPTConfig
154
+ def __init__(self, config):
155
+ super().__init__(config)
156
+ self.tok_emb = nn.Embedding(config.vocab_size, config.emb_dim)
157
+ self.pos_emb = nn.Embedding(config.context_length, config.emb_dim)
158
+ self.drop_emb = nn.Dropout(config.drop_rate)
159
+
160
+ self.trf_blocks = nn.Sequential(
161
+ *[TransformerBlock(config) for _ in range(config.n_layers)]
162
+ )
163
+
164
+ self.final_norm = LayerNorm(config.emb_dim)
165
+ self.out_head = nn.Linear(config.emb_dim, config.classes, bias=True)
166
+
167
+ def forward(self, x):
168
+ batch_size, seq_len = x.shape
169
+ tok_embeddings = self.tok_emb(x) #[2,4,768]
170
+ pos_embeddings = self.pos_emb(torch.arange(seq_len, device=x.device)) #[2,4,768]
171
+ x = tok_embeddings + pos_embeddings #[2,4,768]
172
+ x = self.drop_emb(x)
173
+ x = self.trf_blocks(x)
174
+ x = self.final_norm(x)
175
+ logits = self.out_head(x) #[2,4,50257]
176
+ return logits
177
+
178
+ def classify_review(text, model, tokenizer, max_length=None, pad_token_id=50256):
179
+ lookup = {
180
+ 0:"Hardware",
181
+ 1:"HR Support",
182
+ 2:"Access",
183
+ 3:"Miscellaneous",
184
+ 4:"Storage",
185
+ 5:"Purchase",
186
+ 6:"Internal Project",
187
+ 7:"Administrative rights"
188
+ }
189
+
190
+ current_device = next(self.parameters()).device
191
+ model.eval()
192
+
193
+ # Prepare inputs to the model
194
+ input_ids = tokenizer.encode(text)
195
+ supported_context_length = model.pos_emb.weight.shape[0]
196
+
197
+ # Truncate sequences if they too long
198
+ input_ids = input_ids[:min(max_length, supported_context_length)]
199
+
200
+ # Pad sequences to the longest sequence
201
+ input_ids += [pad_token_id] * (max_length - len(input_ids))
202
+ input_tensor = torch.tensor(input_ids, device=current_device).unsqueeze(0) # add batch dimension
203
+
204
+ # Model inference
205
+ with torch.no_grad():
206
+ logits = model(input_tensor)[:, -1, :] # Logits of the last output token
207
+ predicted_label = torch.argmax(logits, dim=-1).item()
208
+
209
+ # Return the classified result
210
+ return lookup[predicted_label]