DornierDo17 commited on
Commit
0c8750c
·
1 Parent(s): 97eed42

first commit

Browse files
Files changed (6) hide show
  1. MLMHead.py +16 -0
  2. RoBERTaModule.py +54 -0
  3. app.py +32 -0
  4. model.py +29 -0
  5. requirements.txt +5 -0
  6. utils.py +87 -0
MLMHead.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+
3
+
4
+ class MLMHead(nn.Module):
5
+ def __init__(self, d_model=256):
6
+ super().__init__()
7
+ self.lin = nn.Linear(d_model, d_model, bias=False)
8
+ self.gelu = nn.GELU()
9
+ self.norm = nn.LayerNorm(d_model)
10
+
11
+ def forward(self, x):
12
+ x = self.lin(x)
13
+ x = self.gelu(x)
14
+ x = self.norm(x)
15
+
16
+ return x
RoBERTaModule.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from model import RoBERTa
5
+ from torch import nn
6
+ from torch.amp import GradScaler, autocast
7
+ from torch.utils.tensorboard import SummaryWriter
8
+ from tqdm import tqdm
9
+ from transformers import get_cosine_schedule_with_warmup
10
+ from transformers import RobertaTokenizerFast
11
+
12
+
13
+ class RoBERTaModule(nn.Module):
14
+ def __init__(self):
15
+ super().__init__()
16
+ self.tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
17
+ self.model = RoBERTa(vocab_size=self.tokenizer.vocab_size, padding_idx=self.tokenizer.pad_token_id)
18
+
19
+ def forward(self, x, attn_mask):
20
+ return self.model(x, attn_mask)
21
+
22
+ def forward(self, x, attn_mask):
23
+ return self.model(x, attn_mask)
24
+
25
+ def inference(self, sentence):
26
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
+ self.model.to(device)
28
+ self.model.eval()
29
+
30
+ tokenizer = self.tokenizer
31
+ input_ids = tokenizer.encode(sentence)
32
+ input_ids_tensor = torch.tensor([input_ids]).to(device)
33
+ attention_mask = (input_ids_tensor != tokenizer.pad_token_id).long()
34
+
35
+ mask_token_id = tokenizer.mask_token_id
36
+ mask_indices = [i for i, token in enumerate(input_ids) if token == mask_token_id]
37
+ if not mask_indices:
38
+ return "No <mask> token found"
39
+
40
+ with torch.no_grad():
41
+ logits = self.model(input_ids_tensor, attention_mask)
42
+
43
+ predicted_tokens = []
44
+ for idx in mask_indices:
45
+ pred_token_id = logits[0, idx].argmax().item()
46
+ predicted_tokens.append(tokenizer.decode([pred_token_id]))
47
+
48
+ return predicted_tokens if len(predicted_tokens) > 1 else predicted_tokens[0]
49
+
50
+ def load_checkpoint(self, path="finishedBest10.pt"):
51
+ checkpoint = torch.load(path, map_location=torch.device("cpu"))
52
+ self.model.load_state_dict(checkpoint["model_state_dict"])
53
+
54
+
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from RoBERTaModule import RoBERTaModule
3
+ from transformers import RobertaTokenizerFast
4
+ from huggingface_hub import hf_hub_download
5
+
6
+
7
+ MODEL_REPO_ID = "DornierDo17/RoBERTa_17.7M"
8
+ WEIGHTS_FILE = "finishedBest10.pt"
9
+
10
+ weight_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=WEIGHTS_FILE)
11
+
12
+ model = RoBERTaModule()
13
+
14
+ model.load_checkpoint(path=weight_path)
15
+ tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
16
+
17
+ def predict(sentece):
18
+ try:
19
+ result = model.inference(sentece)
20
+ return result
21
+ except Exception as e:
22
+ return str(e)
23
+
24
+
25
+
26
+ gr.Interface(
27
+ fn=predict,
28
+ inputs=gr.Textbox(label="Enter sentence with <mask>"),
29
+ outputs=gr.Textbox(label="Predicted token(s)"),
30
+ title="RoBERTa MLM Inference"
31
+ ).launch()
32
+
model.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from MLMHead import MLMHead
5
+ from utils import TransformerBlock
6
+
7
+
8
+ class RoBERTa(nn.Module):
9
+ def __init__(self, vocab_size, padding_idx, max_sequence_length = 128, d_model = 256, layers=6):
10
+ super().__init__()
11
+ self.tok_emb = nn.Embedding(vocab_size, d_model, padding_idx=padding_idx)
12
+ self.pos_emb = nn.Embedding(max_sequence_length, d_model)
13
+ self.trf_block = nn.Sequential(*[TransformerBlock(d_model=d_model) for _ in range(layers)])
14
+ self.mlmHead = MLMHead(d_model)
15
+
16
+ def forward(self, x, attn_mask):
17
+ batch_size, seq_len = x.shape
18
+ tok_emb = self.tok_emb(x)
19
+ pos_emb = self.pos_emb(torch.arange(seq_len, device=x.device)).unsqueeze(0)
20
+ x = tok_emb + pos_emb
21
+
22
+
23
+ for block in self.trf_block:
24
+ x = block(x, attn_mask)
25
+
26
+ x = self.mlmHead(x)
27
+ x = F.linear(x, self.tok_emb.weight) # weight tying technique to save parameters(reusing existing weight matrix instead of creating new one)
28
+
29
+ return x
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==5.35.0
2
+ huggingface_hub==0.33.0
3
+ torch==2.5.1
4
+ tqdm==4.67.1
5
+ transformers==4.44.1
utils.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+
5
+
6
+ class MultiHeadAttention(nn.Module):
7
+ def __init__(self, d_model = 256, num_heads = 8):
8
+ super().__init__()
9
+ self.d_model = d_model
10
+ self.num_heads = num_heads
11
+
12
+ assert d_model % num_heads == 0, "Number of dimensions should be divisible by heads"
13
+
14
+ self.d_k = d_model // num_heads
15
+
16
+ self.W_q = nn.Linear(d_model, d_model, bias=False)
17
+ self.W_k = nn.Linear(d_model, d_model, bias=False)
18
+ self.W_v = nn.Linear(d_model, d_model, bias=False)
19
+
20
+ self.projection = nn.Linear(d_model, d_model, bias=False)
21
+ self.dropout = nn.Dropout(0.1)
22
+
23
+ def forward(self, x, attention_mask=None):
24
+ batch_size, seq_length, d_model = x.shape
25
+ Q = self.W_q(x) #(batch_size, seq_len, d_model)
26
+ K = self.W_k(x)
27
+ V = self.W_v(x)
28
+
29
+ Q = Q.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2) # (batch_size, num_heads, seq_length, d_k)
30
+ K = K.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
31
+ V = V.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
32
+
33
+ attention_scores = Q @ K.transpose(2, 3)
34
+
35
+ if attention_mask is not None:
36
+ mask = attention_mask.unsqueeze(1).unsqueeze(2) # (batch_dim, 1, 1, seq_length)
37
+ mask = mask.to(attention_scores.device) # making mask to prevent model attending to PAD tokens
38
+ attention_scores = attention_scores.masked_fill(mask == 0, float("-inf"))
39
+
40
+ attention_weights = torch.softmax(attention_scores / math.sqrt(self.d_k), dim=-1)
41
+ attention_weights = self.dropout(attention_weights)
42
+
43
+ final_weights = attention_weights @ V # (batch_size, num_heads, seq_length, d_k)
44
+ final_weights = final_weights.transpose(1,2).contiguous().view(batch_size, seq_length, d_model)
45
+
46
+ out_projection = self.projection(final_weights)
47
+
48
+ return out_projection
49
+
50
+
51
+ class FeedForward(nn.Module):
52
+ def __init__(self, d_model = 256):
53
+ super().__init__()
54
+ self.projection = nn.Sequential(
55
+ nn.Linear(d_model, d_model * 4),
56
+ nn.GELU(),
57
+ nn.Dropout(0.1),
58
+ nn.Linear(d_model * 4, d_model)
59
+ )
60
+
61
+ def forward(self, x):
62
+ return self.projection(x)
63
+
64
+
65
+
66
+ class TransformerBlock(nn.Module):
67
+ def __init__(self, d_model = 256):
68
+ super().__init__()
69
+ self.attn = MultiHeadAttention()
70
+ self.ffn = FeedForward()
71
+ self.norm1 = nn.LayerNorm(d_model)
72
+ self.norm2 = nn.LayerNorm(d_model)
73
+
74
+ def forward(self, x, attn_mask):
75
+ residual = x
76
+ x = self.norm1(x)
77
+ x = self.attn(x, attn_mask)
78
+
79
+ x += residual
80
+
81
+ residual = x
82
+
83
+ x = self.norm2(x)
84
+ x = self.ffn(x)
85
+ x += residual
86
+
87
+ return x