Spaces:

DornierDo17
/

RoBERTaModelPreTrained

Build error

App Files Files Community

DornierDo17 commited on Jul 1, 2025

Commit

0c8750c

1 Parent(s): 97eed42

first commit

Browse files

Files changed (6) hide show

MLMHead.py +16 -0
RoBERTaModule.py +54 -0
app.py +32 -0
model.py +29 -0
requirements.txt +5 -0
utils.py +87 -0

MLMHead.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch.nn as nn
+class MLMHead(nn.Module):
+    def __init__(self, d_model=256):
+        super().__init__()
+        self.lin = nn.Linear(d_model, d_model, bias=False)
+        self.gelu = nn.GELU()
+        self.norm = nn.LayerNorm(d_model)
+    def forward(self, x):
+        x = self.lin(x)
+        x = self.gelu(x)
+        x = self.norm(x)
+        return x

RoBERTaModule.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import copy
+import torch
+import torch.nn.functional as F
+from model import RoBERTa
+from torch import nn
+from torch.amp import GradScaler, autocast
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from transformers import get_cosine_schedule_with_warmup
+from transformers import RobertaTokenizerFast
+class RoBERTaModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+        self.model = RoBERTa(vocab_size=self.tokenizer.vocab_size, padding_idx=self.tokenizer.pad_token_id)
+    def forward(self, x, attn_mask):
+        return self.model(x, attn_mask)
+    def forward(self, x, attn_mask):
+        return self.model(x, attn_mask)
+    def inference(self, sentence):
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(device)
+        self.model.eval()
+        tokenizer = self.tokenizer
+        input_ids = tokenizer.encode(sentence)
+        input_ids_tensor = torch.tensor([input_ids]).to(device)
+        attention_mask = (input_ids_tensor != tokenizer.pad_token_id).long()
+        mask_token_id = tokenizer.mask_token_id
+        mask_indices = [i for i, token in enumerate(input_ids) if token == mask_token_id]
+        if not mask_indices:
+            return "No <mask> token found"
+        with torch.no_grad():
+            logits = self.model(input_ids_tensor, attention_mask)
+        predicted_tokens = []
+        for idx in mask_indices:
+            pred_token_id = logits[0, idx].argmax().item()
+            predicted_tokens.append(tokenizer.decode([pred_token_id]))
+        return predicted_tokens if len(predicted_tokens) > 1 else predicted_tokens[0]
+    def load_checkpoint(self, path="finishedBest10.pt"):
+        checkpoint = torch.load(path, map_location=torch.device("cpu"))
+        self.model.load_state_dict(checkpoint["model_state_dict"])

app.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import gradio as gr
+from RoBERTaModule import RoBERTaModule
+from transformers import RobertaTokenizerFast
+from huggingface_hub import hf_hub_download
+MODEL_REPO_ID = "DornierDo17/RoBERTa_17.7M"
+WEIGHTS_FILE = "finishedBest10.pt"
+weight_path = hf_hub_download(repo_id=MODEL_REPO_ID, filename=WEIGHTS_FILE)
+model = RoBERTaModule()
+model.load_checkpoint(path=weight_path)
+tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+def predict(sentece):
+    try:
+        result = model.inference(sentece)
+        return result
+    except Exception as e:
+        return str(e)
+gr.Interface(
+    fn=predict,
+    inputs=gr.Textbox(label="Enter sentence with <mask>"),
+    outputs=gr.Textbox(label="Predicted token(s)"),
+    title="RoBERTa MLM Inference"
+).launch()

model.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from MLMHead import MLMHead
+from utils import TransformerBlock
+class RoBERTa(nn.Module):
+    def __init__(self, vocab_size, padding_idx, max_sequence_length = 128, d_model = 256, layers=6):
+        super().__init__()
+        self.tok_emb = nn.Embedding(vocab_size, d_model, padding_idx=padding_idx)
+        self.pos_emb = nn.Embedding(max_sequence_length, d_model)
+        self.trf_block = nn.Sequential(*[TransformerBlock(d_model=d_model) for _ in range(layers)])
+        self.mlmHead = MLMHead(d_model)
+    def forward(self, x, attn_mask):
+        batch_size, seq_len = x.shape
+        tok_emb = self.tok_emb(x)
+        pos_emb = self.pos_emb(torch.arange(seq_len, device=x.device)).unsqueeze(0)
+        x = tok_emb + pos_emb
+        for block in self.trf_block:
+            x = block(x, attn_mask)
+        x = self.mlmHead(x)
+        x = F.linear(x, self.tok_emb.weight) # weight tying technique to save parameters(reusing existing weight matrix instead of creating new one)
+        return x

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio==5.35.0
+huggingface_hub==0.33.0
+torch==2.5.1
+tqdm==4.67.1
+transformers==4.44.1

utils.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+import torch.nn as nn
+import math
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model = 256, num_heads = 8):
+        super().__init__()
+        self.d_model = d_model
+        self.num_heads = num_heads
+        assert d_model % num_heads == 0, "Number of dimensions should be divisible by heads"
+        self.d_k = d_model // num_heads
+        self.W_q = nn.Linear(d_model, d_model, bias=False)
+        self.W_k = nn.Linear(d_model, d_model, bias=False)
+        self.W_v = nn.Linear(d_model, d_model, bias=False)
+        self.projection = nn.Linear(d_model, d_model, bias=False)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x, attention_mask=None):
+        batch_size, seq_length, d_model = x.shape
+        Q = self.W_q(x) #(batch_size, seq_len, d_model)
+        K = self.W_k(x)
+        V = self.W_v(x)
+        Q = Q.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2) # (batch_size, num_heads, seq_length, d_k)
+        K = K.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
+        V = V.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
+        attention_scores = Q @ K.transpose(2, 3)
+        if attention_mask is not None:
+            mask = attention_mask.unsqueeze(1).unsqueeze(2) # (batch_dim, 1, 1, seq_length)
+            mask = mask.to(attention_scores.device) # making mask to prevent model attending to PAD tokens
+            attention_scores = attention_scores.masked_fill(mask == 0, float("-inf"))
+        attention_weights = torch.softmax(attention_scores / math.sqrt(self.d_k),  dim=-1)
+        attention_weights = self.dropout(attention_weights)
+        final_weights = attention_weights @ V # (batch_size, num_heads, seq_length, d_k)
+        final_weights = final_weights.transpose(1,2).contiguous().view(batch_size, seq_length, d_model)
+        out_projection = self.projection(final_weights)
+        return out_projection
+class FeedForward(nn.Module):
+    def __init__(self, d_model = 256):
+        super().__init__()
+        self.projection = nn.Sequential(
+            nn.Linear(d_model, d_model * 4),
+            nn.GELU(),
+            nn.Dropout(0.1),
+            nn.Linear(d_model * 4, d_model)
+        )
+    def forward(self, x):
+        return self.projection(x)
+class TransformerBlock(nn.Module):
+    def __init__(self, d_model = 256):
+        super().__init__()
+        self.attn = MultiHeadAttention()
+        self.ffn = FeedForward()
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+    def forward(self, x, attn_mask):
+        residual = x
+        x = self.norm1(x)
+        x = self.attn(x, attn_mask)
+        x += residual
+        residual = x
+        x = self.norm2(x)
+        x = self.ffn(x)
+        x += residual
+        return x