Spaces:
Running
Running
| import os | |
| import torch | |
| import torch.nn as nn | |
| from core.device import DEVICE | |
| from language.embeddings import EmbeddingLayer | |
| from language.encoder import SentenceEncoder | |
| from language.tokenizer import SimpleTokenizer | |
| ARTIFACTS_DIR = "artifacts" | |
| class ProgrammingModel(nn.Module): | |
| def __init__(self, tokenizer: SimpleTokenizer): | |
| super().__init__() | |
| self.tokenizer = tokenizer | |
| # EXACT SAME ARCHITECTURE AS TRAINING | |
| self.embedder = EmbeddingLayer( | |
| len(tokenizer.vocab), | |
| pad_index=tokenizer.vocab[tokenizer.PAD_TOKEN] | |
| ) | |
| self.encoder = SentenceEncoder() | |
| self.classifier = nn.Linear( | |
| self.encoder.projection.out_features, | |
| 2 | |
| ) | |
| self.load_models() | |
| self.to(DEVICE) | |
| self.eval() | |
| def load_models(self): | |
| self.embedder.load_state_dict( | |
| torch.load(os.path.join(ARTIFACTS_DIR, "programming_embedding.pt"), map_location=DEVICE) | |
| ) | |
| self.encoder.load_state_dict( | |
| torch.load(os.path.join(ARTIFACTS_DIR, "programming_encoder.pt"), map_location=DEVICE) | |
| ) | |
| self.classifier.load_state_dict( | |
| torch.load(os.path.join(ARTIFACTS_DIR, "programming_classifier.pt"), map_location=DEVICE) | |
| ) | |
| def forward(self, token_ids): | |
| embeddings = self.embedder(token_ids) | |
| attention_mask = (token_ids != self.tokenizer.vocab[self.tokenizer.PAD_TOKEN]).long() | |
| sentence_vec = self.encoder(embeddings, attention_mask=attention_mask) | |
| return self.classifier(sentence_vec) | |
| def predict(self, text: str): | |
| token_ids = torch.tensor( | |
| [self.tokenizer.encode(text)], | |
| dtype=torch.long | |
| ).to(DEVICE) | |
| with torch.no_grad(): | |
| logits = self.forward(token_ids) | |
| probs = torch.softmax(logits, dim=-1) | |
| label_idx = torch.argmax(probs, dim=-1).item() | |
| return { | |
| "label": "programming" if label_idx == 1 else "non_programming", | |
| "confidence": probs[0][label_idx].item() | |
| } |