| | import os |
| | from typing import List, Dict, Any |
| | from app.core.config import settings |
| |
|
| | import torch |
| | import numpy as np |
| | from scipy.special import softmax |
| | from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig |
| |
|
| |
|
| | class SentimentService: |
| | """ |
| | A service for loading the sentiment analysis model and performing predictions. |
| | """ |
| |
|
| | def __init__(self) -> None: |
| | """ |
| | Initialize the service by loading the sentiment analysis model and tokenizer. |
| | """ |
| | |
| | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | if self.device.type == "cuda": |
| | print( |
| | f"GPU found: {torch.cuda.get_device_name(0)}. Loading model onto GPU." |
| | ) |
| | else: |
| | print("GPU not found. Loading model onto CPU.") |
| |
|
| | |
| |
|
| | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| | MODEL_DIR = os.path.join( |
| | BASE_DIR, "models", "twitter-roberta-base-sentiment-latest" |
| | ) |
| | print(MODEL_DIR) |
| |
|
| | if not os.path.exists(MODEL_DIR): |
| | raise FileNotFoundError(f"Model folder not found: {MODEL_DIR}") |
| |
|
| | |
| | self.tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) |
| | self.config = AutoConfig.from_pretrained(MODEL_DIR) |
| | self.model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to( |
| | self.device |
| | ) |
| |
|
| | self.model.eval() |
| | print("Sentiment model loaded successfully.") |
| |
|
| | def _preprocess_text(self, text: str) -> str: |
| | """ |
| | Replace @user mentions and http links with placeholders. |
| | """ |
| | if not isinstance(text, str): |
| | return "" |
| | new_text = [] |
| | for t in text.split(" "): |
| | t = "@user" if t.startswith("@") and len(t) > 1 else t |
| | t = "http" if t.startswith("http") else t |
| | new_text.append(t) |
| | return " ".join(new_text) |
| |
|
| | def predict(self, texts: List[str]) -> List[Dict[str, Any]]: |
| | """ |
| | Predict sentiment for a batch of texts, splitting into sub-batches |
| | for efficiency on CPU. |
| | """ |
| | |
| | preprocessed_texts = [self._preprocess_text(text) for text in texts] |
| |
|
| | |
| | non_empty_texts_with_indices = [ |
| | (i, text) for i, text in enumerate(preprocessed_texts) if text.strip() |
| | ] |
| | if not non_empty_texts_with_indices: |
| | return [] |
| |
|
| | indices, texts_to_predict = zip(*non_empty_texts_with_indices) |
| |
|
| | |
| | batch_size = settings.INFERENCE_BATCH_SIZE |
| |
|
| | predictions = [] |
| | |
| | for start in range(0, len(texts_to_predict), batch_size): |
| | sub_texts = texts_to_predict[start : start + batch_size] |
| |
|
| | |
| | encoded_inputs = self.tokenizer( |
| | list(sub_texts), |
| | return_tensors="pt", |
| | padding=True, |
| | truncation=True, |
| | max_length=512, |
| | ).to(self.device) |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = self.model(**encoded_inputs) |
| | logits = outputs.logits.detach().cpu().numpy() |
| |
|
| | |
| | del encoded_inputs, outputs |
| | if torch.cuda.is_available(): |
| | torch.cuda.empty_cache() |
| |
|
| | |
| | probs = softmax(logits, axis=1) |
| | for prob in probs: |
| | max_idx = int(np.argmax(prob)) |
| | predictions.append( |
| | { |
| | "label": self.config.id2label[max_idx], |
| | "score": float(prob[max_idx]), |
| | } |
| | ) |
| |
|
| | |
| |
|
| | |
| | final_results: List[Dict[str, Any] | None] = [None] * len(texts) |
| | for original_index, prediction in zip(indices, predictions): |
| | final_results[original_index] = prediction |
| |
|
| | |
| | default_prediction = {"label": "neutral", "score": 1.0} |
| | return [res if res is not None else default_prediction for res in final_results] |
| |
|