import torch import re import logging from typing import List, Dict, Tuple from functools import lru_cache from lime.lime_text import LimeTextExplainer from config import config from models import ModelManager from utils import handle_errors logger = logging.getLogger(__name__) class TextProcessor: """Optimized text processing""" @staticmethod @lru_cache(maxsize=config.CACHE_SIZE) def clean_text(text: str) -> Tuple[str, ...]: """Single-pass text cleaning""" words = re.findall(r'\b\w{3,}\b', text.lower()) return tuple(w for w in words if w not in config.STOP_WORDS) class SentimentEngine: """Streamlined sentiment analysis engine with LIME and SHAP""" def __init__(self): self.model_manager = ModelManager() self.lime_explainer = LimeTextExplainer(class_names=['Negative', 'Positive']) self.shap_explainer = None def predict_proba(self, texts): """Prediction function for LIME""" if isinstance(texts, str): texts = [texts] inputs = self.model_manager.tokenizer( texts, return_tensors="pt", padding=True, truncation=True, max_length=config.MAX_TEXT_LENGTH ).to(self.model_manager.device) with torch.no_grad(): outputs = self.model_manager.model(**inputs) probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy() return probs @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0}) def analyze_single_fast(self, text: str) -> Dict: """Fast single text analysis without keyword extraction""" if not text.strip(): raise ValueError("Empty text") probs = self.predict_proba([text])[0] sentiment = "Positive" if probs[1] > probs[0] else "Negative" return { 'sentiment': sentiment, 'confidence': float(probs.max()), 'pos_prob': float(probs[1]), 'neg_prob': float(probs[0]) } def extract_key_words_lime(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]: """Advanced keyword extraction using LIME""" try: explanation = self.lime_explainer.explain_instance( text, self.predict_proba, num_features=top_k, num_samples=200 ) word_scores = [] for word, score in explanation.as_list(): if len(word.strip()) >= config.MIN_WORD_LENGTH: word_scores.append((word.strip().lower(), abs(score))) word_scores.sort(key=lambda x: x[1], reverse=True) return word_scores[:top_k] except Exception as e: logger.error(f"LIME extraction failed: {e}") return [] def extract_key_words_shap(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]: """Advanced keyword extraction using SHAP""" try: # Simple SHAP implementation using model predictions words = text.split() word_scores = [] # Get baseline prediction baseline_prob = self.predict_proba([text])[0][1] # Positive probability # Calculate importance by removing each word for i, word in enumerate(words): # Create text without this word modified_words = words[:i] + words[i+1:] modified_text = ' '.join(modified_words) if modified_text.strip(): modified_prob = self.predict_proba([modified_text])[0][1] importance = abs(baseline_prob - modified_prob) clean_word = re.sub(r'[^\w]', '', word.lower()) if len(clean_word) >= config.MIN_WORD_LENGTH: word_scores.append((clean_word, importance)) # Remove duplicates and sort unique_scores = {} for word, score in word_scores: if word in unique_scores: unique_scores[word] = max(unique_scores[word], score) else: unique_scores[word] = score sorted_scores = sorted(unique_scores.items(), key=lambda x: x[1], reverse=True) return sorted_scores[:top_k] except Exception as e: logger.error(f"SHAP extraction failed: {e}") return [] def create_heatmap_html(self, text: str, word_scores: Dict[str, float]) -> str: """Create HTML heatmap visualization""" words = text.split() html_parts = ['