Spaces:

entropy25
/

sentiment-analysis

Sleeping

App Files Files Community

entropy25 commited on Jul 23, 2025

Commit

326a9a1

verified ·

1 Parent(s): 43f768a

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -77

app.py CHANGED Viewed

@@ -17,7 +17,8 @@ from dataclasses import dataclass
 from typing import List, Dict, Optional, Tuple, Any, Callable
 from contextlib import contextmanager
 import gc
-import pandas as pd  # Added missing import
 @dataclass
 class Config:
@@ -155,99 +156,109 @@ class HistoryManager:
 # Core Analysis Engine
 class SentimentEngine:
-    """Streamlined sentiment analysis with attention-based keyword extraction"""
     def __init__(self):
         self.model_manager = ModelManager()
-    def extract_key_words(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
-        """Extract contributing words using BERT attention weights"""
         try:
-            inputs = self.model_manager.tokenizer(
-                text, return_tensors="pt", padding=True,
-                truncation=True, max_length=config.MAX_TEXT_LENGTH
-            ).to(self.model_manager.device)
-            # Get model outputs with attention weights
-            with torch.no_grad():
-                outputs = self.model_manager.model(**inputs, output_attentions=True)
-                attention = outputs.attentions  # Tuple of attention tensors for each layer
-                # Use the last layer's attention, average over all heads
-                last_attention = attention[-1]  # Shape: [batch_size, num_heads, seq_len, seq_len]
-                avg_attention = last_attention.mean(dim=1)  # Average over heads: [batch_size, seq_len, seq_len]
-                # Focus on attention to [CLS] token (index 0) as it represents the whole sequence
-                cls_attention = avg_attention[0, 0, :]  # Attention from CLS to all tokens
-            # Get tokens and their attention scores
-            tokens = self.model_manager.tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
-            attention_scores = cls_attention.cpu().numpy()
-            # Filter out special tokens and combine subword tokens
-            word_scores = {}
-            current_word = ""
-            current_score = 0.0
-            for i, (token, score) in enumerate(zip(tokens, attention_scores)):
-                if token in ['[CLS]', '[SEP]', '[PAD]']:
-                    continue
-                if token.startswith('##'):
-                    # Subword token, add to current word
-                    current_word += token[2:]
-                    current_score = max(current_score, score)  # Take max attention
-                else:
-                    # New word, save previous if exists
-                    if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
-                        word_scores[current_word.lower()] = current_score
-                    current_word = token
-                    current_score = score
-            # Don't forget the last word
-            if current_word and len(current_word) >= config.MIN_WORD_LENGTH:
-                word_scores[current_word.lower()] = current_score
-            # Filter out stop words and sort by attention score
-            filtered_words = {
-                word: score for word, score in word_scores.items()
-                if word not in config.STOP_WORDS and len(word) >= config.MIN_WORD_LENGTH
-            }
-            # Sort by attention score and return top_k
-            sorted_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)
-            return sorted_words[:top_k]
         except Exception as e:
-            logger.error(f"Key word extraction failed: {e}")
             return []
-    @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'key_words': []})
     def analyze_single(self, text: str) -> Dict:
-        """Analyze single text with key word extraction"""
         if not text.strip():
             raise ValueError("Empty text")
-        inputs = self.model_manager.tokenizer(
-            text, return_tensors="pt", padding=True,
-            truncation=True, max_length=config.MAX_TEXT_LENGTH
-        ).to(self.model_manager.device)
-        with torch.no_grad():
-            outputs = self.model_manager.model(**inputs)
-            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
         sentiment = "Positive" if probs[1] > probs[0] else "Negative"
-        # Extract key contributing words
-        key_words = self.extract_key_words(text)
         return {
             'sentiment': sentiment,
             'confidence': float(probs.max()),
             'pos_prob': float(probs[1]),
             'neg_prob': float(probs[0]),
-            'key_words': key_words
         }
     @handle_errors(default_return=[])
@@ -585,11 +596,11 @@ class SentimentApp:
         ]
-    @handle_errors(default_return=("Please enter text", None, None, None, None))
     def analyze_single(self, text: str, theme: str = 'default'):
-        """Single text analysis with key words"""
         if not text.strip():
-            return "Please enter text", None, None, None, None
         result = self.engine.analyze_single(text)
@@ -614,7 +625,8 @@ class SentimentApp:
         result_text = (f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})\n"
                       f"Key Words: {key_words_str}")
-        return result_text, prob_plot, gauge_plot, cloud_plot, keyword_plot
     @handle_errors(default_return=None)
     def analyze_batch(self, reviews: str, progress=None):
@@ -706,6 +718,7 @@ def create_interface():
                 with gr.Column():
                     result_output = gr.Textbox(label="Result", lines=3)
             with gr.Row():
                 prob_plot = gr.Plot(label="Probabilities")
@@ -749,7 +762,7 @@ def create_interface():
         analyze_btn.click(
             app.analyze_single,
             inputs=[text_input, theme_selector],
-            outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot, keyword_plot]
         )
         load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)

 from typing import List, Dict, Optional, Tuple, Any, Callable
 from contextlib import contextmanager
 import gc
+import pandas as pd
+from lime.lime_text import LimeTextExplainer  # Added LIME import
 @dataclass
 class Config:
 # Core Analysis Engine
 class SentimentEngine:
+    """Streamlined sentiment analysis with LIME-based keyword extraction"""
     def __init__(self):
         self.model_manager = ModelManager()
+        self.lime_explainer = LimeTextExplainer(class_names=['Negative', 'Positive'])
+    def predict_proba(self, texts):
+        """Prediction function for LIME"""
+        if isinstance(texts, str):
+            texts = [texts]
+        inputs = self.model_manager.tokenizer(
+            texts, return_tensors="pt", padding=True,
+            truncation=True, max_length=config.MAX_TEXT_LENGTH
+        ).to(self.model_manager.device)
+        with torch.no_grad():
+            outputs = self.model_manager.model(**inputs)
+            probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
+        return probs
+    def extract_key_words_lime(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
+        """Fast keyword extraction using LIME"""
         try:
+            # Get LIME explanation
+            explanation = self.lime_explainer.explain_instance(
+                text, self.predict_proba, num_features=top_k, num_samples=100
+            )
+            # Extract word importance scores
+            word_scores = []
+            for word, score in explanation.as_list():
+                if len(word.strip()) >= config.MIN_WORD_LENGTH:
+                    word_scores.append((word.strip().lower(), abs(score)))
+            # Sort by importance and return top_k
+            word_scores.sort(key=lambda x: x[1], reverse=True)
+            return word_scores[:top_k]
         except Exception as e:
+            logger.error(f"LIME extraction failed: {e}")
             return []
+    def create_heatmap_html(self, text: str, word_scores: Dict[str, float]) -> str:
+        """Create HTML heatmap visualization"""
+        words = text.split()
+        html_parts = ['<div style="font-family: Arial; font-size: 16px; line-height: 1.6;">']
+        # Normalize scores for color intensity
+        if word_scores:
+            max_score = max(abs(score) for score in word_scores.values())
+            min_score = min(word_scores.values())
+        else:
+            max_score = min_score = 0
+        for word in words:
+            clean_word = re.sub(r'[^\w]', '', word.lower())
+            score = word_scores.get(clean_word, 0)
+            if score > 0:
+                # Positive contribution - green
+                intensity = min(255, int(180 * (score / max_score) if max_score > 0 else 0))
+                color = f"rgba(0, {intensity}, 0, 0.3)"
+            elif score < 0:
+                # Negative contribution - red
+                intensity = min(255, int(180 * (abs(score) / abs(min_score)) if min_score < 0 else 0))
+                color = f"rgba({intensity}, 0, 0, 0.3)"
+            else:
+                # Neutral - no highlighting
+                color = "transparent"
+            html_parts.append(
+                f'<span style="background-color: {color}; padding: 2px; margin: 1px; '
+                f'border-radius: 3px;" title="Score: {score:.3f}">{word}</span> '
+            )
+        html_parts.append('</div>')
+        return ''.join(html_parts)
+    @handle_errors(default_return={'sentiment': 'Unknown', 'confidence': 0.0, 'key_words': [], 'heatmap_html': ''})
     def analyze_single(self, text: str) -> Dict:
+        """Analyze single text with LIME explanation"""
         if not text.strip():
             raise ValueError("Empty text")
+        # Get sentiment prediction
+        probs = self.predict_proba([text])[0]
         sentiment = "Positive" if probs[1] > probs[0] else "Negative"
+        # Extract key words using LIME
+        key_words = self.extract_key_words_lime(text)
+        # Create heatmap HTML
+        word_scores_dict = dict(key_words)
+        heatmap_html = self.create_heatmap_html(text, word_scores_dict)
         return {
             'sentiment': sentiment,
             'confidence': float(probs.max()),
             'pos_prob': float(probs[1]),
             'neg_prob': float(probs[0]),
+            'key_words': key_words,
+            'heatmap_html': heatmap_html
         }
     @handle_errors(default_return=[])
         ]
+    @handle_errors(default_return=("Please enter text", None, None, None, None, None))
     def analyze_single(self, text: str, theme: str = 'default'):
+        """Single text analysis with LIME explanation and heatmap"""
         if not text.strip():
+            return "Please enter text", None, None, None, None, None
         result = self.engine.analyze_single(text)
         result_text = (f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.3f})\n"
                       f"Key Words: {key_words_str}")
+        # Return heatmap HTML as additional output
+        return result_text, prob_plot, gauge_plot, cloud_plot, keyword_plot, result['heatmap_html']
     @handle_errors(default_return=None)
     def analyze_batch(self, reviews: str, progress=None):
                 with gr.Column():
                     result_output = gr.Textbox(label="Result", lines=3)
+                    heatmap_output = gr.HTML(label="Word Importance Heatmap")
             with gr.Row():
                 prob_plot = gr.Plot(label="Probabilities")
         analyze_btn.click(
             app.analyze_single,
             inputs=[text_input, theme_selector],
+            outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot, keyword_plot, heatmap_output]
         )
         load_btn.click(app.data_handler.process_file, inputs=file_upload, outputs=batch_input)