Spaces:

entropy25
/

sentiment-analysis

Sleeping

App Files Files Community

entropy25 commited on Jun 20, 2025

Commit

35dca1f

verified ·

1 Parent(s): e57599e

Update app.py

Browse files

Files changed (1) hide show

app.py +258 -110

app.py CHANGED Viewed

@@ -8,6 +8,20 @@ from collections import Counter, defaultdict
 from sklearn.feature_extraction.text import TfidfVectorizer
 import networkx as nx
 import re
 # Load model
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -15,18 +29,23 @@ tokenizer = BertTokenizer.from_pretrained("entropy25/sentimentanalysis")
 model = BertForSequenceClassification.from_pretrained("entropy25/sentimentanalysis")
 model.to(device)
-# Global storage
 history = []
 def clean_text(text):
     """Simple text preprocessing"""
     text = re.sub(r'[^\w\s]', '', text.lower())
     words = text.split()
-    # Simple stopwords
     stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'will', 'would', 'could', 'should', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}
     return [w for w in words if w not in stopwords and len(w) > 2]
-def analyze_text(text):
     """Core sentiment analysis"""
     if not text.strip():
         return "Please enter text", None, None, None
@@ -39,30 +58,33 @@ def analyze_text(text):
         conf = probs.max()
         sentiment = "Positive" if pred == 1 else "Negative"
-    # Store in history
     history.append({
         'text': text[:100],
         'full_text': text,
         'sentiment': sentiment,
         'confidence': conf,
         'pos_prob': probs[1],
-        'neg_prob': probs[0]
     })
     result = f"Sentiment: {sentiment} (Confidence: {conf:.3f})"
     # Generate plots
-    prob_plot = plot_probs(probs)
-    gauge_plot = plot_gauge(conf, sentiment)
-    cloud_plot = plot_wordcloud(text, sentiment)
     return result, prob_plot, gauge_plot, cloud_plot
-def plot_probs(probs):
     """Probability bar chart"""
     fig, ax = plt.subplots(figsize=(8, 5))
     labels = ["Negative", "Positive"]
-    colors = ['#ff6b6b', '#4ecdc4']
     bars = ax.bar(labels, probs, color=colors, alpha=0.8)
     ax.set_title("Sentiment Probabilities", fontweight='bold')
@@ -76,7 +98,7 @@ def plot_probs(probs):
     plt.tight_layout()
     return fig
-def plot_gauge(conf, sentiment):
     """Confidence gauge"""
     fig, ax = plt.subplots(figsize=(8, 6))
@@ -102,28 +124,25 @@ def plot_gauge(conf, sentiment):
     plt.tight_layout()
     return fig
-def plot_wordcloud(text, sentiment):
     """Word cloud visualization"""
     if len(text.split()) < 3:
         return None
-    try:
-        colormap = 'Greens' if sentiment == 'Positive' else 'Reds'
-        wc = WordCloud(width=800, height=400, background_color='white',
-                      colormap=colormap, max_words=30).generate(text)
-        fig, ax = plt.subplots(figsize=(10, 5))
-        ax.imshow(wc, interpolation='bilinear')
-        ax.axis('off')
-        ax.set_title(f'{sentiment} Word Cloud', fontweight='bold')
-        plt.tight_layout()
-        return fig
-    except:
-        return None
-def batch_analysis(reviews):
-    """Analyze multiple reviews"""
     if not reviews.strip():
         return None
@@ -131,8 +150,16 @@ def batch_analysis(reviews):
     if len(texts) < 2:
         return None
     results = []
-    for text in texts:
         inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
         with torch.no_grad():
             outputs = model(**inputs)
@@ -155,9 +182,12 @@ def batch_analysis(reviews):
             'sentiment': sentiment,
             'confidence': conf,
             'pos_prob': probs[1],
-            'neg_prob': probs[0]
         })
     # Create visualization
     fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
@@ -199,6 +229,59 @@ def batch_analysis(reviews):
     plt.tight_layout()
     return fig
 def keyword_heatmap():
     """Keyword sentiment heatmap"""
     if len(history) < 3:
@@ -335,55 +418,51 @@ def tfidf_analysis():
     if len(pos_texts) < 2 or len(neg_texts) < 2:
         return None
-    try:
-        # Positive TF-IDF
-        vectorizer_pos = TfidfVectorizer(max_features=50, ngram_range=(1, 2))
-        pos_tfidf = vectorizer_pos.fit_transform(pos_texts)
-        pos_features = vectorizer_pos.get_feature_names_out()
-        pos_scores = pos_tfidf.sum(axis=0).A1
-        # Negative TF-IDF
-        vectorizer_neg = TfidfVectorizer(max_features=50, ngram_range=(1, 2))
-        neg_tfidf = vectorizer_neg.fit_transform(neg_texts)
-        neg_features = vectorizer_neg.get_feature_names_out()
-        neg_scores = neg_tfidf.sum(axis=0).A1
-        # Top 10 features
-        pos_top_idx = np.argsort(pos_scores)[-10:][::-1]
-        neg_top_idx = np.argsort(neg_scores)[-10:][::-1]
-        pos_words = [pos_features[i] for i in pos_top_idx]
-        pos_vals = [pos_scores[i] for i in pos_top_idx]
-        neg_words = [neg_features[i] for i in neg_top_idx]
-        neg_vals = [neg_scores[i] for i in neg_top_idx]
-        # Plot
-        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
-        # Positive
-        bars1 = ax1.barh(pos_words, pos_vals, color='#4ecdc4', alpha=0.8)
-        ax1.set_title('Positive Keywords (TF-IDF)', fontweight='bold')
-        ax1.set_xlabel('TF-IDF Score')
-        for bar, score in zip(bars1, pos_vals):
-            ax1.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
-                    f'{score:.3f}', va='center', fontsize=9)
-        # Negative
-        bars2 = ax2.barh(neg_words, neg_vals, color='#ff6b6b', alpha=0.8)
-        ax2.set_title('Negative Keywords (TF-IDF)', fontweight='bold')
-        ax2.set_xlabel('TF-IDF Score')
-        for bar, score in zip(bars2, neg_vals):
-            ax2.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
-                    f'{score:.3f}', va='center', fontsize=9)
-        plt.tight_layout()
-        return fig
-    except:
-        return None
 def plot_history():
     """Analysis history visualization"""
@@ -414,10 +493,30 @@ def plot_history():
     plt.tight_layout()
     return fig
 # Gradio Interface
 with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
-    gr.Markdown("# 🎬 Movie Sentiment Analyzer")
-    gr.Markdown("Advanced sentiment analysis with comprehensive visualizations")
     with gr.Tab("Single Analysis"):
         with gr.Row():
@@ -427,38 +526,56 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo
                     placeholder="Enter your movie review here...",
                     lines=5
                 )
-                analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
-                gr.Examples([
-                    ["The cinematography was stunning, but the plot felt predictable."],
-                    ["A masterpiece! Amazing performances and direction."],
-                    ["Boring movie with terrible acting and weak plot."],
-                    ["Great special effects but cheesy dialogue."],
-                    ["Incredible ending that left me speechless!"]
-                ], inputs=text_input)
             with gr.Column():
-                result_output = gr.Textbox(label="Result", lines=2)
         with gr.Row():
-            prob_plot = gr.Plot(label="Probabilities")
             gauge_plot = gr.Plot(label="Confidence Gauge")
-        wordcloud_plot = gr.Plot(label="Word Cloud")
     with gr.Tab("Batch Analysis"):
         gr.Markdown("### Multiple Reviews Analysis")
-        batch_input = gr.Textbox(
-            label="Reviews (one per line)",
-            placeholder="Review 1...\nReview 2...\nReview 3...",
-            lines=8
-        )
-        batch_btn = gr.Button("Analyze Batch", variant="primary")
-        batch_plot = gr.Plot(label="Batch Results")
     with gr.Tab("Advanced Analytics"):
         gr.Markdown("### Advanced Visualizations")
         with gr.Row():
             heatmap_btn = gr.Button("Keyword Heatmap", variant="primary")
@@ -467,30 +584,61 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo
         heatmap_plot = gr.Plot(label="Keyword Sentiment Heatmap")
         network_plot = gr.Plot(label="Word Co-occurrence Network")
-        tfidf_plot = gr.Plot(label="TF-IDF Keywords")
-        gr.Markdown("**Status:** All features implemented")
-    with gr.Tab("History"):
-        gr.Markdown("### Analysis History")
         with gr.Row():
-            refresh_btn = gr.Button("Refresh", variant="secondary")
             clear_btn = gr.Button("Clear History", variant="stop")
-        history_plot = gr.Plot(label="Historical Trends")
     # Event handlers
-    analyze_btn.click(analyze_text, inputs=text_input,
-                     outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot])
-    batch_btn.click(batch_analysis, inputs=batch_input, outputs=batch_plot)
     heatmap_btn.click(keyword_heatmap, outputs=heatmap_plot)
     network_btn.click(cooccurrence_network, outputs=network_plot)
     tfidf_btn.click(tfidf_analysis, outputs=tfidf_plot)
     refresh_btn.click(plot_history, outputs=history_plot)
-    clear_btn.click(lambda: history.clear(), outputs=None)
 demo.launch(share=True)

 from sklearn.feature_extraction.text import TfidfVectorizer
 import networkx as nx
 import re
+import json
+import csv
+import io
+from datetime import datetime
+# Configuration
+MAX_HISTORY_SIZE = 1000
+BATCH_SIZE_LIMIT = 50
+THEMES = {
+    'default': {'pos': '#4ecdc4', 'neg': '#ff6b6b'},
+    'ocean': {'pos': '#0077be', 'neg': '#ff6b35'},
+    'forest': {'pos': '#228b22', 'neg': '#dc143c'},
+    'sunset': {'pos': '#ff8c00', 'neg': '#8b0000'}
+}
 # Load model
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = BertForSequenceClassification.from_pretrained("entropy25/sentimentanalysis")
 model.to(device)
+# Global storage with size limit
 history = []
+def manage_history_size():
+    """Keep history size under limit"""
+    global history
+    if len(history) > MAX_HISTORY_SIZE:
+        history = history[-MAX_HISTORY_SIZE:]
 def clean_text(text):
     """Simple text preprocessing"""
     text = re.sub(r'[^\w\s]', '', text.lower())
     words = text.split()
     stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'will', 'would', 'could', 'should', 'this', 'that', 'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them'}
     return [w for w in words if w not in stopwords and len(w) > 2]
+def analyze_text(text, theme='default'):
     """Core sentiment analysis"""
     if not text.strip():
         return "Please enter text", None, None, None
         conf = probs.max()
         sentiment = "Positive" if pred == 1 else "Negative"
+    # Store in history with timestamp
     history.append({
         'text': text[:100],
         'full_text': text,
         'sentiment': sentiment,
         'confidence': conf,
         'pos_prob': probs[1],
+        'neg_prob': probs[0],
+        'timestamp': datetime.now().isoformat()
     })
+    manage_history_size()
     result = f"Sentiment: {sentiment} (Confidence: {conf:.3f})"
     # Generate plots
+    prob_plot = plot_probs(probs, theme)
+    gauge_plot = plot_gauge(conf, sentiment, theme)
+    cloud_plot = plot_wordcloud(text, sentiment, theme)
     return result, prob_plot, gauge_plot, cloud_plot
+def plot_probs(probs, theme='default'):
     """Probability bar chart"""
     fig, ax = plt.subplots(figsize=(8, 5))
     labels = ["Negative", "Positive"]
+    colors = [THEMES[theme]['neg'], THEMES[theme]['pos']]
     bars = ax.bar(labels, probs, color=colors, alpha=0.8)
     ax.set_title("Sentiment Probabilities", fontweight='bold')
     plt.tight_layout()
     return fig
+def plot_gauge(conf, sentiment, theme='default'):
     """Confidence gauge"""
     fig, ax = plt.subplots(figsize=(8, 6))
     plt.tight_layout()
     return fig
+def plot_wordcloud(text, sentiment, theme='default'):
     """Word cloud visualization"""
     if len(text.split()) < 3:
         return None
+    colormap = 'Greens' if sentiment == 'Positive' else 'Reds'
+    wc = WordCloud(width=800, height=400, background_color='white',
+                  colormap=colormap, max_words=30).generate(text)
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.imshow(wc, interpolation='bilinear')
+    ax.axis('off')
+    ax.set_title(f'{sentiment} Word Cloud', fontweight='bold')
+    plt.tight_layout()
+    return fig
+def batch_analysis(reviews, progress=gr.Progress()):
+    """Analyze multiple reviews with progress tracking"""
     if not reviews.strip():
         return None
     if len(texts) < 2:
         return None
+    # Apply batch size limit
+    if len(texts) > BATCH_SIZE_LIMIT:
+        texts = texts[:BATCH_SIZE_LIMIT]
     results = []
+    for i, text in enumerate(texts):
+        progress((i + 1) / len(texts), f"Processing review {i + 1}/{len(texts)}")
+        # Process in smaller GPU batches
         inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
         with torch.no_grad():
             outputs = model(**inputs)
             'sentiment': sentiment,
             'confidence': conf,
             'pos_prob': probs[1],
+            'neg_prob': probs[0],
+            'timestamp': datetime.now().isoformat()
         })
+    manage_history_size()
     # Create visualization
     fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
     plt.tight_layout()
     return fig
+def process_uploaded_file(file):
+    """Process uploaded CSV/TXT file for batch analysis"""
+    if file is None:
+        return ""
+    content = file.read().decode('utf-8')
+    # Handle CSV format
+    if file.name.endswith('.csv'):
+        lines = content.split('\n')
+        # Assume text is in first column or look for 'review' column
+        if ',' in content:
+            reviews = []
+            reader = csv.reader(lines)
+            headers = next(reader, None)
+            if headers and any('review' in h.lower() for h in headers):
+                review_idx = next(i for i, h in enumerate(headers) if 'review' in h.lower())
+                for row in reader:
+                    if len(row) > review_idx:
+                        reviews.append(row[review_idx])
+            else:
+                for row in reader:
+                    if row:
+                        reviews.append(row[0])
+            return '\n'.join(reviews)
+    # Handle plain text
+    return content
+def export_history_csv():
+    """Export history to CSV"""
+    if not history:
+        return None
+    output = io.StringIO()
+    writer = csv.writer(output)
+    writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Positive_Prob', 'Negative_Prob'])
+    for entry in history:
+        writer.writerow([
+            entry['timestamp'], entry['text'], entry['sentiment'],
+            entry['confidence'], entry['pos_prob'], entry['neg_prob']
+        ])
+    return output.getvalue()
+def export_history_json():
+    """Export history to JSON"""
+    if not history:
+        return None
+    return json.dumps(history, indent=2)
 def keyword_heatmap():
     """Keyword sentiment heatmap"""
     if len(history) < 3:
     if len(pos_texts) < 2 or len(neg_texts) < 2:
         return None
+    # Positive TF-IDF
+    vectorizer_pos = TfidfVectorizer(max_features=50, ngram_range=(1, 2))
+    pos_tfidf = vectorizer_pos.fit_transform(pos_texts)
+    pos_features = vectorizer_pos.get_feature_names_out()
+    pos_scores = pos_tfidf.sum(axis=0).A1
+    # Negative TF-IDF
+    vectorizer_neg = TfidfVectorizer(max_features=50, ngram_range=(1, 2))
+    neg_tfidf = vectorizer_neg.fit_transform(neg_texts)
+    neg_features = vectorizer_neg.get_feature_names_out()
+    neg_scores = neg_tfidf.sum(axis=0).A1
+    # Top 10 features
+    pos_top_idx = np.argsort(pos_scores)[-10:][::-1]
+    neg_top_idx = np.argsort(neg_scores)[-10:][::-1]
+    pos_words = [pos_features[i] for i in pos_top_idx]
+    pos_vals = [pos_scores[i] for i in pos_top_idx]
+    neg_words = [neg_features[i] for i in neg_top_idx]
+    neg_vals = [neg_scores[i] for i in neg_top_idx]
+    # Plot
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
+    # Positive
+    bars1 = ax1.barh(pos_words, pos_vals, color='#4ecdc4', alpha=0.8)
+    ax1.set_title('Positive Keywords (TF-IDF)', fontweight='bold')
+    ax1.set_xlabel('TF-IDF Score')
+    for bar, score in zip(bars1, pos_vals):
+        ax1.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
+                f'{score:.3f}', va='center', fontsize=9)
+    # Negative
+    bars2 = ax2.barh(neg_words, neg_vals, color='#ff6b6b', alpha=0.8)
+    ax2.set_title('Negative Keywords (TF-IDF)', fontweight='bold')
+    ax2.set_xlabel('TF-IDF Score')
+    for bar, score in zip(bars2, neg_vals):
+        ax2.text(bar.get_width() + 0.001, bar.get_y() + bar.get_height()/2,
+                f'{score:.3f}', va='center', fontsize=9)
+    plt.tight_layout()
+    return fig
 def plot_history():
     """Analysis history visualization"""
     plt.tight_layout()
     return fig
+def clear_history():
+    """Clear analysis history"""
+    global history
+    history.clear()
+    return "History cleared successfully"
+# Enhanced example data
+EXAMPLE_REVIEWS = [
+    ["The cinematography was stunning, but the plot felt predictable and the dialogue was weak."],
+    ["A masterpiece of filmmaking! Amazing performances, brilliant direction, and unforgettable moments."],
+    ["Boring movie with terrible acting, weak plot, and poor character development throughout."],
+    ["Great special effects and action sequences, but the story was confusing and hard to follow."],
+    ["Incredible ending that left me speechless! One of the best films I've ever seen."],
+    ["The movie started strong but became repetitive and lost my interest halfway through."],
+    ["Outstanding soundtrack and beautiful visuals, though the pacing was somewhat slow."],
+    ["Disappointing sequel that failed to capture the magic of the original film."],
+    ["Brilliant writing and exceptional acting make this a must-watch drama."],
+    ["Generic blockbuster with predictable twists and forgettable characters."]
+]
 # Gradio Interface
 with gr.Blocks(theme=gr.themes.Soft(), title="Movie Sentiment Analyzer") as demo:
+    gr.Markdown("# 🎬 Enhanced Movie Sentiment Analyzer")
+    gr.Markdown("Advanced sentiment analysis with comprehensive visualizations and data export capabilities")
     with gr.Tab("Single Analysis"):
         with gr.Row():
                     placeholder="Enter your movie review here...",
                     lines=5
                 )
+                with gr.Row():
+                    analyze_btn = gr.Button("Analyze", variant="primary", size="lg")
+                    theme_selector = gr.Dropdown(
+                        choices=list(THEMES.keys()),
+                        value="default",
+                        label="Color Theme"
+                    )
+                gr.Examples(
+                    examples=EXAMPLE_REVIEWS,
+                    inputs=text_input,
+                    label="Example Reviews"
+                )
             with gr.Column():
+                result_output = gr.Textbox(label="Analysis Result", lines=2)
         with gr.Row():
+            prob_plot = gr.Plot(label="Sentiment Probabilities")
             gauge_plot = gr.Plot(label="Confidence Gauge")
+        wordcloud_plot = gr.Plot(label="Word Cloud Visualization")
     with gr.Tab("Batch Analysis"):
         gr.Markdown("### Multiple Reviews Analysis")
+        gr.Markdown(f"**Note:** Limited to {BATCH_SIZE_LIMIT} reviews per batch for optimal performance")
+        with gr.Row():
+            with gr.Column():
+                file_upload = gr.File(
+                    label="Upload CSV/TXT File",
+                    file_types=[".csv", ".txt"],
+                    type="binary"
+                )
+                batch_input = gr.Textbox(
+                    label="Reviews (one per line)",
+                    placeholder="Review 1...\nReview 2...\nReview 3...",
+                    lines=8
+                )
+            with gr.Column():
+                load_file_btn = gr.Button("Load File", variant="secondary")
+                batch_btn = gr.Button("Analyze Batch", variant="primary")
+        batch_plot = gr.Plot(label="Batch Analysis Results")
     with gr.Tab("Advanced Analytics"):
         gr.Markdown("### Advanced Visualizations")
+        gr.Markdown("**Requirements:** Minimum analysis history needed for each visualization")
         with gr.Row():
             heatmap_btn = gr.Button("Keyword Heatmap", variant="primary")
         heatmap_plot = gr.Plot(label="Keyword Sentiment Heatmap")
         network_plot = gr.Plot(label="Word Co-occurrence Network")
+        tfidf_plot = gr.Plot(label="TF-IDF Keywords Comparison")
+    with gr.Tab("History & Export"):
+        gr.Markdown("### Analysis History & Data Export")
         with gr.Row():
+            refresh_btn = gr.Button("Refresh History", variant="secondary")
             clear_btn = gr.Button("Clear History", variant="stop")
+        with gr.Row():
+            export_csv_btn = gr.Button("Export CSV", variant="secondary")
+            export_json_btn = gr.Button("Export JSON", variant="secondary")
+        with gr.Row():
+            csv_download = gr.File(label="CSV Download", visible=False)
+            json_download = gr.File(label="JSON Download", visible=False)
+        history_status = gr.Textbox(label="Status", interactive=False)
+        history_plot = gr.Plot(label="Historical Analysis Trends")
     # Event handlers
+    analyze_btn.click(
+        analyze_text,
+        inputs=[text_input, theme_selector],
+        outputs=[result_output, prob_plot, gauge_plot, wordcloud_plot]
+    )
+    load_file_btn.click(
+        process_uploaded_file,
+        inputs=file_upload,
+        outputs=batch_input
+    )
+    batch_btn.click(
+        batch_analysis,
+        inputs=batch_input,
+        outputs=batch_plot
+    )
     heatmap_btn.click(keyword_heatmap, outputs=heatmap_plot)
     network_btn.click(cooccurrence_network, outputs=network_plot)
     tfidf_btn.click(tfidf_analysis, outputs=tfidf_plot)
     refresh_btn.click(plot_history, outputs=history_plot)
+    clear_btn.click(clear_history, outputs=history_status)
+    export_csv_btn.click(
+        export_history_csv,
+        outputs=gr.File(label="history.csv")
+    )
+    export_json_btn.click(
+        export_history_json,
+        outputs=gr.File(label="history.json")
+    )
 demo.launch(share=True)