Spaces:

kambris
/

LLMLPSentiment

Sleeping

App Files Files Community

kambris commited on Dec 11, 2025

Commit

8043d18

verified ·

1 Parent(s): 8814bfe

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -204

app.py CHANGED Viewed

@@ -4,6 +4,15 @@ import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 # Initialize the sentiment analysis pipeline
 sentiment_pipeline = pipeline(
@@ -14,8 +23,20 @@ sentiment_pipeline = pipeline(
 # Store the analyzed dataframe globally
 analyzed_df = None
 def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
-    """Analyze sentiment for multiple TXT files or a single CSV file"""
     global analyzed_df
     try:
@@ -81,11 +102,15 @@ def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
         df['sentiment_label'] = [r['label'] for r in results]
         df['sentiment_score'] = [r['score'] for r in results]
         analyzed_df = df
-        # Get all column names except sentiment columns for filter options
-        filter_columns = [col for col in df.columns if col not in ['sentiment_label', 'sentiment_score']]
         # Create initial summary with file breakdown if multiple TXT files
         if 'file_name' in df.columns:
@@ -103,8 +128,34 @@ def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
                 gr.update(choices=[], value=None))
     except Exception as e:
         return f"Error: {str(e)}", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[])
 def get_filter_values(filter_column):
     """Get unique values for the selected filter column"""
     global analyzed_df
@@ -139,7 +190,8 @@ def compare_groups(filter_column, group1_value, group2_value):
     # Create comparison visualizations
     fig_pie = create_comparison_pie(df1, df2, group1_value, group2_value)
     fig_bar = create_comparison_bar(df1, df2, group1_value, group2_value)
-    fig_hist = create_comparison_histogram(df1, df2, group1_value, group2_value)
     # Create comparison summary
     summary = create_comparison_summary(df1, df2, group1_value, group2_value)
@@ -151,7 +203,8 @@ def compare_groups(filter_column, group1_value, group2_value):
     df2_display['comparison_group'] = group2_value
     combined_df = pd.concat([df1_display, df2_display])
-    return summary, combined_df, fig_pie, fig_bar, fig_hist
 def create_comparison_pie(df1, df2, label1, label2):
     """Create side-by-side pie charts"""
@@ -207,234 +260,104 @@ def create_comparison_bar(df1, df2, label1, label2):
         name=label2,
         x=sentiments,
         y=[counts2.get(s, 0) for s in sentiments],
-        marker_color='#f59e0b',
         text=[f"{counts2.get(s, 0):.1f}%" for s in sentiments],
         textposition='auto'
     ))
-    fig.update_layout(
-        title='Sentiment Percentage Comparison',
-        xaxis_title='Sentiment',
-        yaxis_title='Percentage (%)',
-        barmode='group',
-        height=400
-    )
     return fig
-def create_comparison_histogram(df1, df2, label1, label2):
-    """Create overlaid histograms of confidence scores"""
     fig = go.Figure()
-    fig.add_trace(go.Histogram(
-        x=df1['sentiment_score'],
         name=label1,
-        opacity=0.6,
-        marker_color='#3b82f6',
-        nbinsx=30
     ))
-    fig.add_trace(go.Histogram(
-        x=df2['sentiment_score'],
         name=label2,
-        opacity=0.6,
-        marker_color='#f59e0b',
-        nbinsx=30
     ))
-    fig.update_layout(
-        title='Confidence Score Distribution Comparison',
-        xaxis_title='Confidence Score',
-        yaxis_title='Count',
-        barmode='overlay',
-        height=400
-    )
     return fig
-def create_comparison_summary(df1, df2, label1, label2):
-    """Create detailed comparison summary"""
-    total1 = len(df1)
-    total2 = len(df2)
-    counts1 = df1['sentiment_label'].value_counts()
-    counts2 = df2['sentiment_label'].value_counts()
-    pos1 = counts1.get('POSITIVE', 0) / total1 * 100
-    neg1 = counts1.get('NEGATIVE', 0) / total1 * 100
-    pos2 = counts2.get('POSITIVE', 0) / total2 * 100
-    neg2 = counts2.get('NEGATIVE', 0) / total2 * 100
-    avg1 = df1['sentiment_score'].mean()
-    avg2 = df2['sentiment_score'].mean()
-    summary = f"""
-📊 GROUP COMPARISON SUMMARY
-{'='*50}
-GROUP 1: {label1}
-{'='*50}
-Total Responses: {total1}
-Positive: {counts1.get('POSITIVE', 0)} ({pos1:.1f}%)
-Negative: {counts1.get('NEGATIVE', 0)} ({neg1:.1f}%)
-Average Confidence: {avg1:.3f}
-{'='*50}
-GROUP 2: {label2}
-{'='*50}
-Total Responses: {total2}
-Positive: {counts2.get('POSITIVE', 0)} ({pos2:.1f}%)
-Negative: {counts2.get('NEGATIVE', 0)} ({neg2:.1f}%)
-Average Confidence: {avg2:.3f}
-{'='*50}
-DIFFERENCE ANALYSIS
-{'='*50}
-Positive Sentiment Difference: {pos1 - pos2:+.1f} percentage points
-({label1} {'more' if pos1 > pos2 else 'less'} positive than {label2})
-Confidence Score Difference: {avg1 - avg2:+.3f}
-({label1} {'higher' if avg1 > avg2 else 'lower'} confidence than {label2})
-    """
-    return summary
-def create_summary(df, title):
-    """Create text summary of results"""
-    total = len(df)
-    sentiment_counts = df['sentiment_label'].value_counts()
-    avg_score = df['sentiment_score'].mean()
-    summary = f"""
-📊 {title} (Total: {total} rows)
-Sentiment Breakdown:
-{sentiment_counts.to_string()}
-Average Confidence Score: {avg_score:.3f}
-Sentiment Percentages:
-{(sentiment_counts / total * 100).round(2).to_string()}%
-    """
-    return summary
-# Create Gradio interface
-with gr.Blocks(title="Sentiment Comparison Tool", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📊 Sentiment Analysis: Multi-File Comparison")
-    gr.Markdown("Upload 2-5 TXT files to compare OR upload a single CSV file")
-    with gr.Row():
-        with gr.Column(scale=1):
-            gr.Markdown("### Step 1: Upload & Analyze")
-            gr.Markdown("**Upload Multiple TXT Files (2-5) OR Single CSV:**")
-            file1 = gr.File(label="File 1 (Required)", file_types=[".csv", ".txt"])
-            file2 = gr.File(label="File 2 (Optional)", file_types=[".txt"])
-            file3 = gr.File(label="File 3 (Optional)", file_types=[".txt"])
-            file4 = gr.File(label="File 4 (Optional)", file_types=[".txt"])
-            file5 = gr.File(label="File 5 (Optional)", file_types=[".txt"])
-            column_input = gr.Textbox(
-                label="Column to Analyze (CSV only)",
-                placeholder="e.g., 'review_text'",
-                value="text"
-            )
-            analyze_btn = gr.Button("🔍 Analyze Sentiment", variant="primary", size="lg")
-            gr.Markdown("### Step 2: Compare Groups")
-            filter_column = gr.Dropdown(
-                label="Compare by Column",
-                choices=[],
-                interactive=True,
-                info="Select 'file_name' to compare TXT files"
-            )
-            with gr.Row():
-                group1_value = gr.Dropdown(
-                    label="Group 1",
-                    choices=[],
-                    interactive=True
-                )
-                group2_value = gr.Dropdown(
-                    label="Group 2",
-                    choices=[],
-                    interactive=True
-                )
-            compare_btn = gr.Button("⚖️ Compare Groups", variant="secondary", size="lg")
-        with gr.Column(scale=2):
-            summary_output = gr.Textbox(label="Comparison Summary", lines=20)
-    with gr.Row():
-        plot_pie = gr.Plot(label="Side-by-Side Distribution")
-    with gr.Row():
-        with gr.Column():
-            plot_bar = gr.Plot(label="Percentage Comparison")
-        with gr.Column():
-            plot_hist = gr.Plot(label="Confidence Score Distribution")
-    with gr.Row():
-        output_df = gr.Dataframe(label="All Data", max_height=400)
-    # Connect events
-    analyze_btn.click(
         fn=analyze_sentiment_files,
-        inputs=[file1, file2, file3, file4, file5, column_input],
-        outputs=[summary_output, output_df, plot_pie, plot_bar, plot_hist,
-                filter_column, group1_value, group2_value]
     )
-    filter_column.change(
         fn=get_filter_values,
-        inputs=[filter_column],
-        outputs=[group1_value, group2_value]
     )
-    compare_btn.click(
         fn=compare_groups,
-        inputs=[filter_column, group1_value, group2_value],
-        outputs=[summary_output, output_df, plot_pie, plot_bar, plot_hist]
     )
-    gr.Markdown("""
-    ### 💡 How to use:
-    **Option A: Multiple TXT Files (2-5 files)**
-    1. Upload 2-5 TXT files (one per upload slot)
-    2. Click "Analyze Sentiment" to process all files
-    3. Select "file_name" as the comparison column
-    4. Choose two files to compare (e.g., "File 1" vs "File 2")
-    5. Click "Compare Groups" to see side-by-side comparison
-    **Option B: Single CSV File**
-    1. Upload one CSV file with text column and grouping columns
-    2. Specify which column contains the text to analyze
-    3. Click "Analyze Sentiment"
-    4. Select any column to compare groups (e.g., language, category)
-    5. Choose two values to compare
-    ### 📂 File Format Details:
-    - **TXT files**: Each line is analyzed separately; files are labeled as "File 1", "File 2", etc.
-    - **CSV files**: Specify text column; can compare based on any categorical column
-    ### 📈 Comparison Features:
-    - Side-by-side pie charts showing sentiment distribution
-    - Grouped bar chart comparing positive/negative percentages
-    - Overlaid histogram comparing confidence score distributions
-    - Detailed statistical summary with difference analysis
-    - Full data table with all analyzed text and sentiment scores
-    ### 🎯 Example Use Cases:
-    - Compare sentiment across different text documents
-    - Analyze reviews from different sources
-    - Compare sentiment: Arab responses vs Chinese responses
-    - Analyze: Product A reviews vs Product B reviews
-    - Compare: Pre-intervention vs Post-intervention feedback
-    """)
 if __name__ == "__main__":
-    demo.launch(share=True)

 import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
+import spacy
+# Load the English spaCy model (lightweight, 'sm' for small)
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    print("Downloading spaCy model 'en_core_web_sm'. Please run 'python -m spacy download en_core_web_sm' if this fails repeatedly.")
+    spacy.cli.download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
 # Initialize the sentiment analysis pipeline
 sentiment_pipeline = pipeline(
 # Store the analyzed dataframe globally
 analyzed_df = None
+# --- Function: Detect Passive Voice using spaCy ---
+def is_passive(text):
+    """Checks if a sentence is passive using spaCy's dependency parser."""
+    doc = nlp(text)
+    # A simple heuristic check for passive voice structure
+    # Look for a form of 'be' (auxpass) followed by a past participle (VERB/VBN)
+    for token in doc:
+        if token.dep_ == 'auxpass' and token.head.pos_ == 'VERB' and token.head.tag_ == 'VBN':
+            return True
+    return False
 def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
+    """Analyze sentiment and active/passive voice for multiple TXT files or a single CSV file"""
     global analyzed_df
     try:
         df['sentiment_label'] = [r['label'] for r in results]
         df['sentiment_score'] = [r['score'] for r in results]
+        # --- New Analysis: Active/Passive Voice ---
+        df['is_passive'] = df[column_name].apply(is_passive)
+        df['voice_label'] = df['is_passive'].apply(lambda x: 'PASSIVE' if x else 'ACTIVE')
         analyzed_df = df
+        # Get all column names except sentiment/voice columns for filter options
+        filter_columns = [col for col in df.columns if col not in ['sentiment_label', 'sentiment_score', 'is_passive', 'voice_label']]
         # Create initial summary with file breakdown if multiple TXT files
         if 'file_name' in df.columns:
                 gr.update(choices=[], value=None))
     except Exception as e:
+        import traceback
+        traceback.print_exc()
         return f"Error: {str(e)}", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[])
+# --- Summary Functions (Updated to include passive voice) ---
+def create_summary(df, title):
+    """Generates a summary string including sentiment and voice stats."""
+    total_lines = len(df)
+    positive_pct = (df['sentiment_label'].value_counts(normalize=True).get('POSITIVE', 0) * 100)
+    passive_pct = (df['is_passive'].mean() * 100) # Mean of True/False gives proportion of True
+    summary = (f"--- Summary for {title} ---\n"
+               f"Total Lines Analyzed: {total_lines}\n"
+               f"Positive Sentiment: {positive_pct:.1f}%\n"
+               f"Negative Sentiment: {(100 - positive_pct):.1f}%\n"
+               f"**Passive Voice Sentences: {passive_pct:.1f}%**\n"
+               f"**Active Voice Sentences: {(100 - passive_pct):.1f}%**\n"
+               f"---------------------------------")
+    return summary
+def create_comparison_summary(df1, df2, label1, label2):
+    """Generates a comparison summary string."""
+    summary = f"📊 COMPARISON SUMMARY: {label1} vs {label2}\n\n"
+    summary += create_summary(df1, label1) + "\n\n"
+    summary += create_summary(df2, label2)
+    return summary
 def get_filter_values(filter_column):
     """Get unique values for the selected filter column"""
     global analyzed_df
     # Create comparison visualizations
     fig_pie = create_comparison_pie(df1, df2, group1_value, group2_value)
     fig_bar = create_comparison_bar(df1, df2, group1_value, group2_value)
+    # Using the new voice bar chart instead of a generic histogram
+    fig_voice_bar = create_comparison_voice_bar(df1, df2, group1_value, group2_value)
     # Create comparison summary
     summary = create_comparison_summary(df1, df2, group1_value, group2_value)
     df2_display['comparison_group'] = group2_value
     combined_df = pd.concat([df1_display, df2_display])
+    return summary, combined_df, fig_pie, fig_bar, fig_voice_bar
 def create_comparison_pie(df1, df2, label1, label2):
     """Create side-by-side pie charts"""
         name=label2,
         x=sentiments,
         y=[counts2.get(s, 0) for s in sentiments],
+        marker_color='#ef4444',
         text=[f"{counts2.get(s, 0):.1f}%" for s in sentiments],
         textposition='auto'
     ))
+    fig.update_layout(title_text='Sentiment Percentage Comparison', barmode='group', height=400)
     return fig
+# --- New Function: Create Voice Comparison Bar Chart ---
+def create_comparison_voice_bar(df1, df2, label1, label2):
+    """Create grouped bar chart comparing active vs passive voice percentages"""
+    counts1 = df1['voice_label'].value_counts(normalize=True) * 100
+    counts2 = df2['voice_label'].value_counts(normalize=True) * 100
+    voices = ['ACTIVE', 'PASSIVE']
     fig = go.Figure()
+    fig.add_trace(go.Bar(
         name=label1,
+        x=voices,
+        y=[counts1.get(s, 0) for s in voices],
+        marker_color='#10b981',
+        text=[f"{counts1.get(s, 0):.1f}%" for s in voices],
+        textposition='auto'
     ))
+    fig.add_trace(go.Bar(
         name=label2,
+        x=voices,
+        y=[counts2.get(s, 0) for s in voices],
+        marker_color='#fbbf24',
+        text=[f"{counts2.get(s, 0):.1f}%" for s in voices],
+        textposition='auto'
     ))
+    fig.update_layout(title_text='Active vs. Passive Voice Percentage Comparison', barmode='group', height=400)
     return fig
+# --- Gradio UI Setup ---
+with gr.Blocks(title="Sentiment & Voice Analyzer") as demo:
+    gr.Markdown("# Advanced Text Analyzer: Sentiment, Active vs. Passive Voice")
+    with gr.Tab("Analyze Files"):
+        with gr.Row():
+            file_input1 = gr.File(label="Upload TXT/CSV File 1")
+            file_input2 = gr.File(label="Upload TXT File 2 (Optional)")
+            file_input3 = gr.File(label="Upload TXT File 3 (Optional)")
+            file_input4 = gr.File(label="Upload TXT File 4 (Optional)")
+            file_input5 = gr.File(label="Upload TXT File 5 (Optional)")
+        csv_column_name = gr.Textbox(label="If CSV, specify text column name", value="text")
+        analyze_button = gr.Button("Analyze Texts", variant="primary")
+        summary_output = gr.Textbox(label="Analysis Summary", lines=10)
+        dataframe_output = gr.DataFrame(label="Detailed Analysis Results")
+    with gr.Tab("Compare Groups"):
+        gr.Markdown("Select a column to filter by (e.g., 'file_name' for TXT uploads) and compare two values.")
+        with gr.Row():
+            filter_col_dropdown = gr.Dropdown(label="Select Filter Column", choices=[])
+            group1_dropdown = gr.Dropdown(label="Group 1 Value", choices=[])
+            group2_dropdown = gr.Dropdown(label="Group 2 Value", choices=[])
+        compare_button = gr.Button("Compare Groups", variant="primary")
+        comparison_summary_output = gr.Textbox(label="Comparison Summary", lines=15)
+        comparison_dataframe_output = gr.DataFrame(label="Comparison Data Results")
+        # Updated output slots for the new voice bar chart
+        comparison_pie_chart = gr.Plot(label="Sentiment Distribution Pie Chart")
+        comparison_bar_chart = gr.Plot(label="Sentiment Percentage Bar Chart")
+        comparison_voice_bar_chart = gr.Plot(label="Active/Passive Voice Bar Chart")
+    # --- Event Handlers ---
+    analyze_button.click(
         fn=analyze_sentiment_files,
+        inputs=[file_input1, file_input2, file_input3, file_input4, file_input5, csv_column_name],
+        outputs=[summary_output, dataframe_output, comparison_pie_chart, comparison_bar_chart, comparison_voice_bar_chart, filter_col_dropdown, group1_dropdown, group2_dropdown]
     )
+    filter_col_dropdown.change(
         fn=get_filter_values,
+        inputs=[filter_col_dropdown],
+        outputs=[group1_dropdown, group2_dropdown]
     )
+    compare_button.click(
         fn=compare_groups,
+        inputs=[filter_col_dropdown, group1_dropdown, group2_dropdown],
+        outputs=[comparison_summary_output, comparison_dataframe_output, comparison_pie_chart, comparison_bar_chart, comparison_voice_bar_chart]
     )
 if __name__ == "__main__":
+    demo.launch()