Spaces:

kambris
/

LLMLPSentiment

Sleeping

App Files Files Community

kambris commited on Dec 11, 2025

Commit

0be81c2

verified ·

1 Parent(s): 6c8a927

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -149

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 import spacy
 # Load the English spaCy model (lightweight, 'sm' for small)
 try:
@@ -20,40 +21,29 @@ sentiment_pipeline = pipeline(
     model="distilbert-base-uncased-finetuned-sst-2-english"
 )
-# Store the analyzed dataframe globally
-analyzed_df = None
 # --- Function: Detect Passive Voice using spaCy ---
 def is_passive(text):
     """Checks if a sentence is passive using spaCy's dependency parser."""
     doc = nlp(text)
-    # A simple heuristic check for passive voice structure
-    # Look for a form of 'be' (auxpass) followed by a past participle (VERB/VBN)
     for token in doc:
         if token.dep_ == 'auxpass' and token.head.pos_ == 'VERB' and token.head.tag_ == 'VBN':
             return True
     return False
 def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
     """Analyze sentiment and active/passive voice for multiple TXT files or a single CSV file"""
-    global analyzed_df
     try:
-        # Collect all uploaded files
         files = [f for f in [file1, file2, file3, file4, file5] if f is not None]
         if not files:
-            return ("Please upload at least one file",
-                    None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]))
-        # Check if we have TXT files or CSV
         file_paths = [f.name for f in files]
         if all(path.endswith('.txt') for path in file_paths):
-            # Handle multiple TXT files
             all_data = []
             for i, file in enumerate(files, 1):
                 try:
                     with open(file.name, 'r', encoding='utf-8') as f:
@@ -61,58 +51,32 @@ def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
                 except:
                     with open(file.name, 'r', encoding='latin-1') as f:
                         lines = f.readlines()
                 texts = [line.strip() for line in lines if line.strip()]
-                if not texts:
-                    continue
-                # Create dataframe for this file
-                file_df = pd.DataFrame({
-                    'text': texts,
-                    'line_number': range(1, len(texts) + 1),
-                    'file_name': f'File {i}',
-                    'source_file': file.name.split('/')[-1].split('\\')[-1]
-                })
                 all_data.append(file_df)
             if not all_data:
-                return ("Error: No valid text found in uploaded files",
-                        None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]))
-            # Combine all files
             df = pd.concat(all_data, ignore_index=True)
             column_name = 'text'
         elif len(files) == 1 and file_paths[0].endswith('.csv'):
-            # Handle single CSV file
             df = pd.read_csv(file_paths[0])
             if column_name not in df.columns:
-                return (f"Error: Column '{column_name}' not found. Available columns: {', '.join(df.columns)}",
-                        None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]))
         else:
-            return ("Error: Either upload multiple TXT files OR a single CSV file (not both)",
-                    None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]))
-        # Analyze sentiment
         texts = df[column_name].fillna("").astype(str).tolist()
         results = sentiment_pipeline(texts, truncation=True, max_length=512)
         df['sentiment_label'] = [r['label'] for r in results]
         df['sentiment_score'] = [r['score'] for r in results]
-        # --- New Analysis: Active/Passive Voice ---
         df['is_passive'] = df[column_name].apply(is_passive)
         df['voice_label'] = df['is_passive'].apply(lambda x: 'PASSIVE' if x else 'ACTIVE')
-        analyzed_df = df
         # Get all column names except sentiment/voice columns for filter options
         filter_columns = [col for col in df.columns if col not in ['sentiment_label', 'sentiment_score', 'is_passive', 'voice_label']]
-        # Create initial summary with file breakdown if multiple TXT files
         if 'file_name' in df.columns:
             file_summary = "\n\n📁 FILES UPLOADED:\n"
             for fname in df['file_name'].unique():
@@ -122,24 +86,23 @@ def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
         else:
             summary = create_summary(df, "All Data")
         return (summary, df, None, None, None,
                 gr.update(choices=filter_columns, value='file_name' if 'file_name' in filter_columns else None),
                 gr.update(choices=[], value=None),
-                gr.update(choices=[], value=None))
     except Exception as e:
-        import traceback
         traceback.print_exc()
-        return f"Error: {str(e)}", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[])
-# --- Summary Functions (Updated to include passive voice) ---
 def create_summary(df, title):
-    """Generates a summary string including sentiment and voice stats."""
     total_lines = len(df)
     positive_pct = (df['sentiment_label'].value_counts(normalize=True).get('POSITIVE', 0) * 100)
-    passive_pct = (df['is_passive'].mean() * 100) # Mean of True/False gives proportion of True
     summary = (f"--- Summary for {title} ---\n"
                f"Total Lines Analyzed: {total_lines}\n"
                f"Positive Sentiment: {positive_pct:.1f}%\n"
@@ -150,53 +113,43 @@ def create_summary(df, title):
     return summary
 def create_comparison_summary(df1, df2, label1, label2):
-    """Generates a comparison summary string."""
     summary = f"📊 COMPARISON SUMMARY: {label1} vs {label2}\n\n"
     summary += create_summary(df1, label1) + "\n\n"
     summary += create_summary(df2, label2)
     return summary
-def get_filter_values(filter_column):
-    """Get unique values for the selected filter column"""
-    global analyzed_df
-    if analyzed_df is None or filter_column is None:
         return gr.update(choices=[]), gr.update(choices=[])
-    unique_values = analyzed_df[filter_column].dropna().unique().tolist()
     unique_values = [str(v) for v in unique_values][:100]
     return gr.update(choices=unique_values, value=None), gr.update(choices=unique_values, value=None)
-def compare_groups(filter_column, group1_value, group2_value):
-    """Compare two groups side by side"""
-    global analyzed_df
-    if analyzed_df is None:
         return "Please analyze sentiment first", None, None, None, None
     if not filter_column or not group1_value or not group2_value:
         return "Please select a filter column and both group values", None, None, None, None
-    df = analyzed_df.copy()
-    # Filter data for each group
     df1 = df[df[filter_column].astype(str) == group1_value]
     df2 = df[df[filter_column].astype(str) == group2_value]
     if len(df1) == 0 or len(df2) == 0:
         return "One or both groups have no data", None, None, None, None
-    # Create comparison visualizations
     fig_pie = create_comparison_pie(df1, df2, group1_value, group2_value)
     fig_bar = create_comparison_bar(df1, df2, group1_value, group2_value)
-    # Using the new voice bar chart instead of a generic histogram
     fig_voice_bar = create_comparison_voice_bar(df1, df2, group1_value, group2_value)
-    # Create comparison summary
     summary = create_comparison_summary(df1, df2, group1_value, group2_value)
-    # Combine dataframes with group labels
     df1_display = df1.copy()
     df1_display['comparison_group'] = group1_value
     df2_display = df2.copy()
@@ -207,98 +160,35 @@ def compare_groups(filter_column, group1_value, group2_value):
 def create_comparison_pie(df1, df2, label1, label2):
-    """Create side-by-side pie charts"""
-    fig = make_subplots(
-        rows=1, cols=2,
-        specs=[[{'type':'pie'}, {'type':'pie'}]],
-        subplot_titles=(f'{label1}', f'{label2}')
-    )
-    # Group 1
     counts1 = df1['sentiment_label'].value_counts()
-    fig.add_trace(go.Pie(
-        labels=counts1.index,
-        values=counts1.values,
-        name=label1,
-        marker_colors=['#10b981' if x=='POSITIVE' else '#ef4444' for x in counts1.index],
-        textinfo='percent+label+value'
-    ), row=1, col=1)
-    # Group 2
     counts2 = df2['sentiment_label'].value_counts()
-    fig.add_trace(go.Pie(
-        labels=counts2.index,
-        values=counts2.values,
-        name=label2,
-        marker_colors=['#10b981' if x=='POSITIVE' else '#ef4444' for x in counts2.index],
-        textinfo='percent+label+value'
-    ), row=1, col=2)
     fig.update_layout(title_text='Sentiment Distribution Comparison', height=400)
     return fig
 def create_comparison_bar(df1, df2, label1, label2):
-    """Create grouped bar chart comparing sentiment percentages"""
     counts1 = df1['sentiment_label'].value_counts(normalize=True) * 100
     counts2 = df2['sentiment_label'].value_counts(normalize=True) * 100
     sentiments = ['POSITIVE', 'NEGATIVE']
     fig = go.Figure()
-    fig.add_trace(go.Bar(
-        name=label1,
-        x=sentiments,
-        y=[counts1.get(s, 0) for s in sentiments],
-        marker_color='#3b82f6',
-        text=[f"{counts1.get(s, 0):.1f}%" for s in sentiments],
-        textposition='auto'
-    ))
-    fig.add_trace(go.Bar(
-        name=label2,
-        x=sentiments,
-        y=[counts2.get(s, 0) for s in sentiments],
-        marker_color='#ef4444',
-        text=[f"{counts2.get(s, 0):.1f}%" for s in sentiments],
-        textposition='auto'
-    ))
     fig.update_layout(title_text='Sentiment Percentage Comparison', barmode='group', height=400)
     return fig
-# --- New Function: Create Voice Comparison Bar Chart ---
 def create_comparison_voice_bar(df1, df2, label1, label2):
-    """Create grouped bar chart comparing active vs passive voice percentages"""
     counts1 = df1['voice_label'].value_counts(normalize=True) * 100
     counts2 = df2['voice_label'].value_counts(normalize=True) * 100
     voices = ['ACTIVE', 'PASSIVE']
     fig = go.Figure()
-    fig.add_trace(go.Bar(
-        name=label1,
-        x=voices,
-        y=[counts1.get(s, 0) for s in voices],
-        marker_color='#10b981',
-        text=[f"{counts1.get(s, 0):.1f}%" for s in voices],
-        textposition='auto'
-    ))
-    fig.add_trace(go.Bar(
-        name=label2,
-        x=voices,
-        y=[counts2.get(s, 0) for s in voices],
-        marker_color='#fbbf24',
-        text=[f"{counts2.get(s, 0):.1f}%" for s in voices],
-        textposition='auto'
-    ))
     fig.update_layout(title_text='Active vs. Passive Voice Percentage Comparison', barmode='group', height=400)
     return fig
@@ -306,6 +196,8 @@ def create_comparison_voice_bar(df1, df2, label1, label2):
 with gr.Blocks(title="Sentiment & Voice Analyzer") as demo:
     gr.Markdown("# Advanced Text Analyzer: Sentiment, Active vs. Passive Voice")
     with gr.Tab("Analyze Files"):
         with gr.Row():
@@ -333,29 +225,33 @@ with gr.Blocks(title="Sentiment & Voice Analyzer") as demo:
         comparison_summary_output = gr.Textbox(label="Comparison Summary", lines=15)
         comparison_dataframe_output = gr.DataFrame(label="Comparison Data Results")
-        # Updated output slots for the new voice bar chart
         comparison_pie_chart = gr.Plot(label="Sentiment Distribution Pie Chart")
         comparison_bar_chart = gr.Plot(label="Sentiment Percentage Bar Chart")
         comparison_voice_bar_chart = gr.Plot(label="Active/Passive Voice Bar Chart")
     # --- Event Handlers ---
     analyze_button.click(
         fn=analyze_sentiment_files,
         inputs=[file_input1, file_input2, file_input3, file_input4, file_input5, csv_column_name],
-        outputs=[summary_output, dataframe_output, comparison_pie_chart, comparison_bar_chart, comparison_voice_bar_chart, filter_col_dropdown, group1_dropdown, group2_dropdown]
     )
     filter_col_dropdown.change(
         fn=get_filter_values,
-        inputs=[filter_col_dropdown],
         outputs=[group1_dropdown, group2_dropdown]
     )
     compare_button.click(
         fn=compare_groups,
-        inputs=[filter_col_dropdown, group1_dropdown, group2_dropdown],
         outputs=[comparison_summary_output, comparison_dataframe_output, comparison_pie_chart, comparison_bar_chart, comparison_voice_bar_chart]
     )

 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 import spacy
+import traceback # Added for better error tracing
 # Load the English spaCy model (lightweight, 'sm' for small)
 try:
     model="distilbert-base-uncased-finetuned-sst-2-english"
 )
 # --- Function: Detect Passive Voice using spaCy ---
 def is_passive(text):
     """Checks if a sentence is passive using spaCy's dependency parser."""
     doc = nlp(text)
     for token in doc:
         if token.dep_ == 'auxpass' and token.head.pos_ == 'VERB' and token.head.tag_ == 'VBN':
             return True
     return False
 def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
     """Analyze sentiment and active/passive voice for multiple TXT files or a single CSV file"""
+    # analyzed_df is no longer global, it's returned by this function
     try:
         files = [f for f in [file1, file2, file3, file4, file5] if f is not None]
         if not files:
+            return ("Please upload at least one file", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), None)
         file_paths = [f.name for f in files]
         if all(path.endswith('.txt') for path in file_paths):
             all_data = []
             for i, file in enumerate(files, 1):
                 try:
                     with open(file.name, 'r', encoding='utf-8') as f:
                 except:
                     with open(file.name, 'r', encoding='latin-1') as f:
                         lines = f.readlines()
                 texts = [line.strip() for line in lines if line.strip()]
+                if not texts: continue
+                file_df = pd.DataFrame({'text': texts, 'line_number': range(1, len(texts) + 1), 'file_name': f'File {i}', 'source_file': file.name.split('/')[-1].split('\\')[-1]})
                 all_data.append(file_df)
             if not all_data:
+                return ("Error: No valid text found in uploaded files", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), None)
             df = pd.concat(all_data, ignore_index=True)
             column_name = 'text'
         elif len(files) == 1 and file_paths[0].endswith('.csv'):
             df = pd.read_csv(file_paths[0])
             if column_name not in df.columns:
+                return (f"Error: Column '{column_name}' not found. Available columns: {', '.join(df.columns)}", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), None)
         else:
+            return ("Error: Either upload multiple TXT files OR a single CSV file (not both)", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), None)
+        # Analyze sentiment & voice
         texts = df[column_name].fillna("").astype(str).tolist()
         results = sentiment_pipeline(texts, truncation=True, max_length=512)
         df['sentiment_label'] = [r['label'] for r in results]
         df['sentiment_score'] = [r['score'] for r in results]
         df['is_passive'] = df[column_name].apply(is_passive)
         df['voice_label'] = df['is_passive'].apply(lambda x: 'PASSIVE' if x else 'ACTIVE')
         # Get all column names except sentiment/voice columns for filter options
         filter_columns = [col for col in df.columns if col not in ['sentiment_label', 'sentiment_score', 'is_passive', 'voice_label']]
         if 'file_name' in df.columns:
             file_summary = "\n\n📁 FILES UPLOADED:\n"
             for fname in df['file_name'].unique():
         else:
             summary = create_summary(df, "All Data")
+        # Return the DF as the new state value
         return (summary, df, None, None, None,
                 gr.update(choices=filter_columns, value='file_name' if 'file_name' in filter_columns else None),
                 gr.update(choices=[], value=None),
+                gr.update(choices=[], value=None),
+                df) # Return DF for the gr.State component
     except Exception as e:
         traceback.print_exc()
+        return f"Error: {str(e)}", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), None
+# --- Summary Functions ---
 def create_summary(df, title):
     total_lines = len(df)
     positive_pct = (df['sentiment_label'].value_counts(normalize=True).get('POSITIVE', 0) * 100)
+    passive_pct = (df['is_passive'].mean() * 100)
     summary = (f"--- Summary for {title} ---\n"
                f"Total Lines Analyzed: {total_lines}\n"
                f"Positive Sentiment: {positive_pct:.1f}%\n"
     return summary
 def create_comparison_summary(df1, df2, label1, label2):
     summary = f"📊 COMPARISON SUMMARY: {label1} vs {label2}\n\n"
     summary += create_summary(df1, label1) + "\n\n"
     summary += create_summary(df2, label2)
     return summary
+def get_filter_values(filter_column, current_df_state):
+    """Get unique values for the selected filter column using the state DF"""
+    if current_df_state is None or filter_column is None:
         return gr.update(choices=[]), gr.update(choices=[])
+    unique_values = current_df_state[filter_column].dropna().unique().tolist()
     unique_values = [str(v) for v in unique_values][:100]
     return gr.update(choices=unique_values, value=None), gr.update(choices=unique_values, value=None)
+def compare_groups(filter_column, group1_value, group2_value, current_df_state):
+    """Compare two groups side by side using the state DF"""
+    if current_df_state is None:
         return "Please analyze sentiment first", None, None, None, None
     if not filter_column or not group1_value or not group2_value:
         return "Please select a filter column and both group values", None, None, None, None
+    df = current_df_state.copy()
     df1 = df[df[filter_column].astype(str) == group1_value]
     df2 = df[df[filter_column].astype(str) == group2_value]
     if len(df1) == 0 or len(df2) == 0:
         return "One or both groups have no data", None, None, None, None
     fig_pie = create_comparison_pie(df1, df2, group1_value, group2_value)
     fig_bar = create_comparison_bar(df1, df2, group1_value, group2_value)
     fig_voice_bar = create_comparison_voice_bar(df1, df2, group1_value, group2_value)
     summary = create_comparison_summary(df1, df2, group1_value, group2_value)
     df1_display = df1.copy()
     df1_display['comparison_group'] = group1_value
     df2_display = df2.copy()
 def create_comparison_pie(df1, df2, label1, label2):
+    # (Function body is unchanged from previous response, uses plotly)
+    fig = make_subplots(rows=1, cols=2, specs=[[{'type':'pie'}, {'type':'pie'}]], subplot_titles=(f'{label1}', f'{label2}'))
     counts1 = df1['sentiment_label'].value_counts()
+    fig.add_trace(go.Pie(labels=counts1.index, values=counts1.values, name=label1, marker_colors=['#10b981' if x=='POSITIVE' else '#ef4444' for x in counts1.index], textinfo='percent+label+value'), row=1, col=1)
     counts2 = df2['sentiment_label'].value_counts()
+    fig.add_trace(go.Pie(labels=counts2.index, values=counts2.values, name=label2, marker_colors=['#10b981' if x=='POSITIVE' else '#ef4444' for x in counts2.index], textinfo='percent+label+value'), row=1, col=2)
     fig.update_layout(title_text='Sentiment Distribution Comparison', height=400)
     return fig
 def create_comparison_bar(df1, df2, label1, label2):
+    # (Function body is unchanged from previous response, uses plotly)
     counts1 = df1['sentiment_label'].value_counts(normalize=True) * 100
     counts2 = df2['sentiment_label'].value_counts(normalize=True) * 100
     sentiments = ['POSITIVE', 'NEGATIVE']
     fig = go.Figure()
+    fig.add_trace(go.Bar(name=label1, x=sentiments, y=[counts1.get(s, 0) for s in sentiments], marker_color='#3b82f6', text=[f"{counts1.get(s, 0):.1f}%" for s in sentiments], textposition='auto'))
+    fig.add_trace(go.Bar(name=label2, x=sentiments, y=[counts2.get(s, 0) for s in sentiments], marker_color='#ef4444', text=[f"{counts2.get(s, 0):.1f}%" for s in sentiments], textposition='auto'))
     fig.update_layout(title_text='Sentiment Percentage Comparison', barmode='group', height=400)
     return fig
 def create_comparison_voice_bar(df1, df2, label1, label2):
+    # (Function body is unchanged from previous response, uses plotly)
     counts1 = df1['voice_label'].value_counts(normalize=True) * 100
     counts2 = df2['voice_label'].value_counts(normalize=True) * 100
     voices = ['ACTIVE', 'PASSIVE']
     fig = go.Figure()
+    fig.add_trace(go.Bar(name=label1, x=voices, y=[counts1.get(s, 0) for s in voices], marker_color='#10b981', text=[f"{counts1.get(s, 0):.1f}%" for s in voices], textposition='auto'))
+    fig.add_trace(go.Bar(name=label2, x=voices, y=[counts2.get(s, 0) for s in voices], marker_color='#fbbf24', text=[f"{counts2.get(s, 0):.1f}%" for s in voices], textposition='auto'))
     fig.update_layout(title_text='Active vs. Passive Voice Percentage Comparison', barmode='group', height=400)
     return fig
 with gr.Blocks(title="Sentiment & Voice Analyzer") as demo:
     gr.Markdown("# Advanced Text Analyzer: Sentiment, Active vs. Passive Voice")
+    # Define the state component to persist data across calls
+    analyzed_df_state = gr.State(value=None)
     with gr.Tab("Analyze Files"):
         with gr.Row():
         comparison_summary_output = gr.Textbox(label="Comparison Summary", lines=15)
         comparison_dataframe_output = gr.DataFrame(label="Comparison Data Results")
         comparison_pie_chart = gr.Plot(label="Sentiment Distribution Pie Chart")
         comparison_bar_chart = gr.Plot(label="Sentiment Percentage Bar Chart")
         comparison_voice_bar_chart = gr.Plot(label="Active/Passive Voice Bar Chart")
     # --- Event Handlers ---
     analyze_button.click(
         fn=analyze_sentiment_files,
         inputs=[file_input1, file_input2, file_input3, file_input4, file_input5, csv_column_name],
+        outputs=[
+            summary_output, dataframe_output, comparison_pie_chart, comparison_bar_chart,
+            comparison_voice_bar_chart, filter_col_dropdown, group1_dropdown, group2_dropdown,
+            analyzed_df_state # IMPORTANT: Update the State variable with the new DF
+        ]
     )
+    # Pass the state DF to the value-getting function
     filter_col_dropdown.change(
         fn=get_filter_values,
+        inputs=[filter_col_dropdown, analyzed_df_state],
         outputs=[group1_dropdown, group2_dropdown]
     )
+    # Pass the state DF to the comparison function
     compare_button.click(
         fn=compare_groups,
+        inputs=[filter_col_dropdown, group1_dropdown, group2_dropdown, analyzed_df_state],
         outputs=[comparison_summary_output, comparison_dataframe_output, comparison_pie_chart, comparison_bar_chart, comparison_voice_bar_chart]
     )