Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| import spacy | |
| import traceback # Added for better error tracing | |
| # Load the English spaCy model (lightweight, 'sm' for small) | |
| try: | |
| nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| print("Downloading spaCy model 'en_core_web_sm'. Please run 'python -m spacy download en_core_web_sm' if this fails repeatedly.") | |
| spacy.cli.download("en_core_web_sm") | |
| nlp = spacy.load("en_core_web_sm") | |
| # Initialize the sentiment analysis pipeline | |
| sentiment_pipeline = pipeline( | |
| "sentiment-analysis", | |
| model="distilbert-base-uncased-finetuned-sst-2-english" | |
| ) | |
| # --- Function: Detect Passive Voice using spaCy --- | |
| def is_passive(text): | |
| """Checks if a sentence is passive using spaCy's dependency parser.""" | |
| doc = nlp(text) | |
| for token in doc: | |
| if token.dep_ == 'auxpass' and token.head.pos_ == 'VERB' and token.head.tag_ == 'VBN': | |
| return True | |
| return False | |
| def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name): | |
| """Analyze sentiment and active/passive voice for multiple TXT files or a single CSV file""" | |
| # analyzed_df is no longer global, it's returned by this function | |
| try: | |
| files = [f for f in [file1, file2, file3, file4, file5] if f is not None] | |
| if not files: | |
| return ("Please upload at least one file", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), None) | |
| file_paths = [f.name for f in files] | |
| if all(path.endswith('.txt') for path in file_paths): | |
| all_data = [] | |
| for i, file in enumerate(files, 1): | |
| try: | |
| with open(file.name, 'r', encoding='utf-8') as f: | |
| lines = f.readlines() | |
| except: | |
| with open(file.name, 'r', encoding='latin-1') as f: | |
| lines = f.readlines() | |
| texts = [line.strip() for line in lines if line.strip()] | |
| if not texts: continue | |
| file_df = pd.DataFrame({'text': texts, 'line_number': range(1, len(texts) + 1), 'file_name': f'File {i}', 'source_file': file.name.split('/')[-1].split('\\')[-1]}) | |
| all_data.append(file_df) | |
| if not all_data: | |
| return ("Error: No valid text found in uploaded files", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), None) | |
| df = pd.concat(all_data, ignore_index=True) | |
| column_name = 'text' | |
| elif len(files) == 1 and file_paths[0].endswith('.csv'): | |
| df = pd.read_csv(file_paths[0]) | |
| if column_name not in df.columns: | |
| return (f"Error: Column '{column_name}' not found. Available columns: {', '.join(df.columns)}", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), None) | |
| else: | |
| return ("Error: Either upload multiple TXT files OR a single CSV file (not both)", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), None) | |
| # Analyze sentiment & voice | |
| texts = df[column_name].fillna("").astype(str).tolist() | |
| results = sentiment_pipeline(texts, truncation=True, max_length=512) | |
| df['sentiment_label'] = [r['label'] for r in results] | |
| df['sentiment_score'] = [r['score'] for r in results] | |
| df['is_passive'] = df[column_name].apply(is_passive) | |
| df['voice_label'] = df['is_passive'].apply(lambda x: 'PASSIVE' if x else 'ACTIVE') | |
| # Get all column names except sentiment/voice columns for filter options | |
| filter_columns = [col for col in df.columns if col not in ['sentiment_label', 'sentiment_score', 'is_passive', 'voice_label']] | |
| if 'file_name' in df.columns: | |
| file_summary = "\n\n📁 FILES UPLOADED:\n" | |
| for fname in df['file_name'].unique(): | |
| count = len(df[df['file_name'] == fname]) | |
| file_summary += f" - {fname}: {count} lines\n" | |
| summary = create_summary(df, "All Data") + file_summary | |
| else: | |
| summary = create_summary(df, "All Data") | |
| # Return the DF as the new state value | |
| return (summary, df, None, None, None, | |
| gr.update(choices=filter_columns, value='file_name' if 'file_name' in filter_columns else None), | |
| gr.update(choices=[], value=None), | |
| gr.update(choices=[], value=None), | |
| df) # Return DF for the gr.State component | |
| except Exception as e: | |
| traceback.print_exc() | |
| return f"Error: {str(e)}", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[]), None | |
| # --- Summary Functions --- | |
| def create_summary(df, title): | |
| total_lines = len(df) | |
| positive_pct = (df['sentiment_label'].value_counts(normalize=True).get('POSITIVE', 0) * 100) | |
| passive_pct = (df['is_passive'].mean() * 100) | |
| summary = (f"--- Summary for {title} ---\n" | |
| f"Total Lines Analyzed: {total_lines}\n" | |
| f"Positive Sentiment: {positive_pct:.1f}%\n" | |
| f"Negative Sentiment: {(100 - positive_pct):.1f}%\n" | |
| f"**Passive Voice Sentences: {passive_pct:.1f}%**\n" | |
| f"**Active Voice Sentences: {(100 - passive_pct):.1f}%**\n" | |
| f"---------------------------------") | |
| return summary | |
| def create_comparison_summary(df1, df2, label1, label2): | |
| summary = f"📊 COMPARISON SUMMARY: {label1} vs {label2}\n\n" | |
| summary += create_summary(df1, label1) + "\n\n" | |
| summary += create_summary(df2, label2) | |
| return summary | |
| def get_filter_values(filter_column, current_df_state): | |
| """Get unique values for the selected filter column using the state DF""" | |
| if current_df_state is None or filter_column is None: | |
| return gr.update(choices=[]), gr.update(choices=[]) | |
| unique_values = current_df_state[filter_column].dropna().unique().tolist() | |
| unique_values = [str(v) for v in unique_values][:100] | |
| return gr.update(choices=unique_values, value=None), gr.update(choices=unique_values, value=None) | |
| def compare_groups(filter_column, group1_value, group2_value, current_df_state): | |
| """Compare two groups side by side using the state DF""" | |
| if current_df_state is None: | |
| return "Please analyze sentiment first", None, None, None, None | |
| if not filter_column or not group1_value or not group2_value: | |
| return "Please select a filter column and both group values", None, None, None, None | |
| df = current_df_state.copy() | |
| df1 = df[df[filter_column].astype(str) == group1_value] | |
| df2 = df[df[filter_column].astype(str) == group2_value] | |
| if len(df1) == 0 or len(df2) == 0: | |
| return "One or both groups have no data", None, None, None, None | |
| fig_pie = create_comparison_pie(df1, df2, group1_value, group2_value) | |
| fig_bar = create_comparison_bar(df1, df2, group1_value, group2_value) | |
| fig_voice_bar = create_comparison_voice_bar(df1, df2, group1_value, group2_value) | |
| summary = create_comparison_summary(df1, df2, group1_value, group2_value) | |
| df1_display = df1.copy() | |
| df1_display['comparison_group'] = group1_value | |
| df2_display = df2.copy() | |
| df2_display['comparison_group'] = group2_value | |
| combined_df = pd.concat([df1_display, df2_display]) | |
| return summary, combined_df, fig_pie, fig_bar, fig_voice_bar | |
| def create_comparison_pie(df1, df2, label1, label2): | |
| # (Function body is unchanged from previous response, uses plotly) | |
| fig = make_subplots(rows=1, cols=2, specs=[[{'type':'pie'}, {'type':'pie'}]], subplot_titles=(f'{label1}', f'{label2}')) | |
| counts1 = df1['sentiment_label'].value_counts() | |
| fig.add_trace(go.Pie(labels=counts1.index, values=counts1.values, name=label1, marker_colors=['#10b981' if x=='POSITIVE' else '#ef4444' for x in counts1.index], textinfo='percent+label+value'), row=1, col=1) | |
| counts2 = df2['sentiment_label'].value_counts() | |
| fig.add_trace(go.Pie(labels=counts2.index, values=counts2.values, name=label2, marker_colors=['#10b981' if x=='POSITIVE' else '#ef4444' for x in counts2.index], textinfo='percent+label+value'), row=1, col=2) | |
| fig.update_layout(title_text='Sentiment Distribution Comparison', height=400) | |
| return fig | |
| def create_comparison_bar(df1, df2, label1, label2): | |
| # (Function body is unchanged from previous response, uses plotly) | |
| counts1 = df1['sentiment_label'].value_counts(normalize=True) * 100 | |
| counts2 = df2['sentiment_label'].value_counts(normalize=True) * 100 | |
| sentiments = ['POSITIVE', 'NEGATIVE'] | |
| fig = go.Figure() | |
| fig.add_trace(go.Bar(name=label1, x=sentiments, y=[counts1.get(s, 0) for s in sentiments], marker_color='#3b82f6', text=[f"{counts1.get(s, 0):.1f}%" for s in sentiments], textposition='auto')) | |
| fig.add_trace(go.Bar(name=label2, x=sentiments, y=[counts2.get(s, 0) for s in sentiments], marker_color='#ef4444', text=[f"{counts2.get(s, 0):.1f}%" for s in sentiments], textposition='auto')) | |
| fig.update_layout(title_text='Sentiment Percentage Comparison', barmode='group', height=400) | |
| return fig | |
| def create_comparison_voice_bar(df1, df2, label1, label2): | |
| # (Function body is unchanged from previous response, uses plotly) | |
| counts1 = df1['voice_label'].value_counts(normalize=True) * 100 | |
| counts2 = df2['voice_label'].value_counts(normalize=True) * 100 | |
| voices = ['ACTIVE', 'PASSIVE'] | |
| fig = go.Figure() | |
| fig.add_trace(go.Bar(name=label1, x=voices, y=[counts1.get(s, 0) for s in voices], marker_color='#10b981', text=[f"{counts1.get(s, 0):.1f}%" for s in voices], textposition='auto')) | |
| fig.add_trace(go.Bar(name=label2, x=voices, y=[counts2.get(s, 0) for s in voices], marker_color='#fbbf24', text=[f"{counts2.get(s, 0):.1f}%" for s in voices], textposition='auto')) | |
| fig.update_layout(title_text='Active vs. Passive Voice Percentage Comparison', barmode='group', height=400) | |
| return fig | |
| # --- Gradio UI Setup --- | |
| with gr.Blocks(title="Sentiment & Voice Analyzer") as demo: | |
| gr.Markdown("# Advanced Text Analyzer: Sentiment, Active vs. Passive Voice") | |
| # Define the state component to persist data across calls | |
| analyzed_df_state = gr.State(value=None) | |
| with gr.Tab("Analyze Files"): | |
| with gr.Row(): | |
| file_input1 = gr.File(label="Upload TXT/CSV File 1") | |
| file_input2 = gr.File(label="Upload TXT File 2 (Optional)") | |
| file_input3 = gr.File(label="Upload TXT File 3 (Optional)") | |
| file_input4 = gr.File(label="Upload TXT File 4 (Optional)") | |
| file_input5 = gr.File(label="Upload TXT File 5 (Optional)") | |
| csv_column_name = gr.Textbox(label="If CSV, specify text column name", value="text") | |
| analyze_button = gr.Button("Analyze Texts", variant="primary") | |
| summary_output = gr.Textbox(label="Analysis Summary", lines=10) | |
| dataframe_output = gr.DataFrame(label="Detailed Analysis Results") | |
| with gr.Tab("Compare Groups"): | |
| gr.Markdown("Select a column to filter by (e.g., 'file_name' for TXT uploads) and compare two values.") | |
| with gr.Row(): | |
| filter_col_dropdown = gr.Dropdown(label="Select Filter Column", choices=[]) | |
| group1_dropdown = gr.Dropdown(label="Group 1 Value", choices=[]) | |
| group2_dropdown = gr.Dropdown(label="Group 2 Value", choices=[]) | |
| compare_button = gr.Button("Compare Groups", variant="primary") | |
| comparison_summary_output = gr.Textbox(label="Comparison Summary", lines=15) | |
| comparison_dataframe_output = gr.DataFrame(label="Comparison Data Results") | |
| comparison_pie_chart = gr.Plot(label="Sentiment Distribution Pie Chart") | |
| comparison_bar_chart = gr.Plot(label="Sentiment Percentage Bar Chart") | |
| comparison_voice_bar_chart = gr.Plot(label="Active/Passive Voice Bar Chart") | |
| # --- Event Handlers --- | |
| analyze_button.click( | |
| fn=analyze_sentiment_files, | |
| inputs=[file_input1, file_input2, file_input3, file_input4, file_input5, csv_column_name], | |
| outputs=[ | |
| summary_output, dataframe_output, comparison_pie_chart, comparison_bar_chart, | |
| comparison_voice_bar_chart, filter_col_dropdown, group1_dropdown, group2_dropdown, | |
| analyzed_df_state # IMPORTANT: Update the State variable with the new DF | |
| ] | |
| ) | |
| # Pass the state DF to the value-getting function | |
| filter_col_dropdown.change( | |
| fn=get_filter_values, | |
| inputs=[filter_col_dropdown, analyzed_df_state], | |
| outputs=[group1_dropdown, group2_dropdown] | |
| ) | |
| # Pass the state DF to the comparison function | |
| compare_button.click( | |
| fn=compare_groups, | |
| inputs=[filter_col_dropdown, group1_dropdown, group2_dropdown, analyzed_df_state], | |
| outputs=[comparison_summary_output, comparison_dataframe_output, comparison_pie_chart, comparison_bar_chart, comparison_voice_bar_chart] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |