kambris commited on
Commit
8043d18
·
verified ·
1 Parent(s): 8814bfe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -204
app.py CHANGED
@@ -4,6 +4,15 @@ import pandas as pd
4
  import plotly.express as px
5
  import plotly.graph_objects as go
6
  from plotly.subplots import make_subplots
 
 
 
 
 
 
 
 
 
7
 
8
  # Initialize the sentiment analysis pipeline
9
  sentiment_pipeline = pipeline(
@@ -14,8 +23,20 @@ sentiment_pipeline = pipeline(
14
  # Store the analyzed dataframe globally
15
  analyzed_df = None
16
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
18
- """Analyze sentiment for multiple TXT files or a single CSV file"""
19
  global analyzed_df
20
 
21
  try:
@@ -81,11 +102,15 @@ def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
81
 
82
  df['sentiment_label'] = [r['label'] for r in results]
83
  df['sentiment_score'] = [r['score'] for r in results]
84
-
 
 
 
 
85
  analyzed_df = df
86
 
87
- # Get all column names except sentiment columns for filter options
88
- filter_columns = [col for col in df.columns if col not in ['sentiment_label', 'sentiment_score']]
89
 
90
  # Create initial summary with file breakdown if multiple TXT files
91
  if 'file_name' in df.columns:
@@ -103,8 +128,34 @@ def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
103
  gr.update(choices=[], value=None))
104
 
105
  except Exception as e:
 
 
106
  return f"Error: {str(e)}", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[])
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  def get_filter_values(filter_column):
109
  """Get unique values for the selected filter column"""
110
  global analyzed_df
@@ -139,7 +190,8 @@ def compare_groups(filter_column, group1_value, group2_value):
139
  # Create comparison visualizations
140
  fig_pie = create_comparison_pie(df1, df2, group1_value, group2_value)
141
  fig_bar = create_comparison_bar(df1, df2, group1_value, group2_value)
142
- fig_hist = create_comparison_histogram(df1, df2, group1_value, group2_value)
 
143
 
144
  # Create comparison summary
145
  summary = create_comparison_summary(df1, df2, group1_value, group2_value)
@@ -151,7 +203,8 @@ def compare_groups(filter_column, group1_value, group2_value):
151
  df2_display['comparison_group'] = group2_value
152
  combined_df = pd.concat([df1_display, df2_display])
153
 
154
- return summary, combined_df, fig_pie, fig_bar, fig_hist
 
155
 
156
  def create_comparison_pie(df1, df2, label1, label2):
157
  """Create side-by-side pie charts"""
@@ -207,234 +260,104 @@ def create_comparison_bar(df1, df2, label1, label2):
207
  name=label2,
208
  x=sentiments,
209
  y=[counts2.get(s, 0) for s in sentiments],
210
- marker_color='#f59e0b',
211
  text=[f"{counts2.get(s, 0):.1f}%" for s in sentiments],
212
  textposition='auto'
213
  ))
214
 
215
- fig.update_layout(
216
- title='Sentiment Percentage Comparison',
217
- xaxis_title='Sentiment',
218
- yaxis_title='Percentage (%)',
219
- barmode='group',
220
- height=400
221
- )
222
 
223
  return fig
224
 
225
- def create_comparison_histogram(df1, df2, label1, label2):
226
- """Create overlaid histograms of confidence scores"""
 
 
 
 
 
 
227
  fig = go.Figure()
228
 
229
- fig.add_trace(go.Histogram(
230
- x=df1['sentiment_score'],
231
  name=label1,
232
- opacity=0.6,
233
- marker_color='#3b82f6',
234
- nbinsx=30
 
 
235
  ))
236
 
237
- fig.add_trace(go.Histogram(
238
- x=df2['sentiment_score'],
239
  name=label2,
240
- opacity=0.6,
241
- marker_color='#f59e0b',
242
- nbinsx=30
 
 
243
  ))
244
 
245
- fig.update_layout(
246
- title='Confidence Score Distribution Comparison',
247
- xaxis_title='Confidence Score',
248
- yaxis_title='Count',
249
- barmode='overlay',
250
- height=400
251
- )
252
 
253
  return fig
254
 
255
- def create_comparison_summary(df1, df2, label1, label2):
256
- """Create detailed comparison summary"""
257
- total1 = len(df1)
258
- total2 = len(df2)
259
-
260
- counts1 = df1['sentiment_label'].value_counts()
261
- counts2 = df2['sentiment_label'].value_counts()
262
-
263
- pos1 = counts1.get('POSITIVE', 0) / total1 * 100
264
- neg1 = counts1.get('NEGATIVE', 0) / total1 * 100
265
-
266
- pos2 = counts2.get('POSITIVE', 0) / total2 * 100
267
- neg2 = counts2.get('NEGATIVE', 0) / total2 * 100
268
-
269
- avg1 = df1['sentiment_score'].mean()
270
- avg2 = df2['sentiment_score'].mean()
271
-
272
- summary = f"""
273
- 📊 GROUP COMPARISON SUMMARY
274
-
275
- {'='*50}
276
- GROUP 1: {label1}
277
- {'='*50}
278
- Total Responses: {total1}
279
- Positive: {counts1.get('POSITIVE', 0)} ({pos1:.1f}%)
280
- Negative: {counts1.get('NEGATIVE', 0)} ({neg1:.1f}%)
281
- Average Confidence: {avg1:.3f}
282
 
283
- {'='*50}
284
- GROUP 2: {label2}
285
- {'='*50}
286
- Total Responses: {total2}
287
- Positive: {counts2.get('POSITIVE', 0)} ({pos2:.1f}%)
288
- Negative: {counts2.get('NEGATIVE', 0)} ({neg2:.1f}%)
289
- Average Confidence: {avg2:.3f}
290
 
291
- {'='*50}
292
- DIFFERENCE ANALYSIS
293
- {'='*50}
294
- Positive Sentiment Difference: {pos1 - pos2:+.1f} percentage points
295
- ({label1} {'more' if pos1 > pos2 else 'less'} positive than {label2})
296
 
297
- Confidence Score Difference: {avg1 - avg2:+.3f}
298
- ({label1} {'higher' if avg1 > avg2 else 'lower'} confidence than {label2})
299
- """
300
-
301
- return summary
302
-
303
- def create_summary(df, title):
304
- """Create text summary of results"""
305
- total = len(df)
306
- sentiment_counts = df['sentiment_label'].value_counts()
307
- avg_score = df['sentiment_score'].mean()
308
-
309
- summary = f"""
310
- 📊 {title} (Total: {total} rows)
311
 
312
- Sentiment Breakdown:
313
- {sentiment_counts.to_string()}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
- Average Confidence Score: {avg_score:.3f}
316
 
317
- Sentiment Percentages:
318
- {(sentiment_counts / total * 100).round(2).to_string()}%
319
- """
320
-
321
- return summary
322
 
323
- # Create Gradio interface
324
- with gr.Blocks(title="Sentiment Comparison Tool", theme=gr.themes.Soft()) as demo:
325
- gr.Markdown("# 📊 Sentiment Analysis: Multi-File Comparison")
326
- gr.Markdown("Upload 2-5 TXT files to compare OR upload a single CSV file")
327
-
328
- with gr.Row():
329
- with gr.Column(scale=1):
330
- gr.Markdown("### Step 1: Upload & Analyze")
331
- gr.Markdown("**Upload Multiple TXT Files (2-5) OR Single CSV:**")
332
-
333
- file1 = gr.File(label="File 1 (Required)", file_types=[".csv", ".txt"])
334
- file2 = gr.File(label="File 2 (Optional)", file_types=[".txt"])
335
- file3 = gr.File(label="File 3 (Optional)", file_types=[".txt"])
336
- file4 = gr.File(label="File 4 (Optional)", file_types=[".txt"])
337
- file5 = gr.File(label="File 5 (Optional)", file_types=[".txt"])
338
-
339
- column_input = gr.Textbox(
340
- label="Column to Analyze (CSV only)",
341
- placeholder="e.g., 'review_text'",
342
- value="text"
343
- )
344
- analyze_btn = gr.Button("🔍 Analyze Sentiment", variant="primary", size="lg")
345
-
346
- gr.Markdown("### Step 2: Compare Groups")
347
- filter_column = gr.Dropdown(
348
- label="Compare by Column",
349
- choices=[],
350
- interactive=True,
351
- info="Select 'file_name' to compare TXT files"
352
- )
353
-
354
- with gr.Row():
355
- group1_value = gr.Dropdown(
356
- label="Group 1",
357
- choices=[],
358
- interactive=True
359
- )
360
- group2_value = gr.Dropdown(
361
- label="Group 2",
362
- choices=[],
363
- interactive=True
364
- )
365
-
366
- compare_btn = gr.Button("⚖️ Compare Groups", variant="secondary", size="lg")
367
-
368
- with gr.Column(scale=2):
369
- summary_output = gr.Textbox(label="Comparison Summary", lines=20)
370
-
371
- with gr.Row():
372
- plot_pie = gr.Plot(label="Side-by-Side Distribution")
373
-
374
- with gr.Row():
375
- with gr.Column():
376
- plot_bar = gr.Plot(label="Percentage Comparison")
377
- with gr.Column():
378
- plot_hist = gr.Plot(label="Confidence Score Distribution")
379
-
380
- with gr.Row():
381
- output_df = gr.Dataframe(label="All Data", max_height=400)
382
-
383
- # Connect events
384
- analyze_btn.click(
385
  fn=analyze_sentiment_files,
386
- inputs=[file1, file2, file3, file4, file5, column_input],
387
- outputs=[summary_output, output_df, plot_pie, plot_bar, plot_hist,
388
- filter_column, group1_value, group2_value]
389
  )
390
-
391
- filter_column.change(
392
  fn=get_filter_values,
393
- inputs=[filter_column],
394
- outputs=[group1_value, group2_value]
395
  )
396
-
397
- compare_btn.click(
398
  fn=compare_groups,
399
- inputs=[filter_column, group1_value, group2_value],
400
- outputs=[summary_output, output_df, plot_pie, plot_bar, plot_hist]
401
  )
402
-
403
- gr.Markdown("""
404
- ### 💡 How to use:
405
-
406
- **Option A: Multiple TXT Files (2-5 files)**
407
- 1. Upload 2-5 TXT files (one per upload slot)
408
- 2. Click "Analyze Sentiment" to process all files
409
- 3. Select "file_name" as the comparison column
410
- 4. Choose two files to compare (e.g., "File 1" vs "File 2")
411
- 5. Click "Compare Groups" to see side-by-side comparison
412
-
413
- **Option B: Single CSV File**
414
- 1. Upload one CSV file with text column and grouping columns
415
- 2. Specify which column contains the text to analyze
416
- 3. Click "Analyze Sentiment"
417
- 4. Select any column to compare groups (e.g., language, category)
418
- 5. Choose two values to compare
419
-
420
- ### 📂 File Format Details:
421
- - **TXT files**: Each line is analyzed separately; files are labeled as "File 1", "File 2", etc.
422
- - **CSV files**: Specify text column; can compare based on any categorical column
423
-
424
- ### 📈 Comparison Features:
425
- - Side-by-side pie charts showing sentiment distribution
426
- - Grouped bar chart comparing positive/negative percentages
427
- - Overlaid histogram comparing confidence score distributions
428
- - Detailed statistical summary with difference analysis
429
- - Full data table with all analyzed text and sentiment scores
430
-
431
- ### 🎯 Example Use Cases:
432
- - Compare sentiment across different text documents
433
- - Analyze reviews from different sources
434
- - Compare sentiment: Arab responses vs Chinese responses
435
- - Analyze: Product A reviews vs Product B reviews
436
- - Compare: Pre-intervention vs Post-intervention feedback
437
- """)
438
 
439
  if __name__ == "__main__":
440
- demo.launch(share=True)
 
4
  import plotly.express as px
5
  import plotly.graph_objects as go
6
  from plotly.subplots import make_subplots
7
+ import spacy
8
+
9
+ # Load the English spaCy model (lightweight, 'sm' for small)
10
+ try:
11
+ nlp = spacy.load("en_core_web_sm")
12
+ except OSError:
13
+ print("Downloading spaCy model 'en_core_web_sm'. Please run 'python -m spacy download en_core_web_sm' if this fails repeatedly.")
14
+ spacy.cli.download("en_core_web_sm")
15
+ nlp = spacy.load("en_core_web_sm")
16
 
17
  # Initialize the sentiment analysis pipeline
18
  sentiment_pipeline = pipeline(
 
23
  # Store the analyzed dataframe globally
24
  analyzed_df = None
25
 
26
+ # --- Function: Detect Passive Voice using spaCy ---
27
+ def is_passive(text):
28
+ """Checks if a sentence is passive using spaCy's dependency parser."""
29
+ doc = nlp(text)
30
+ # A simple heuristic check for passive voice structure
31
+ # Look for a form of 'be' (auxpass) followed by a past participle (VERB/VBN)
32
+ for token in doc:
33
+ if token.dep_ == 'auxpass' and token.head.pos_ == 'VERB' and token.head.tag_ == 'VBN':
34
+ return True
35
+ return False
36
+
37
+
38
  def analyze_sentiment_files(file1, file2, file3, file4, file5, column_name):
39
+ """Analyze sentiment and active/passive voice for multiple TXT files or a single CSV file"""
40
  global analyzed_df
41
 
42
  try:
 
102
 
103
  df['sentiment_label'] = [r['label'] for r in results]
104
  df['sentiment_score'] = [r['score'] for r in results]
105
+
106
+ # --- New Analysis: Active/Passive Voice ---
107
+ df['is_passive'] = df[column_name].apply(is_passive)
108
+ df['voice_label'] = df['is_passive'].apply(lambda x: 'PASSIVE' if x else 'ACTIVE')
109
+
110
  analyzed_df = df
111
 
112
+ # Get all column names except sentiment/voice columns for filter options
113
+ filter_columns = [col for col in df.columns if col not in ['sentiment_label', 'sentiment_score', 'is_passive', 'voice_label']]
114
 
115
  # Create initial summary with file breakdown if multiple TXT files
116
  if 'file_name' in df.columns:
 
128
  gr.update(choices=[], value=None))
129
 
130
  except Exception as e:
131
+ import traceback
132
+ traceback.print_exc()
133
  return f"Error: {str(e)}", None, None, None, None, gr.update(choices=[]), gr.update(choices=[]), gr.update(choices=[])
134
 
135
+ # --- Summary Functions (Updated to include passive voice) ---
136
+
137
+ def create_summary(df, title):
138
+ """Generates a summary string including sentiment and voice stats."""
139
+ total_lines = len(df)
140
+ positive_pct = (df['sentiment_label'].value_counts(normalize=True).get('POSITIVE', 0) * 100)
141
+ passive_pct = (df['is_passive'].mean() * 100) # Mean of True/False gives proportion of True
142
+
143
+ summary = (f"--- Summary for {title} ---\n"
144
+ f"Total Lines Analyzed: {total_lines}\n"
145
+ f"Positive Sentiment: {positive_pct:.1f}%\n"
146
+ f"Negative Sentiment: {(100 - positive_pct):.1f}%\n"
147
+ f"**Passive Voice Sentences: {passive_pct:.1f}%**\n"
148
+ f"**Active Voice Sentences: {(100 - passive_pct):.1f}%**\n"
149
+ f"---------------------------------")
150
+ return summary
151
+
152
+ def create_comparison_summary(df1, df2, label1, label2):
153
+ """Generates a comparison summary string."""
154
+ summary = f"📊 COMPARISON SUMMARY: {label1} vs {label2}\n\n"
155
+ summary += create_summary(df1, label1) + "\n\n"
156
+ summary += create_summary(df2, label2)
157
+ return summary
158
+
159
  def get_filter_values(filter_column):
160
  """Get unique values for the selected filter column"""
161
  global analyzed_df
 
190
  # Create comparison visualizations
191
  fig_pie = create_comparison_pie(df1, df2, group1_value, group2_value)
192
  fig_bar = create_comparison_bar(df1, df2, group1_value, group2_value)
193
+ # Using the new voice bar chart instead of a generic histogram
194
+ fig_voice_bar = create_comparison_voice_bar(df1, df2, group1_value, group2_value)
195
 
196
  # Create comparison summary
197
  summary = create_comparison_summary(df1, df2, group1_value, group2_value)
 
203
  df2_display['comparison_group'] = group2_value
204
  combined_df = pd.concat([df1_display, df2_display])
205
 
206
+ return summary, combined_df, fig_pie, fig_bar, fig_voice_bar
207
+
208
 
209
  def create_comparison_pie(df1, df2, label1, label2):
210
  """Create side-by-side pie charts"""
 
260
  name=label2,
261
  x=sentiments,
262
  y=[counts2.get(s, 0) for s in sentiments],
263
+ marker_color='#ef4444',
264
  text=[f"{counts2.get(s, 0):.1f}%" for s in sentiments],
265
  textposition='auto'
266
  ))
267
 
268
+ fig.update_layout(title_text='Sentiment Percentage Comparison', barmode='group', height=400)
 
 
 
 
 
 
269
 
270
  return fig
271
 
272
+ # --- New Function: Create Voice Comparison Bar Chart ---
273
+ def create_comparison_voice_bar(df1, df2, label1, label2):
274
+ """Create grouped bar chart comparing active vs passive voice percentages"""
275
+ counts1 = df1['voice_label'].value_counts(normalize=True) * 100
276
+ counts2 = df2['voice_label'].value_counts(normalize=True) * 100
277
+
278
+ voices = ['ACTIVE', 'PASSIVE']
279
+
280
  fig = go.Figure()
281
 
282
+ fig.add_trace(go.Bar(
 
283
  name=label1,
284
+ x=voices,
285
+ y=[counts1.get(s, 0) for s in voices],
286
+ marker_color='#10b981',
287
+ text=[f"{counts1.get(s, 0):.1f}%" for s in voices],
288
+ textposition='auto'
289
  ))
290
 
291
+ fig.add_trace(go.Bar(
 
292
  name=label2,
293
+ x=voices,
294
+ y=[counts2.get(s, 0) for s in voices],
295
+ marker_color='#fbbf24',
296
+ text=[f"{counts2.get(s, 0):.1f}%" for s in voices],
297
+ textposition='auto'
298
  ))
299
 
300
+ fig.update_layout(title_text='Active vs. Passive Voice Percentage Comparison', barmode='group', height=400)
 
 
 
 
 
 
301
 
302
  return fig
303
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
+ # --- Gradio UI Setup ---
 
 
 
 
 
 
306
 
307
+ with gr.Blocks(title="Sentiment & Voice Analyzer") as demo:
308
+ gr.Markdown("# Advanced Text Analyzer: Sentiment, Active vs. Passive Voice")
 
 
 
309
 
310
+ with gr.Tab("Analyze Files"):
311
+ with gr.Row():
312
+ file_input1 = gr.File(label="Upload TXT/CSV File 1")
313
+ file_input2 = gr.File(label="Upload TXT File 2 (Optional)")
314
+ file_input3 = gr.File(label="Upload TXT File 3 (Optional)")
315
+ file_input4 = gr.File(label="Upload TXT File 4 (Optional)")
316
+ file_input5 = gr.File(label="Upload TXT File 5 (Optional)")
317
+
318
+ csv_column_name = gr.Textbox(label="If CSV, specify text column name", value="text")
319
+ analyze_button = gr.Button("Analyze Texts", variant="primary")
320
+
321
+ summary_output = gr.Textbox(label="Analysis Summary", lines=10)
322
+ dataframe_output = gr.DataFrame(label="Detailed Analysis Results")
 
323
 
324
+ with gr.Tab("Compare Groups"):
325
+ gr.Markdown("Select a column to filter by (e.g., 'file_name' for TXT uploads) and compare two values.")
326
+ with gr.Row():
327
+ filter_col_dropdown = gr.Dropdown(label="Select Filter Column", choices=[])
328
+ group1_dropdown = gr.Dropdown(label="Group 1 Value", choices=[])
329
+ group2_dropdown = gr.Dropdown(label="Group 2 Value", choices=[])
330
+
331
+ compare_button = gr.Button("Compare Groups", variant="primary")
332
+
333
+ comparison_summary_output = gr.Textbox(label="Comparison Summary", lines=15)
334
+ comparison_dataframe_output = gr.DataFrame(label="Comparison Data Results")
335
+
336
+ # Updated output slots for the new voice bar chart
337
+ comparison_pie_chart = gr.Plot(label="Sentiment Distribution Pie Chart")
338
+ comparison_bar_chart = gr.Plot(label="Sentiment Percentage Bar Chart")
339
+ comparison_voice_bar_chart = gr.Plot(label="Active/Passive Voice Bar Chart")
340
 
 
341
 
342
+ # --- Event Handlers ---
 
 
 
 
343
 
344
+ analyze_button.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
345
  fn=analyze_sentiment_files,
346
+ inputs=[file_input1, file_input2, file_input3, file_input4, file_input5, csv_column_name],
347
+ outputs=[summary_output, dataframe_output, comparison_pie_chart, comparison_bar_chart, comparison_voice_bar_chart, filter_col_dropdown, group1_dropdown, group2_dropdown]
 
348
  )
349
+
350
+ filter_col_dropdown.change(
351
  fn=get_filter_values,
352
+ inputs=[filter_col_dropdown],
353
+ outputs=[group1_dropdown, group2_dropdown]
354
  )
355
+
356
+ compare_button.click(
357
  fn=compare_groups,
358
+ inputs=[filter_col_dropdown, group1_dropdown, group2_dropdown],
359
+ outputs=[comparison_summary_output, comparison_dataframe_output, comparison_pie_chart, comparison_bar_chart, comparison_voice_bar_chart]
360
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
361
 
362
  if __name__ == "__main__":
363
+ demo.launch()