popaaln commited on
Commit
5ca0cd3
Β·
verified Β·
1 Parent(s): bc31020

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +499 -0
app.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from itertools import combinations
6
+ import re
7
+ from base import BaseMetric
8
+ from relaxed_entity_extraction import RelaxedThresholdStringEntityMetric
9
+
10
+ def parse_labels(label_str):
11
+ if pd.isna(label_str):
12
+ return []
13
+ if label_str.startswith('[') and label_str.endswith(']'):
14
+ matches = re.findall(r"'([^']*)'|\"([^\"]*)\"", label_str)
15
+ return [m[0] or m[1] for m in matches]
16
+ return [label_str]
17
+
18
+ def analyze_coverage(df, sources, omniscan_sets=1, selected_tasks=None):
19
+ results = {}
20
+
21
+ # Initialize RelaxedThresholdStringEntityMetric for extraction tasks
22
+ string_metric = RelaxedThresholdStringEntityMetric()
23
+
24
+ # Identify extraction tasks from task_type column
25
+ extraction_tasks = set()
26
+ if 'task_type' in df.columns:
27
+ extraction_tasks = set(df[df['task_type'].str.contains('extraction', case=False, na=False)]['task'].unique())
28
+ print(f"DEBUG: Found extraction tasks: {extraction_tasks}") # Debug
29
+
30
+ # Filter by selected tasks if provided
31
+ if selected_tasks:
32
+ df = df[df['task'].isin(selected_tasks)]
33
+ tasks_to_process = selected_tasks
34
+ else:
35
+ tasks_to_process = df['task'].unique().tolist()
36
+
37
+ for asin in df['asin'].unique():
38
+ asin_data = df[df['asin'] == asin]
39
+
40
+ # Check coverage for each task
41
+ task_coverage = {}
42
+ all_unobservable_labels = []
43
+
44
+ for task in tasks_to_process:
45
+ task_data = asin_data[asin_data['task'] == task]
46
+ if task_data.empty:
47
+ continue
48
+
49
+ task_covered = False
50
+ task_unobservable = []
51
+ extraction_labels = [] # For extraction consistency analysis
52
+
53
+ # Handle omniscan combinations for this task
54
+ if 'omniscan' in sources and 'omniscan' in task_data['source_type'].values:
55
+ omniscan_data = task_data[task_data['source_type'] == 'omniscan']
56
+
57
+ # Sort by timestamp and take earliest N captures
58
+ if 'timestamp' in omniscan_data.columns:
59
+ omniscan_data = omniscan_data.sort_values('timestamp')
60
+
61
+ num_captures = min(omniscan_sets, len(omniscan_data))
62
+ selected_captures = omniscan_data.head(num_captures)
63
+
64
+ all_parsed = []
65
+ for label in selected_captures['label']:
66
+ all_parsed.extend(parse_labels(label))
67
+
68
+ non_unobservable = [l for l in all_parsed if 'UNOBSERVABLE' not in l.upper()]
69
+ if non_unobservable:
70
+ task_covered = True
71
+ extraction_labels.extend(non_unobservable)
72
+ else:
73
+ task_unobservable.extend([l for l in all_parsed if 'UNOBSERVABLE' in l.upper()])
74
+
75
+ # Handle other sources for this task
76
+ if not task_covered:
77
+ for source in sources:
78
+ if source != 'omniscan':
79
+ source_data = task_data[task_data['source_type'] == source]
80
+ if not source_data.empty:
81
+ all_parsed = []
82
+ for label in source_data['label']:
83
+ all_parsed.extend(parse_labels(label))
84
+ non_unobservable = [l for l in all_parsed if 'UNOBSERVABLE' not in l.upper()]
85
+ if non_unobservable:
86
+ task_covered = True
87
+ extraction_labels.extend(non_unobservable)
88
+ break
89
+ else:
90
+ task_unobservable.extend([l for l in all_parsed if 'UNOBSERVABLE' in l.upper()])
91
+
92
+ task_coverage[task] = task_covered
93
+ if not task_covered:
94
+ all_unobservable_labels.extend(task_unobservable)
95
+
96
+ # ASIN is covered only if ALL tasks are covered
97
+ asin_covered = all(task_coverage.values()) if task_coverage else False
98
+
99
+ if True:
100
+ # Custom rule for German ingredients/allergens
101
+ if ('ingredients-german' in tasks_to_process and 'iallergens-german' in tasks_to_process and
102
+ 'ingredients-german' in task_coverage and 'iallergens-german' in task_coverage):
103
+
104
+ # If ingredients-german is covered but iallergens-german is not
105
+ if (task_coverage['ingredients-german'] and not task_coverage['iallergens-german']):
106
+ # Check if iallergens-german failed only due to "UNOBSERVABLE" (not other unobservable types)
107
+ iallergens_data = asin_data[asin_data['task'] == 'iallergens-german']
108
+ if not iallergens_data.empty:
109
+ all_iallergens_labels = []
110
+ for label in iallergens_data['label']:
111
+ all_iallergens_labels.extend(parse_labels(label))
112
+
113
+ # Check if all unobservable labels are exactly "UNOBSERVABLE"
114
+ if (all_iallergens_labels and
115
+ all(label.upper() == 'UNOBSERVABLE' for label in all_iallergens_labels)):
116
+ asin_covered = True
117
+ task_coverage['iallergens-german'] = True
118
+
119
+
120
+ results[asin] = {
121
+ 'covered': asin_covered,
122
+ 'task_coverage': task_coverage,
123
+ 'unobservable_labels': all_unobservable_labels
124
+ }
125
+
126
+ # Analyze extraction consistency - compute at ASIN level, then aggregate at task level
127
+ consistency_stats = {}
128
+ asin_consistency_data = {} # Store per-ASIN consistency for aggregation
129
+
130
+ for asin in df['asin'].unique():
131
+ asin_data = df[df['asin'] == asin]
132
+
133
+ for task in tasks_to_process:
134
+ task_data = asin_data[asin_data['task'] == task]
135
+ if task_data.empty:
136
+ continue
137
+
138
+ # Collect all extraction labels for this ASIN-task combination
139
+ extraction_labels = []
140
+
141
+ # Get labels from all sources for this ASIN-task
142
+ for source in sources:
143
+ source_data = task_data[task_data['source_type'] == source]
144
+ if not source_data.empty:
145
+ for label in source_data['label']:
146
+ parsed = parse_labels(label)
147
+ non_unobservable = [l for l in parsed if 'UNOBSERVABLE' not in l.upper()]
148
+ extraction_labels.extend(non_unobservable)
149
+
150
+ # Compute consistency for this ASIN-task if we have multiple labels
151
+ if len(extraction_labels) > 1:
152
+ consistent_count = 0
153
+ inconsistent_count = 0
154
+
155
+ # Compare all pairs of labels for this ASIN-task
156
+ for i in range(len(extraction_labels)):
157
+ for j in range(i + 1, len(extraction_labels)):
158
+ try:
159
+ eval_result = string_metric.evaluate([extraction_labels[i]], [extraction_labels[j]])
160
+ if eval_result.get('tps', []):
161
+ consistent_count += 1
162
+ else:
163
+ inconsistent_count += 1
164
+ except Exception as e:
165
+ inconsistent_count += 1
166
+
167
+ total = consistent_count + inconsistent_count
168
+ if total > 0:
169
+ asin_consistency_pct = (consistent_count / total) * 100
170
+
171
+ # Store ASIN-level consistency for task aggregation
172
+ if task not in asin_consistency_data:
173
+ asin_consistency_data[task] = []
174
+ asin_consistency_data[task].append(asin_consistency_pct)
175
+
176
+ # Aggregate ASIN-level consistency to task level
177
+ for task, asin_percentages in asin_consistency_data.items():
178
+ if asin_percentages:
179
+ avg_consistency = sum(asin_percentages) / len(asin_percentages)
180
+ consistency_stats[task] = {
181
+ 'consistent_pct': avg_consistency,
182
+ 'inconsistent_pct': 100 - avg_consistency,
183
+ 'num_asins': len(asin_percentages)
184
+ }
185
+
186
+
187
+ return results, consistency_stats
188
+
189
+ def create_analysis(csv_file, marketing, omniscan, pics, detailed_page, omniscan_sets, task_checkboxes):
190
+ if csv_file is None:
191
+ return None, "Please upload a CSV file"
192
+
193
+ df = pd.read_csv(csv_file.name)
194
+
195
+ # Get selected tasks
196
+ selected_tasks = task_checkboxes if task_checkboxes else []
197
+ if not selected_tasks:
198
+ return None, "Please select at least one task"
199
+
200
+ # Get available sources
201
+ available_sources = df['source_type'].unique()
202
+
203
+ # Build selected sources list
204
+ sources = []
205
+ if marketing and 'marketing' in available_sources:
206
+ sources.append('marketing')
207
+ if omniscan and 'omniscan' in available_sources:
208
+ sources.append('omniscan')
209
+ if pics and 'pics' in available_sources:
210
+ sources.append('pics')
211
+ if detailed_page and 'detailed_page' in available_sources:
212
+ sources.append('detailed_page')
213
+
214
+ if not sources:
215
+ return None, "Please select at least one available source"
216
+
217
+ # Analyze coverage
218
+ results, consistency_stats = analyze_coverage(df, sources, omniscan_sets, selected_tasks)
219
+
220
+ # Calculate coverage statistics
221
+ total_asins = len(results)
222
+ covered_asins = sum(1 for r in results.values() if r['covered'])
223
+ uncovered_asins = total_asins - covered_asins
224
+ asin_coverage_rate = covered_asins / total_asins if total_asins > 0 else 0
225
+ uncovered_rate = uncovered_asins / total_asins if total_asins > 0 else 0
226
+
227
+ # Collect unobservable labels only from uncovered ASINs
228
+ all_unobservable = []
229
+ for result in results.values():
230
+ if not result['covered']:
231
+ all_unobservable.extend(result['unobservable_labels'])
232
+
233
+ # Create pie chart for unobservable issues
234
+ if all_unobservable:
235
+ unobservable_counts = pd.Series(all_unobservable).value_counts()
236
+ fig = px.pie(values=unobservable_counts.values, names=unobservable_counts.index,
237
+ title=f"Unobservable Issues from {uncovered_asins} Uncovered ASINs ({uncovered_rate:.1%} of total)")
238
+ else:
239
+ fig = px.pie(values=[1], names=['All Covered'],
240
+ title=f"ASIN Coverage: {asin_coverage_rate:.1%}")
241
+
242
+ # Format consistency stats prominently
243
+ consistency_text = ""
244
+ if consistency_stats:
245
+ consistency_text = "\n\n## 🎯 **Extraction Consistency Analysis**\n```\n"
246
+ for task, stats in consistency_stats.items():
247
+ consistency_text += f"{task:<25} βœ… {stats['consistent_pct']:5.1f}% consistent | ❗ {stats['inconsistent_pct']:5.1f}% inconsistent\n"
248
+ consistency_text += "```\n"
249
+
250
+
251
+ stats = f"## πŸ“Š **ASIN Coverage: {covered_asins}/{total_asins} ASINs ({asin_coverage_rate:.1%})**{consistency_text}"
252
+ return fig, stats
253
+
254
+ def create_source_coverage_analysis(csv_file, marketing, omniscan, pics, detailed_page, task_checkboxes):
255
+ if csv_file is None:
256
+ return None, "Please upload a CSV file"
257
+
258
+ df = pd.read_csv(csv_file.name)
259
+
260
+ # Get selected tasks
261
+ selected_tasks = task_checkboxes if task_checkboxes else []
262
+ if not selected_tasks:
263
+ return None, "Please select at least one task"
264
+
265
+ # Get available sources
266
+ available_sources = df['source_type'].unique()
267
+
268
+ # Build selected sources list
269
+ selected_sources = []
270
+ if marketing and 'marketing' in available_sources:
271
+ selected_sources.append('marketing')
272
+ if omniscan and 'omniscan' in available_sources:
273
+ selected_sources.append('omniscan')
274
+ if pics and 'pics' in available_sources:
275
+ selected_sources.append('pics')
276
+ if detailed_page and 'detailed_page' in available_sources:
277
+ selected_sources.append('detailed_page')
278
+
279
+ if not selected_sources:
280
+ return None, "Please select at least one available source"
281
+
282
+ # Calculate coverage for all combinations using the same logic as main analysis
283
+ coverage_data = []
284
+
285
+ # Single sources
286
+ for source in selected_sources:
287
+ results, _ = analyze_coverage(df, [source], 1, selected_tasks)
288
+ covered_asins = sum(1 for r in results.values() if r['covered'])
289
+ coverage_data.append((source, covered_asins))
290
+
291
+ # Pairs
292
+ for combo in combinations(selected_sources, 2):
293
+ results, _ = analyze_coverage(df, list(combo), 1, selected_tasks)
294
+ covered_asins = sum(1 for r in results.values() if r['covered'])
295
+ coverage_data.append((f"{combo[0]}<br>{combo[1]}", covered_asins))
296
+
297
+ # All combinations of 3 or more
298
+ if len(selected_sources) >= 3:
299
+ for r in range(3, len(selected_sources) + 1):
300
+ for combo in combinations(selected_sources, r):
301
+ results, _ = analyze_coverage(df, list(combo), 1, selected_tasks)
302
+ covered_asins = sum(1 for res in results.values() if res['covered'])
303
+ coverage_data.append(("<br>".join(combo), covered_asins))
304
+
305
+ # Create spider/radar chart
306
+ labels, values = zip(*coverage_data)
307
+
308
+ # Calculate total ASINs for percentage calculation
309
+ total_asins = len(df['asin'].unique())
310
+
311
+ # Create text labels with value and percentage
312
+ text_labels = [f"{value} ({value/total_asins*100:.1f}%)" for value in values]
313
+
314
+ fig = go.Figure()
315
+
316
+ fig.add_trace(go.Scatterpolar(
317
+ r=values,
318
+ theta=labels,
319
+ fill='toself',
320
+ name='ASIN Coverage',
321
+ line_color='rgb(0, 123, 255)',
322
+ fillcolor='rgba(0, 123, 255, 0.3)',
323
+ text=text_labels,
324
+ textposition='top right',
325
+ mode='markers+text+lines'
326
+ ))
327
+
328
+ fig.update_layout(
329
+ polar=dict(
330
+ radialaxis=dict(
331
+ visible=False, # Hide radial axis values
332
+ range=[0, max(values) * 1.1] if values else [0, 100]
333
+ )
334
+ ),
335
+ title='ASIN Coverage by Source Combination (Spider Chart)',
336
+ height=600,
337
+ showlegend=True
338
+ )
339
+
340
+ # Create statistics text
341
+ stats_text = "## πŸ“Š **Source Coverage Statistics**\n```\n"
342
+ for label, value in coverage_data:
343
+ stats_text += f"{label:<30}: {value} ASINs\n"
344
+ stats_text += "```"
345
+
346
+ return fig, stats_text
347
+
348
+ def create_omniscan_capture_analysis(csv_file, task_checkboxes):
349
+ if csv_file is None:
350
+ return None, "Please upload a CSV file"
351
+
352
+ df = pd.read_csv(csv_file.name)
353
+
354
+ # Get selected tasks
355
+ selected_tasks = task_checkboxes if task_checkboxes else []
356
+ if not selected_tasks:
357
+ return None, "Please select at least one task"
358
+
359
+ # Check if omniscan data exists
360
+ if 'omniscan' not in df['source_type'].values:
361
+ return None, "No omniscan data found in the dataset"
362
+
363
+ # Get max omniscan captures available
364
+ max_captures = df[df['source_type'] == 'omniscan'].groupby('asin').size().max()
365
+
366
+ # Analyze coverage for different numbers of omniscan captures
367
+ capture_data = []
368
+
369
+ for num_captures in range(1, min(max_captures + 1, 11)): # Limit to 10 captures max
370
+ results, _ = analyze_coverage(df, ['omniscan'], num_captures, selected_tasks)
371
+ covered_asins = sum(1 for r in results.values() if r['covered'])
372
+ total_asins = len(results)
373
+ coverage_pct = (covered_asins / total_asins * 100) if total_asins > 0 else 0
374
+ capture_data.append((num_captures, covered_asins, coverage_pct))
375
+
376
+ # Create line chart
377
+ captures, counts, percentages = zip(*capture_data)
378
+
379
+ fig = go.Figure()
380
+
381
+ fig.add_trace(go.Scatter(
382
+ x=captures,
383
+ y=percentages,
384
+ mode='lines+markers',
385
+ name='Coverage %',
386
+ line=dict(color='rgb(0, 123, 255)', width=3),
387
+ marker=dict(size=8),
388
+ text=[f"{count} ASINs ({pct:.1f}%)" for count, pct in zip(counts, percentages)],
389
+ textposition='top center'
390
+ ))
391
+
392
+ fig.update_layout(
393
+ title='Coverage Gains by Number of Omniscan Captures',
394
+ xaxis_title='Number of Omniscan Captures',
395
+ yaxis_title='Coverage Percentage (%)',
396
+ height=500,
397
+ showlegend=False
398
+ )
399
+
400
+ # Create statistics text
401
+ stats_text = "## πŸ“ˆ **Omniscan Capture Analysis**\n```\n"
402
+ for captures, count, pct in capture_data:
403
+ gain = pct - capture_data[0][2] if captures > 1 else 0
404
+ stats_text += f"{captures} capture(s): {count:3d} ASINs ({pct:5.1f}%) [+{gain:4.1f}% gain]\n"
405
+ stats_text += "```"
406
+
407
+ return fig, stats_text
408
+
409
+ def update_source_buttons(csv_file):
410
+ if csv_file is None:
411
+ return (gr.Checkbox(interactive=False), gr.Checkbox(interactive=False),
412
+ gr.Checkbox(interactive=False), gr.Checkbox(interactive=False),
413
+ gr.Slider(interactive=False), gr.CheckboxGroup(choices=[], interactive=False))
414
+
415
+ df = pd.read_csv(csv_file.name)
416
+ available_sources = df['source_type'].unique()
417
+ available_tasks = sorted(df['task'].unique().tolist())
418
+
419
+ marketing_available = 'marketing' in available_sources
420
+ omniscan_available = 'omniscan' in available_sources
421
+ pics_available = 'pics' in available_sources
422
+ detailed_page_available = 'detailed_page' in available_sources
423
+
424
+ # Get max omniscan sets for slider
425
+ max_omniscan = 1
426
+ if omniscan_available:
427
+ max_omniscan = df[df['source_type'] == 'omniscan'].groupby('asin').size().max()
428
+
429
+ return (gr.Checkbox(interactive=marketing_available, value=False),
430
+ gr.Checkbox(interactive=omniscan_available, value=False),
431
+ gr.Checkbox(interactive=pics_available, value=False),
432
+ gr.Checkbox(interactive=detailed_page_available, value=False),
433
+ gr.Slider(minimum=1, maximum=min(max_omniscan, 10), value=1, step=1, interactive=omniscan_available),
434
+ gr.CheckboxGroup(choices=available_tasks, value=[], interactive=True))
435
+
436
+ with gr.Blocks() as demo:
437
+ gr.Markdown("# Omniscan Multi-Capture Multi-Source Analysis Tool")
438
+
439
+ csv_input = gr.File(label="Upload CSV file", file_types=[".csv"])
440
+
441
+ with gr.Row():
442
+ with gr.Column():
443
+ gr.Markdown("### πŸ“Š Data Sources")
444
+ marketing_cb = gr.Checkbox(label="Marketing", interactive=False)
445
+ omniscan_cb = gr.Checkbox(label="Omniscan", interactive=False)
446
+ pics_cb = gr.Checkbox(label="PICS", interactive=False)
447
+ detailed_page_cb = gr.Checkbox(label="Detailed Page Text", interactive=False)
448
+
449
+ gr.Markdown("### 🏷️ Task Selection")
450
+ task_checkboxes = gr.CheckboxGroup(label="Select Tasks", choices=[], interactive=False)
451
+
452
+ gr.Markdown("### βš™οΈ Omniscan Settings")
453
+ omniscan_sets = gr.Slider(label="Max Omniscan Image Sets", minimum=1, maximum=10,
454
+ value=1, step=1, interactive=False)
455
+
456
+ with gr.Column():
457
+ analyze_btn = gr.Button("πŸ“ˆ Analyze Coverage")
458
+ stats_output = gr.Markdown(label="Statistics")
459
+ plot_output = gr.Plot()
460
+
461
+ gr.Markdown("---")
462
+ source_coverage_btn = gr.Button("πŸ” Analyze Source Coverage")
463
+ source_stats_output = gr.Markdown(label="Source Coverage Statistics")
464
+ source_plot_output = gr.Plot()
465
+
466
+ gr.Markdown("---")
467
+ omniscan_capture_btn = gr.Button("πŸ“ˆ Analyze Omniscan Captures")
468
+ omniscan_capture_stats_output = gr.Markdown(label="Omniscan Capture Statistics")
469
+ omniscan_capture_plot_output = gr.Plot()
470
+
471
+ # Update source availability when CSV is uploaded
472
+ csv_input.change(
473
+ update_source_buttons,
474
+ inputs=csv_input,
475
+ outputs=[marketing_cb, omniscan_cb, pics_cb, detailed_page_cb, omniscan_sets, task_checkboxes]
476
+ )
477
+
478
+ # Run analysis
479
+ analyze_btn.click(
480
+ create_analysis,
481
+ inputs=[csv_input, marketing_cb, omniscan_cb, pics_cb, detailed_page_cb, omniscan_sets, task_checkboxes],
482
+ outputs=[plot_output, stats_output]
483
+ )
484
+
485
+ # Run source coverage analysis
486
+ source_coverage_btn.click(
487
+ create_source_coverage_analysis,
488
+ inputs=[csv_input, marketing_cb, omniscan_cb, pics_cb, detailed_page_cb, task_checkboxes],
489
+ outputs=[source_plot_output, source_stats_output]
490
+ )
491
+
492
+ # Run omniscan capture analysis
493
+ omniscan_capture_btn.click(
494
+ create_omniscan_capture_analysis,
495
+ inputs=[csv_input, task_checkboxes],
496
+ outputs=[omniscan_capture_plot_output, omniscan_capture_stats_output]
497
+ )
498
+
499
+ demo.launch()