Spaces:

popaaln
/

myspace

Runtime error

File size: 16,887 Bytes

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from itertools import combinations
import re

def parse_labels(label_str):
    if pd.isna(label_str):
        return []
    if label_str.startswith('[') and label_str.endswith(']'):
        matches = re.findall(r"'([^']*)'|\"([^\"]*)\"", label_str)
        return [m[0] or m[1] for m in matches]
    return [label_str]

def analyze_coverage(df, sources, omniscan_sets=1, selected_tasks=None):
    results = {}
    
    # Filter by selected tasks if provided
    if selected_tasks:
        df = df[df['task'].isin(selected_tasks)]
        tasks_to_process = selected_tasks
    else:
        tasks_to_process = df['task'].unique().tolist()
    
    for asin in df['asin'].unique():
        asin_data = df[df['asin'] == asin]
        
        # Check coverage for each task
        task_coverage = {}
        all_unobservable_labels = []
        
        for task in tasks_to_process:
            task_data = asin_data[asin_data['task'] == task]
            if task_data.empty:
                continue
                
            task_covered = False
            task_unobservable = []
            
            # Handle omniscan combinations for this task
            if 'omniscan' in sources and 'omniscan' in task_data['source_type'].values:
                omniscan_data = task_data[task_data['source_type'] == 'omniscan']
                
                # Sort by timestamp and take earliest N captures
                if 'timestamp' in omniscan_data.columns:
                    omniscan_data = omniscan_data.sort_values('timestamp')
                
                num_captures = min(omniscan_sets, len(omniscan_data))
                selected_captures = omniscan_data.head(num_captures)
                
                all_parsed = []
                for label in selected_captures['label']:
                    all_parsed.extend(parse_labels(label))
                
                non_unobservable = [l for l in all_parsed if 'UNOBSERVABLE' not in l.upper()]
                if non_unobservable:
                    task_covered = True
                else:
                    task_unobservable.extend([l for l in all_parsed if 'UNOBSERVABLE' in l.upper()])
            
            # Handle other sources for this task
            if not task_covered:
                for source in sources:
                    if source != 'omniscan':
                        source_data = task_data[task_data['source_type'] == source]
                        if not source_data.empty:
                            all_parsed = []
                            for label in source_data['label']:
                                all_parsed.extend(parse_labels(label))
                            non_unobservable = [l for l in all_parsed if 'UNOBSERVABLE' not in l.upper()]
                            if non_unobservable:
                                task_covered = True
                                break
                            else:
                                task_unobservable.extend([l for l in all_parsed if 'UNOBSERVABLE' in l.upper()])
            
            task_coverage[task] = task_covered
            if not task_covered:
                all_unobservable_labels.extend(task_unobservable)
            
        # ASIN is covered only if ALL tasks are covered
        asin_covered = all(task_coverage.values()) if task_coverage else False

        # Custom rule for German ingredients/allergens
        if ('ingredients-german' in tasks_to_process and 'iallergens-german' in tasks_to_process and
            'ingredients-german' in task_coverage and 'iallergens-german' in task_coverage):
            
            # If ingredients-german is covered but iallergens-german is not
            if (task_coverage['ingredients-german'] and not task_coverage['iallergens-german']):
                # Check if iallergens-german failed only due to "UNOBSERVABLE" (not other unobservable types)
                iallergens_data = asin_data[asin_data['task'] == 'iallergens-german']
                if not iallergens_data.empty:
                    all_iallergens_labels = []
                    for label in iallergens_data['label']:
                        all_iallergens_labels.extend(parse_labels(label))
                    
                    # Check if all unobservable labels are exactly "UNOBSERVABLE"
                    if (all_iallergens_labels and 
                        all(label.upper() == 'UNOBSERVABLE' for label in all_iallergens_labels)):
                        asin_covered = True
                        task_coverage['iallergens-german'] = True
        
        results[asin] = {
            'covered': asin_covered,
            'task_coverage': task_coverage,
            'unobservable_labels': all_unobservable_labels
        }
    
    return results

def create_analysis(csv_file, marketing, omniscan, pics, detailed_page, omniscan_sets, task_checkboxes):
    if csv_file is None:
        return None, "Please upload a CSV file"
    
    df = pd.read_csv(csv_file.name)
    
    # Get selected tasks
    selected_tasks = task_checkboxes if task_checkboxes else []
    if not selected_tasks:
        return None, "Please select at least one task"
    
    # Get available sources
    available_sources = df['source_type'].unique()
    
    # Build selected sources list
    sources = []
    if marketing and 'marketing' in available_sources:
        sources.append('marketing')
    if omniscan and 'omniscan' in available_sources:
        sources.append('omniscan')
    if pics and 'pics' in available_sources:
        sources.append('pics')
    if detailed_page and 'detailed_page' in available_sources:
        sources.append('detailed_page')
    
    if not sources:
        return None, "Please select at least one available source"
    
    # Analyze coverage
    results = analyze_coverage(df, sources, omniscan_sets, selected_tasks)
    
    # Calculate coverage statistics
    total_asins = len(results)
    covered_asins = sum(1 for r in results.values() if r['covered'])
    uncovered_asins = total_asins - covered_asins
    asin_coverage_rate = covered_asins / total_asins if total_asins > 0 else 0
    uncovered_rate = uncovered_asins / total_asins if total_asins > 0 else 0
    
    # Collect unobservable labels only from uncovered ASINs
    all_unobservable = []
    for result in results.values():
        if not result['covered']:
            all_unobservable.extend(result['unobservable_labels'])
    
    # Create pie chart for unobservable issues
    if all_unobservable:
        unobservable_counts = pd.Series(all_unobservable).value_counts()
        fig = px.pie(values=unobservable_counts.values, names=unobservable_counts.index,
                     title=f"Unobservable Issues from {uncovered_asins} Uncovered ASINs ({uncovered_rate:.1%} of total)")
    else:
        fig = px.pie(values=[1], names=['All Covered'],
                     title=f"ASIN Coverage: {asin_coverage_rate:.1%}")
    
    stats = f"## 📊 **ASIN Coverage: {covered_asins}/{total_asins} ASINs ({asin_coverage_rate:.1%})**"
    return fig, stats

def create_source_coverage_analysis(csv_file, marketing, omniscan, pics, detailed_page, task_checkboxes):
    if csv_file is None:
        return None, "Please upload a CSV file"
    
    df = pd.read_csv(csv_file.name)
    
    # Get selected tasks
    selected_tasks = task_checkboxes if task_checkboxes else []
    if not selected_tasks:
        return None, "Please select at least one task"
    
    # Get available sources
    available_sources = df['source_type'].unique()
    
    # Build selected sources list
    selected_sources = []
    if marketing and 'marketing' in available_sources:
        selected_sources.append('marketing')
    if omniscan and 'omniscan' in available_sources:
        selected_sources.append('omniscan')
    if pics and 'pics' in available_sources:
        selected_sources.append('pics')
    if detailed_page and 'detailed_page' in available_sources:
        selected_sources.append('detailed_page')
    
    if not selected_sources:
        return None, "Please select at least one available source"
    
    # Calculate coverage for all combinations using the same logic as main analysis
    coverage_data = []
    
    # Single sources
    for source in selected_sources:
        results = analyze_coverage(df, [source], 1, selected_tasks)
        covered_asins = sum(1 for r in results.values() if r['covered'])
        coverage_data.append((source, covered_asins))
    
    # Pairs
    for combo in combinations(selected_sources, 2):
        results = analyze_coverage(df, list(combo), 1, selected_tasks)
        covered_asins = sum(1 for r in results.values() if r['covered'])
        coverage_data.append((f"{combo[0]}<br>{combo[1]}", covered_asins))
    
    # All combinations of 3 or more
    if len(selected_sources) >= 3:
        for r in range(3, len(selected_sources) + 1):
            for combo in combinations(selected_sources, r):
                results = analyze_coverage(df, list(combo), 1, selected_tasks)
                covered_asins = sum(1 for res in results.values() if res['covered'])
                coverage_data.append(("<br>".join(combo), covered_asins))
    
    # Create spider/radar chart
    labels, values = zip(*coverage_data)
    
    # Calculate total ASINs for percentage calculation
    total_asins = len(df['asin'].unique())
    
    # Create text labels with value and percentage
    text_labels = [f"{value} ({value/total_asins*100:.1f}%)" for value in values]
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=labels,
        fill='toself',
        name='ASIN Coverage',
        line_color='rgb(0, 123, 255)',
        fillcolor='rgba(0, 123, 255, 0.3)',
        text=text_labels,
        textposition='top right',
        mode='markers+text+lines'
    ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=False,  # Hide radial axis values
                range=[0, max(values) * 1.1] if values else [0, 100]
            )
        ),
        title='ASIN Coverage by Source Combination (Spider Chart)',
        height=600,
        showlegend=True
    )
    
    # Create statistics text
    stats_text = "## 📊 **Source Coverage Statistics**\n```\n"
    for label, value in coverage_data:
        stats_text += f"{label:<30}: {value} ASINs\n"
    stats_text += "```"
    
    return fig, stats_text

def create_omniscan_capture_analysis(csv_file, task_checkboxes):
    if csv_file is None:
        return None, "Please upload a CSV file"
    
    df = pd.read_csv(csv_file.name)
    
    # Get selected tasks
    selected_tasks = task_checkboxes if task_checkboxes else []
    if not selected_tasks:
        return None, "Please select at least one task"
    
    # Check if omniscan data exists
    if 'omniscan' not in df['source_type'].values:
        return None, "No omniscan data found in the dataset"
    
    # Get max omniscan captures available
    max_captures = df[df['source_type'] == 'omniscan'].groupby('asin').size().max()
    
    # Analyze coverage for different numbers of omniscan captures
    capture_data = []
    
    for num_captures in range(1, min(max_captures + 1, 11)):  # Limit to 10 captures max
        results = analyze_coverage(df, ['omniscan'], num_captures, selected_tasks)
        covered_asins = sum(1 for r in results.values() if r['covered'])
        total_asins = len(results)
        coverage_pct = (covered_asins / total_asins * 100) if total_asins > 0 else 0
        capture_data.append((num_captures, covered_asins, coverage_pct))
    
    # Create line chart
    captures, counts, percentages = zip(*capture_data)
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=captures,
        y=percentages,
        mode='lines+markers',
        name='Coverage %',
        line=dict(color='rgb(0, 123, 255)', width=3),
        marker=dict(size=8),
        text=[f"{count} ASINs ({pct:.1f}%)" for count, pct in zip(counts, percentages)],
        textposition='top center'
    ))
    
    fig.update_layout(
        title='Coverage Gains by Number of Omniscan Captures',
        xaxis_title='Number of Omniscan Captures',
        yaxis_title='Coverage Percentage (%)',
        height=500,
        showlegend=False
    )
    
    # Create statistics text
    stats_text = "## 📈 **Omniscan Capture Analysis**\n```\n"
    for captures, count, pct in capture_data:
        gain = pct - capture_data[0][2] if captures > 1 else 0
        stats_text += f"{captures} capture(s): {count:3d} ASINs ({pct:5.1f}%) [+{gain:4.1f}% gain]\n"
    stats_text += "```"
    
    return fig, stats_text

def update_source_buttons(csv_file):
    if csv_file is None:
        return (gr.Checkbox(interactive=False), gr.Checkbox(interactive=False), 
                gr.Checkbox(interactive=False), gr.Checkbox(interactive=False),
                gr.Slider(interactive=False), gr.CheckboxGroup(choices=[], interactive=False))
    
    df = pd.read_csv(csv_file.name)
    available_sources = df['source_type'].unique()
    available_tasks = sorted(df['task'].unique().tolist())
    
    marketing_available = 'marketing' in available_sources
    omniscan_available = 'omniscan' in available_sources
    pics_available = 'pics' in available_sources
    detailed_page_available = 'detailed_page' in available_sources
    
    # Get max omniscan sets for slider
    max_omniscan = 1
    if omniscan_available:
        max_omniscan = df[df['source_type'] == 'omniscan'].groupby('asin').size().max()
    
    return (gr.Checkbox(interactive=marketing_available, value=False),
            gr.Checkbox(interactive=omniscan_available, value=False),
            gr.Checkbox(interactive=pics_available, value=False),
            gr.Checkbox(interactive=detailed_page_available, value=False),
            gr.Slider(minimum=1, maximum=min(max_omniscan, 10), value=1, step=1, interactive=omniscan_available),
            gr.CheckboxGroup(choices=available_tasks, value=[], interactive=True))

with gr.Blocks() as demo:
    gr.Markdown("# Omniscan Multi-Capture Multi-Source Analysis Tool")
    
    csv_input = gr.File(label="Upload CSV file", file_types=[".csv"])
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 📊 Data Sources")
            marketing_cb = gr.Checkbox(label="Marketing", interactive=False)
            omniscan_cb = gr.Checkbox(label="Omniscan", interactive=False)
            pics_cb = gr.Checkbox(label="PICS", interactive=False)
            detailed_page_cb = gr.Checkbox(label="Detailed Page Text", interactive=False)
            
            gr.Markdown("### 🏷️ Task Selection")
            task_checkboxes = gr.CheckboxGroup(label="Select Tasks", choices=[], interactive=False)
            
            gr.Markdown("### ⚙️ Omniscan Settings")
            omniscan_sets = gr.Slider(label="Max Omniscan Image Sets", minimum=1, maximum=10, 
                                    value=1, step=1, interactive=False)
        
        with gr.Column():
            analyze_btn = gr.Button("📈 Analyze Coverage")
            stats_output = gr.Markdown(label="Statistics")
            plot_output = gr.Plot()
            
            gr.Markdown("---")
            source_coverage_btn = gr.Button("🔍 Analyze Source Coverage")
            source_stats_output = gr.Markdown(label="Source Coverage Statistics")
            source_plot_output = gr.Plot()
            
            gr.Markdown("---")
            omniscan_capture_btn = gr.Button("📈 Analyze Omniscan Captures")
            omniscan_capture_stats_output = gr.Markdown(label="Omniscan Capture Statistics")
            omniscan_capture_plot_output = gr.Plot()
    
    # Update source availability when CSV is uploaded
    csv_input.change(
        update_source_buttons,
        inputs=csv_input,
        outputs=[marketing_cb, omniscan_cb, pics_cb, detailed_page_cb, omniscan_sets, task_checkboxes]
    )
    
    # Run analysis
    analyze_btn.click(
        create_analysis,
        inputs=[csv_input, marketing_cb, omniscan_cb, pics_cb, detailed_page_cb, omniscan_sets, task_checkboxes],
        outputs=[plot_output, stats_output]
    )
    
    # Run source coverage analysis
    source_coverage_btn.click(
        create_source_coverage_analysis,
        inputs=[csv_input, marketing_cb, omniscan_cb, pics_cb, detailed_page_cb, task_checkboxes],
        outputs=[source_plot_output, source_stats_output]
    )
    
    # Run omniscan capture analysis
    omniscan_capture_btn.click(
        create_omniscan_capture_analysis,
        inputs=[csv_input, task_checkboxes],
        outputs=[omniscan_capture_plot_output, omniscan_capture_stats_output]
    )

#demo.launch()