import pandas as pd import gradio as gr import plotly.express as px from typing import Dict from pathlib import Path from config import METADATA_COLUMNS, DATA_FOLDER from data_loader import load_csv_from_folder, get_available_datasets DB: Dict[str, pd.DataFrame] = {} # --- 1. DATA PROCESSING FUNCTIONS --- def analyze_domain_configs(df_subset): """Separates configuration columns into constants and variables for a domain.""" actual_cols = [c for c in df_subset.columns if c not in METADATA_COLUMNS] # Exclude any column containing 'failed' in the name actual_cols = [c for c in actual_cols if 'failed' not in c.lower()] constants = {} variables = [] for col in actual_cols: unique_vals = df_subset[col].astype(str).unique() if len(unique_vals) <= 1: constants[col] = unique_vals[0] if len(unique_vals) > 0 else "N/A" else: variables.append(col) return constants, variables def load_data() -> str: """Loads data from the configured data folder and responses folder.""" try: # Load aggregate metrics data df, status_msg = load_csv_from_folder(DATA_FOLDER) if not df.empty: # Remove failed_samples column if it exists if 'failed_samples' in df.columns: df = df.drop(columns=['failed_samples']) DB["data"] = df # Load response data DB["responses"] = load_response_data() response_count = sum(len(df) for df in DB["responses"].values()) return f"{status_msg}\nLoaded {len(DB['responses'])} response datasets with {response_count} total responses." except Exception as e: return f"Error loading data: {str(e)}" def load_response_data() -> Dict[str, pd.DataFrame]: """Load all response CSV files from responses folder.""" responses_folder = Path("./responses") response_db = {} domain_mapping = { 'Biomedical_pubmedqa_checkpoint_100.csv': 'Biomedical (PubMedQA)', 'Customer_Support_techqa_checkpoint_100.csv': 'Customer Support (TechQA)', 'Finance_finqa_checkpoint_100.csv': 'Finance (FinQA)', 'General_msmarco_checkpoint_100.csv': 'General (MS MARCO)', 'Legal_cuad_checkpoint_100.csv': 'Legal (CUAD)' } for filename, domain_name in domain_mapping.items(): filepath = responses_folder / filename if filepath.exists(): df = pd.read_csv(filepath) # Convert metric columns to numeric for col in ['trace_relevance', 'trace_utilization', 'trace_completeness', 'trace_adherence']: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0) response_db[domain_name] = df return response_db def get_questions_for_domain(domain): """Get list of questions for selected domain.""" if "responses" not in DB or domain not in DB["responses"]: return gr.update(choices=[], value=None) df = DB["responses"][domain] questions = df['question'].unique().tolist() return gr.update(choices=questions, value=None) def get_response_details(domain, question): """Get LLM answer, gold answer, and metrics for selected question.""" if "responses" not in DB or domain not in DB["responses"]: return "", "", None df = DB["responses"][domain] row = df[df['question'] == question] if row.empty: return "", "", None row = row.iloc[0] llm_answer = str(row.get('answer', 'N/A')) gold_answer = str(row.get('gold_answer', 'N/A')) # Create metrics visualization metrics_data = { 'Metric': ['Relevance', 'Utilization', 'Completeness', 'Adherence'], 'Score': [ row.get('trace_relevance', 0.0), row.get('trace_utilization', 0.0), row.get('trace_completeness', 0.0), row.get('trace_adherence', 0.0) ] } metrics_df = pd.DataFrame(metrics_data) # Create bar chart fig = px.bar( metrics_df, x='Metric', y='Score', title=f'Quality Metrics for Selected Response', text_auto='.3f', color='Metric', range_y=[0, 1] ) fig.update_traces(textposition='outside') return llm_answer, gold_answer, fig # --- 2. UI LOGIC --- def get_dataset_choices(): """Safely retrieves dataset choices for dropdown.""" try: if "data" in DB and not DB["data"].empty: return get_available_datasets(DB["data"]) return [] except Exception as e: print(f"Error getting dataset choices: {e}") return [] def get_data_preview(): """Returns separate dataframes for each domain with columns reordered by type.""" if "data" not in DB: return {}, {}, {}, {}, {} df = DB["data"].copy() # Remove failed_samples related columns columns_to_remove = ['failed_samples', '# Failed/Total Samples', 'failedsamples', '%_failed_sample'] for col in columns_to_remove: if col in df.columns: df = df.drop(columns=[col]) # Define explicit domain order matching the UI domain_order = ['pubmedqa', 'techqa', 'finqa', 'msmarco', 'cuad'] # Metric columns (Results) result_cols = ['rmse_relevance', 'rmse_utilization', 'rmse_completeness', 'f1_score', 'aucroc'] metadata_cols = ['test_id', 'config_purpose', 'dataset_name'] domain_dfs = [] for ds in domain_order: domain_df = df[df['dataset_name'] == ds].copy() if domain_df.empty: domain_dfs.append(pd.DataFrame()) continue # Analyze constants and variables consts, variables = analyze_domain_configs(domain_df) # Reorder columns: Metadata -> Constants -> Variables -> Results ordered_cols = [] # Add metadata columns first for col in metadata_cols: if col in domain_df.columns: ordered_cols.append(col) # Add constant columns (sorted) const_cols = sorted([col for col in consts.keys() if col in domain_df.columns]) ordered_cols.extend(const_cols) # Add variable columns (sorted) var_cols = sorted([col for col in variables if col in domain_df.columns]) ordered_cols.extend(var_cols) # Add result columns for col in result_cols: if col in domain_df.columns: ordered_cols.append(col) # Add any remaining columns (excluding failed samples columns) remaining = [col for col in domain_df.columns if col not in ordered_cols] ordered_cols.extend(remaining) # Reorder dataframe domain_df = domain_df[ordered_cols] domain_dfs.append(domain_df) return domain_dfs[0], domain_dfs[1], domain_dfs[2], domain_dfs[3], domain_dfs[4] def get_domain_state(dataset): empty_update = gr.update(visible=False, value=None, choices=[]) if "data" not in DB: return "", empty_update, empty_update, empty_update, empty_update, empty_update df = DB["data"] subset = df[df['dataset_name'] == dataset] if subset.empty: return "No data for this domain.", empty_update, empty_update, empty_update, empty_update, empty_update consts, _ = analyze_domain_configs(subset) const_text = "CONSTANTS (Fixed for this domain):\n" + "\n".join([f"{k}: {v}" for k,v in consts.items()]) # Fixed filter columns across all domains FILTER_COLUMNS = ['reranker_model', 'chunking_strategy', 'summarization', 'repacking', 'gpt_label'] updates = [] for col_name in FILTER_COLUMNS: if col_name in subset.columns: unique_choices = list(subset[col_name].astype(str).unique()) unique_choices.insert(0, "All") updates.append(gr.update( label=f"Filter by {col_name}", choices=unique_choices, value="All", visible=True, interactive=True )) else: updates.append(empty_update) return const_text, updates[0], updates[1], updates[2], updates[3], updates[4] def plot_metrics_on_x_axis(dataset, f1_val, f2_val, f3_val, f4_val, f5_val): """Generates RMSE and Performance metric plots for selected domain and filters.""" if "data" not in DB or not dataset: return None, None try: df = DB["data"] subset = df[df['dataset_name'] == dataset].copy() except Exception as e: print(f"Error accessing data: {e}") return None, None # Fixed filter columns across all domains FILTER_COLUMNS = ['reranker_model', 'chunking_strategy', 'summarization', 'repacking', 'gpt_label'] filters = [f1_val, f2_val, f3_val, f4_val, f5_val] for i, val in enumerate(filters): if i < len(FILTER_COLUMNS) and val != "All" and val is not None: col = FILTER_COLUMNS[i] if col in subset.columns: subset = subset[subset[col].astype(str) == str(val)].copy() if subset.empty: return None, None # Reset index to avoid any index-related issues subset = subset.reset_index(drop=True) # Create Legend Label # Ensure test_id is string to prevent errors subset['Legend'] = "Test " + subset['test_id'].astype(str) + ": " + subset['config_purpose'].astype(str) # --- PLOT 1: RMSE --- # Check if columns exist before melting rmse_cols = ['rmse_relevance', 'rmse_utilization', 'rmse_completeness'] available_rmse = [c for c in rmse_cols if c in subset.columns] if available_rmse: rmse_melted = subset.melt( id_vars=['Legend', 'test_id'], value_vars=available_rmse, var_name='Metric Name', value_name='Score' ) # Explicitly ensure Score is numeric float rmse_melted['Score'] = pd.to_numeric(rmse_melted['Score'], errors='coerce').fillna(0.0).astype(float) rmse_melted['Metric Name'] = rmse_melted['Metric Name'].str.replace('rmse_', '').str.capitalize() rmse_melted = rmse_melted.reset_index(drop=True) # DEBUG: Print to verify values print(f"[DEBUG] RMSE melted data - Score range: {rmse_melted['Score'].min():.4f} to {rmse_melted['Score'].max():.4f}") print(f"[DEBUG] Sample scores: {rmse_melted['Score'].head(6).tolist()}") fig_rmse = px.bar( rmse_melted, x="Metric Name", y="Score", color="Legend", barmode="group", title=f"RMSE Breakdown (Lower is Better) - {len(subset)} Tests", text_auto='.3f' ) fig_rmse.update_traces(textposition='outside') fig_rmse.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)) else: fig_rmse = None # --- PLOT 2: Performance --- perf_cols = ['f1_score', 'aucroc'] available_perf = [c for c in perf_cols if c in subset.columns] if available_perf: perf_melted = subset.melt( id_vars=['Legend', 'test_id'], value_vars=available_perf, var_name='Metric Name', value_name='Score' ) # Explicitly ensure Score is numeric float perf_melted['Score'] = pd.to_numeric(perf_melted['Score'], errors='coerce').fillna(0.0).astype(float) perf_melted['Metric Name'] = perf_melted['Metric Name'].replace({ 'f1_score': 'F1 Score', 'aucroc': 'AUC-ROC' }) perf_melted = perf_melted.reset_index(drop=True) # DEBUG: Print to verify values print(f"[DEBUG] Performance melted data - Score range: {perf_melted['Score'].min():.4f} to {perf_melted['Score'].max():.4f}") print(f"[DEBUG] Sample scores: {perf_melted['Score'].head(6).tolist()}") fig_perf = px.bar( perf_melted, x="Metric Name", y="Score", color="Legend", barmode="group", title=f"Performance Metrics (Higher is Better) - {len(subset)} Tests", text_auto='.3f' ) fig_perf.update_traces(textposition='outside') fig_perf.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)) else: fig_perf = None return fig_rmse, fig_perf def generate_inter_domain_comparison(metric='f1_score'): """Generates comparison table and plot across all domains for selected metric.""" if "data" not in DB: return pd.DataFrame(), None try: df = DB["data"] except Exception as e: print(f"Error accessing data: {e}") return pd.DataFrame(), None datasets = df['dataset_name'].unique() all_keys = set() domain_constants = {} for ds in datasets: subset = df[df['dataset_name'] == ds] consts, _ = analyze_domain_configs(subset) domain_constants[ds] = consts all_keys.update(consts.keys()) # Exclude failed_samples and other unwanted columns EXCLUDE_COLUMNS = ['failed_samples', 'failedsamples', '%_failed_sample'] all_keys = {k for k in all_keys if k not in EXCLUDE_COLUMNS and 'failed' not in k.lower()} table_rows = [] for key in sorted(list(all_keys)): row = {"Configuration Parameter": key} for ds in datasets: val = domain_constants[ds].get(key, "Variable") row[ds] = val table_rows.append(row) comp_df = pd.DataFrame(table_rows) # Metric display names metric_names = { 'rmse_relevance': 'RMSE Relevance', 'rmse_utilization': 'RMSE Utilization', 'rmse_completeness': 'RMSE Completeness', 'f1_score': 'F1 Score', 'aucroc': 'AUC-ROC' } metric_display = metric_names.get(metric, metric) is_rmse = metric.startswith('rmse') direction = "Lower is Better" if is_rmse else "Higher is Better" best_results = [] for ds in datasets: subset = df[df['dataset_name'] == ds] if metric in subset.columns: if is_rmse: best_val = subset[metric].min() best_idx = subset[metric].idxmin() else: best_val = subset[metric].max() best_idx = subset[metric].idxmax() best_row = subset.loc[best_idx] best_results.append({ "Domain": ds, metric_display: best_val, "Best Config": best_row['config_purpose'] }) if best_results: best_df = pd.DataFrame(best_results) fig_global = px.bar( best_df, x="Domain", y=metric_display, color="Domain", text_auto='.4f', hover_data=["Best Config"], title=f"Peak Performance per Domain: {metric_display} ({direction})" ) fig_global.update_traces(textposition='outside') else: fig_global = None return comp_df, fig_global # --- 3. UI --- APP_VERSION = "v2.2.0" # Global constants used across all experiments GLOBAL_CONSTANTS = """ **Global Constants (Applied to All Domains):** - Generator Model: **llama-3.1-8b-instant** - Generator Max Tokens: **512** - Generator Temperature: **0.2** - Generator API Provider: **Groq** - Generation LLM Context Budget: **2000** - Judge Model: **llama-3.3-70b-versatile** - Judge Max Tokens: **1024** - Judge Temperature: **0.0** - Judge Sentence Attribution: **ENABLED** - Summarization Model: **fangyuan/nq_abstractive_compressor** """ with gr.Blocks(title="RAG Analytics Pro") as demo: gr.Markdown("## RAG Pipeline Analytics") gr.Markdown(f"**Data Source:** `{DATA_FOLDER}` | **Version:** {APP_VERSION}") with gr.Accordion("Global Experiment Configuration", open=False): gr.Markdown(GLOBAL_CONSTANTS) with gr.Row(): refresh_data_btn = gr.Button("Load/Refresh Data", variant="primary") status = gr.Textbox(label="Status (Check here for debug info)", interactive=False, scale=3) with gr.Tabs(): # TAB 1: Main Analytics with gr.TabItem("Intra-Domain Analysis"): with gr.Row(): with gr.Column(scale=1): ds_dropdown = gr.Dropdown(label="1. Select Domain", choices=[], interactive=True) constants_box = gr.Textbox(label="Domain Constants", lines=5, interactive=False) gr.Markdown("### Filter Tests") filter_1 = gr.Dropdown(visible=False) filter_2 = gr.Dropdown(visible=False) filter_3 = gr.Dropdown(visible=False) filter_4 = gr.Dropdown(visible=False) filter_5 = gr.Dropdown(visible=False) with gr.Column(scale=3): plot_r = gr.Plot(label="RMSE Comparison") plot_p = gr.Plot(label="Performance Comparison") # TAB 2: Data Inspector with gr.TabItem("Data Preview"): gr.Markdown("### All Test Configurations by Domain") gr.Markdown("**Biomedical (PubMedQA)**") preview_table_1 = gr.Dataframe(interactive=False, wrap=True) gr.Markdown("**Customer Support (TechQA)**") preview_table_2 = gr.Dataframe(interactive=False, wrap=True) gr.Markdown("**Finance (FinQA)**") preview_table_3 = gr.Dataframe(interactive=False, wrap=True) gr.Markdown("**General (MS MARCO)**") preview_table_4 = gr.Dataframe(interactive=False, wrap=True) gr.Markdown("**Legal (CUAD)**") preview_table_5 = gr.Dataframe(interactive=False, wrap=True) preview_btn = gr.Button("Refresh Data Preview") # TAB 3: Comparison with gr.TabItem("Inter-Domain Comparison"): gr.Markdown("### Select Metric to Compare") metric_dropdown = gr.Dropdown( label="Comparison Metric", choices=[ ("F1 Score (Higher is Better)", "f1_score"), ("AUC-ROC (Higher is Better)", "aucroc"), ("RMSE Relevance (Lower is Better)", "rmse_relevance"), ("RMSE Utilization (Lower is Better)", "rmse_utilization"), ("RMSE Completeness (Lower is Better)", "rmse_completeness") ], value="f1_score", interactive=True ) refresh_btn = gr.Button("Generate Comparison") gr.Markdown("### Configuration Differences") comp_table = gr.Dataframe(interactive=False) gr.Markdown("### Peak Performance") global_plot = gr.Plot() # TAB 4: Response Preview & Metrics with gr.TabItem("Response Preview & Metrics"): gr.Markdown("### Preview LLM Responses and Quality Metrics") gr.Markdown("Select a domain and question to view the generated answer, gold answer, and quality metrics.") with gr.Row(): with gr.Column(scale=1): domain_selector = gr.Dropdown( label="Select Domain", choices=[ 'Biomedical (PubMedQA)', 'Customer Support (TechQA)', 'Finance (FinQA)', 'General (MS MARCO)', 'Legal (CUAD)' ], interactive=True ) question_selector = gr.Dropdown( label="Select Question", choices=[], interactive=True ) with gr.Column(scale=2): metrics_plot = gr.Plot(label="Quality Metrics") with gr.Row(): with gr.Column(): gr.Markdown("#### LLM Generated Answer") llm_answer_box = gr.Textbox( label="LLM Answer", lines=12, interactive=False ) with gr.Column(): gr.Markdown("#### Gold Standard Answer") gold_answer_box = gr.Textbox( label="Gold Answer", lines=12, interactive=False ) # EVENTS refresh_data_btn.click( load_data, inputs=None, outputs=[status] ).then( lambda: gr.Dropdown(choices=get_dataset_choices()), outputs=[ds_dropdown] ) ds_dropdown.change( get_domain_state, inputs=[ds_dropdown], outputs=[constants_box, filter_1, filter_2, filter_3, filter_4, filter_5] ).then( plot_metrics_on_x_axis, inputs=[ds_dropdown, filter_1, filter_2, filter_3, filter_4, filter_5], outputs=[plot_r, plot_p] ) gr.on( triggers=[filter_1.change, filter_2.change, filter_3.change, filter_4.change, filter_5.change], fn=plot_metrics_on_x_axis, inputs=[ds_dropdown, filter_1, filter_2, filter_3, filter_4, filter_5], outputs=[plot_r, plot_p] ) # Debug Preview Events preview_btn.click(get_data_preview, inputs=None, outputs=[preview_table_1, preview_table_2, preview_table_3, preview_table_4, preview_table_5]) refresh_btn.click( generate_inter_domain_comparison, inputs=[metric_dropdown], outputs=[comp_table, global_plot] ) # Response Preview Events domain_selector.change( fn=get_questions_for_domain, inputs=[domain_selector], outputs=[question_selector] ).then( fn=lambda: ("", "", None), outputs=[llm_answer_box, gold_answer_box, metrics_plot] ) question_selector.change( fn=get_response_details, inputs=[domain_selector, question_selector], outputs=[llm_answer_box, gold_answer_box, metrics_plot] ) # Auto-load data on startup print(f"Loading data from {DATA_FOLDER}...") startup_status = load_data() print(startup_status) # Launch Gradio app (for Hugging Face Spaces, this runs on import) demo.launch(ssr_mode=False)