Spaces:
Sleeping
Sleeping
updated files
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import pandas as pd
|
|
| 2 |
import gradio as gr
|
| 3 |
import plotly.express as px
|
| 4 |
from typing import Dict
|
|
|
|
| 5 |
|
| 6 |
from config import METADATA_COLUMNS, DATA_FOLDER
|
| 7 |
from data_loader import load_csv_from_folder, get_available_datasets
|
|
@@ -30,18 +31,101 @@ def analyze_domain_configs(df_subset):
|
|
| 30 |
return constants, variables
|
| 31 |
|
| 32 |
def load_data() -> str:
|
| 33 |
-
"""Loads data from the configured data folder."""
|
| 34 |
try:
|
|
|
|
| 35 |
df, status_msg = load_csv_from_folder(DATA_FOLDER)
|
| 36 |
if not df.empty:
|
| 37 |
# Remove failed_samples column if it exists
|
| 38 |
if 'failed_samples' in df.columns:
|
| 39 |
df = df.drop(columns=['failed_samples'])
|
| 40 |
DB["data"] = df
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
except Exception as e:
|
| 43 |
return f"Error loading data: {str(e)}"
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
# --- 2. UI LOGIC ---
|
| 46 |
|
| 47 |
def get_dataset_choices():
|
|
@@ -380,6 +464,52 @@ with gr.Blocks(title="RAG Analytics Pro") as demo:
|
|
| 380 |
gr.Markdown("### Peak Performance")
|
| 381 |
global_plot = gr.Plot()
|
| 382 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
# EVENTS
|
| 384 |
refresh_data_btn.click(
|
| 385 |
load_data, inputs=None, outputs=[status]
|
|
@@ -413,6 +543,22 @@ with gr.Blocks(title="RAG Analytics Pro") as demo:
|
|
| 413 |
inputs=[metric_dropdown],
|
| 414 |
outputs=[comp_table, global_plot]
|
| 415 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
|
| 417 |
# Auto-load data on startup
|
| 418 |
print(f"Loading data from {DATA_FOLDER}...")
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import plotly.express as px
|
| 4 |
from typing import Dict
|
| 5 |
+
from pathlib import Path
|
| 6 |
|
| 7 |
from config import METADATA_COLUMNS, DATA_FOLDER
|
| 8 |
from data_loader import load_csv_from_folder, get_available_datasets
|
|
|
|
| 31 |
return constants, variables
|
| 32 |
|
| 33 |
def load_data() -> str:
|
| 34 |
+
"""Loads data from the configured data folder and responses folder."""
|
| 35 |
try:
|
| 36 |
+
# Load aggregate metrics data
|
| 37 |
df, status_msg = load_csv_from_folder(DATA_FOLDER)
|
| 38 |
if not df.empty:
|
| 39 |
# Remove failed_samples column if it exists
|
| 40 |
if 'failed_samples' in df.columns:
|
| 41 |
df = df.drop(columns=['failed_samples'])
|
| 42 |
DB["data"] = df
|
| 43 |
+
|
| 44 |
+
# Load response data
|
| 45 |
+
DB["responses"] = load_response_data()
|
| 46 |
+
response_count = sum(len(df) for df in DB["responses"].values())
|
| 47 |
+
|
| 48 |
+
return f"{status_msg}\nLoaded {len(DB['responses'])} response datasets with {response_count} total responses."
|
| 49 |
except Exception as e:
|
| 50 |
return f"Error loading data: {str(e)}"
|
| 51 |
|
| 52 |
+
def load_response_data() -> Dict[str, pd.DataFrame]:
|
| 53 |
+
"""Load all response CSV files from responses folder."""
|
| 54 |
+
responses_folder = Path("./responses")
|
| 55 |
+
response_db = {}
|
| 56 |
+
|
| 57 |
+
domain_mapping = {
|
| 58 |
+
'Biomedical_pubmedqa_checkpoint_100.csv': 'Biomedical (PubMedQA)',
|
| 59 |
+
'Customer_Support_techqa_checkpoint_100.csv': 'Customer Support (TechQA)',
|
| 60 |
+
'Finance_finqa_checkpoint_100.csv': 'Finance (FinQA)',
|
| 61 |
+
'General_msmarco_checkpoint_100.csv': 'General (MS MARCO)',
|
| 62 |
+
'Legal_cuad_checkpoint_100.csv': 'Legal (CUAD)'
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
for filename, domain_name in domain_mapping.items():
|
| 66 |
+
filepath = responses_folder / filename
|
| 67 |
+
if filepath.exists():
|
| 68 |
+
df = pd.read_csv(filepath)
|
| 69 |
+
# Convert metric columns to numeric
|
| 70 |
+
for col in ['trace_relevance', 'trace_utilization', 'trace_completeness', 'trace_adherence']:
|
| 71 |
+
if col in df.columns:
|
| 72 |
+
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)
|
| 73 |
+
response_db[domain_name] = df
|
| 74 |
+
|
| 75 |
+
return response_db
|
| 76 |
+
|
| 77 |
+
def get_questions_for_domain(domain):
|
| 78 |
+
"""Get list of questions for selected domain."""
|
| 79 |
+
if "responses" not in DB or domain not in DB["responses"]:
|
| 80 |
+
return []
|
| 81 |
+
|
| 82 |
+
df = DB["responses"][domain]
|
| 83 |
+
questions = df['question'].unique().tolist()
|
| 84 |
+
return questions
|
| 85 |
+
|
| 86 |
+
def get_response_details(domain, question):
|
| 87 |
+
"""Get LLM answer, gold answer, and metrics for selected question."""
|
| 88 |
+
if "responses" not in DB or domain not in DB["responses"]:
|
| 89 |
+
return "", "", None
|
| 90 |
+
|
| 91 |
+
df = DB["responses"][domain]
|
| 92 |
+
row = df[df['question'] == question]
|
| 93 |
+
|
| 94 |
+
if row.empty:
|
| 95 |
+
return "", "", None
|
| 96 |
+
|
| 97 |
+
row = row.iloc[0]
|
| 98 |
+
|
| 99 |
+
llm_answer = str(row.get('answer', 'N/A'))
|
| 100 |
+
gold_answer = str(row.get('gold_answer', 'N/A'))
|
| 101 |
+
|
| 102 |
+
# Create metrics visualization
|
| 103 |
+
metrics_data = {
|
| 104 |
+
'Metric': ['Relevance', 'Utilization', 'Completeness', 'Adherence'],
|
| 105 |
+
'Score': [
|
| 106 |
+
row.get('trace_relevance', 0.0),
|
| 107 |
+
row.get('trace_utilization', 0.0),
|
| 108 |
+
row.get('trace_completeness', 0.0),
|
| 109 |
+
row.get('trace_adherence', 0.0)
|
| 110 |
+
]
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
metrics_df = pd.DataFrame(metrics_data)
|
| 114 |
+
|
| 115 |
+
# Create bar chart
|
| 116 |
+
fig = px.bar(
|
| 117 |
+
metrics_df,
|
| 118 |
+
x='Metric',
|
| 119 |
+
y='Score',
|
| 120 |
+
title=f'Quality Metrics for Selected Response',
|
| 121 |
+
text_auto='.3f',
|
| 122 |
+
color='Metric',
|
| 123 |
+
range_y=[0, 1]
|
| 124 |
+
)
|
| 125 |
+
fig.update_traces(textposition='outside')
|
| 126 |
+
|
| 127 |
+
return llm_answer, gold_answer, fig
|
| 128 |
+
|
| 129 |
# --- 2. UI LOGIC ---
|
| 130 |
|
| 131 |
def get_dataset_choices():
|
|
|
|
| 464 |
gr.Markdown("### Peak Performance")
|
| 465 |
global_plot = gr.Plot()
|
| 466 |
|
| 467 |
+
# TAB 4: Response Preview & Metrics
|
| 468 |
+
with gr.TabItem("Response Preview & Metrics"):
|
| 469 |
+
gr.Markdown("### Preview LLM Responses and Quality Metrics")
|
| 470 |
+
gr.Markdown("Select a domain and question to view the generated answer, gold answer, and quality metrics.")
|
| 471 |
+
|
| 472 |
+
with gr.Row():
|
| 473 |
+
with gr.Column(scale=1):
|
| 474 |
+
domain_selector = gr.Dropdown(
|
| 475 |
+
label="Select Domain",
|
| 476 |
+
choices=[
|
| 477 |
+
'Biomedical (PubMedQA)',
|
| 478 |
+
'Customer Support (TechQA)',
|
| 479 |
+
'Finance (FinQA)',
|
| 480 |
+
'General (MS MARCO)',
|
| 481 |
+
'Legal (CUAD)'
|
| 482 |
+
],
|
| 483 |
+
interactive=True
|
| 484 |
+
)
|
| 485 |
+
question_selector = gr.Dropdown(
|
| 486 |
+
label="Select Question",
|
| 487 |
+
choices=[],
|
| 488 |
+
interactive=True
|
| 489 |
+
)
|
| 490 |
+
|
| 491 |
+
with gr.Column(scale=2):
|
| 492 |
+
metrics_plot = gr.Plot(label="Quality Metrics")
|
| 493 |
+
|
| 494 |
+
with gr.Row():
|
| 495 |
+
with gr.Column():
|
| 496 |
+
gr.Markdown("#### LLM Generated Answer")
|
| 497 |
+
llm_answer_box = gr.Textbox(
|
| 498 |
+
label="LLM Answer",
|
| 499 |
+
lines=12,
|
| 500 |
+
interactive=False,
|
| 501 |
+
show_copy_button=True
|
| 502 |
+
)
|
| 503 |
+
|
| 504 |
+
with gr.Column():
|
| 505 |
+
gr.Markdown("#### Gold Standard Answer")
|
| 506 |
+
gold_answer_box = gr.Textbox(
|
| 507 |
+
label="Gold Answer",
|
| 508 |
+
lines=12,
|
| 509 |
+
interactive=False,
|
| 510 |
+
show_copy_button=True
|
| 511 |
+
)
|
| 512 |
+
|
| 513 |
# EVENTS
|
| 514 |
refresh_data_btn.click(
|
| 515 |
load_data, inputs=None, outputs=[status]
|
|
|
|
| 543 |
inputs=[metric_dropdown],
|
| 544 |
outputs=[comp_table, global_plot]
|
| 545 |
)
|
| 546 |
+
|
| 547 |
+
# Response Preview Events
|
| 548 |
+
domain_selector.change(
|
| 549 |
+
fn=get_questions_for_domain,
|
| 550 |
+
inputs=[domain_selector],
|
| 551 |
+
outputs=[question_selector]
|
| 552 |
+
).then(
|
| 553 |
+
fn=lambda: ("", "", None),
|
| 554 |
+
outputs=[llm_answer_box, gold_answer_box, metrics_plot]
|
| 555 |
+
)
|
| 556 |
+
|
| 557 |
+
question_selector.change(
|
| 558 |
+
fn=get_response_details,
|
| 559 |
+
inputs=[domain_selector, question_selector],
|
| 560 |
+
outputs=[llm_answer_box, gold_answer_box, metrics_plot]
|
| 561 |
+
)
|
| 562 |
|
| 563 |
# Auto-load data on startup
|
| 564 |
print(f"Loading data from {DATA_FOLDER}...")
|
responses/Biomedical_pubmedqa_checkpoint_100.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
responses/Customer_Support_techqa_checkpoint_100.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
responses/Finance_finqa_checkpoint_100.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
responses/General_msmarco_checkpoint_100.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
responses/Legal_cuad_checkpoint_100.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|