Supastrikas-004's picture
Update app.py
b16d000 verified
import os
import tempfile
import pandas as pd
import gradio as gr
from evaluator import evaluate_dataframe
from synthetic import generate_synthetic_dataset
# Helper: save Gradio uploaded file object to a path
def save_uploaded(file_obj):
if not file_obj:
return None
try:
return file_obj.name
except Exception:
# fallback for different gradio versions
data = file_obj.read()
suffix = ".csv" if getattr(file_obj, "name", "").endswith(".csv") else ".json"
fd, tmp = tempfile.mkstemp(suffix=suffix)
with os.fdopen(fd, "wb") as f:
f.write(data)
return tmp
def load_file(path):
if path is None:
return None
try:
if path.endswith(".csv"):
return pd.read_csv(path)
try:
return pd.read_json(path, lines=True)
except ValueError:
return pd.read_json(path)
except Exception as e:
raise e
def run_eval_and_build_outputs(file_obj, use_llm_judge):
# load data
if file_obj is None:
df = generate_synthetic_dataset(num_agents=4, num_samples=24)
else:
path = save_uploaded(file_obj)
df = load_file(path)
# normalize column names
cols = {c.lower(): c for c in df.columns}
rename_map = {}
for k in ["prompt", "response", "task", "agent", "reference", "instruction"]:
if k in cols and cols[k] != k:
rename_map[cols[k]] = k
if "prompt" in cols and "instruction" not in cols:
# map prompt to instruction
rename_map[cols["prompt"]] = "instruction"
if rename_map:
df = df.rename(columns=rename_map)
# Call evaluator
metrics_df, images, leaderboard_df = evaluate_dataframe(df, use_llm_judge=use_llm_judge)
# Save CSV for download
out_csv = "/tmp/eval_results.csv"
metrics_df.to_csv(out_csv, index=False)
# Prepare image displays
image_items = []
for path, caption in images:
if os.path.exists(path):
image_items.append((path, caption))
return metrics_df, leaderboard_df, out_csv, image_items
# Build Gradio UI
with gr.Blocks(title="Agentic Evaluation Framework") as demo:
gr.Markdown("# 🤖 Agentic Evaluation Framework")
with gr.Tab("Data Preview / Upload"):
gr.Markdown("Upload a CSV/JSON/JSONL file with columns `prompt`/`instruction`, `response`, `task`, `agent`, `reference` (optional).")
file_input = gr.File(label="Upload dataset (CSV/JSON/JSONL) — optional", file_types=[".csv", ".json", ".jsonl"])
run_btn_preview = gr.Button("Run evaluation (use demo if no file)")
with gr.Tab("Run Evaluation"):
use_llm = gr.Checkbox(label="Use LLM Judge (Hallucination Detector) — may be heavy", value=False)
run_button = gr.Button("Evaluate")
status = gr.Textbox(label="Status", interactive=False)
# outputs
metrics_table = gr.Dataframe(label="Per-example Metrics", interactive=False)
leaderboard_table = gr.Dataframe(label="Leaderboard (Agent, Task, Avg final score)", interactive=False)
download_csv = gr.File(label="Download CSV report")
# Visualization gallery
gallery = gr.Gallery(label="Generated Visualizations", columns=2, height="auto")
def on_run(file_obj, use_llm_flag):
try:
metrics_df, leaderboard_df, out_csv, images = run_eval_and_build_outputs(file_obj, use_llm_flag)
status_text = "Evaluation complete."
gallery_list = [(path, caption) for path, caption in images]
return status_text, metrics_df, leaderboard_df, out_csv, gallery_list
except Exception as e:
return f"Error: {str(e)}", None, None, None, []
run_button.click(fn=on_run, inputs=[file_input, use_llm], outputs=[status, metrics_table, leaderboard_table, download_csv, gallery])
with gr.Tab("Usage & Notes"):
gr.Markdown(
"- Use the LLM Judge only if you understand the memory cost. If the detector models cannot be loaded, the app will fall back to neutral placeholders and continue.\n"
"- Visualizations are saved in `/tmp` and displayed in the Gallery; each image is downloadable via right-click or the download button for CSV.\n"
"- If you want a lighter footprint, disable the LLM Judge toggle.\n"
)
demo.launch()