|
|
import os |
|
|
import tempfile |
|
|
import pandas as pd |
|
|
import gradio as gr |
|
|
from evaluator import evaluate_dataframe |
|
|
from synthetic import generate_synthetic_dataset |
|
|
|
|
|
|
|
|
def save_uploaded(file_obj): |
|
|
if not file_obj: |
|
|
return None |
|
|
try: |
|
|
return file_obj.name |
|
|
except Exception: |
|
|
|
|
|
data = file_obj.read() |
|
|
suffix = ".csv" if getattr(file_obj, "name", "").endswith(".csv") else ".json" |
|
|
fd, tmp = tempfile.mkstemp(suffix=suffix) |
|
|
with os.fdopen(fd, "wb") as f: |
|
|
f.write(data) |
|
|
return tmp |
|
|
|
|
|
def load_file(path): |
|
|
if path is None: |
|
|
return None |
|
|
try: |
|
|
if path.endswith(".csv"): |
|
|
return pd.read_csv(path) |
|
|
try: |
|
|
return pd.read_json(path, lines=True) |
|
|
except ValueError: |
|
|
return pd.read_json(path) |
|
|
except Exception as e: |
|
|
raise e |
|
|
|
|
|
def run_eval_and_build_outputs(file_obj, use_llm_judge): |
|
|
|
|
|
if file_obj is None: |
|
|
df = generate_synthetic_dataset(num_agents=4, num_samples=24) |
|
|
else: |
|
|
path = save_uploaded(file_obj) |
|
|
df = load_file(path) |
|
|
|
|
|
|
|
|
cols = {c.lower(): c for c in df.columns} |
|
|
rename_map = {} |
|
|
for k in ["prompt", "response", "task", "agent", "reference", "instruction"]: |
|
|
if k in cols and cols[k] != k: |
|
|
rename_map[cols[k]] = k |
|
|
if "prompt" in cols and "instruction" not in cols: |
|
|
|
|
|
rename_map[cols["prompt"]] = "instruction" |
|
|
if rename_map: |
|
|
df = df.rename(columns=rename_map) |
|
|
|
|
|
|
|
|
metrics_df, images, leaderboard_df = evaluate_dataframe(df, use_llm_judge=use_llm_judge) |
|
|
|
|
|
|
|
|
out_csv = "/tmp/eval_results.csv" |
|
|
metrics_df.to_csv(out_csv, index=False) |
|
|
|
|
|
|
|
|
image_items = [] |
|
|
for path, caption in images: |
|
|
if os.path.exists(path): |
|
|
image_items.append((path, caption)) |
|
|
return metrics_df, leaderboard_df, out_csv, image_items |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Agentic Evaluation Framework") as demo: |
|
|
gr.Markdown("# 🤖 Agentic Evaluation Framework") |
|
|
|
|
|
with gr.Tab("Data Preview / Upload"): |
|
|
gr.Markdown("Upload a CSV/JSON/JSONL file with columns `prompt`/`instruction`, `response`, `task`, `agent`, `reference` (optional).") |
|
|
file_input = gr.File(label="Upload dataset (CSV/JSON/JSONL) — optional", file_types=[".csv", ".json", ".jsonl"]) |
|
|
run_btn_preview = gr.Button("Run evaluation (use demo if no file)") |
|
|
|
|
|
with gr.Tab("Run Evaluation"): |
|
|
use_llm = gr.Checkbox(label="Use LLM Judge (Hallucination Detector) — may be heavy", value=False) |
|
|
run_button = gr.Button("Evaluate") |
|
|
status = gr.Textbox(label="Status", interactive=False) |
|
|
|
|
|
metrics_table = gr.Dataframe(label="Per-example Metrics", interactive=False) |
|
|
leaderboard_table = gr.Dataframe(label="Leaderboard (Agent, Task, Avg final score)", interactive=False) |
|
|
download_csv = gr.File(label="Download CSV report") |
|
|
|
|
|
|
|
|
gallery = gr.Gallery(label="Generated Visualizations", columns=2, height="auto") |
|
|
|
|
|
def on_run(file_obj, use_llm_flag): |
|
|
try: |
|
|
metrics_df, leaderboard_df, out_csv, images = run_eval_and_build_outputs(file_obj, use_llm_flag) |
|
|
status_text = "Evaluation complete." |
|
|
gallery_list = [(path, caption) for path, caption in images] |
|
|
return status_text, metrics_df, leaderboard_df, out_csv, gallery_list |
|
|
except Exception as e: |
|
|
return f"Error: {str(e)}", None, None, None, [] |
|
|
|
|
|
run_button.click(fn=on_run, inputs=[file_input, use_llm], outputs=[status, metrics_table, leaderboard_table, download_csv, gallery]) |
|
|
|
|
|
with gr.Tab("Usage & Notes"): |
|
|
gr.Markdown( |
|
|
"- Use the LLM Judge only if you understand the memory cost. If the detector models cannot be loaded, the app will fall back to neutral placeholders and continue.\n" |
|
|
"- Visualizations are saved in `/tmp` and displayed in the Gallery; each image is downloadable via right-click or the download button for CSV.\n" |
|
|
"- If you want a lighter footprint, disable the LLM Judge toggle.\n" |
|
|
) |
|
|
|
|
|
demo.launch() |
|
|
|