import os import tempfile import pandas as pd import gradio as gr from evaluator import evaluate_dataframe from synthetic import generate_synthetic_dataset # Helper: save Gradio uploaded file object to a path def save_uploaded(file_obj): if not file_obj: return None try: return file_obj.name except Exception: # fallback for different gradio versions data = file_obj.read() suffix = ".csv" if getattr(file_obj, "name", "").endswith(".csv") else ".json" fd, tmp = tempfile.mkstemp(suffix=suffix) with os.fdopen(fd, "wb") as f: f.write(data) return tmp def load_file(path): if path is None: return None try: if path.endswith(".csv"): return pd.read_csv(path) try: return pd.read_json(path, lines=True) except ValueError: return pd.read_json(path) except Exception as e: raise e def run_eval_and_build_outputs(file_obj, use_llm_judge): # load data if file_obj is None: df = generate_synthetic_dataset(num_agents=4, num_samples=24) else: path = save_uploaded(file_obj) df = load_file(path) # normalize column names cols = {c.lower(): c for c in df.columns} rename_map = {} for k in ["prompt", "response", "task", "agent", "reference", "instruction"]: if k in cols and cols[k] != k: rename_map[cols[k]] = k if "prompt" in cols and "instruction" not in cols: # map prompt to instruction rename_map[cols["prompt"]] = "instruction" if rename_map: df = df.rename(columns=rename_map) # Call evaluator metrics_df, images, leaderboard_df = evaluate_dataframe(df, use_llm_judge=use_llm_judge) # Save CSV for download out_csv = "/tmp/eval_results.csv" metrics_df.to_csv(out_csv, index=False) # Prepare image displays image_items = [] for path, caption in images: if os.path.exists(path): image_items.append((path, caption)) return metrics_df, leaderboard_df, out_csv, image_items # Build Gradio UI with gr.Blocks(title="Agentic Evaluation Framework") as demo: gr.Markdown("# 🤖 Agentic Evaluation Framework") with gr.Tab("Data Preview / Upload"): gr.Markdown("Upload a CSV/JSON/JSONL file with columns `prompt`/`instruction`, `response`, `task`, `agent`, `reference` (optional).") file_input = gr.File(label="Upload dataset (CSV/JSON/JSONL) — optional", file_types=[".csv", ".json", ".jsonl"]) run_btn_preview = gr.Button("Run evaluation (use demo if no file)") with gr.Tab("Run Evaluation"): use_llm = gr.Checkbox(label="Use LLM Judge (Hallucination Detector) — may be heavy", value=False) run_button = gr.Button("Evaluate") status = gr.Textbox(label="Status", interactive=False) # outputs metrics_table = gr.Dataframe(label="Per-example Metrics", interactive=False) leaderboard_table = gr.Dataframe(label="Leaderboard (Agent, Task, Avg final score)", interactive=False) download_csv = gr.File(label="Download CSV report") # Visualization gallery gallery = gr.Gallery(label="Generated Visualizations", columns=2, height="auto") def on_run(file_obj, use_llm_flag): try: metrics_df, leaderboard_df, out_csv, images = run_eval_and_build_outputs(file_obj, use_llm_flag) status_text = "Evaluation complete." gallery_list = [(path, caption) for path, caption in images] return status_text, metrics_df, leaderboard_df, out_csv, gallery_list except Exception as e: return f"Error: {str(e)}", None, None, None, [] run_button.click(fn=on_run, inputs=[file_input, use_llm], outputs=[status, metrics_table, leaderboard_table, download_csv, gallery]) with gr.Tab("Usage & Notes"): gr.Markdown( "- Use the LLM Judge only if you understand the memory cost. If the detector models cannot be loaded, the app will fall back to neutral placeholders and continue.\n" "- Visualizations are saved in `/tmp` and displayed in the Gallery; each image is downloadable via right-click or the download button for CSV.\n" "- If you want a lighter footprint, disable the LLM Judge toggle.\n" ) demo.launch()