Spaces:

Supastrikas-004
/

agentic-evaluation-framework

Sleeping

App Files Files Community

agentic-evaluation-framework / app.py

Supastrikas-004

Update app.py

b16d000 verified 5 months ago

raw

history blame contribute delete

4.41 kB

	import os
	import tempfile
	import pandas as pd
	import gradio as gr
	from evaluator import evaluate_dataframe
	from synthetic import generate_synthetic_dataset

	# Helper: save Gradio uploaded file object to a path
	def save_uploaded(file_obj):
	if not file_obj:
	return None
	try:
	return file_obj.name
	except Exception:
	# fallback for different gradio versions
	data = file_obj.read()
	suffix = ".csv" if getattr(file_obj, "name", "").endswith(".csv") else ".json"
	fd, tmp = tempfile.mkstemp(suffix=suffix)
	with os.fdopen(fd, "wb") as f:
	f.write(data)
	return tmp

	def load_file(path):
	if path is None:
	return None
	try:
	if path.endswith(".csv"):
	return pd.read_csv(path)
	try:
	return pd.read_json(path, lines=True)
	except ValueError:
	return pd.read_json(path)
	except Exception as e:
	raise e

	def run_eval_and_build_outputs(file_obj, use_llm_judge):
	# load data
	if file_obj is None:
	df = generate_synthetic_dataset(num_agents=4, num_samples=24)
	else:
	path = save_uploaded(file_obj)
	df = load_file(path)

	# normalize column names
	cols = {c.lower(): c for c in df.columns}
	rename_map = {}
	for k in ["prompt", "response", "task", "agent", "reference", "instruction"]:
	if k in cols and cols[k] != k:
	rename_map[cols[k]] = k
	if "prompt" in cols and "instruction" not in cols:
	# map prompt to instruction
	rename_map[cols["prompt"]] = "instruction"
	if rename_map:
	df = df.rename(columns=rename_map)

	# Call evaluator
	metrics_df, images, leaderboard_df = evaluate_dataframe(df, use_llm_judge=use_llm_judge)

	# Save CSV for download
	out_csv = "/tmp/eval_results.csv"
	metrics_df.to_csv(out_csv, index=False)

	# Prepare image displays
	image_items = []
	for path, caption in images:
	if os.path.exists(path):
	image_items.append((path, caption))
	return metrics_df, leaderboard_df, out_csv, image_items

	# Build Gradio UI
	with gr.Blocks(title="Agentic Evaluation Framework") as demo:
	gr.Markdown("# 🤖 Agentic Evaluation Framework")

	with gr.Tab("Data Preview / Upload"):
	gr.Markdown("Upload a CSV/JSON/JSONL file with columns `prompt`/`instruction`, `response`, `task`, `agent`, `reference` (optional).")
	file_input = gr.File(label="Upload dataset (CSV/JSON/JSONL) — optional", file_types=[".csv", ".json", ".jsonl"])
	run_btn_preview = gr.Button("Run evaluation (use demo if no file)")

	with gr.Tab("Run Evaluation"):
	use_llm = gr.Checkbox(label="Use LLM Judge (Hallucination Detector) — may be heavy", value=False)
	run_button = gr.Button("Evaluate")
	status = gr.Textbox(label="Status", interactive=False)
	# outputs
	metrics_table = gr.Dataframe(label="Per-example Metrics", interactive=False)
	leaderboard_table = gr.Dataframe(label="Leaderboard (Agent, Task, Avg final score)", interactive=False)
	download_csv = gr.File(label="Download CSV report")

	# Visualization gallery
	gallery = gr.Gallery(label="Generated Visualizations", columns=2, height="auto")

	def on_run(file_obj, use_llm_flag):
	try:
	metrics_df, leaderboard_df, out_csv, images = run_eval_and_build_outputs(file_obj, use_llm_flag)
	status_text = "Evaluation complete."
	gallery_list = [(path, caption) for path, caption in images]
	return status_text, metrics_df, leaderboard_df, out_csv, gallery_list
	except Exception as e:
	return f"Error: {str(e)}", None, None, None, []

	run_button.click(fn=on_run, inputs=[file_input, use_llm], outputs=[status, metrics_table, leaderboard_table, download_csv, gallery])

	with gr.Tab("Usage & Notes"):
	gr.Markdown(
	"- Use the LLM Judge only if you understand the memory cost. If the detector models cannot be loaded, the app will fall back to neutral placeholders and continue.\n"
	"- Visualizations are saved in `/tmp` and displayed in the Gallery; each image is downloadable via right-click or the download button for CSV.\n"
	"- If you want a lighter footprint, disable the LLM Judge toggle.\n"
	)

	demo.launch()