Spaces:

Supastrikas-004
/

evaluation-framework

Runtime error

App Files Files Community

Supastrikas-004 commited on Sep 13, 2025

Commit

5f74c91

verified ·

1 Parent(s): 233811e

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -34

app.py CHANGED Viewed

@@ -1,34 +1,121 @@
-import gradio as gr
-import pandas as pd
-from evaluator import evaluate_responses
-from synthetic_data import generate_synthetic_dataset
-# Demo synthetic dataset
-df = generate_synthetic_dataset(num_agents=10, num_samples=50)
-def run_evaluation(use_llm_judge=False):
-    results = evaluate_responses(df, use_llm_judge=use_llm_judge)
-    leaderboard = results.groupby("agent")["final_score"].mean().reset_index()
-    leaderboard = leaderboard.sort_values("final_score", ascending=False)
-    return results, leaderboard
-with gr.Blocks(title="Agentic Evaluation Framework") as demo:
-    gr.Markdown("# 🤖 Agentic Evaluation Framework")
-    gr.Markdown("Automatically evaluate AI agents across multiple dimensions.")
-    with gr.Tab("Synthetic Data Preview"):
-        gr.DataFrame(df, label="Generated Dataset", interactive=False)
-    with gr.Tab("Run Evaluation"):
-        use_llm = gr.Checkbox(label="Use LLM Judge (Optional)", value=False)
-        run_button = gr.Button("Run Evaluation")
-        results_output = gr.DataFrame(label="Evaluation Results")
-        leaderboard_output = gr.DataFrame(label="Leaderboard")
-        run_button.click(
-            fn=run_evaluation,
-            inputs=[use_llm],
-            outputs=[results_output, leaderboard_output]
-        )
-demo.launch()

+# app.py
+"""
+Gradio application entrypoint for Hugging Face Spaces.
+"""
+import os
+import tempfile
+import pandas as pd
+import gradio as gr
+from evaluator import evaluate_dataframe
+from synthetic_data import generate_synthetic_dataset
+# Helper to save uploaded file to local temp path (gradio File gives a NamedTemporaryFile-like object)
+def save_uploaded(file_obj):
+    if not file_obj:
+        return None
+    # file_obj can be a dictionary or a file-like object depending on Gradio version
+    try:
+        path = file_obj.name
+        return path
+    except Exception:
+        # fallback: write bytes to temp file
+        data = file_obj.read()
+        suffix = ".csv" if file_obj.name.endswith(".csv") else ".json"
+        fd, tmp = tempfile.mkstemp(suffix=suffix)
+        with os.fdopen(fd, "wb") as f:
+            f.write(data)
+        return tmp
+def load_file_to_df(path):
+    if path is None:
+        return None
+    # Try CSV
+    try:
+        if path.endswith(".csv"):
+            return pd.read_csv(path)
+        # JSONL
+        try:
+            return pd.read_json(path, lines=True)
+        except ValueError:
+            return pd.read_json(path)
+    except Exception as e:
+        # As last resort, raise
+        raise e
+def run_evaluation(file_obj):
+    # If no file provided, use synthetic demo
+    if file_obj is None:
+        df = generate_synthetic_dataset(num_agents=3, num_samples=12)
+    else:
+        path = save_uploaded(file_obj)
+        df = load_file_to_df(path)
+    # Ensure required columns exist; otherwise, attempt to map common alternatives
+    if df is None:
+        return None, "No data loaded", None
+    # Try to normalize column names
+    cols = {c.lower(): c for c in df.columns}
+    # rename common variants
+    rename_map = {}
+    for k in ["prompt", "response", "task", "agent", "reference"]:
+        if k not in cols:
+            # try variants
+            if k == "reference":
+                for alt in ["answer", "ground_truth", "ref"]:
+                    if alt in cols:
+                        rename_map[cols[alt]] = k
+                        break
+            else:
+                for alt in [k, k.capitalize(), k.upper()]:
+                    if alt.lower() in cols:
+                        rename_map[cols[alt.lower()]] = k
+    if rename_map:
+        df = df.rename(columns=rename_map)
+    metrics_df, images, leaderboard = evaluate_dataframe(df)
+    # Prepare gallery (list of image file paths). Gradio Gallery accepts list of image paths or PIL images.
+    gallery_items = [p for (p, caption) in images]
+    captions = [caption for (p, caption) in images]
+    # Save a CSV report for download
+    out_csv = "/tmp/eval_results.csv"
+    metrics_df.to_csv(out_csv, index=False)
+    return (gallery_items, captions), metrics_df, leaderboard
+# Build Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# Agentic Evaluation Framework")
+    gr.Markdown(
+        "Upload a CSV/JSON/JSONL with columns: `prompt,response,task,agent,reference` (reference optional). "
+        "If no file is uploaded, a small synthetic demo will run."
+    )
+    with gr.Row():
+        file_input = gr.File(label="Upload CSV / JSON / JSONL (optional)", file_types=[".csv", ".json", ".jsonl"])
+        run_btn = gr.Button("Run Evaluation")
+        download_report = gr.File(label="Download CSV Report")
+    gallery = gr.Gallery(label="Visualization Outputs").style(grid=2)
+    table = gr.Dataframe(headers=None, label="Per-example Metrics (detailed)")
+    leaderboard = gr.Dataframe(headers=None, label="Leaderboard (Avg Final Score per Agent & Task)")
+    def on_run(file_in):
+        (gallery_items, captions), metrics_df, lb = run_evaluation(file_in)
+        # Save gallery captions mapping into a simple list of tuples for Gradio gallery (path, caption)
+        gallery_display = []
+        for i, p in enumerate(gallery_items):
+            caption = captions[i] if i < len(captions) else ""
+            gallery_display.append((p, caption))
+        return gallery_display, metrics_df, lb
+    run_btn.click(fn=on_run, inputs=[file_input], outputs=[gallery, table, leaderboard])
+    gr.Markdown("## Usage tips\n- Columns: `prompt,response,task,agent,reference` (case-insensitive). "
+                "- `reference` can be empty but accuracy/hallucination will be weaker.\n"
+                "- Visualization images are available in the Gallery and a CSV report is downloadable.")
+demo.launch()