File size: 4,407 Bytes
3ff2363
 
 
 
 
4a89e34
3ff2363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b16d000
3ff2363
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b16d000
 
3ff2363
 
 
 
 
b16d000
3ff2363
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import tempfile
import pandas as pd
import gradio as gr
from evaluator import evaluate_dataframe
from synthetic import generate_synthetic_dataset

# Helper: save Gradio uploaded file object to a path
def save_uploaded(file_obj):
    if not file_obj:
        return None
    try:
        return file_obj.name
    except Exception:
        # fallback for different gradio versions
        data = file_obj.read()
        suffix = ".csv" if getattr(file_obj, "name", "").endswith(".csv") else ".json"
        fd, tmp = tempfile.mkstemp(suffix=suffix)
        with os.fdopen(fd, "wb") as f:
            f.write(data)
        return tmp

def load_file(path):
    if path is None:
        return None
    try:
        if path.endswith(".csv"):
            return pd.read_csv(path)
        try:
            return pd.read_json(path, lines=True)
        except ValueError:
            return pd.read_json(path)
    except Exception as e:
        raise e

def run_eval_and_build_outputs(file_obj, use_llm_judge):
    # load data
    if file_obj is None:
        df = generate_synthetic_dataset(num_agents=4, num_samples=24)
    else:
        path = save_uploaded(file_obj)
        df = load_file(path)

    # normalize column names
    cols = {c.lower(): c for c in df.columns}
    rename_map = {}
    for k in ["prompt", "response", "task", "agent", "reference", "instruction"]:
        if k in cols and cols[k] != k:
            rename_map[cols[k]] = k
    if "prompt" in cols and "instruction" not in cols:
        # map prompt to instruction
        rename_map[cols["prompt"]] = "instruction"
    if rename_map:
        df = df.rename(columns=rename_map)

    # Call evaluator
    metrics_df, images, leaderboard_df = evaluate_dataframe(df, use_llm_judge=use_llm_judge)

    # Save CSV for download
    out_csv = "/tmp/eval_results.csv"
    metrics_df.to_csv(out_csv, index=False)

    # Prepare image displays
    image_items = []
    for path, caption in images:
        if os.path.exists(path):
            image_items.append((path, caption))
    return metrics_df, leaderboard_df, out_csv, image_items

# Build Gradio UI
with gr.Blocks(title="Agentic Evaluation Framework") as demo:
    gr.Markdown("# 🤖 Agentic Evaluation Framework")

    with gr.Tab("Data Preview / Upload"):
        gr.Markdown("Upload a CSV/JSON/JSONL file with columns `prompt`/`instruction`, `response`, `task`, `agent`, `reference` (optional).")
        file_input = gr.File(label="Upload dataset (CSV/JSON/JSONL) — optional", file_types=[".csv", ".json", ".jsonl"])
        run_btn_preview = gr.Button("Run evaluation (use demo if no file)")

    with gr.Tab("Run Evaluation"):
        use_llm = gr.Checkbox(label="Use LLM Judge (Hallucination Detector) — may be heavy", value=False)
        run_button = gr.Button("Evaluate")
        status = gr.Textbox(label="Status", interactive=False)
        # outputs
        metrics_table = gr.Dataframe(label="Per-example Metrics", interactive=False)
        leaderboard_table = gr.Dataframe(label="Leaderboard (Agent, Task, Avg final score)", interactive=False)
        download_csv = gr.File(label="Download CSV report")

        # Visualization gallery
        gallery = gr.Gallery(label="Generated Visualizations", columns=2, height="auto")

        def on_run(file_obj, use_llm_flag):
            try:
                metrics_df, leaderboard_df, out_csv, images = run_eval_and_build_outputs(file_obj, use_llm_flag)
                status_text = "Evaluation complete."
                gallery_list = [(path, caption) for path, caption in images]
                return status_text, metrics_df, leaderboard_df, out_csv, gallery_list
            except Exception as e:
                return f"Error: {str(e)}", None, None, None, []

        run_button.click(fn=on_run, inputs=[file_input, use_llm], outputs=[status, metrics_table, leaderboard_table, download_csv, gallery])

    with gr.Tab("Usage & Notes"):
        gr.Markdown(
            "- Use the LLM Judge only if you understand the memory cost. If the detector models cannot be loaded, the app will fall back to neutral placeholders and continue.\n"
            "- Visualizations are saved in `/tmp` and displayed in the Gallery; each image is downloadable via right-click or the download button for CSV.\n"
            "- If you want a lighter footprint, disable the LLM Judge toggle.\n"
        )

demo.launch()