Supastrikas-004 commited on
Commit
5f74c91
Β·
verified Β·
1 Parent(s): 233811e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -34
app.py CHANGED
@@ -1,34 +1,121 @@
1
- import gradio as gr
2
- import pandas as pd
3
- from evaluator import evaluate_responses
4
- from synthetic_data import generate_synthetic_dataset
5
-
6
- # Demo synthetic dataset
7
- df = generate_synthetic_dataset(num_agents=10, num_samples=50)
8
-
9
- def run_evaluation(use_llm_judge=False):
10
- results = evaluate_responses(df, use_llm_judge=use_llm_judge)
11
- leaderboard = results.groupby("agent")["final_score"].mean().reset_index()
12
- leaderboard = leaderboard.sort_values("final_score", ascending=False)
13
- return results, leaderboard
14
-
15
- with gr.Blocks(title="Agentic Evaluation Framework") as demo:
16
- gr.Markdown("# πŸ€– Agentic Evaluation Framework")
17
- gr.Markdown("Automatically evaluate AI agents across multiple dimensions.")
18
-
19
- with gr.Tab("Synthetic Data Preview"):
20
- gr.DataFrame(df, label="Generated Dataset", interactive=False)
21
-
22
- with gr.Tab("Run Evaluation"):
23
- use_llm = gr.Checkbox(label="Use LLM Judge (Optional)", value=False)
24
- run_button = gr.Button("Run Evaluation")
25
- results_output = gr.DataFrame(label="Evaluation Results")
26
- leaderboard_output = gr.DataFrame(label="Leaderboard")
27
-
28
- run_button.click(
29
- fn=run_evaluation,
30
- inputs=[use_llm],
31
- outputs=[results_output, leaderboard_output]
32
- )
33
-
34
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ """
3
+ Gradio application entrypoint for Hugging Face Spaces.
4
+ """
5
+
6
+ import os
7
+ import tempfile
8
+ import pandas as pd
9
+ import gradio as gr
10
+ from evaluator import evaluate_dataframe
11
+ from synthetic_data import generate_synthetic_dataset
12
+
13
+ # Helper to save uploaded file to local temp path (gradio File gives a NamedTemporaryFile-like object)
14
+ def save_uploaded(file_obj):
15
+ if not file_obj:
16
+ return None
17
+ # file_obj can be a dictionary or a file-like object depending on Gradio version
18
+ try:
19
+ path = file_obj.name
20
+ return path
21
+ except Exception:
22
+ # fallback: write bytes to temp file
23
+ data = file_obj.read()
24
+ suffix = ".csv" if file_obj.name.endswith(".csv") else ".json"
25
+ fd, tmp = tempfile.mkstemp(suffix=suffix)
26
+ with os.fdopen(fd, "wb") as f:
27
+ f.write(data)
28
+ return tmp
29
+
30
+ def load_file_to_df(path):
31
+ if path is None:
32
+ return None
33
+ # Try CSV
34
+ try:
35
+ if path.endswith(".csv"):
36
+ return pd.read_csv(path)
37
+ # JSONL
38
+ try:
39
+ return pd.read_json(path, lines=True)
40
+ except ValueError:
41
+ return pd.read_json(path)
42
+ except Exception as e:
43
+ # As last resort, raise
44
+ raise e
45
+
46
+ def run_evaluation(file_obj):
47
+ # If no file provided, use synthetic demo
48
+ if file_obj is None:
49
+ df = generate_synthetic_dataset(num_agents=3, num_samples=12)
50
+ else:
51
+ path = save_uploaded(file_obj)
52
+ df = load_file_to_df(path)
53
+
54
+ # Ensure required columns exist; otherwise, attempt to map common alternatives
55
+ if df is None:
56
+ return None, "No data loaded", None
57
+
58
+ # Try to normalize column names
59
+ cols = {c.lower(): c for c in df.columns}
60
+ # rename common variants
61
+ rename_map = {}
62
+ for k in ["prompt", "response", "task", "agent", "reference"]:
63
+ if k not in cols:
64
+ # try variants
65
+ if k == "reference":
66
+ for alt in ["answer", "ground_truth", "ref"]:
67
+ if alt in cols:
68
+ rename_map[cols[alt]] = k
69
+ break
70
+ else:
71
+ for alt in [k, k.capitalize(), k.upper()]:
72
+ if alt.lower() in cols:
73
+ rename_map[cols[alt.lower()]] = k
74
+ if rename_map:
75
+ df = df.rename(columns=rename_map)
76
+
77
+ metrics_df, images, leaderboard = evaluate_dataframe(df)
78
+
79
+ # Prepare gallery (list of image file paths). Gradio Gallery accepts list of image paths or PIL images.
80
+ gallery_items = [p for (p, caption) in images]
81
+ captions = [caption for (p, caption) in images]
82
+
83
+ # Save a CSV report for download
84
+ out_csv = "/tmp/eval_results.csv"
85
+ metrics_df.to_csv(out_csv, index=False)
86
+
87
+ return (gallery_items, captions), metrics_df, leaderboard
88
+
89
+ # Build Gradio UI
90
+ with gr.Blocks() as demo:
91
+ gr.Markdown("# Agentic Evaluation Framework")
92
+ gr.Markdown(
93
+ "Upload a CSV/JSON/JSONL with columns: `prompt,response,task,agent,reference` (reference optional). "
94
+ "If no file is uploaded, a small synthetic demo will run."
95
+ )
96
+
97
+ with gr.Row():
98
+ file_input = gr.File(label="Upload CSV / JSON / JSONL (optional)", file_types=[".csv", ".json", ".jsonl"])
99
+ run_btn = gr.Button("Run Evaluation")
100
+ download_report = gr.File(label="Download CSV Report")
101
+
102
+ gallery = gr.Gallery(label="Visualization Outputs").style(grid=2)
103
+ table = gr.Dataframe(headers=None, label="Per-example Metrics (detailed)")
104
+ leaderboard = gr.Dataframe(headers=None, label="Leaderboard (Avg Final Score per Agent & Task)")
105
+
106
+ def on_run(file_in):
107
+ (gallery_items, captions), metrics_df, lb = run_evaluation(file_in)
108
+ # Save gallery captions mapping into a simple list of tuples for Gradio gallery (path, caption)
109
+ gallery_display = []
110
+ for i, p in enumerate(gallery_items):
111
+ caption = captions[i] if i < len(captions) else ""
112
+ gallery_display.append((p, caption))
113
+ return gallery_display, metrics_df, lb
114
+
115
+ run_btn.click(fn=on_run, inputs=[file_input], outputs=[gallery, table, leaderboard])
116
+
117
+ gr.Markdown("## Usage tips\n- Columns: `prompt,response,task,agent,reference` (case-insensitive). "
118
+ "- `reference` can be empty but accuracy/hallucination will be weaker.\n"
119
+ "- Visualization images are available in the Gallery and a CSV report is downloadable.")
120
+
121
+ demo.launch()