Psiska commited on
Commit
4475dcb
·
1 Parent(s): a410403

Evaluation

Browse files
__pycache__/crew.cpython-310.pyc CHANGED
Binary files a/__pycache__/crew.cpython-310.pyc and b/__pycache__/crew.cpython-310.pyc differ
 
evaluation.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import pandas as pd
4
+ import gradio as gr
5
+
6
+ from crew import run_crew
7
+
8
+ # Configuration: endpoint for GAIA evaluation API
9
+ API_URL = os.getenv("GAIA_API_URL", "https://huggingface.co/spaces/Psiska/General_AI_Assistant")
10
+ # Your Space identifier for generating the agent_code URL
11
+ SPACE_ID = os.getenv("SPACE_ID", "Psiska/General_AI_Assistant")
12
+
13
+
14
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
15
+ """
16
+ Fetches all evaluation questions, runs your agent on each,
17
+ and submits the batch to the /submit endpoint.
18
+ Returns a status message and a DataFrame of logs.
19
+ """
20
+ if profile is None:
21
+ return "🔒 Please log in with your Hugging Face account.", None
22
+
23
+ username = profile.username
24
+ try:
25
+ # 1) Fetch questions
26
+ resp = requests.get(f"{API_URL}/questions", timeout=15)
27
+ resp.raise_for_status()
28
+ questions = resp.json()
29
+
30
+ # 2) Run agent on each question
31
+ logs = []
32
+ answers = []
33
+ for item in questions:
34
+ task_id = item.get("task_id") or item.get("id")
35
+ question = item.get("question", "")
36
+ file_name = item.get("file_name", "")
37
+
38
+ # Optional: download attached file
39
+ if file_name:
40
+ file_resp = requests.get(f"{API_URL}/files/{task_id}", timeout=15)
41
+ file_resp.raise_for_status()
42
+ local_path = os.path.join("data", file_name)
43
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
44
+ with open(local_path, "wb") as f:
45
+ f.write(file_resp.content)
46
+ # pass file_name or path to your agent if needed
47
+
48
+ # Get agent's answer
49
+ answer = run_crew(question, file_name)
50
+ answers.append({"task_id": task_id, "submitted_answer": answer})
51
+ logs.append({"Task ID": task_id, "Question": question, "Answer": answer})
52
+
53
+ # 3) Prepare payload
54
+ payload = {
55
+ "username": username,
56
+ "agent_code": f"https://huggingface.co/spaces/{SPACE_ID}/tree/main",
57
+ "answers": answers
58
+ }
59
+
60
+ # 4) Submit answers
61
+ submit_resp = requests.post(f"{API_URL}/submit", json=payload, timeout=60)
62
+ submit_resp.raise_for_status()
63
+ result = submit_resp.json()
64
+
65
+ # Format status
66
+ status = (
67
+ f"✅ {result['username']} scored {result['score']}% "
68
+ f"({result['correct_count']}/{result['total_attempted']} correct)"
69
+ )
70
+ return status, pd.DataFrame(logs)
71
+
72
+ except Exception as e:
73
+ return f"❌ Error: {str(e)}", None
74
+
75
+
76
+ # Build Gradio interface
77
+ with gr.Blocks(title="GAIA Evaluation Runner") as demo:
78
+ gr.Markdown("# GAIA Evaluation Runner")
79
+ login = gr.LoginButton()
80
+
81
+ run_btn = gr.Button("Run & Submit All Answers")
82
+ status = gr.Textbox(label="Status", interactive=False)
83
+ table = gr.DataFrame(headers=["Task ID", "Question", "Answer"], label="Log of Q&A")
84
+
85
+ run_btn.click(
86
+ fn=run_and_submit_all,
87
+ inputs=[login],
88
+ outputs=[status, table]
89
+ )
90
+
91
+ if __name__ == "__main__":
92
+ demo.launch()
requirements.txt CHANGED
@@ -11,3 +11,5 @@ langchain
11
  redis==4.5.5 # if you choose Redis for persistence
12
  python-dotenv # to load REDIS_URL from .env
13
  faiss-cpu
 
 
 
11
  redis==4.5.5 # if you choose Redis for persistence
12
  python-dotenv # to load REDIS_URL from .env
13
  faiss-cpu
14
+ requests
15
+ pandas