Psiska commited on
Commit
3fe1356
·
1 Parent(s): 4475dcb

Evaluation 1

Browse files
Files changed (3) hide show
  1. app-original.py +158 -0
  2. app.py +78 -146
  3. evaluation.py +0 -92
app-original.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, threading
2
+ import gradio as gr
3
+ from crew import run_parallel_crew
4
+ from crew import run_crew
5
+ from utils import get_questions
6
+
7
+
8
+ def ask(question, openai_api_key, gemini_api_key, anthropic_api_key, file_name = ""):
9
+ """
10
+ Ask General AI Assistant a question to answer.
11
+
12
+ Args:
13
+ question (str): The question to answer
14
+ openai_api_key (str): OpenAI API key
15
+ gemini_api_key (str): Gemini API key
16
+ anthropic_api_key (str): Anthropic API key
17
+ file_name (str): Optional file name
18
+
19
+ Returns:
20
+ str: The answer to the question
21
+ """
22
+ if not question:
23
+ raise gr.Error("Question is required.")
24
+
25
+ if not openai_api_key:
26
+ raise gr.Error("OpenAI API Key is required.")
27
+
28
+ if not gemini_api_key:
29
+ raise gr.Error("Gemini API Key is required.")
30
+
31
+ if not anthropic_api_key:
32
+ raise gr.Error("Anthropic API Key is required.")
33
+
34
+ if file_name:
35
+ file_name = f"data/{file_name}"
36
+
37
+ lock = threading.Lock()
38
+
39
+ with lock:
40
+ answer = ""
41
+
42
+ try:
43
+ os.environ["OPENAI_API_KEY"] = openai_api_key
44
+ os.environ["GEMINI_API_KEY"] = gemini_api_key
45
+ os.environ["MODEL_API_KEY"] = anthropic_api_key
46
+
47
+ #answer = run_parallel_crew(question, file_name)
48
+ answer = run_crew(question, file_name)
49
+ except Exception as e:
50
+ raise gr.Error(e)
51
+ finally:
52
+ del os.environ["OPENAI_API_KEY"]
53
+ del os.environ["GEMINI_API_KEY"]
54
+ del os.environ["MODEL_API_KEY"]
55
+
56
+ return answer
57
+
58
+ gr.close_all()
59
+
60
+ with gr.Blocks() as grady:
61
+ gr.Markdown("## Grady - General AI Assistant")
62
+
63
+ with gr.Tab("Solution"):
64
+ gr.Markdown(os.environ.get("DESCRIPTION"))
65
+
66
+ with gr.Row():
67
+ with gr.Column(scale=3):
68
+ with gr.Row():
69
+ question = gr.Textbox(
70
+ label="Question *",
71
+ placeholder="In the 2025 Gradio Agents & MCP Hackathon, what percentage of participants submitted a solution during the last 24 hours?",
72
+ interactive=True
73
+ )
74
+ with gr.Row():
75
+ level = gr.Radio(
76
+ choices=[1, 2, 3],
77
+ label="GAIA Benchmark Level",
78
+ interactive=True,
79
+ scale=1
80
+ )
81
+ ground_truth = gr.Textbox(
82
+ label="Ground Truth",
83
+ interactive=True,
84
+ scale=1
85
+ )
86
+ file_name = gr.Textbox(
87
+ label="File Name",
88
+ interactive=True,
89
+ scale=2
90
+ )
91
+ with gr.Row():
92
+ openai_api_key = gr.Textbox(
93
+ label="OpenAI API Key *",
94
+ type="password",
95
+ placeholder="sk‑...",
96
+ interactive=True
97
+ )
98
+ gemini_api_key = gr.Textbox(
99
+ label="Gemini API Key *",
100
+ type="password",
101
+ interactive=True
102
+ )
103
+ anthropic_api_key = gr.Textbox(
104
+ label="Anthropic API Key *",
105
+ type="password",
106
+ placeholder="sk-ant-...",
107
+ interactive=True
108
+ )
109
+ with gr.Row():
110
+ clear_btn = gr.ClearButton(
111
+ components=[question, level, ground_truth, file_name]
112
+ )
113
+ submit_btn = gr.Button("Submit", variant="primary")
114
+ with gr.Column(scale=1):
115
+ answer = gr.Textbox(
116
+ label="Answer",
117
+ lines=1,
118
+ interactive=False
119
+ )
120
+
121
+ submit_btn.click(
122
+ fn=ask,
123
+ inputs=[question, openai_api_key, gemini_api_key, anthropic_api_key, file_name],
124
+ outputs=answer
125
+ )
126
+
127
+ QUESTION_FILE_PATH = "data/gaia_validation.jsonl"
128
+
129
+ gr.Examples(
130
+ label="GAIA Benchmark Level 1 Problems",
131
+ examples=get_questions(QUESTION_FILE_PATH, 1),
132
+ inputs=[question, level, ground_truth, file_name, openai_api_key, gemini_api_key, anthropic_api_key],
133
+ outputs=answer,
134
+ cache_examples=False
135
+ )
136
+
137
+ gr.Examples(
138
+ label="GAIA Benchmark Level 2 Problems",
139
+ examples=get_questions(QUESTION_FILE_PATH, 2),
140
+ inputs=[question, level, ground_truth, file_name, openai_api_key, gemini_api_key, anthropic_api_key],
141
+ outputs=answer,
142
+ cache_examples=False
143
+ )
144
+
145
+ gr.Examples(
146
+ label="GAIA Benchmark Level 3 Problems",
147
+ examples=get_questions(QUESTION_FILE_PATH, 3),
148
+ inputs=[question, level, ground_truth, file_name, openai_api_key, gemini_api_key, anthropic_api_key],
149
+ outputs=answer,
150
+ cache_examples=False
151
+ )
152
+ with gr.Tab("Documentation"):
153
+ gr.Markdown(os.environ.get("DOCUMENTATION"))
154
+
155
+ grady.launch(mcp_server=True)
156
+
157
+
158
+
app.py CHANGED
@@ -1,158 +1,90 @@
1
- import os, threading
 
 
2
  import gradio as gr
3
- from crew import run_parallel_crew
4
- from crew import run_crew
5
- from utils import get_questions
6
 
 
7
 
8
- def ask(question, openai_api_key, gemini_api_key, anthropic_api_key, file_name = ""):
9
- """
10
- Ask General AI Assistant a question to answer.
 
11
 
12
- Args:
13
- question (str): The question to answer
14
- openai_api_key (str): OpenAI API key
15
- gemini_api_key (str): Gemini API key
16
- anthropic_api_key (str): Anthropic API key
17
- file_name (str): Optional file name
18
 
19
- Returns:
20
- str: The answer to the question
21
  """
22
- if not question:
23
- raise gr.Error("Question is required.")
24
-
25
- if not openai_api_key:
26
- raise gr.Error("OpenAI API Key is required.")
27
-
28
- if not gemini_api_key:
29
- raise gr.Error("Gemini API Key is required.")
30
-
31
- if not anthropic_api_key:
32
- raise gr.Error("Anthropic API Key is required.")
33
-
34
- if file_name:
35
- file_name = f"data/{file_name}"
36
-
37
- lock = threading.Lock()
38
-
39
- with lock:
40
- answer = ""
41
-
42
- try:
43
- os.environ["OPENAI_API_KEY"] = openai_api_key
44
- os.environ["GEMINI_API_KEY"] = gemini_api_key
45
- os.environ["MODEL_API_KEY"] = anthropic_api_key
46
-
47
- #answer = run_parallel_crew(question, file_name)
 
 
 
 
 
48
  answer = run_crew(question, file_name)
49
- except Exception as e:
50
- raise gr.Error(e)
51
- finally:
52
- del os.environ["OPENAI_API_KEY"]
53
- del os.environ["GEMINI_API_KEY"]
54
- del os.environ["MODEL_API_KEY"]
55
-
56
- return answer
57
-
58
- gr.close_all()
59
-
60
- with gr.Blocks() as grady:
61
- gr.Markdown("## Grady - General AI Assistant")
62
-
63
- with gr.Tab("Solution"):
64
- gr.Markdown(os.environ.get("DESCRIPTION"))
65
-
66
- with gr.Row():
67
- with gr.Column(scale=3):
68
- with gr.Row():
69
- question = gr.Textbox(
70
- label="Question *",
71
- placeholder="In the 2025 Gradio Agents & MCP Hackathon, what percentage of participants submitted a solution during the last 24 hours?",
72
- interactive=True
73
- )
74
- with gr.Row():
75
- level = gr.Radio(
76
- choices=[1, 2, 3],
77
- label="GAIA Benchmark Level",
78
- interactive=True,
79
- scale=1
80
- )
81
- ground_truth = gr.Textbox(
82
- label="Ground Truth",
83
- interactive=True,
84
- scale=1
85
- )
86
- file_name = gr.Textbox(
87
- label="File Name",
88
- interactive=True,
89
- scale=2
90
- )
91
- with gr.Row():
92
- openai_api_key = gr.Textbox(
93
- label="OpenAI API Key *",
94
- type="password",
95
- placeholder="sk‑...",
96
- interactive=True
97
- )
98
- gemini_api_key = gr.Textbox(
99
- label="Gemini API Key *",
100
- type="password",
101
- interactive=True
102
- )
103
- anthropic_api_key = gr.Textbox(
104
- label="Anthropic API Key *",
105
- type="password",
106
- placeholder="sk-ant-...",
107
- interactive=True
108
- )
109
- with gr.Row():
110
- clear_btn = gr.ClearButton(
111
- components=[question, level, ground_truth, file_name]
112
- )
113
- submit_btn = gr.Button("Submit", variant="primary")
114
- with gr.Column(scale=1):
115
- answer = gr.Textbox(
116
- label="Answer",
117
- lines=1,
118
- interactive=False
119
- )
120
-
121
- submit_btn.click(
122
- fn=ask,
123
- inputs=[question, openai_api_key, gemini_api_key, anthropic_api_key, file_name],
124
- outputs=answer
125
- )
126
-
127
- QUESTION_FILE_PATH = "data/gaia_validation.jsonl"
128
-
129
- gr.Examples(
130
- label="GAIA Benchmark Level 1 Problems",
131
- examples=get_questions(QUESTION_FILE_PATH, 1),
132
- inputs=[question, level, ground_truth, file_name, openai_api_key, gemini_api_key, anthropic_api_key],
133
- outputs=answer,
134
- cache_examples=False
135
  )
136
-
137
- gr.Examples(
138
- label="GAIA Benchmark Level 2 Problems",
139
- examples=get_questions(QUESTION_FILE_PATH, 2),
140
- inputs=[question, level, ground_truth, file_name, openai_api_key, gemini_api_key, anthropic_api_key],
141
- outputs=answer,
142
- cache_examples=False
143
- )
144
-
145
- gr.Examples(
146
- label="GAIA Benchmark Level 3 Problems",
147
- examples=get_questions(QUESTION_FILE_PATH, 3),
148
- inputs=[question, level, ground_truth, file_name, openai_api_key, gemini_api_key, anthropic_api_key],
149
- outputs=answer,
150
- cache_examples=False
151
- )
152
- with gr.Tab("Documentation"):
153
- gr.Markdown(os.environ.get("DOCUMENTATION"))
154
 
155
- grady.launch(mcp_server=True)
 
 
 
156
 
 
 
 
157
 
 
 
 
 
 
158
 
 
 
 
1
+ import os
2
+ import requests
3
+ import pandas as pd
4
  import gradio as gr
 
 
 
5
 
6
+ from crew import run_crew
7
 
8
+ # Configuration: endpoint for GAIA evaluation API
9
+ API_URL = os.getenv("GAIA_API_URL", "https://huggingface.co/spaces/Psiska/General_AI_Assistant")
10
+ # Your Space identifier for generating the agent_code URL
11
+ SPACE_ID = os.getenv("SPACE_ID", "Psiska/General_AI_Assistant")
12
 
 
 
 
 
 
 
13
 
14
+ def run_and_submit_all(username: str):
 
15
  """
16
+ Fetches all evaluation questions, runs your agent on each,
17
+ and submits the batch to the /submit endpoint.
18
+ Returns a status message and a DataFrame of logs.
19
+ """
20
+ if not username:
21
+ return "🔒 Please enter your Hugging Face username.", None
22
+
23
+ try:
24
+ # 1) Fetch questions
25
+ resp = requests.get(f"{API_URL}/questions", timeout=15)
26
+ resp.raise_for_status()
27
+ questions = resp.json()
28
+
29
+ # 2) Run agent on each question
30
+ logs = []
31
+ answers = []
32
+ for item in questions:
33
+ task_id = item.get("task_id") or item.get("id")
34
+ question = item.get("question", "")
35
+ file_name = item.get("file_name", "")
36
+
37
+ # Optional: download attached file
38
+ if file_name:
39
+ file_resp = requests.get(f"{API_URL}/files/{task_id}", timeout=15)
40
+ file_resp.raise_for_status()
41
+ local_path = os.path.join("data", file_name)
42
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
43
+ with open(local_path, "wb") as f:
44
+ f.write(file_resp.content)
45
+
46
+ # Get agent's answer
47
  answer = run_crew(question, file_name)
48
+ answers.append({"task_id": task_id, "submitted_answer": answer})
49
+ logs.append({"Task ID": task_id, "Question": question, "Answer": answer})
50
+
51
+ # 3) Prepare payload
52
+ payload = {
53
+ "username": username,
54
+ "agent_code": f"https://huggingface.co/spaces/{SPACE_ID}/tree/main",
55
+ "answers": answers
56
+ }
57
+
58
+ # 4) Submit answers
59
+ submit_resp = requests.post(f"{API_URL}/submit", json=payload, timeout=60)
60
+ submit_resp.raise_for_status()
61
+ result = submit_resp.json()
62
+
63
+ # Format status
64
+ status = (
65
+ f"✅ {result['username']} scored {result['score']}% "
66
+ f"({result['correct_count']}/{result['total_attempted']} correct)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  )
68
+ return status, pd.DataFrame(logs)
69
+
70
+ except Exception as e:
71
+ return f"❌ Error: {str(e)}", None
72
+
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ # Build Gradio interface
75
+ with gr.Blocks(title="GAIA Evaluation Runner") as demo:
76
+ gr.Markdown("# GAIA Evaluation Runner")
77
+ username_input = gr.Textbox(label="Hugging Face Username")
78
 
79
+ run_btn = gr.Button("Run & Submit All Answers")
80
+ status = gr.Textbox(label="Status", interactive=False)
81
+ table = gr.DataFrame(headers=["Task ID", "Question", "Answer"], label="Log of Q&A")
82
 
83
+ run_btn.click(
84
+ fn=run_and_submit_all,
85
+ inputs=[username_input],
86
+ outputs=[status, table]
87
+ )
88
 
89
+ if __name__ == "__main__":
90
+ demo.launch()
evaluation.py DELETED
@@ -1,92 +0,0 @@
1
- import os
2
- import requests
3
- import pandas as pd
4
- import gradio as gr
5
-
6
- from crew import run_crew
7
-
8
- # Configuration: endpoint for GAIA evaluation API
9
- API_URL = os.getenv("GAIA_API_URL", "https://huggingface.co/spaces/Psiska/General_AI_Assistant")
10
- # Your Space identifier for generating the agent_code URL
11
- SPACE_ID = os.getenv("SPACE_ID", "Psiska/General_AI_Assistant")
12
-
13
-
14
- def run_and_submit_all(profile: gr.OAuthProfile | None):
15
- """
16
- Fetches all evaluation questions, runs your agent on each,
17
- and submits the batch to the /submit endpoint.
18
- Returns a status message and a DataFrame of logs.
19
- """
20
- if profile is None:
21
- return "🔒 Please log in with your Hugging Face account.", None
22
-
23
- username = profile.username
24
- try:
25
- # 1) Fetch questions
26
- resp = requests.get(f"{API_URL}/questions", timeout=15)
27
- resp.raise_for_status()
28
- questions = resp.json()
29
-
30
- # 2) Run agent on each question
31
- logs = []
32
- answers = []
33
- for item in questions:
34
- task_id = item.get("task_id") or item.get("id")
35
- question = item.get("question", "")
36
- file_name = item.get("file_name", "")
37
-
38
- # Optional: download attached file
39
- if file_name:
40
- file_resp = requests.get(f"{API_URL}/files/{task_id}", timeout=15)
41
- file_resp.raise_for_status()
42
- local_path = os.path.join("data", file_name)
43
- os.makedirs(os.path.dirname(local_path), exist_ok=True)
44
- with open(local_path, "wb") as f:
45
- f.write(file_resp.content)
46
- # pass file_name or path to your agent if needed
47
-
48
- # Get agent's answer
49
- answer = run_crew(question, file_name)
50
- answers.append({"task_id": task_id, "submitted_answer": answer})
51
- logs.append({"Task ID": task_id, "Question": question, "Answer": answer})
52
-
53
- # 3) Prepare payload
54
- payload = {
55
- "username": username,
56
- "agent_code": f"https://huggingface.co/spaces/{SPACE_ID}/tree/main",
57
- "answers": answers
58
- }
59
-
60
- # 4) Submit answers
61
- submit_resp = requests.post(f"{API_URL}/submit", json=payload, timeout=60)
62
- submit_resp.raise_for_status()
63
- result = submit_resp.json()
64
-
65
- # Format status
66
- status = (
67
- f"✅ {result['username']} scored {result['score']}% "
68
- f"({result['correct_count']}/{result['total_attempted']} correct)"
69
- )
70
- return status, pd.DataFrame(logs)
71
-
72
- except Exception as e:
73
- return f"❌ Error: {str(e)}", None
74
-
75
-
76
- # Build Gradio interface
77
- with gr.Blocks(title="GAIA Evaluation Runner") as demo:
78
- gr.Markdown("# GAIA Evaluation Runner")
79
- login = gr.LoginButton()
80
-
81
- run_btn = gr.Button("Run & Submit All Answers")
82
- status = gr.Textbox(label="Status", interactive=False)
83
- table = gr.DataFrame(headers=["Task ID", "Question", "Answer"], label="Log of Q&A")
84
-
85
- run_btn.click(
86
- fn=run_and_submit_all,
87
- inputs=[login],
88
- outputs=[status, table]
89
- )
90
-
91
- if __name__ == "__main__":
92
- demo.launch()