Mehedi2 commited on
Commit
58052c9
·
verified ·
1 Parent(s): e8d4bd6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -153
app.py CHANGED
@@ -1,18 +1,9 @@
1
  import os
2
- import re
3
  import json
 
4
  import requests
5
  import gradio as gr
6
 
7
- # Try importing datasets
8
- try:
9
- from datasets import load_dataset
10
- from huggingface_hub import login
11
- DATASETS_AVAILABLE = True
12
- except ImportError:
13
- DATASETS_AVAILABLE = False
14
- print("⚠️ datasets library not found. Install with: pip install datasets huggingface_hub")
15
-
16
  # ===============================
17
  # 1. LLM Wrapper (Your Original)
18
  # ===============================
@@ -21,12 +12,10 @@ class OpenRouterLLM:
21
  self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
22
  self.model = model
23
  self.base_url = "https://openrouter.ai/api/v1"
24
-
25
  if not self.api_key:
26
  raise ValueError("Missing OpenRouter API key. Set OPENROUTER_API_KEY environment variable.")
27
 
28
  def generate(self, prompt, system_prompt="You are a helpful AI agent."):
29
- """Send a prompt to OpenRouter and return the model's response"""
30
  headers = {
31
  "Authorization": f"Bearer {self.api_key}",
32
  "Content-Type": "application/json",
@@ -36,185 +25,170 @@ class OpenRouterLLM:
36
  "messages": [
37
  {"role": "system", "content": system_prompt},
38
  {"role": "user", "content": prompt}
39
- ]
 
 
40
  }
41
-
42
  try:
43
- response = requests.post(
44
- f"{self.base_url}/chat/completions",
45
- headers=headers,
46
- data=json.dumps(payload)
47
- )
48
  response.raise_for_status()
49
  data = response.json()
50
  return data["choices"][0]["message"]["content"].strip()
51
  except Exception as e:
52
- print(f"LLM error: {e}")
53
  return f"Error: {e}"
54
 
55
  # ===============================
56
- # 2. GAIA Dataset Loader
57
  # ===============================
58
- class GAIADatasetLoader:
59
- def __init__(self):
60
- self.dataset = None
 
 
61
  self.questions = []
62
-
63
- def load_gaia_dataset(self):
64
- """Load GAIA dataset from HuggingFace with authentication"""
65
- if not DATASETS_AVAILABLE:
66
- return "Error: datasets library not available"
67
-
68
  try:
69
- hf_token = os.getenv("HF_TOKEN")
70
- if not hf_token:
71
- return "Error: HF_TOKEN environment variable not set"
72
-
73
- # Authenticate with HF Hub
74
- login(token=hf_token)
75
-
76
- # Load validation split
77
- dataset = load_dataset(
78
- "gaia-benchmark/GAIA",
79
- split="validation",
80
- use_auth_token=hf_token
81
- )
82
-
83
- self.questions = []
84
- for i, item in enumerate(dataset.select(range(20))): # max 20 for leaderboard
85
- self.questions.append({
86
- "task_id": item["task_id"],
87
- "Question": item["Question"],
88
- "Final answer": str(item["Final answer"]),
89
- "file_name": item.get("file_name", ""),
90
- "file_path": item.get("file_path", "")
91
- })
92
-
93
- return f"✅ Successfully loaded {len(self.questions)} GAIA questions"
94
-
95
  except Exception as e:
96
- print(f"Dataset loading error: {e}")
97
- return self.create_fallback_questions(str(e))
98
-
99
- def create_fallback_questions(self, error_message=""):
100
- """Fallback: create toy questions if dataset fails"""
101
- self.questions = [
102
- {"task_id": "fallback_1", "Question": "What is 2+2?", "Final answer": "4"},
103
- {"task_id": "fallback_2", "Question": "What is the capital of France?", "Final answer": "Paris"},
104
- ]
105
- return f"⚠️ Using fallback questions. Error: {error_message}"
 
106
 
107
- # ===============================
108
- # 3. GAIA Agent (Evaluator)
109
- # ===============================
110
- class GAIAAgent:
111
- def __init__(self, llm: OpenRouterLLM, dataset_loader: GAIADatasetLoader):
112
- self.llm = llm
113
- self.dataset_loader = dataset_loader
114
-
115
  def clean_answer(self, answer: str):
116
- """Clean model output to keep only raw answer"""
117
- if not answer:
118
- return ""
119
  answer = answer.strip()
120
- # Remove "Answer:" or "Final answer:" prefixes
121
- answer = re.sub(r"(?i)^(final\s*answer|answer)\s*[:\-]?\s*", "", answer)
 
 
122
  return answer.strip()
123
-
124
  def answer_question(self, question_obj):
125
- """Ask LLM to answer one question"""
126
- q = question_obj["Question"]
127
  system_prompt = (
128
  "You are solving GAIA benchmark questions. "
129
  "Provide ONLY the final answer, no reasoning."
130
  )
131
  raw_answer = self.llm.generate(q, system_prompt)
132
  return self.clean_answer(raw_answer)
133
-
134
- def evaluate(self):
135
- """Evaluate all questions and compute accuracy"""
136
- results, correct = [], 0
137
- for q in self.dataset_loader.questions:
138
- agent_answer = self.answer_question(q)
139
- expected = str(q["Final answer"]).strip()
140
- is_correct = agent_answer.strip() == expected
 
 
141
  if is_correct:
142
  correct += 1
143
  results.append({
144
- "task_id": q["task_id"],
145
- "question": q["Question"],
146
  "expected": expected,
147
- "answer": agent_answer,
148
  "correct": is_correct
149
  })
150
- accuracy = correct / len(results) if results else 0
151
- return results, accuracy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  # ===============================
154
- # 4. Gradio UI
155
  # ===============================
156
- def build_gradio_interface(agent, dataset_loader):
157
- def load_dataset_ui():
158
- return dataset_loader.load_gaia_dataset()
159
-
160
- def test_single_question(question_text):
161
- return agent.answer_question({"Question": question_text})
162
-
163
- def evaluate_agent():
164
- results, acc = agent.evaluate()
165
- summary = f" Accuracy: {acc*100:.1f}% ({sum(r['correct'] for r in results)}/{len(results)})\n\n"
166
- for r in results:
167
- summary += f"\nQ: {r['question']}\nExpected: {r['expected']} | Got: {r['answer']} | Correct: {r['correct']}\n"
168
- return summary
169
-
170
- def manual_answer_eval(question_text, expected_answer):
171
- agent_answer = agent.answer_question({"Question": question_text})
172
- is_correct = agent_answer.strip() == expected_answer.strip()
173
- return f"Q: {question_text}\nExpected: {expected_answer}\nAgent: {agent_answer}\nCorrect: {is_correct}"
174
-
175
- with gr.Blocks() as demo:
176
- gr.Markdown("# 🤖 GAIA Agent Evaluation")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
- with gr.Tab("1. Load Dataset"):
179
- out1 = gr.Textbox(label="Dataset Load Status")
180
- btn1 = gr.Button("Load GAIA Dataset")
181
- btn1.click(load_dataset_ui, outputs=out1)
182
 
183
- with gr.Tab("2. Test Single Question"):
184
- q_in = gr.Textbox(label="Enter a Question")
185
- ans_out = gr.Textbox(label="Agent Answer")
186
- btn2 = gr.Button("Get Answer")
187
- btn2.click(test_single_question, inputs=q_in, outputs=ans_out)
188
 
189
- with gr.Tab("3. Evaluate Full Dataset"):
190
- out3 = gr.Textbox(label="Evaluation Results", lines=20)
191
- btn3 = gr.Button("Run Evaluation")
192
- btn3.click(evaluate_agent, outputs=out3)
 
193
 
194
- with gr.Tab("4. Manual Evaluation"):
195
- q_in2 = gr.Textbox(label="Question")
196
- expected_in = gr.Textbox(label="Expected Answer")
197
- out4 = gr.Textbox(label="Evaluation Result")
198
- btn4 = gr.Button("Evaluate Agent Answer")
199
- btn4.click(manual_answer_eval, inputs=[q_in2, expected_in], outputs=out4)
200
-
201
- return demo
202
 
203
  # ===============================
204
- # 5. Main
205
  # ===============================
206
- def main():
207
- api_key = os.getenv("OPENROUTER_API_KEY")
208
- if not api_key:
209
- print("⚠️ Set OPENROUTER_API_KEY before running.")
210
- return
211
-
212
- llm = OpenRouterLLM(api_key=api_key)
213
- loader = GAIADatasetLoader()
214
- agent = GAIAAgent(llm, loader)
215
-
216
- demo = build_gradio_interface(agent, loader)
217
- demo.launch(share=True)
218
-
219
  if __name__ == "__main__":
220
- main()
 
 
 
 
 
1
  import os
 
2
  import json
3
+ import time
4
  import requests
5
  import gradio as gr
6
 
 
 
 
 
 
 
 
 
 
7
  # ===============================
8
  # 1. LLM Wrapper (Your Original)
9
  # ===============================
 
12
  self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
13
  self.model = model
14
  self.base_url = "https://openrouter.ai/api/v1"
 
15
  if not self.api_key:
16
  raise ValueError("Missing OpenRouter API key. Set OPENROUTER_API_KEY environment variable.")
17
 
18
  def generate(self, prompt, system_prompt="You are a helpful AI agent."):
 
19
  headers = {
20
  "Authorization": f"Bearer {self.api_key}",
21
  "Content-Type": "application/json",
 
25
  "messages": [
26
  {"role": "system", "content": system_prompt},
27
  {"role": "user", "content": prompt}
28
+ ],
29
+ "temperature": 0.1,
30
+ "max_tokens": 500
31
  }
 
32
  try:
33
+ response = requests.post(f"{self.base_url}/chat/completions", headers=headers, json=payload)
 
 
 
 
34
  response.raise_for_status()
35
  data = response.json()
36
  return data["choices"][0]["message"]["content"].strip()
37
  except Exception as e:
 
38
  return f"Error: {e}"
39
 
40
  # ===============================
41
+ # 2. GAIA API Loader
42
  # ===============================
43
+ GAIA_API_BASE = "https://gaia-benchmark-hf.fly.dev"
44
+
45
+ class GAIAAgent:
46
+ def __init__(self, llm: OpenRouterLLM):
47
+ self.llm = llm
48
  self.questions = []
49
+
50
+ def fetch_questions(self):
 
 
 
 
51
  try:
52
+ resp = requests.get(f"{GAIA_API_BASE}/questions", timeout=30)
53
+ if resp.status_code == 200:
54
+ self.questions = resp.json()
55
+ return f"✅ Loaded {len(self.questions)} GAIA questions"
56
+ else:
57
+ return f"⚠️ Failed to fetch questions: {resp.status_code}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  except Exception as e:
59
+ return f"⚠️ Error fetching questions: {e}"
60
+
61
+ def fetch_random_question(self):
62
+ try:
63
+ resp = requests.get(f"{GAIA_API_BASE}/random-question", timeout=10)
64
+ if resp.status_code == 200:
65
+ return resp.json()
66
+ else:
67
+ return {}
68
+ except:
69
+ return {}
70
 
 
 
 
 
 
 
 
 
71
  def clean_answer(self, answer: str):
 
 
 
72
  answer = answer.strip()
73
+ prefixes = ["Answer:", "Final answer:", "The answer is:"]
74
+ for prefix in prefixes:
75
+ if answer.lower().startswith(prefix.lower()):
76
+ answer = answer[len(prefix):].strip()
77
  return answer.strip()
78
+
79
  def answer_question(self, question_obj):
80
+ q = question_obj.get("Question", "")
 
81
  system_prompt = (
82
  "You are solving GAIA benchmark questions. "
83
  "Provide ONLY the final answer, no reasoning."
84
  )
85
  raw_answer = self.llm.generate(q, system_prompt)
86
  return self.clean_answer(raw_answer)
87
+
88
+ def evaluate_all(self):
89
+ if not self.questions:
90
+ return {"error": "No questions loaded"}
91
+ results = []
92
+ correct = 0
93
+ for q in self.questions:
94
+ expected = str(q.get("Final answer", "")).strip()
95
+ answer = self.answer_question(q)
96
+ is_correct = answer.strip() == expected
97
  if is_correct:
98
  correct += 1
99
  results.append({
100
+ "task_id": q.get("task_id"),
101
+ "question": q.get("Question"),
102
  "expected": expected,
103
+ "answer": answer,
104
  "correct": is_correct
105
  })
106
+ score = (correct / len(results)) * 100 if results else 0
107
+ return {"score": score, "results": results, "correct": correct, "total": len(results)}
108
+
109
+ def submit_answers(self, username, agent_code, answers):
110
+ try:
111
+ payload = {
112
+ "username": username,
113
+ "agent_code": agent_code,
114
+ "answers": answers
115
+ }
116
+ resp = requests.post(f"{GAIA_API_BASE}/submit", json=payload, timeout=60)
117
+ if resp.status_code == 200:
118
+ return resp.json()
119
+ else:
120
+ return {"error": f"Submission failed: {resp.status_code}"}
121
+ except Exception as e:
122
+ return {"error": str(e)}
123
 
124
  # ===============================
125
+ # 3. Gradio UI
126
  # ===============================
127
+ llm = OpenRouterLLM()
128
+ agent = GAIAAgent(llm)
129
+
130
+ def load_questions_ui():
131
+ return agent.fetch_questions()
132
+
133
+ def test_random_question_ui():
134
+ q = agent.fetch_random_question()
135
+ if not q:
136
+ return "Failed to fetch a random question"
137
+ ans = agent.answer_question(q)
138
+ return f"Question: {q.get('Question')}\nAnswer: {ans}"
139
+
140
+ def run_full_evaluation_ui(username):
141
+ if not agent.questions:
142
+ return "Please load questions first."
143
+ results_data = agent.evaluate_all()
144
+ if "error" in results_data:
145
+ return results_data["error"]
146
+
147
+ answers_payload = [
148
+ {"task_id": r["task_id"], "submitted_answer": r["answer"]}
149
+ for r in results_data["results"]
150
+ ]
151
+ agent_code = f"https://huggingface.co/spaces/{username}/Gaia-Test-Agent/tree/main"
152
+ submission_result = agent.submit_answers(username, agent_code, answers_payload)
153
+ score = submission_result.get("score", 0)
154
+ return f"Score: {score}%\nAnswers submitted: {len(answers_payload)}\nLeaderboard info: {submission_result}"
155
+
156
+ def manual_test_ui(question_text):
157
+ return agent.answer_question({"Question": question_text})
158
+
159
+ def build_gradio_app():
160
+ with gr.Blocks() as app:
161
+ gr.Markdown("# 🤖 GAIA Benchmark Agent")
162
 
163
+ with gr.Tab("Load Questions"):
164
+ out_load = gr.Textbox(label="Status")
165
+ btn_load = gr.Button("Load GAIA Questions")
166
+ btn_load.click(load_questions_ui, outputs=out_load)
167
 
168
+ with gr.Tab("Random Question Test"):
169
+ out_test = gr.Textbox(label="Result", lines=6)
170
+ btn_test = gr.Button("Test Random Question")
171
+ btn_test.click(test_random_question_ui, outputs=out_test)
 
172
 
173
+ with gr.Tab("Full Evaluation & Submit"):
174
+ username_input = gr.Textbox(label="Your HF Username")
175
+ out_eval = gr.Textbox(label="Evaluation Result", lines=10)
176
+ btn_eval = gr.Button("Run Evaluation & Submit")
177
+ btn_eval.click(run_full_evaluation_ui, inputs=username_input, outputs=out_eval)
178
 
179
+ with gr.Tab("Manual Test"):
180
+ manual_input = gr.Textbox(label="Enter Question")
181
+ manual_output = gr.Textbox(label="Agent Answer", lines=4)
182
+ manual_btn = gr.Button("Get Answer")
183
+ manual_btn.click(manual_test_ui, inputs=manual_input, outputs=manual_output)
184
+ return app
 
 
185
 
186
  # ===============================
187
+ # 4. Main
188
  # ===============================
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  if __name__ == "__main__":
190
+ app = build_gradio_app()
191
+ if os.getenv("SPACE_ID"):
192
+ app.launch(server_name="0.0.0.0", server_port=7860)
193
+ else:
194
+ app.launch(share=True)