Spaces:

Mehedi2
/

new_assignment

Sleeping

App Files Files Community

Mehedi2 commited on Sep 28, 2025

Commit

0a876aa

verified ·

1 Parent(s): 288ad5e

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -168

app.py CHANGED Viewed

@@ -7,8 +7,13 @@ import time
 # Your OpenRouter API key
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") or os.getenv("my_key")
-GAIA_API_BASE = "https://gaia-benchmark-hf.fly.dev"  # official GAIA API base
 class OpenRouterLLM:
     def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"):
         self.api_key = api_key
@@ -16,8 +21,6 @@ class OpenRouterLLM:
         self.base_url = "https://openrouter.ai/api/v1/chat/completions"
     def __call__(self, prompt: str, max_tokens: int = 1000, temperature: float = 0.1) -> str:
-        """Make API call to OpenRouter"""
         if not self.api_key or not self.api_key.startswith('sk-or-v1-'):
             return "Error: Invalid OpenRouter API key"
@@ -36,14 +39,11 @@ class OpenRouterLLM:
                     "content": """You are a helpful AI assistant designed to answer questions accurately and concisely.
 For GAIA evaluation, provide EXACT answers without explanation unless asked.
 - For math questions, give just the number
-- For yes/no questions, give just "Yes" or "No"
 - For factual questions, give just the fact
-- Be precise and direct."""
                 },
-                {
-                    "role": "user",
-                    "content": prompt
-                }
             ],
             "temperature": temperature,
             "max_tokens": max_tokens,
@@ -52,8 +52,8 @@ For GAIA evaluation, provide EXACT answers without explanation unless asked.
         try:
             response = requests.post(
-                self.base_url,
-                headers=headers,
                 json=payload,
                 timeout=30
             )
@@ -62,265 +62,185 @@ For GAIA evaluation, provide EXACT answers without explanation unless asked.
                 return f"API Error: {response.status_code}"
             result = response.json()
             if "choices" in result and len(result["choices"]) > 0:
                 return result["choices"][0]["message"]["content"].strip()
             else:
                 return "Error: No response content received"
         except Exception as e:
             return f"Error: {str(e)}"
 class GAIAAgent:
     def __init__(self, api_key: str):
         self.llm = OpenRouterLLM(api_key=api_key)
         self.api_key = api_key
     def run_agent(self, prompt: str) -> str:
-        """
-        Main function for GAIA evaluation
-        This is what GAIA calls to get answers
-        """
         try:
-            # Process the question to get a direct answer
             enhanced_prompt = f"""
 Question: {prompt}
-Analyze this question carefully and provide the exact answer. Do not include explanations, reasoning, or extra text unless specifically asked for reasoning.
-Examples of good responses:
-- Math question "What is 15 + 27?" → Answer: "42"
-- Yes/No question "Is Paris the capital of France?" → Answer: "Yes"
-- Factual question "What is the capital of Japan?" → Answer: "Tokyo"
 Your answer:"""
             response = self.llm(enhanced_prompt, max_tokens=500, temperature=0.1)
-            # Clean up the response to get just the answer
-            answer = self.clean_answer(response)
-            return answer
         except Exception as e:
             return f"Error: {str(e)}"
     def clean_answer(self, response: str) -> str:
-        """Clean the response to extract just the answer"""
         response = response.strip()
-        # Remove common prefixes
         prefixes_to_remove = [
-            "Answer:", "The answer is:", "Response:", "Result:",
             "Final answer:", "Solution:", "A:", "Answer is:"
         ]
         for prefix in prefixes_to_remove:
             if response.lower().startswith(prefix.lower()):
                 response = response[len(prefix):].strip()
-        # Remove quotes if they wrap the entire answer
         if response.startswith('"') and response.endswith('"'):
             response = response[1:-1]
         return response
     def get_questions(self) -> List[Dict]:
-        """Get questions from GAIA API"""
         try:
-            response = requests.get(f"{GAIA_API_BASE}/questions", timeout=30)
-            if response.status_code == 200:
-                return response.json()
-            else:
-                return []
-        except:
             return []
     def get_random_question(self) -> Dict:
-        """Get a random question from GAIA API"""
         try:
-            response = requests.get(f"{GAIA_API_BASE}/random-question", timeout=30)
-            if response.status_code == 200:
-                return response.json()
-            else:
-                return {}
-        except:
             return {}
     def submit_answers(self, username: str, agent_code: str, answers: List[Dict]) -> Dict:
-        """Submit answers to GAIA for scoring"""
         try:
             payload = {
                 "username": username,
                 "agent_code": agent_code,
                 "answers": answers
             }
-            response = requests.post(
-                f"{GAIA_API_BASE}/submit",
-                json=payload,
-                timeout=60
-            )
-            if response.status_code == 200:
-                return response.json()
-            else:
-                return {"error": f"Submission failed: {response.status_code}"}
         except Exception as e:
             return {"error": f"Submission error: {str(e)}"}
-# Initialize the agent
 agent = GAIAAgent(api_key=OPENROUTER_API_KEY)
 def run_agent(prompt: str) -> str:
-    """Main function that GAIA will call"""
     return agent.run_agent(prompt)
 def test_single_question():
-    """Test the agent with a single question"""
     question = agent.get_random_question()
     if question:
-        answer = run_agent(question.get("Question", ""))
-        return f"Question: {question.get('Question', '')}\nAnswer: {answer}"
     return "Failed to get question"
 def run_full_evaluation(username: str, progress=gr.Progress()):
-    """Run full GAIA evaluation"""
     if not username:
         return "Please provide your Hugging Face username"
     if not OPENROUTER_API_KEY:
         return "Please configure your OpenRouter API key"
-    progress(0.1, desc="Getting questions...")
-    # Get all questions
     questions = agent.get_questions()
     if not questions:
         return "Failed to retrieve questions from GAIA API"
-    progress(0.2, desc=f"Processing {len(questions)} questions...")
-    # Process each question
     answers = []
     for i, question in enumerate(questions):
-        progress(0.2 + (0.7 * i / len(questions)), desc=f"Processing question {i+1}/{len(questions)}")
         task_id = question.get("task_id", "")
-        question_text = question.get("Question", "")
-        if question_text:
-            answer = run_agent(question_text)
-            answers.append({
-                "task_id": task_id,
-                "submitted_answer": answer
-            })
-        # Small delay to avoid rate limiting
-        time.sleep(0.5)
     progress(0.9, desc="Submitting answers...")
-    # Submit answers
     agent_code = f"https://huggingface.co/spaces/{username}/Gaia-Test-Agent/tree/main"
     result = agent.submit_answers(username, agent_code, answers)
-    progress(1.0, desc="Complete!")
     if "error" in result:
         return f"Submission failed: {result['error']}"
-    else:
-        score = result.get("score", 0)
-        return f"Evaluation complete!\nScore: {score}%\nAnswers submitted: {len(answers)}\nCheck the leaderboard for your ranking!"
-# Create Gradio interface
 def create_gradio_app():
     with gr.Blocks(title="GAIA Test Agent", theme=gr.themes.Soft()) as app:
         gr.HTML("""
-        <div style="text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
             <h1>GAIA Test Agent</h1>
             <p>AI Agent for GAIA Benchmark Evaluation</p>
         </div>
         """)
         with gr.Tab("Single Question Test"):
-            test_btn = gr.Button("Test Random Question", variant="primary")
-            test_output = gr.Textbox(
-                label="Test Result",
-                lines=10,
-                placeholder="Test results will appear here..."
-            )
-            test_btn.click(
-                fn=test_single_question,
-                outputs=[test_output]
-            )
         with gr.Tab("Full Evaluation"):
-            gr.Markdown("### Run Full GAIA Evaluation")
-            username_input = gr.Textbox(
-                label="Hugging Face Username",
-                placeholder="Enter your HF username",
-                info="This will be used for the leaderboard"
-            )
             eval_btn = gr.Button("Run Full Evaluation", variant="primary")
-            eval_output = gr.Textbox(
-                label="Evaluation Results",
-                lines=15,
-                placeholder="Evaluation results will appear here..."
-            )
-            eval_btn.click(
-                fn=run_full_evaluation,
-                inputs=[username_input],
-                outputs=[eval_output],
-                show_progress=True
-            )
-        with gr.Tab("Manual Testing"):
-            gr.Markdown("### Test Individual Questions")
-            manual_input = gr.Textbox(
-                label="Enter Question",
-                placeholder="Type a question to test...",
-                lines=3
-            )
-            manual_btn = gr.Button("Get Answer", variant="primary")
-            manual_output = gr.Textbox(
-                label="Answer",
-                lines=5,
-                placeholder="Answer will appear here..."
-            )
-            manual_btn.click(
-                fn=run_agent,
-                inputs=[manual_input],
-                outputs=[manual_output]
-            )
         gr.Markdown("""
-        ### How to Use:
-        1. **Single Question Test**: Test your agent with one random question from GAIA
-        2. **Full Evaluation**: Run the complete evaluation and submit to leaderboard
-        3. **Manual Testing**: Test your agent with custom questions
-        ### Requirements:
-        - Set your OpenRouter API key in Space secrets as `OPENROUTER_API_KEY`
-        - Keep your Space public for leaderboard verification
-        - Your HF username will appear on the leaderboard
         """)
     return app
-# Launch the app
 if __name__ == "__main__":
     app = create_gradio_app()
     if os.getenv("SPACE_ID"):
         app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
     else:
-        app.launch(share=True, show_api=False)

 # Your OpenRouter API key
 OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") or os.getenv("my_key")
+# ✅ Replace with the official GAIA leaderboard API base
+GAIA_API_BASE = "https://gaia-leaderboard.fly.dev"
+# -----------------------------
+# OpenRouter LLM Wrapper
+# -----------------------------
 class OpenRouterLLM:
     def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"):
         self.api_key = api_key
         self.base_url = "https://openrouter.ai/api/v1/chat/completions"
     def __call__(self, prompt: str, max_tokens: int = 1000, temperature: float = 0.1) -> str:
         if not self.api_key or not self.api_key.startswith('sk-or-v1-'):
             return "Error: Invalid OpenRouter API key"
                     "content": """You are a helpful AI assistant designed to answer questions accurately and concisely.
 For GAIA evaluation, provide EXACT answers without explanation unless asked.
 - For math questions, give just the number
+- For yes/no questions, give just "Yes" or "No"
 - For factual questions, give just the fact
+Be precise and direct."""
                 },
+                {"role": "user", "content": prompt}
             ],
             "temperature": temperature,
             "max_tokens": max_tokens,
         try:
             response = requests.post(
+                self.base_url,
+                headers=headers,
                 json=payload,
                 timeout=30
             )
                 return f"API Error: {response.status_code}"
             result = response.json()
             if "choices" in result and len(result["choices"]) > 0:
                 return result["choices"][0]["message"]["content"].strip()
             else:
                 return "Error: No response content received"
         except Exception as e:
             return f"Error: {str(e)}"
+# -----------------------------
+# GAIA Agent
+# -----------------------------
 class GAIAAgent:
     def __init__(self, api_key: str):
         self.llm = OpenRouterLLM(api_key=api_key)
         self.api_key = api_key
     def run_agent(self, prompt: str) -> str:
         try:
             enhanced_prompt = f"""
 Question: {prompt}
+Analyze this question carefully and provide the exact answer. Do not include explanations, reasoning, or extra text.
 Your answer:"""
             response = self.llm(enhanced_prompt, max_tokens=500, temperature=0.1)
+            return self.clean_answer(response)
         except Exception as e:
             return f"Error: {str(e)}"
     def clean_answer(self, response: str) -> str:
         response = response.strip()
         prefixes_to_remove = [
+            "Answer:", "The answer is:", "Response:", "Result:",
             "Final answer:", "Solution:", "A:", "Answer is:"
         ]
         for prefix in prefixes_to_remove:
             if response.lower().startswith(prefix.lower()):
                 response = response[len(prefix):].strip()
         if response.startswith('"') and response.endswith('"'):
             response = response[1:-1]
         return response
     def get_questions(self) -> List[Dict]:
         try:
+            resp = requests.get(f"{GAIA_API_BASE}/questions", timeout=30)
+            if resp.status_code == 200:
+                return resp.json()
+            return []
+        except Exception as e:
+            print("Error fetching questions:", e)
             return []
     def get_random_question(self) -> Dict:
         try:
+            resp = requests.get(f"{GAIA_API_BASE}/random-question", timeout=30)
+            if resp.status_code == 200:
+                return resp.json()
+            return {}
+        except Exception as e:
+            print("Error fetching random question:", e)
             return {}
+    def get_file(self, task_id: str) -> bytes:
+        """Download file for a specific task"""
+        try:
+            resp = requests.get(f"{GAIA_API_BASE}/files/{task_id}", timeout=60)
+            if resp.status_code == 200:
+                return resp.content
+            return None
+        except Exception as e:
+            print("Error fetching file:", e)
+            return None
     def submit_answers(self, username: str, agent_code: str, answers: List[Dict]) -> Dict:
         try:
             payload = {
                 "username": username,
                 "agent_code": agent_code,
                 "answers": answers
             }
+            resp = requests.post(f"{GAIA_API_BASE}/submit", json=payload, timeout=60)
+            if resp.status_code == 200:
+                return resp.json()
+            return {"error": f"Submission failed: {resp.status_code}"}
         except Exception as e:
             return {"error": f"Submission error: {str(e)}"}
+# -----------------------------
+# Helper Functions
+# -----------------------------
 agent = GAIAAgent(api_key=OPENROUTER_API_KEY)
 def run_agent(prompt: str) -> str:
     return agent.run_agent(prompt)
 def test_single_question():
     question = agent.get_random_question()
     if question:
+        q_text = question.get("Question", "")
+        answer = run_agent(q_text)
+        return f"Q: {q_text}\nA: {answer}"
     return "Failed to get question"
 def run_full_evaluation(username: str, progress=gr.Progress()):
     if not username:
         return "Please provide your Hugging Face username"
     if not OPENROUTER_API_KEY:
         return "Please configure your OpenRouter API key"
+    progress(0.1, desc="Fetching questions...")
     questions = agent.get_questions()
     if not questions:
         return "Failed to retrieve questions from GAIA API"
     answers = []
     for i, question in enumerate(questions):
+        progress(0.2 + (0.7 * i / len(questions)), desc=f"Processing {i+1}/{len(questions)}")
         task_id = question.get("task_id", "")
+        q_text = question.get("Question", "")
+        if q_text:
+            ans = run_agent(q_text)
+            answers.append({"task_id": task_id, "submitted_answer": ans})
+        time.sleep(0.5)  # avoid hammering
     progress(0.9, desc="Submitting answers...")
     agent_code = f"https://huggingface.co/spaces/{username}/Gaia-Test-Agent/tree/main"
     result = agent.submit_answers(username, agent_code, answers)
+    progress(1.0, desc="Done")
     if "error" in result:
         return f"Submission failed: {result['error']}"
+    score = result.get("score", 0)
+    return f"Evaluation complete!\nScore: {score}%\nAnswers submitted: {len(answers)}"
+# -----------------------------
+# Gradio App
+# -----------------------------
 def create_gradio_app():
     with gr.Blocks(title="GAIA Test Agent", theme=gr.themes.Soft()) as app:
         gr.HTML("""
+        <div style="text-align:center;background:linear-gradient(90deg,#667eea,#764ba2);color:white;padding:20px;border-radius:10px;margin-bottom:20px;">
             <h1>GAIA Test Agent</h1>
             <p>AI Agent for GAIA Benchmark Evaluation</p>
         </div>
         """)
         with gr.Tab("Single Question Test"):
+            btn = gr.Button("Test Random Question", variant="primary")
+            out = gr.Textbox(label="Result", lines=8)
+            btn.click(fn=test_single_question, outputs=[out])
         with gr.Tab("Full Evaluation"):
+            username = gr.Textbox(label="HF Username", placeholder="Enter your username")
             eval_btn = gr.Button("Run Full Evaluation", variant="primary")
+            eval_out = gr.Textbox(label="Results", lines=12)
+            eval_btn.click(fn=run_full_evaluation, inputs=[username], outputs=[eval_out], show_progress=True)
+        with gr.Tab("Manual Test"):
+            q_in = gr.Textbox(label="Enter Question", lines=3)
+            q_btn = gr.Button("Get Answer", variant="primary")
+            q_out = gr.Textbox(label="Answer", lines=4)
+            q_btn.click(fn=run_agent, inputs=[q_in], outputs=[q_out])
         gr.Markdown("""
+        ### Instructions
+        - **Single Question Test**: Try a random GAIA question.
+        - **Full Evaluation**: Run through the 20 evaluation questions and submit.
+        - **Manual Test**: Try custom questions.
+        Requirements:
+        - Set your `OPENROUTER_API_KEY` in Space secrets.
+        - Keep your Space public for leaderboard verification.
         """)
     return app
 if __name__ == "__main__":
     app = create_gradio_app()
     if os.getenv("SPACE_ID"):
         app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
     else:
+        app.launch(share=True, show_api=False)