Spaces:

Mehedi2
/

new_assignment

Sleeping

App Files Files Community

Mehedi2 commited on Sep 28, 2025

Commit

58052c9

verified ·

1 Parent(s): e8d4bd6

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -153

app.py CHANGED Viewed

@@ -1,18 +1,9 @@
 import os
-import re
 import json
 import requests
 import gradio as gr
-# Try importing datasets
-try:
-    from datasets import load_dataset
-    from huggingface_hub import login
-    DATASETS_AVAILABLE = True
-except ImportError:
-    DATASETS_AVAILABLE = False
-    print("⚠️ datasets library not found. Install with: pip install datasets huggingface_hub")
 # ===============================
 # 1. LLM Wrapper (Your Original)
 # ===============================
@@ -21,12 +12,10 @@ class OpenRouterLLM:
         self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
         self.model = model
         self.base_url = "https://openrouter.ai/api/v1"
         if not self.api_key:
             raise ValueError("Missing OpenRouter API key. Set OPENROUTER_API_KEY environment variable.")
     def generate(self, prompt, system_prompt="You are a helpful AI agent."):
-        """Send a prompt to OpenRouter and return the model's response"""
         headers = {
             "Authorization": f"Bearer {self.api_key}",
             "Content-Type": "application/json",
@@ -36,185 +25,170 @@ class OpenRouterLLM:
             "messages": [
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": prompt}
-            ]
         }
         try:
-            response = requests.post(
-                f"{self.base_url}/chat/completions",
-                headers=headers,
-                data=json.dumps(payload)
-            )
             response.raise_for_status()
             data = response.json()
             return data["choices"][0]["message"]["content"].strip()
         except Exception as e:
-            print(f"LLM error: {e}")
             return f"Error: {e}"
 # ===============================
-# 2. GAIA Dataset Loader
 # ===============================
-class GAIADatasetLoader:
-    def __init__(self):
-        self.dataset = None
         self.questions = []
-    def load_gaia_dataset(self):
-        """Load GAIA dataset from HuggingFace with authentication"""
-        if not DATASETS_AVAILABLE:
-            return "Error: datasets library not available"
         try:
-            hf_token = os.getenv("HF_TOKEN")
-            if not hf_token:
-                return "Error: HF_TOKEN environment variable not set"
-            # Authenticate with HF Hub
-            login(token=hf_token)
-            # Load validation split
-            dataset = load_dataset(
-                "gaia-benchmark/GAIA",
-                split="validation",
-                use_auth_token=hf_token
-            )
-            self.questions = []
-            for i, item in enumerate(dataset.select(range(20))):  # max 20 for leaderboard
-                self.questions.append({
-                    "task_id": item["task_id"],
-                    "Question": item["Question"],
-                    "Final answer": str(item["Final answer"]),
-                    "file_name": item.get("file_name", ""),
-                    "file_path": item.get("file_path", "")
-                })
-            return f"✅ Successfully loaded {len(self.questions)} GAIA questions"
         except Exception as e:
-            print(f"Dataset loading error: {e}")
-            return self.create_fallback_questions(str(e))
-    def create_fallback_questions(self, error_message=""):
-        """Fallback: create toy questions if dataset fails"""
-        self.questions = [
-            {"task_id": "fallback_1", "Question": "What is 2+2?", "Final answer": "4"},
-            {"task_id": "fallback_2", "Question": "What is the capital of France?", "Final answer": "Paris"},
-        ]
-        return f"⚠️ Using fallback questions. Error: {error_message}"
-# ===============================
-# 3. GAIA Agent (Evaluator)
-# ===============================
-class GAIAAgent:
-    def __init__(self, llm: OpenRouterLLM, dataset_loader: GAIADatasetLoader):
-        self.llm = llm
-        self.dataset_loader = dataset_loader
     def clean_answer(self, answer: str):
-        """Clean model output to keep only raw answer"""
-        if not answer:
-            return ""
         answer = answer.strip()
-        # Remove "Answer:" or "Final answer:" prefixes
-        answer = re.sub(r"(?i)^(final\s*answer|answer)\s*[:\-]?\s*", "", answer)
         return answer.strip()
     def answer_question(self, question_obj):
-        """Ask LLM to answer one question"""
-        q = question_obj["Question"]
         system_prompt = (
             "You are solving GAIA benchmark questions. "
             "Provide ONLY the final answer, no reasoning."
         )
         raw_answer = self.llm.generate(q, system_prompt)
         return self.clean_answer(raw_answer)
-    def evaluate(self):
-        """Evaluate all questions and compute accuracy"""
-        results, correct = [], 0
-        for q in self.dataset_loader.questions:
-            agent_answer = self.answer_question(q)
-            expected = str(q["Final answer"]).strip()
-            is_correct = agent_answer.strip() == expected
             if is_correct:
                 correct += 1
             results.append({
-                "task_id": q["task_id"],
-                "question": q["Question"],
                 "expected": expected,
-                "answer": agent_answer,
                 "correct": is_correct
             })
-        accuracy = correct / len(results) if results else 0
-        return results, accuracy
 # ===============================
-# 4. Gradio UI
 # ===============================
-def build_gradio_interface(agent, dataset_loader):
-    def load_dataset_ui():
-        return dataset_loader.load_gaia_dataset()
-    def test_single_question(question_text):
-        return agent.answer_question({"Question": question_text})
-    def evaluate_agent():
-        results, acc = agent.evaluate()
-        summary = f"✅ Accuracy: {acc*100:.1f}% ({sum(r['correct'] for r in results)}/{len(results)})\n\n"
-        for r in results:
-            summary += f"\nQ: {r['question']}\nExpected: {r['expected']} | Got: {r['answer']} | Correct: {r['correct']}\n"
-        return summary
-    def manual_answer_eval(question_text, expected_answer):
-        agent_answer = agent.answer_question({"Question": question_text})
-        is_correct = agent_answer.strip() == expected_answer.strip()
-        return f"Q: {question_text}\nExpected: {expected_answer}\nAgent: {agent_answer}\nCorrect: {is_correct}"
-    with gr.Blocks() as demo:
-        gr.Markdown("# 🤖 GAIA Agent Evaluation")
-        with gr.Tab("1. Load Dataset"):
-            out1 = gr.Textbox(label="Dataset Load Status")
-            btn1 = gr.Button("Load GAIA Dataset")
-            btn1.click(load_dataset_ui, outputs=out1)
-        with gr.Tab("2. Test Single Question"):
-            q_in = gr.Textbox(label="Enter a Question")
-            ans_out = gr.Textbox(label="Agent Answer")
-            btn2 = gr.Button("Get Answer")
-            btn2.click(test_single_question, inputs=q_in, outputs=ans_out)
-        with gr.Tab("3. Evaluate Full Dataset"):
-            out3 = gr.Textbox(label="Evaluation Results", lines=20)
-            btn3 = gr.Button("Run Evaluation")
-            btn3.click(evaluate_agent, outputs=out3)
-        with gr.Tab("4. Manual Evaluation"):
-            q_in2 = gr.Textbox(label="Question")
-            expected_in = gr.Textbox(label="Expected Answer")
-            out4 = gr.Textbox(label="Evaluation Result")
-            btn4 = gr.Button("Evaluate Agent Answer")
-            btn4.click(manual_answer_eval, inputs=[q_in2, expected_in], outputs=out4)
-    return demo
 # ===============================
-# 5. Main
 # ===============================
-def main():
-    api_key = os.getenv("OPENROUTER_API_KEY")
-    if not api_key:
-        print("⚠️ Set OPENROUTER_API_KEY before running.")
-        return
-    llm = OpenRouterLLM(api_key=api_key)
-    loader = GAIADatasetLoader()
-    agent = GAIAAgent(llm, loader)
-    demo = build_gradio_interface(agent, loader)
-    demo.launch(share=True)
 if __name__ == "__main__":
-    main()

 import os
 import json
+import time
 import requests
 import gradio as gr
 # ===============================
 # 1. LLM Wrapper (Your Original)
 # ===============================
         self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
         self.model = model
         self.base_url = "https://openrouter.ai/api/v1"
         if not self.api_key:
             raise ValueError("Missing OpenRouter API key. Set OPENROUTER_API_KEY environment variable.")
     def generate(self, prompt, system_prompt="You are a helpful AI agent."):
         headers = {
             "Authorization": f"Bearer {self.api_key}",
             "Content-Type": "application/json",
             "messages": [
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": prompt}
+            ],
+            "temperature": 0.1,
+            "max_tokens": 500
         }
         try:
+            response = requests.post(f"{self.base_url}/chat/completions", headers=headers, json=payload)
             response.raise_for_status()
             data = response.json()
             return data["choices"][0]["message"]["content"].strip()
         except Exception as e:
             return f"Error: {e}"
 # ===============================
+# 2. GAIA API Loader
 # ===============================
+GAIA_API_BASE = "https://gaia-benchmark-hf.fly.dev"
+class GAIAAgent:
+    def __init__(self, llm: OpenRouterLLM):
+        self.llm = llm
         self.questions = []
+    def fetch_questions(self):
         try:
+            resp = requests.get(f"{GAIA_API_BASE}/questions", timeout=30)
+            if resp.status_code == 200:
+                self.questions = resp.json()
+                return f"✅ Loaded {len(self.questions)} GAIA questions"
+            else:
+                return f"⚠️ Failed to fetch questions: {resp.status_code}"
         except Exception as e:
+            return f"⚠️ Error fetching questions: {e}"
+    def fetch_random_question(self):
+        try:
+            resp = requests.get(f"{GAIA_API_BASE}/random-question", timeout=10)
+            if resp.status_code == 200:
+                return resp.json()
+            else:
+                return {}
+        except:
+            return {}
     def clean_answer(self, answer: str):
         answer = answer.strip()
+        prefixes = ["Answer:", "Final answer:", "The answer is:"]
+        for prefix in prefixes:
+            if answer.lower().startswith(prefix.lower()):
+                answer = answer[len(prefix):].strip()
         return answer.strip()
     def answer_question(self, question_obj):
+        q = question_obj.get("Question", "")
         system_prompt = (
             "You are solving GAIA benchmark questions. "
             "Provide ONLY the final answer, no reasoning."
         )
         raw_answer = self.llm.generate(q, system_prompt)
         return self.clean_answer(raw_answer)
+    def evaluate_all(self):
+        if not self.questions:
+            return {"error": "No questions loaded"}
+        results = []
+        correct = 0
+        for q in self.questions:
+            expected = str(q.get("Final answer", "")).strip()
+            answer = self.answer_question(q)
+            is_correct = answer.strip() == expected
             if is_correct:
                 correct += 1
             results.append({
+                "task_id": q.get("task_id"),
+                "question": q.get("Question"),
                 "expected": expected,
+                "answer": answer,
                 "correct": is_correct
             })
+        score = (correct / len(results)) * 100 if results else 0
+        return {"score": score, "results": results, "correct": correct, "total": len(results)}
+    def submit_answers(self, username, agent_code, answers):
+        try:
+            payload = {
+                "username": username,
+                "agent_code": agent_code,
+                "answers": answers
+            }
+            resp = requests.post(f"{GAIA_API_BASE}/submit", json=payload, timeout=60)
+            if resp.status_code == 200:
+                return resp.json()
+            else:
+                return {"error": f"Submission failed: {resp.status_code}"}
+        except Exception as e:
+            return {"error": str(e)}
 # ===============================
+# 3. Gradio UI
 # ===============================
+llm = OpenRouterLLM()
+agent = GAIAAgent(llm)
+def load_questions_ui():
+    return agent.fetch_questions()
+def test_random_question_ui():
+    q = agent.fetch_random_question()
+    if not q:
+        return "Failed to fetch a random question"
+    ans = agent.answer_question(q)
+    return f"Question: {q.get('Question')}\nAnswer: {ans}"
+def run_full_evaluation_ui(username):
+    if not agent.questions:
+        return "Please load questions first."
+    results_data = agent.evaluate_all()
+    if "error" in results_data:
+        return results_data["error"]
+    answers_payload = [
+        {"task_id": r["task_id"], "submitted_answer": r["answer"]}
+        for r in results_data["results"]
+    ]
+    agent_code = f"https://huggingface.co/spaces/{username}/Gaia-Test-Agent/tree/main"
+    submission_result = agent.submit_answers(username, agent_code, answers_payload)
+    score = submission_result.get("score", 0)
+    return f"Score: {score}%\nAnswers submitted: {len(answers_payload)}\nLeaderboard info: {submission_result}"
+def manual_test_ui(question_text):
+    return agent.answer_question({"Question": question_text})
+def build_gradio_app():
+    with gr.Blocks() as app:
+        gr.Markdown("# 🤖 GAIA Benchmark Agent")
+        with gr.Tab("Load Questions"):
+            out_load = gr.Textbox(label="Status")
+            btn_load = gr.Button("Load GAIA Questions")
+            btn_load.click(load_questions_ui, outputs=out_load)
+        with gr.Tab("Random Question Test"):
+            out_test = gr.Textbox(label="Result", lines=6)
+            btn_test = gr.Button("Test Random Question")
+            btn_test.click(test_random_question_ui, outputs=out_test)
+        with gr.Tab("Full Evaluation & Submit"):
+            username_input = gr.Textbox(label="Your HF Username")
+            out_eval = gr.Textbox(label="Evaluation Result", lines=10)
+            btn_eval = gr.Button("Run Evaluation & Submit")
+            btn_eval.click(run_full_evaluation_ui, inputs=username_input, outputs=out_eval)
+        with gr.Tab("Manual Test"):
+            manual_input = gr.Textbox(label="Enter Question")
+            manual_output = gr.Textbox(label="Agent Answer", lines=4)
+            manual_btn = gr.Button("Get Answer")
+            manual_btn.click(manual_test_ui, inputs=manual_input, outputs=manual_output)
+    return app
 # ===============================
+# 4. Main
 # ===============================
 if __name__ == "__main__":
+    app = build_gradio_app()
+    if os.getenv("SPACE_ID"):
+        app.launch(server_name="0.0.0.0", server_port=7860)
+    else:
+        app.launch(share=True)