Spaces:

Mehedi2
/

new_assignment

Sleeping

App Files Files Community

Mehedi2 commited on Sep 28, 2025

Commit

8f02790

verified ·

1 Parent(s): a793777

Update app.py

Browse files

Files changed (1) hide show

app.py +170 -335

app.py CHANGED Viewed

@@ -1,400 +1,235 @@
 import os
-import requests
 import json
 import gradio as gr
-from typing import Dict, List, Any
 try:
     from datasets import load_dataset
     DATASETS_AVAILABLE = True
 except ImportError:
     DATASETS_AVAILABLE = False
-# Your OpenRouter API key
-OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") or os.getenv("my_key")
 class OpenRouterLLM:
-    def __init__(self, api_key: str, model: str = "deepseek/deepseek-v3.1-terminus"):
-        self.api_key = api_key
         self.model = model
-        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
-    def __call__(self, prompt: str, max_tokens: int = 1000, temperature: float = 0.1) -> str:
-        """Make API call to OpenRouter"""
-        if not self.api_key or not self.api_key.startswith('sk-or-v1-'):
-            return "Error: Invalid OpenRouter API key"
         headers = {
             "Authorization": f"Bearer {self.api_key}",
             "Content-Type": "application/json",
         }
         payload = {
             "model": self.model,
             "messages": [
-                {
-                    "role": "system",
-                    "content": """You are a helpful AI assistant. Provide exact, direct answers without explanations unless specifically asked.
-Examples:
-- Math: "15 + 27" → "42"
-- Yes/No: "Is Paris the capital of France?" → "Yes"
-- Facts: "Capital of Japan?" → "Tokyo"
-Be concise and precise."""
-                },
-                {
-                    "role": "user",
-                    "content": prompt
-                }
-            ],
-            "temperature": temperature,
-            "max_tokens": max_tokens,
         }
         try:
-            response = requests.post(self.base_url, headers=headers, json=payload, timeout=30)
-            if response.status_code != 200:
-                return f"API Error: {response.status_code}"
-            result = response.json()
-            if "choices" in result and len(result["choices"]) > 0:
-                answer = result["choices"][0]["message"]["content"].strip()
-                return self.clean_answer(answer)
-            else:
-                return "Error: No response content received"
         except Exception as e:
-            return f"Error: {str(e)}"
-    def clean_answer(self, response: str) -> str:
-        """Clean the response to extract just the answer"""
-        response = response.strip()
-        # Remove common prefixes
-        prefixes = [
-            "Answer:", "The answer is:", "Response:", "Result:",
-            "Final answer:", "Solution:", "A:", "Answer is:",
-            "The final answer is:", "My answer is:"
-        ]
-        for prefix in prefixes:
-            if response.lower().startswith(prefix.lower()):
-                response = response[len(prefix):].strip()
-                break
-        # Remove quotes
-        if (response.startswith('"') and response.endswith('"')) or (response.startswith("'") and response.endswith("'")):
-            response = response[1:-1]
-        # Remove trailing periods for short answers
-        if len(response.split()) <= 3 and response.endswith('.'):
-            response = response[:-1]
-        return response
 class GAIADatasetLoader:
-    def __init__(self, api_key: str):
-        self.llm = OpenRouterLLM(api_key=api_key)
-        self.questions = []
         self.dataset = None
     def load_gaia_dataset(self):
-        """Load GAIA dataset properly"""
         if not DATASETS_AVAILABLE:
             return "Error: datasets library not available. Install with: pip install datasets"
         try:
-            # Load the GAIA dataset - it has different subsets
             print("Loading GAIA dataset...")
-            # The GAIA dataset structure from HuggingFace
-            dataset = load_dataset("gaia-benchmark/GAIA", "2023_all")
-            # Check available splits
             available_splits = list(dataset.keys())
             print(f"Available splits: {available_splits}")
-            # Use validation split if available, otherwise use test
             if "validation" in available_splits:
                 self.dataset = dataset["validation"]
             elif "test" in available_splits:
-                self.dataset = dataset["test"]
             else:
-                # Use the first available split
                 split_name = available_splits[0]
                 self.dataset = dataset[split_name]
-            # Convert to our format
             self.questions = []
-            level_1_count = 0
-            for i, item in enumerate(self.dataset):
-                # Extract fields from the dataset
-                task_id = item.get("task_id", f"gaia_{i}")
-                question = item.get("Question", "")
-                final_answer = item.get("Final answer", "")
-                level = item.get("Level", 1)  # Default to level 1
-                # Only include level 1 questions for the leaderboard
-                if level == 1 and question and final_answer:
-                    self.questions.append({
-                        "task_id": task_id,
-                        "Question": question,
-                        "Final answer": str(final_answer),
-                        "Level": level,
-                        "file_name": item.get("file_name", ""),
-                        "file_path": item.get("file_path", "")
-                    })
-                    level_1_count += 1
-                    # Limit to 20 questions for the leaderboard
-                    if level_1_count >= 20:
-                        break
-            if not self.questions:
-                return self.create_fallback_questions("No level 1 questions found in dataset")
-            return f"Successfully loaded {len(self.questions)} level 1 GAIA questions"
         except Exception as e:
             print(f"Dataset loading error: {e}")
             return self.create_fallback_questions(str(e))
-    def create_fallback_questions(self, error_msg: str):
-        """Create fallback questions if dataset loading fails"""
         self.questions = [
-            {"task_id": "fallback_1", "Question": "What is 15 + 27?", "Final answer": "42", "Level": 1},
-            {"task_id": "fallback_2", "Question": "What is the capital of France?", "Final answer": "Paris", "Level": 1},
-            {"task_id": "fallback_3", "Question": "Is 64 divisible by 8?", "Final answer": "Yes", "Level": 1},
-            {"task_id": "fallback_4", "Question": "What is 7 × 6?", "Final answer": "42", "Level": 1},
-            {"task_id": "fallback_5", "Question": "What color do you get when you mix red and yellow?", "Final answer": "Orange", "Level": 1},
-            {"task_id": "fallback_6", "Question": "How many days are in a week?", "Final answer": "7", "Level": 1},
-            {"task_id": "fallback_7", "Question": "What is the square root of 16?", "Final answer": "4", "Level": 1},
-            {"task_id": "fallback_8", "Question": "Is the Sun a star?", "Final answer": "Yes", "Level": 1},
-            {"task_id": "fallback_9", "Question": "What is 100 - 37?", "Final answer": "63", "Level": 1},
-            {"task_id": "fallback_10", "Question": "What is the largest planet in our solar system?", "Final answer": "Jupiter", "Level": 1}
         ]
-        return f"Dataset loading failed: {error_msg}. Using {len(self.questions)} fallback questions for testing."
-    def run_agent(self, prompt: str) -> str:
-        """Main function that GAIA will call"""
-        return self.llm(prompt, max_tokens=500, temperature=0.1)
-    def get_random_question(self):
-        """Get a random question"""
-        if not self.questions:
-            return None
-        import random
-        return random.choice(self.questions)
-    def evaluate_agent(self, max_questions: int = None):
-        """Evaluate the agent on loaded questions"""
-        if not self.questions:
-            return {"error": "No questions loaded"}
-        eval_questions = self.questions[:max_questions] if max_questions else self.questions
-        results = []
-        correct = 0
-        for q in eval_questions:
-            question = q["Question"]
-            expected = q["Final answer"]
-            agent_answer = self.run_agent(question)
-            # Exact match comparison (case-insensitive)
-            is_correct = agent_answer.lower().strip() == expected.lower().strip()
             if is_correct:
                 correct += 1
             results.append({
                 "task_id": q["task_id"],
-                "question": question,
                 "expected": expected,
-                "agent_answer": agent_answer,
                 "correct": is_correct
             })
-        score = (correct / len(eval_questions)) * 100 if eval_questions else 0
-        return {
-            "score": score,
-            "correct": correct,
-            "total": len(eval_questions),
-            "results": results
-        }
-# Initialize the agent
-agent = GAIADatasetLoader(api_key=OPENROUTER_API_KEY)
-def run_agent(prompt: str) -> str:
-    """Main function for GAIA evaluation - this is what gets called"""
-    return agent.run_agent(prompt)
-def load_dataset_action():
-    """Load the GAIA dataset"""
-    return agent.load_gaia_dataset()
-def test_random_question():
-    """Test with a random question"""
-    if not agent.questions:
-        return "Please load the dataset first"
-    q = agent.get_random_question()
-    if not q:
-        return "No questions available"
-    question = q["Question"]
-    expected = q["Final answer"]
-    agent_answer = agent.run_agent(question)
-    is_correct = agent_answer.lower().strip() == expected.lower().strip()
-    return f"""Question: {question}
-Expected Answer: {expected}
-Agent Answer: {agent_answer}
-Correct: {'Yes' if is_correct else 'No'}
-Status: {'EXACT MATCH' if is_correct else 'NO MATCH'}"""
-def run_full_evaluation():
-    """Run evaluation on all loaded questions"""
-    if not agent.questions:
-        return "Please load the dataset first"
-    results = agent.evaluate_agent()
-    if "error" in results:
-        return f"Error: {results['error']}"
-    summary = f"""GAIA Evaluation Results:
-Score: {results['score']:.1f}%
-Correct: {results['correct']}/{results['total']}
-Detailed Results:
-"""
-    for i, result in enumerate(results['results'][:10]):  # Show first 10
-        status_icon = "✓" if result['correct'] else "✗"
-        summary += f"\n{i+1}. [{status_icon}] {result['question'][:60]}..."
-        summary += f"\n   Expected: {result['expected']}"
-        summary += f"\n   Got: {result['agent_answer']}"
-        summary += "\n"
-    if len(results['results']) > 10:
-        summary += f"\n... and {len(results['results']) - 10} more questions"
-    return summary
-def create_gradio_app():
-    with gr.Blocks(title="GAIA Dataset Agent", theme=gr.themes.Soft()) as app:
-        gr.HTML("""
-        <div style="text-align: center; background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
-            <h1>GAIA Dataset Agent</h1>
-            <p>Load and evaluate on real GAIA questions</p>
-        </div>
-        """)
-        # Status indicators
-        with gr.Row():
-            api_status = gr.HTML(
-                f"""<div style="padding: 10px; border-radius: 5px; {'background: lightgreen;' if OPENROUTER_API_KEY else 'background: lightcoral;'}">
-                API Key: {'✓ Configured' if OPENROUTER_API_KEY else '✗ Missing - Set OPENROUTER_API_KEY'}
-                </div>"""
-            )
-            datasets_status = gr.HTML(
-                f"""<div style="padding: 10px; border-radius: 5px; {'background: lightgreen;' if DATASETS_AVAILABLE else 'background: lightcoral;'}">
-                Datasets Library: {'✓ Available' if DATASETS_AVAILABLE else '✗ Missing - Install datasets'}
-                </div>"""
-            )
         with gr.Tab("1. Load Dataset"):
-            gr.Markdown("### Load GAIA Questions from HuggingFace")
-            load_btn = gr.Button("Load GAIA Dataset", variant="primary", size="lg")
-            load_output = gr.Textbox(
-                label="Loading Status",
-                lines=5,
-                placeholder="Click to load GAIA questions..."
-            )
-            load_btn.click(fn=load_dataset_action, outputs=[load_output])
-        with gr.Tab("2. Test Single"):
-            gr.Markdown("### Test with Random Question")
-            test_btn = gr.Button("Test Random Question", variant="primary")
-            test_output = gr.Textbox(
-                label="Test Result",
-                lines=10,
-                placeholder="Load dataset first, then test..."
-            )
-            test_btn.click(fn=test_random_question, outputs=[test_output])
-        with gr.Tab("3. Full Evaluation"):
-            gr.Markdown("### Run Complete Evaluation")
-            eval_btn = gr.Button("Run Full Evaluation", variant="primary", size="lg")
-            eval_output = gr.Textbox(
-                label="Evaluation Results",
-                lines=20,
-                placeholder="Load dataset first, then run evaluation...",
-                show_copy_button=True
-            )
-            eval_btn.click(fn=run_full_evaluation, outputs=[eval_output])
-        with gr.Tab("4. Manual Test"):
-            gr.Markdown("### Test Your Own Questions")
-            manual_input = gr.Textbox(
-                label="Enter Question",
-                placeholder="What is 2 + 2?",
-                lines=2
-            )
-            manual_btn = gr.Button("Get Answer", variant="primary")
-            manual_output = gr.Textbox(
-                label="Agent Answer",
-                lines=3
-            )
-            manual_btn.click(fn=run_agent, inputs=[manual_input], outputs=[manual_output])
-        gr.Markdown("""
-        ### Instructions:
-        1. **Load Dataset**: Click to download GAIA questions from HuggingFace
-        2. **Test Single**: Try your agent on one random question
-        3. **Full Evaluation**: Get your score on all loaded questions
-        4. **Manual Test**: Test with your own questions
-        ### Requirements:
-        - OpenRouter API key in Space secrets as `OPENROUTER_API_KEY`
-        - The `datasets` library (should be in requirements.txt)
-        ### Notes:
-        - Uses real GAIA level 1 questions (20 max for leaderboard)
-        - Scoring is exact match only (case-insensitive)
-        - Your `run_agent` function is ready for GAIA API integration
-        """)
-    return app
 if __name__ == "__main__":
-    app = create_gradio_app()
-    if os.getenv("SPACE_ID"):
-        app.launch(server_name="0.0.0.0", server_port=7860, show_api=False)
-    else:
-        app.launch(share=True, show_api=False)

 import os
+import re
 import json
+import requests
 import gradio as gr
+# Try importing datasets
 try:
     from datasets import load_dataset
     DATASETS_AVAILABLE = True
 except ImportError:
     DATASETS_AVAILABLE = False
+    print("⚠️ datasets library not found. Install with: pip install datasets")
+# ===============================
+# 1. Your Original LLM Wrapper
+# ===============================
 class OpenRouterLLM:
+    def __init__(self, api_key=None, model="deepseek/deepseek-v3.1-terminus"):
+        self.api_key = api_key or os.getenv("OPENROUTER_API_KEY")
         self.model = model
+        self.base_url = "https://openrouter.ai/api/v1"
+        if not self.api_key:
+            raise ValueError("Missing OpenRouter API key. Set OPENROUTER_API_KEY environment variable.")
+    def generate(self, prompt, system_prompt="You are a helpful AI agent."):
+        """Send a prompt to OpenRouter and return the model's response"""
         headers = {
             "Authorization": f"Bearer {self.api_key}",
             "Content-Type": "application/json",
         }
         payload = {
             "model": self.model,
             "messages": [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt}
+            ]
         }
         try:
+            response = requests.post(
+                f"{self.base_url}/chat/completions",
+                headers=headers,
+                data=json.dumps(payload)
+            )
+            response.raise_for_status()
+            data = response.json()
+            return data["choices"][0]["message"]["content"].strip()
         except Exception as e:
+            print(f"LLM error: {e}")
+            return f"Error: {e}"
+# ===============================
+# 2. GAIA Dataset Loader
+# ===============================
 class GAIADatasetLoader:
+    def __init__(self):
         self.dataset = None
+        self.questions = []
     def load_gaia_dataset(self):
+        """Load GAIA dataset from Hugging Face"""
         if not DATASETS_AVAILABLE:
             return "Error: datasets library not available. Install with: pip install datasets"
         try:
             print("Loading GAIA dataset...")
+            dataset = load_dataset("gaia-benchmark/GAIA")
             available_splits = list(dataset.keys())
             print(f"Available splits: {available_splits}")
+            # Use validation split if available
             if "validation" in available_splits:
                 self.dataset = dataset["validation"]
             elif "test" in available_splits:
+                self.dataset = dataset["test"]
             else:
                 split_name = available_splits[0]
                 self.dataset = dataset[split_name]
+            # Pick first 20 items (GAIA leaderboard setup)
             self.questions = []
+            for i, item in enumerate(self.dataset.select(range(20))):
+                self.questions.append({
+                    "task_id": item["task_id"],
+                    "Question": item["Question"],
+                    "Final answer": str(item["Final answer"]),
+                    "file_name": item.get("file_name", ""),
+                    "file_path": item.get("file_path", "")
+                })
+            return f"✅ Successfully loaded {len(self.questions)} GAIA questions"
         except Exception as e:
             print(f"Dataset loading error: {e}")
             return self.create_fallback_questions(str(e))
+    def create_fallback_questions(self, error_message=""):
+        """Fallback: create toy questions if dataset fails"""
         self.questions = [
+            {
+                "task_id": "test_1",
+                "Question": "What is 2+2?",
+                "Final answer": "4"
+            },
+            {
+                "task_id": "test_2",
+                "Question": "What is the capital of France?",
+                "Final answer": "Paris"
+            }
         ]
+        return f"⚠️ Using fallback questions. Error: {error_message}"
+# ===============================
+# 3. GAIA Agent (Evaluator)
+# ===============================
+class GAIAAgent:
+    def __init__(self, llm: OpenRouterLLM, dataset_loader: GAIADatasetLoader):
+        self.llm = llm
+        self.dataset_loader = dataset_loader
+    def clean_answer(self, answer: str):
+        """Clean model output to keep only raw answer"""
+        if not answer:
+            return ""
+        answer = answer.strip()
+        # Remove "Answer:" or "Final answer:" prefixes
+        answer = re.sub(r"(?i)^(final\s*answer|answer)\s*[:\-]?\s*", "", answer)
+        return answer.strip()
+    def answer_question(self, question_obj):
+        """Ask LLM to answer one question"""
+        q = question_obj["Question"]
+        system_prompt = (
+            "You are solving GAIA benchmark questions. "
+            "Provide ONLY the final answer, no reasoning."
+        )
+        raw_answer = self.llm.generate(q, system_prompt)
+        return self.clean_answer(raw_answer)
+    def evaluate(self):
+        """Evaluate all questions and compute accuracy"""
+        results, correct = [], 0
+        for q in self.dataset_loader.questions:
+            agent_answer = self.answer_question(q)
+            expected = str(q["Final answer"]).strip()
+            # Exact match (GAIA leaderboard scoring)
+            is_correct = agent_answer.strip() == expected
             if is_correct:
                 correct += 1
             results.append({
                 "task_id": q["task_id"],
+                "question": q["Question"],
                 "expected": expected,
+                "answer": agent_answer,
                 "correct": is_correct
             })
+        accuracy = correct / len(results) if results else 0
+        return results, accuracy
+# ===============================
+# 4. Gradio UI
+# ===============================
+def build_gradio_interface(agent, dataset_loader):
+    def load_dataset_ui():
+        return dataset_loader.load_gaia_dataset()
+    def test_single_question(question_text):
+        return agent.answer_question({"Question": question_text})
+    def evaluate_agent():
+        results, acc = agent.evaluate()
+        summary = f"✅ Accuracy: {acc*100:.1f}% ({sum(r['correct'] for r in results)}/{len(results)})\n\n"
+        for r in results:
+            summary += (
+                f"\nQ: {r['question']}\n"
+                f"Expected: {r['expected']} | Got: {r['answer']} | Correct: {r['correct']}\n"
+            )
+        return summary
+    def manual_answer_eval(question_text, expected_answer):
+        agent_answer = agent.answer_question({"Question": question_text})
+        is_correct = agent_answer.strip() == expected_answer.strip()
+        return f"Q: {question_text}\nExpected: {expected_answer}\nAgent: {agent_answer}\nCorrect: {is_correct}"
+    with gr.Blocks() as demo:
+        gr.Markdown("# 🤖 GAIA Agent Evaluation")
         with gr.Tab("1. Load Dataset"):
+            out1 = gr.Textbox(label="Dataset Load Status")
+            btn1 = gr.Button("Load GAIA Dataset")
+            btn1.click(load_dataset_ui, outputs=out1)
+        with gr.Tab("2. Test Single Question"):
+            q_in = gr.Textbox(label="Enter a Question")
+            ans_out = gr.Textbox(label="Agent Answer")
+            btn2 = gr.Button("Get Answer")
+            btn2.click(test_single_question, inputs=q_in, outputs=ans_out)
+        with gr.Tab("3. Evaluate Full Dataset"):
+            out3 = gr.Textbox(label="Evaluation Results", lines=20)
+            btn3 = gr.Button("Run Evaluation")
+            btn3.click(evaluate_agent, outputs=out3)
+        with gr.Tab("4. Manual Evaluation"):
+            q_in2 = gr.Textbox(label="Question")
+            expected_in = gr.Textbox(label="Expected Answer")
+            out4 = gr.Textbox(label="Evaluation Result")
+            btn4 = gr.Button("Evaluate Agent Answer")
+            btn4.click(manual_answer_eval, inputs=[q_in2, expected_in], outputs=out4)
+    return demo
+# ===============================
+# 5. Main
+# ===============================
+def main():
+    api_key = os.getenv("OPENROUTER_API_KEY")
+    if not api_key:
+        print("⚠️ Set OPENROUTER_API_KEY before running.")
+        return
+    llm = OpenRouterLLM(api_key=api_key, model="deepseek/deepseek-v3.1-terminus")
+    loader = GAIADatasetLoader()
+    agent = GAIAAgent(llm, loader)
+    demo = build_gradio_interface(agent, loader)
+    demo.launch(share=True)
 if __name__ == "__main__":
+    main()