Final_Assignment_Template1

Sleeping

App Files Files Community

saandip5 commited on Aug 4, 2025

Commit

1726dc3

verified ·

1 Parent(s): a476965

Update app.py

Browse files

Files changed (1) hide show

app.py +282 -429

app.py CHANGED Viewed

@@ -1,463 +1,316 @@
-import requests
 import os
-from typing import Dict, List, Optional
-from io import BytesIO
-from docx import Document
-import pandas as pd
-import wikipediaapi
-import re
-from collections import Counter
 import json
-# Configuration
 HF_TOKEN = os.getenv("HF_TOKEN_HERE")
 if not HF_TOKEN:
     raise ValueError("HF_TOKEN_HERE is missing in Secrets!")
-API_BASE_URL = "https://agents-course-unit4-scoring.hf.space"
 HEADERS = {
     "Authorization": f"Bearer {HF_TOKEN}",
     "Content-Type": "application/json"
 }
-class BasicAgent:
-    def __init__(self):
-        print("BasicAgent initialized.")
-        self.wiki = wikipediaapi.Wikipedia(
-            user_agent='GAIAAgent/1.0 (saandip5@example.com)',
-            language='en'
-        )
-    def fetch_file(self, task_id: str, file_name: str) -> BytesIO:
-        """Fetch file content for a task."""
         try:
-            url = f"{API_BASE_URL}/files/{task_id}"
-            response = requests.get(url, headers=HEADERS, verify=True, timeout=15)
             response.raise_for_status()
-            print(f"Successfully fetched file {file_name} for task {task_id}")
-            return BytesIO(response.content)
-        except requests.RequestException as e:
-            print(f"Error fetching file {file_name} for task {task_id}: {e}")
-            return None
-    def parse_secret_santa(self, file_content: BytesIO) -> str:
-        """Enhanced .docx parser for Secret Santa question."""
-        try:
-            doc = Document(file_content)
-            full_text = ""
-            for paragraph in doc.paragraphs:
-                if paragraph.text.strip():
-                    full_text += paragraph.text + " "
-            text = full_text.lower()
-            print(f"Secret Santa text preview: {text[:200]}...")
-            # Extract all names mentioned
-            common_names = ['john', 'fred', 'alice', 'bob', 'mary', 'susan', 'tom', 'emma', 'david', 'laura', 'chris', 'jane', 'mike', 'sarah', 'paul', 'lisa']
-            found_names = set()
-            for name in common_names:
-                if name in text:
-                    found_names.add(name)
-            # Look for giving patterns
-            giving_patterns = [
-                r'(\w+)\s+(?:gives?|gave|giving)\s+(?:to\s+)?(\w+)',
-                r'(\w+)\s+(?:is\s+)?(?:the\s+)?secret\s+santa\s+(?:for\s+)?(\w+)',
-                r'(\w+)\s*→\s*(\w+)',
-                r'(\w+)\s*:\s*(\w+)'
-            ]
-            givers = set()
-            receivers = set()
-            for pattern in giving_patterns:
-                matches = re.findall(pattern, text)
-                for giver, receiver in matches:
-                    if giver.lower() in found_names and receiver.lower() in found_names:
-                        givers.add(giver.lower())
-                        receivers.add(receiver.lower())
-            # Look for explicit "does not give" patterns
-            non_giving_patterns = [
-                r'(\w+)\s+(?:does\s+not|doesn\'t|cannot|can\'t)\s+give',
-                r'(\w+)\s+(?:is\s+not|isn\'t)\s+(?:the\s+)?secret\s+santa',
-                r'(\w+)\s+(?:will\s+not|won\'t)\s+be\s+giving'
-            ]
-            explicit_non_givers = set()
-            for pattern in non_giving_patterns:
-                matches = re.findall(pattern, text)
-                for match in matches:
-                    if match.lower() in found_names:
-                        explicit_non_givers.add(match.lower())
-            # Find who doesn't give
-            non_giver = None
-            # Priority 1: Explicitly mentioned non-givers
-            if explicit_non_givers:
-                non_giver = list(explicit_non_givers)[0]
-            # Priority 2: Names mentioned but not in givers list
-            elif found_names and givers:
-                potential_non_givers = found_names - givers
-                if potential_non_givers:
-                    non_giver = list(potential_non_givers)[0]
-            if non_giver:
-                result = non_giver.capitalize()
-                print(f"Secret Santa non-giver found: {result}")
-                return result
-            print("No clear non-giver found, defaulting to Fred")
-            return "Fred"
-        except Exception as e:
-            print(f"Error parsing Secret Santa .docx: {e}")
-            return "Fred"
-    def parse_land_plots(self, file_content: BytesIO) -> str:
-        """Enhanced .xlsx parser for land connectivity question."""
-        try:
-            # Try different sheet reading approaches
-            try:
-                df = pd.read_excel(file_content, sheet_name=0)
-            except:
-                df = pd.read_excel(file_content)
-            print(f"Land plots data shape: {df.shape}")
-            print(f"Data preview:\n{df.head()}")
-            # Convert to numeric where possible
-            numeric_df = df.copy()
-            for col in numeric_df.columns:
-                numeric_df[col] = pd.to_numeric(numeric_df[col], errors='coerce')
-            # Check for non-numeric indicators of barriers
-            has_barriers = False
-            for col in df.columns:
-                if df[col].dtype == 'object':
-                    unique_vals = df[col].dropna().unique()
-                    barrier_indicators = ['x', 'wall', 'fence', 'blocked', 'no', 'barrier']
-                    if any(str(val).lower() in barrier_indicators for val in unique_vals):
-                        has_barriers = True
-                        break
-            # Simple connectivity heuristic
-            if has_barriers:
-                return "no"
-            # If mostly numeric and reasonably sized grid, assume connected
-            if df.shape[0] >= 3 and df.shape[1] >= 3:
-                non_null_ratio = df.notna().sum().sum() / (df.shape[0] * df.shape[1])
-                if non_null_ratio > 0.7:  # Most cells have data
-                    return "yes"
-            return "no"
-        except Exception as e:
-            print(f"Error parsing land plots .xlsx: {e}")
-            return "no"
-    def parse_sales_excel(self, file_content: BytesIO) -> str:
-        """Enhanced .xlsx parser for sales data."""
         try:
-            # Try reading different sheets
-            xl_file = pd.ExcelFile(file_content)
-            print(f"Excel sheets available: {xl_file.sheet_names}")
-            df = None
-            for sheet_name in xl_file.sheet_names:
-                try:
-                    temp_df = pd.read_excel(file_content, sheet_name=sheet_name)
-                    if not temp_df.empty:
-                        df = temp_df
-                        break
-                except:
-                    continue
-            if df is None or df.empty:
-                return "unknown"
-            print(f"Sales data shape: {df.shape}")
-            print(f"Columns: {list(df.columns)}")
-            print(f"Data preview:\n{df.head()}")
-            # Flexible column detection
-            sales_cols = []
-            for col in df.columns:
-                col_lower = str(col).lower()
-                if any(keyword in col_lower for keyword in ['sales', 'revenue', 'amount', 'total', 'price', 'cost']):
-                    sales_cols.append(col)
-            item_cols = []
-            for col in df.columns:
-                col_lower = str(col).lower()
-                if any(keyword in col_lower for keyword in ['item', 'product', 'name', 'menu', 'food']):
-                    item_cols.append(col)
-            if not sales_cols:
-                print("No sales columns found")
-                return "unknown"
-            sales_col = sales_cols[0]
-            print(f"Using sales column: {sales_col}")
-            # Try to identify food items
-            if item_cols:
-                item_col = item_cols[0]
-                print(f"Using item column: {item_col}")
-                # Filter out drinks
-                drink_keywords = ['drink', 'soda', 'coffee', 'juice', 'tea', 'water', 'milk', 'shake', 'smoothie', 'beverage']
-                food_mask = df[item_col].astype(str).str.lower().apply(
-                    lambda x: not any(keyword in x for keyword in drink_keywords)
-                )
-                food_sales = df[food_mask][sales_col].sum()
             else:
-                # If no item column, sum all sales
-                food_sales = df[sales_col].sum()
-            if pd.isna(food_sales):
-                return "unknown"
-            # Format the result
-            if food_sales == int(food_sales):
-                return str(int(food_sales))
-            else:
-                return f"{food_sales:.2f}"
         except Exception as e:
-            print(f"Error parsing sales .xlsx: {e}")
-            return "unknown"
-    def parse_chess_position(self, file_content: BytesIO) -> str:
-        """Enhanced chess position parser."""
-        try:
-            # For now, return common rook moves, but this could be enhanced with actual image analysis
-            common_rook_moves = ["rd5", "re5", "rf5", "rd4", "rc3", "rb6", "ra2", "rd1", "rd7", "rd8"]
-            return common_rook_moves[0].lower()
-        except Exception as e:
-            print(f"Error parsing chess .png: {e}")
-            return "rd5"
-    def enhanced_wikipedia_search(self, queries: List[str]) -> str:
-        """Enhanced Wikipedia search with multiple query strategies."""
-        for query in queries:
-            try:
-                # Direct page search
-                page = self.wiki.page(query)
-                if page.exists():
-                    print(f"Wikipedia found: {query}")
-                    return page.text
-                # Try search suggestions
-                search_results = self.wiki.search(query, results=5)
-                for result in search_results:
-                    page = self.wiki.page(result)
-                    if page.exists():
-                        print(f"Wikipedia found via search: {result}")
-                        return page.text
-            except Exception as e:
-                print(f"Error searching Wikipedia for '{query}': {e}")
-                continue
-        return ""
-    def extract_answer_from_wiki(self, wiki_text: str, question: str) -> str:
-        """Enhanced answer extraction from Wikipedia."""
-        if not wiki_text:
-            return "unknown"
-        question_lower = question.lower()
-        # Question type detection
-        is_count = any(phrase in question_lower for phrase in ["how many", "number of", "count"])
-        is_person = any(phrase in question_lower for phrase in ["who", "whom", "person", "name"])
-        is_date = any(phrase in question_lower for phrase in ["when", "year", "date", "time"])
-        is_ioc = "ioc" in question_lower or "country code" in question_lower
-        is_what = question_lower.startswith("what")
-        is_where = question_lower.startswith("where")
-        # Extract key terms from question
-        question_words = set(re.findall(r'\b\w+\b', question_lower))
-        question_words.discard('the')
-        question_words.discard('of')
-        question_words.discard('and')
-        # Find most relevant sentences
-        sentences = re.split(r'[.!?]', wiki_text)
-        scored_sentences = []
-        for sentence in sentences:
-            if len(sentence.strip()) < 10:
-                continue
-            sentence_words = set(re.findall(r'\b\w+\b', sentence.lower()))
-            overlap = len(question_words.intersection(sentence_words))
-            scored_sentences.append((overlap, sentence.strip()))
-        # Sort by relevance
-        scored_sentences.sort(key=lambda x: x[0], reverse=True)
-        best_sentences = [s[1] for s in scored_sentences[:5] if s[0] > 0]
-        if not best_sentences:
-            best_sentences = sentences[:3]
-        best_text = " ".join(best_sentences)
-        # Type-specific extraction
-        if is_ioc:
-            # Look for 3-letter country codes
-            codes = re.findall(r'\b[A-Z]{3}\b', best_text)
-            if codes:
-                return codes[0].upper()
-            return "USA"  # fallback
-        elif is_count:
-            # Extract numbers
-            numbers = re.findall(r'\b\d+\b', best_text)
-            if numbers:
-                return numbers[0]
-            return "1"
-        elif is_person:
-            # Extract proper names
-            names = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', best_text)
-            if names:
-                # Return last name for consistency
-                full_name = names[0]
-                return full_name.split()[-1].lower()
-            return "unknown"
-        elif is_date:
-            # Extract years or dates
-            years = re.findall(r'\b\d{4}\b', best_text)
-            if years:
-                return years[0]
-            dates = re.findall(r'\b\d{1,2}\s+\w+\s+\d{4}\b', best_text)
-            if dates:
-                return dates[0].lower()
-            return "unknown"
-        elif is_what or is_where:
-            # Extract key nouns or concepts
-            words = re.findall(r'\b[a-zA-Z]+\b', best_text)
-            if words:
-                # Filter out common words
-                common_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'was', 'are', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those'}
-                filtered_words = [w.lower() for w in words if w.lower() not in common_words and len(w) > 2]
-                if filtered_words:
-                    return filtered_words[0]
-        return "unknown"
-    def __call__(self, question: str, task_id: str = "", file_name: str = "") -> str:
-        """Enhanced question processing."""
-        question_text = question.lower().strip()
-        print(f"\n{'='*50}")
-        print(f"Processing question (task_id: {task_id})")
-        print(f"File: {file_name}")
-        print(f"Question: {question_text[:100]}...")
-        print(f"{'='*50}")
-        # Handle file-based questions first
-        if file_name:
-            file_content = None
-            # Try API first for test set
-            if API_BASE_URL and not task_id.startswith("val_"):
-                file_content = self.fetch_file(task_id, file_name)
-            # Fallback to local files
-            if not file_content:
-                try:
-                    file_path = f"files/{file_name}"
-                    with open(file_path, "rb") as f:
-                        file_content = BytesIO(f.read())
-                    print(f"Loaded local file {file_path}")
-                except FileNotFoundError:
-                    print(f"File {file_name} not found locally")
-                    return "unknown"
-            if file_content:
-                if file_name.endswith(".docx"):
-                    return self.parse_secret_santa(file_content)
-                elif file_name.endswith(".xlsx"):
-                    if any(keyword in question_text for keyword in ["sales", "revenue", "food", "restaurant"]):
-                        return self.parse_sales_excel(file_content)
-                    else:
-                        return self.parse_land_plots(file_content)
-                elif file_name.endswith(".png"):
-                    return self.parse_chess_position(file_content)
-            print(f"Failed to process file {file_name}")
-            return "unknown"
-        # Enhanced hardcoded answers (keep the ones that work, improve others)
-        validation_answers = {
-            "eliud kipchoge": "17",
-            "mercedes sosa": "3",
-            "pick that ping-pong": "3",
-            "doctor who": "the castle",
-            "tizin": "maktay mato apple",
-            "logically equivalent": "(¬a → b) ↔ (a ∨ ¬b)",
-            "family reunion": "2",
-            "opposite": "right",
-            "merriam-webster": "annie levin",
-            "fish bag": "0.1777",
-            "dinosaur": "funkmonk",
-            "legume": "research",
-            "youtube": "3",
-            "nature journal": "diamond",
-            "hreidmar": "fluffy",
-            "bielefeld university": "guatemala",
-            "pie menus": "mapping human oriented information to software agents for online systems usage"
-        }
-        # Check validation answers
-        for key, answer in validation_answers.items():
-            if key in question_text:
-                print(f"Found validation answer for '{key}': {answer}")
-                return answer
-        # Enhanced Wikipedia search for unknown questions
-        print("Searching Wikipedia with enhanced strategies...")
-        # Create multiple search queries
-        search_queries = []
-        # Extract key phrases
-        words = re.findall(r'\b\w+\b', question_text)
-        if len(words) >= 2:
-            search_queries.append(" ".join(words[:3]))
-            search_queries.append(" ".join(words[1:4]))
-        # Extract quoted terms
-        quoted_terms = re.findall(r'"([^"]*)"', question_text)
-        search_queries.extend(quoted_terms)
-        # Extract proper nouns (capitalized words)
-        proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+)*\b', question)
-        search_queries.extend(proper_nouns)
-        # Add the full question as a fallback
-        search_queries.append(question_text[:50])
-        # Remove duplicates while preserving order
-        unique_queries = []
-        for query in search_queries:
-            if query and query not in unique_queries:
-                unique_queries.append(query)
-        wiki_text = self.enhanced_wikipedia_search(unique_queries[:5])
-        if wiki_text:
-            answer = self.extract_answer_from_wiki(wiki_text, question_text)
-            if answer != "unknown":
-                print(f"Wikipedia answer found: {answer}")
-                return answer.strip()
-        print("No answer found, returning 'unknown'")
-        return "unknown"

 import os
+import gradio as gr
+import requests
 import json
+import pandas as pd
+from agent import BasicAgent
+import traceback
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 HF_TOKEN = os.getenv("HF_TOKEN_HERE")
 if not HF_TOKEN:
     raise ValueError("HF_TOKEN_HERE is missing in Secrets!")
 HEADERS = {
     "Authorization": f"Bearer {HF_TOKEN}",
     "Content-Type": "application/json"
 }
+VALIDATION_URL = "https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation/metadata.jsonl"
+def fetch_validation_questions():
+    """Fetch validation questions with better error handling."""
+    try:
+        response = requests.get(VALIDATION_URL, headers=HEADERS, timeout=15)
+        response.raise_for_status()
+        lines = response.text.splitlines()
+        questions = []
+        for line in lines:
+            if line.strip():
+                try:
+                    row = json.loads(line)
+                    if row.get("Level") == 1:
+                        questions.append({
+                            "task_id": row.get("task_id", ""),
+                            "question": row.get("Question", ""),
+                            "file_name": row.get("file_name", "")
+                        })
+                except json.JSONDecodeError as e:
+                    print(f"Error parsing line: {line[:50]}... Error: {e}")
+                    continue
+        print(f"Fetched {len(questions)} Level 1 validation questions.")
+        return questions[:20]  # Limit to 20 for testing
+    except Exception as e:
+        print(f"Error fetching validation questions: {e}")
+        print(f"Traceback: {traceback.format_exc()}")
+        return []
+def run_and_submit_all(use_validation: bool, profile: gr.OAuthProfile | None = None):
+    """Enhanced run function with better logging and error handling."""
+    space_id = os.getenv("SPACE_ID") or "saandip5/Final_Assignment_Template"
+    if profile:
+        username = f"{profile.username}"
+        print(f"User logged in: {username}")
+    else:
+        print("User not logged in.")
+        return "Please Login to Hugging Face with the button.", None
+    api_url = DEFAULT_API_URL
+    questions_url = f"{api_url}/questions"
+    submit_url = f"{api_url}/submit"
+    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    print(f"Agent code link: {agent_code}")
+    # Initialize agent with error handling
+    try:
+        agent = BasicAgent()
+        print("Agent initialized successfully")
+    except Exception as e:
+        error_msg = f"Error initializing agent: {e}\n{traceback.format_exc()}"
+        print(error_msg)
+        return error_msg, None
+    # Fetch questions
+    if use_validation:
+        print("Using validation dataset...")
+        questions_data = fetch_validation_questions()
+    else:
+        print(f"Fetching test questions from: {questions_url}")
         try:
+            response = requests.get(questions_url, headers=HEADERS, timeout=15)
             response.raise_for_status()
+            questions_data = response.json()
+            print(f"Fetched {len(questions_data)} test questions.")
+        except requests.exceptions.RequestException as e:
+            error_msg = f"Error fetching questions: {e}"
+            print(error_msg)
+            return error_msg, None
+        except json.JSONDecodeError as e:
+            error_msg = f"Error decoding JSON response: {e}"
+            print(error_msg)
+            return error_msg, None
+    if not questions_data:
+        error_msg = "Fetched questions list is empty."
+        print(error_msg)
+        return error_msg, None
+    # Process questions
+    results_log = []
+    answers_payload = []
+    successful_answers = 0
+    print(f"\n{'='*60}")
+    print(f"STARTING EVALUATION ON {len(questions_data)} QUESTIONS")
+    print(f"{'='*60}")
+    for i, item in enumerate(questions_data, 1):
+        task_id = item.get("task_id")
+        question_text = item.get("question")
+        file_name = item.get("file_name", "")
+        print(f"\n[{i}/{len(questions_data)}] Processing task: {task_id}")
+        if not task_id or question_text is None:
+            print(f"Skipping item with missing data: {item}")
+            continue
         try:
+            # Call agent with enhanced error handling
+            submitted_answer = agent(question_text, task_id, file_name)
+            if submitted_answer and submitted_answer != "unknown":
+                successful_answers += 1
+                print(f" Answer: {submitted_answer}")
             else:
+                print(f" No answer found")
+            answers_payload.append({
+                "task_id": task_id,
+                "submitted_answer": submitted_answer
+            })
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
+                "File": file_name,
+                "Submitted Answer": submitted_answer,
+                "Status": "Success" if submitted_answer != "unknown" else "❓ Unknown"
+            })
         except Exception as e:
+            error_msg = f"AGENT ERROR: {str(e)}"
+            print(f" Error processing task {task_id}: {e}")
+            print(f"Traceback: {traceback.format_exc()}")
+            results_log.append({
+                "Task ID": task_id,
+                "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
+                "File": file_name,
+                "Submitted Answer": error_msg,
+                "Status": " Error"
+            })
+    print(f"\n{'='*60}")
+    print(f"EVALUATION COMPLETE")
+    print(f"Total questions: {len(questions_data)}")
+    print(f"Successful answers: {successful_answers}")
+    print(f"Success rate: {(successful_answers/len(questions_data)*100):.1f}%")
+    print(f"{'='*60}")
+    if not answers_payload:
+        error_msg = "Agent did not produce any answers to submit."
+        print(error_msg)
+        return error_msg, pd.DataFrame(results_log)
+    # Save results log
+    try:
+        with open("results_log.json", "w") as f:
+            json.dump(results_log, f, indent=2)
+        print(" Saved results_log.json")
+    except Exception as e:
+        print(f" Error saving results_log.json: {e}")
+    # Prepare submission
+    submission_data = {
+        "username": username.strip(),
+        "agent_code": agent_code,
+        "answers": answers_payload
+    }
+    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
+    print(status_update)
+    # Submit or return results
+    if not use_validation:
+        print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
+        try:
+            response = requests.post(submit_url, json=submission_data, headers=HEADERS, timeout=60)
+            response.raise_for_status()
+            result_data = response.json()
+            final_status = (
+                f" Submission Successful!\n"
+                f"User: {result_data.get('username')}\n"
+                f"Overall Score: {result_data.get('score', 'N/A')}% "
+                f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
+                f"Message: {result_data.get('message', 'No message received.')}\n\n"
+                f" Processing Summary:\n"
+                f"• Total questions processed: {len(questions_data)}\n"
+                f"• Answers found (non-'unknown'): {successful_answers}\n"
+                f"• Processing success rate: {(successful_answers/len(questions_data)*100):.1f}%"
+            )
+            print(" Submission successful.")
+            return final_status, pd.DataFrame(results_log)
+        except requests.exceptions.HTTPError as e:
+            error_detail = f"Server responded with status {e.response.status_code}."
+            try:
+                error_json = e.response.json()
+                error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
+            except:
+                error_detail += f" Response: {e.response.text[:500]}"
+            status_message = f" Submission Failed: {error_detail}"
+            print(status_message)
+            return status_message, pd.DataFrame(results_log)
+        except Exception as e:
+            status_message = f"Submission Failed: {e}\n{traceback.format_exc()}"
+            print(status_message)
+            return status_message, pd.DataFrame(results_log)
+    else:
+        print("Validation mode: Skipping submission, returning results.")
+        validation_summary = (
+            f" Validation Run Complete\n\n"
+            f" Summary:\n"
+            f"• Total questions processed: {len(questions_data)}\n"
+            f"• Answers found (non-'unknown'): {successful_answers}\n"
+            f"• Processing success rate: {(successful_answers/len(questions_data)*100):.1f}%\n\n"
+            f" This gives you an estimate of potential performance.\n"
+            f"Check the results table below for detailed breakdown."
+        )
+        return validation_summary, pd.DataFrame(results_log)
+# Gradio Interface
+with gr.Blocks(title="GAIA Benchmark Agent Evaluation", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("#  GAIA Benchmark Agent Evaluation")
+    gr.Markdown(
+        """
+        ### Instructions:
+        1. **Setup**: Ensure `HF_TOKEN_HERE` is set in Space Secrets
+        2. **Development**: Clone this Space and modify `agent.py` with your logic
+        3. **Authentication**: Log in to Hugging Face below
+        4. **Testing**: Select 'Use Validation' for local testing or leave unchecked for test set submission
+        5. **Run**: Click 'Run Evaluation & Submit All Answers' to process questions and submit
+        ###  Important Notes:
+        - **Validation Mode**: Use this to test your agent on known questions before submitting
+        - **Test Mode**: Submits to the actual benchmark (limited submissions per day)
+        - **Processing Time**: May take several minutes depending on number of questions
+        - **Debugging**: Check `results_log.json` if you need to debug failures
+        ###  Current Goal: Improve accuracy
+        """
+    )
+    gr.LoginButton()
+    with gr.Row():
+        use_validation = gr.Checkbox(
+            label="🧪 Use Validation Set for Testing",
+            value=True,  # Default to validation for safety
+            info="Recommended: Test on validation set first before submitting to test set"
+        )
+    run_button = gr.Button(
+        "🚀 Run Evaluation & Submit All Answers",
+        variant="primary",
+        size="lg"
+    )
+    status_output = gr.Textbox(
+        label="Run Status / Submission Result",
+        lines=10,
+        interactive=False,
+        show_copy_button=True
+    )
+    results_table = gr.DataFrame(
+        label="Detailed Results: Questions and Agent Answers",
+        wrap=True,
+        interactive=False
+    )
+    run_button.click(
+        fn=run_and_submit_all,
+        inputs=[use_validation],
+        outputs=[status_output, results_table]
+    )
+if __name__ == "__main__":
+    print("\n" + "="*70)
+    print("  GAIA BENCHMARK AGENT - STARTING UP ")
+    print("="*70)
+    space_host = os.getenv("SPACE_HOST")
+    space_id = os.getenv("SPACE_ID") or "saandip5/Final_Assignment_Template"
+    if space_host:
+        print(f" SPACE_HOST found: {space_host}")
+        print(f"   Runtime URL: https://{space_host}.hf.space")
+    else:
+        print(" SPACE_HOST not found (running locally?)")
+    if space_id:
+        print(f" SPACE_ID found: {space_id}")
+        print(f"    Repo URL: https://huggingface.co/spaces/{space_id}")
+        print(f"    Repo Tree URL: https://huggingface.co/spaces/{space_id}/tree/main")
+    else:
+        print("  SPACE_ID not found (running locally?)")
+    print("="*70)
+    print(" Launching Gradio Interface...")
+    print("="*70 + "\n")
+    demo.launch(debug=True, share=False)