Final_Agent_HF_Course

Sleeping

App Files Files Community

selim-ba commited on Jun 28, 2025

Commit

99b5ec7

verified ·

1 Parent(s): 23b780e

Create app.py

Browse files

Files changed (1) hide show

app.py +1262 -0

app.py ADDED Viewed

	@@ -0,0 +1,1262 @@

+import os
+import gradio as gr
+import requests
+import inspect
+import pandas as pd
+from langgraph.graph import StateGraph, END
+from typing import TypedDict
+import string
+from transformers import pipeline
+import re
+import wikipedia
+import wikipediaapi
+import spacy
+try:
+    nlp = spacy.load("en_core_web_sm")
+except OSError:
+    print("Downloading spaCy model 'en_core_web_sm'...")
+    spacy.cli.download("en_core_web_sm")
+    nlp = spacy.load("en_core_web_sm")
+# (Keep Constants as is)
+# --- Constants ---
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+# --- Basic Agent Definition ---
+# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
+# class BasicAgent:
+#    def __init__(self):
+#        print("BasicAgent initialized.")
+#    def __call__(self, question: str) -> str:
+#        print(f"Agent received question (first 50 chars): {question[:50]}...")
+#        fixed_answer = "This is a default answer."
+#        print(f"Agent returning fixed answer: {fixed_answer}")
+#        return fixed_answer
+# --- Constants ---
+DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+class SuperSmartAgent:
+    def __init__(self):
+        self.wiki_wiki = wikipediaapi.Wikipedia(
+            language='en',
+            extract_format=wikipediaapi.ExtractFormat.WIKI,
+            user_agent='SelimResearchAgent/1.0'
+        )
+        self.graph = self._build_graph() # Build graph after initializing wiki_wiki
+    def _build_graph(self):
+        # Helper functions (can be class methods or nested as before)
+        def score_text(text):
+            alnum_count = sum(c.isalnum() for c in text)
+            space_count = text.count(' ')
+            punctuation_count = sum(c in string.punctuation for c in text)
+            ends_properly = text[-1] in '.!?'
+            score = alnum_count + space_count
+            if ends_properly:
+                score += 5
+            return score
+        def check_reversed(state):
+            question = state["question"]
+            reversed_candidate = question[::-1]
+            original_score = score_text(question)
+            reversed_score = score_text(reversed_candidate)
+            if reversed_score > original_score:
+                state["is_reversed"] = True
+            else:
+                state["is_reversed"] = False
+            return state
+        def fix_question(state):
+            if state.get("is_reversed", False):
+                state["question"] = state["question"][::-1]
+            return state
+        def check_riddle_or_trick(state):
+            q = state["question"].lower()
+            keywords = ["opposite of", "if you understand", "riddle", "trick question", "what comes next", "i speak without"]
+            state["is_riddle"] = any(kw in q for kw in keywords)
+            return state
+        def solve_riddle(state):
+            q = state["question"].lower()
+            if "opposite of the word" in q:
+                if "left" in q:
+                    state["response"] = "right"
+                elif "up" in q:
+                    state["response"] = "down"
+                elif "hot" in q:
+                    state["response"] = "cold"
+                else:
+                    state["response"] = "Unknown opposite."
+            else:
+                state["response"] = "Could not solve riddle."
+            return state
+        def check_python_suitability(state):
+            question = state["question"].lower()
+            patterns = ["sum", "average", "count", "sort", "generate", "regex", "convert"]
+            state["is_python"] = any(word in question for word in patterns)
+            return state
+        def generate_code(state):
+            q = state["question"].lower()
+            if "sum" in q:
+                state["response"] = "numbers = [1, 2, 3]\nprint(sum(numbers))"
+            elif "average" in q:
+                state["response"] = "numbers = [1, 2, 3]\nprint(sum(numbers) / len(numbers))"
+            elif "sort" in q:
+                state["response"] = "data = [3, 1, 2]\ndata.sort()\nprint(data)"
+            else:
+                state["response"] = "# Code generation not implemented for this case."
+            return state
+        def fallback(state):
+            state["response"] = "This question doesn't require Python or is unclear."
+            return state
+        def check_reasoning_needed(state):
+            q = state["question"].lower()
+            needs_reasoning = any(word in q for word in ["whose", "only", "first", "after", "before", "no longer", "not", "but", "except"])
+            state["needs_reasoning"] = needs_reasoning
+            return state
+        def check_wikipedia_suitability(state):
+            q = state["question"].lower()
+            triggers = [
+                "wikipedia", "who is", "what is", "when did", "where is",
+                "tell me about", "how many", "how much", "what was the",
+                "describe", "explain", "information about", "details about",
+                "history of", "facts about", "define", "give me data on"
+            ]
+            state["is_wiki"] = any(trigger in q for trigger in triggers)
+            return state
+        # --- MODIFIED/NEW HELPER METHODS (NOW PART OF THE CLASS) ---
+        # These methods are now part of the SuperSmartAgent class,
+        # so they can access self.wiki_wiki and other class properties.
+    def get_relevant_context(self, question, search_results):
+        """
+        Get more relevant context by focusing on the most relevant page and sections,
+        and optionally from multiple top search results.
+        """
+        if not search_results:
+            return ""
+        all_relevant_content = []
+        # Consider fetching from top 2-3 results for broader context
+        for title in search_results[:2]: # Fetch from top 2 results
+            try:
+                page = self.wiki_wiki.page(title)
+                if page.exists():
+                    full_content = page.text
+                    # Limit initial content size for processing
+                    full_content = full_content[:20000] # Increased limit for more context
+                    # Try to identify the most relevant sections based on question keywords
+                    key_phrases = self.extract_key_phrases(question)
+                    # Split content into sections more robustly
+                    sections = re.split(r'\n==\s*[^=]+\s*==\n', full_content) # Split by major headings
+                    relevant_sections = []
+                    # Prioritize sections that directly match heading or contain many keywords
+                    for section in sections:
+                        section_lower = section.lower()
+                        score = 0
+                        # Score based on keywords in the section
+                        for phrase in key_phrases:
+                            if phrase.lower() in section_lower:
+                                score += 1
+                        # Score for heading matches
+                        heading_match = re.search(r'==\s*([^=]+)\s*==', section)
+                        if heading_match and any(phrase.lower() in heading_match.group(1).lower() for phrase in key_phrases):
+                            score += 5 # Boost for heading match
+                        if score > 0:
+                            if self.section_contains_statistics(section):
+                                relevant_sections.insert(0, section) # Prioritize stats
+                            else:
+                                relevant_sections.append(section)
+                    if relevant_sections:
+                        all_relevant_content.append("\n\n".join(relevant_sections))
+                    else:
+                        # If no specific sections are highly relevant, take a larger chunk
+                        all_relevant_content.append(full_content[:5000]) # Take a smaller chunk if no specific section found
+            except Exception as e:
+                print(f"Error processing page '{title}': {e}")
+                continue
+        # Combine content from multiple relevant pages/sections
+        return "\n\n".join(all_relevant_content)
+    def section_contains_statistics(self, section):
+        """Determine if a section likely contains statistics."""
+        indicators = [
+            'statistics', 'stats', 'season', 'player',
+            'year', 'at bat', 'walk', 'home run', 'rbi',
+            'era', 'career', 'record', 'totals', 'rank', 'chart', 'table',
+            r'\d{4}-\d{2}', # Years like 2020-21
+            r'average', r'sum', r'count', r'total', r'percent', r'%'
+        ]
+        section_lower = section.lower()
+        return any(re.search(r'\b' + indicator + r'\b', section_lower) for indicator in indicators) # Use word boundaries
+    def preprocess_context(self, context):
+        """Preprocess context: remove citations, excess whitespace, and specific wiki markup."""
+        context = re.sub(r'\[\d+\]', '', context)  # Remove [1], [2], etc.
+        context = re.sub(r'<ref[^>]*>.*?<\/ref>', '', context, flags=re.DOTALL | re.IGNORECASE) # Remove <ref> tags
+        context = re.sub(r'\{\{.*?\}\}', '', context, flags=re.DOTALL) # Remove {{templates}}
+        context = re.sub(r'{\|.*?\|\}', '', context, flags=re.DOTALL) # Remove wiki tables (if extract_tables_from_wikipedia doesn't catch all)
+        context = re.sub(r'==\s*See also\s*==.*?$', '', context, flags=re.DOTALL | re.IGNORECASE) # Remove "See also" section and anything after
+        context = re.sub(r'==\s*References\s*==.*?$', '', context, flags=re.DOTALL | re.IGNORECASE) # Remove "References" section and anything after
+        context = re.sub(r'\s+', ' ', context).strip() # Normalize whitespace
+        return context
+    def extract_key_phrases(self, question):
+        """Identify important phrases in the question using spaCy."""
+        doc = nlp(question)
+        key_phrases = []
+        for token in doc:
+            if not token.is_stop and not token.is_punct and not token.is_space and token.text.strip():
+                key_phrases.append(token.lemma_) # Use lemma for better matching
+        # Add multi-word nouns (noun chunks)
+        for chunk in doc.noun_chunks:
+            if not any(token.is_stop or token.is_punct for token in chunk):
+                key_phrases.append(chunk.text)
+        return list(set(key_phrases)) # Return unique phrases
+    def general_reasoning_qa(self, state):
+        question = state["question"]
+        question_lower = question.lower()
+        try:
+            search_results = wikipedia.search(question, results=3)
+            if not search_results:
+                state["response"] = "Sorry, I couldn't find relevant information on Wikipedia."
+                return state
+            context = self.get_relevant_context(question, search_results)
+            if not context:
+                state["response"] = "Sorry, I couldn't find detailed relevant information."
+                return state
+            context = self.preprocess_context(context)
+            tables = self.extract_tables_from_wikipedia(context)
+            # Try to extract a specific answer first
+            answer = self.extract_answer(question, context, tables)
+            if answer and self.validate_answer(question, answer):
+                state["response"] = answer
+                return state
+            # If no specific answer or validation failed, try to get the most relevant sentence
+            if not answer:
+                question_keywords = self.extract_key_phrases(question)
+                if question_keywords:
+                    sentences = re.split(r'(?<=[.!?])\s+', context) # Split more carefully to keep punctuation with sentence
+                    scored_sentences = []
+                    for sentence in sentences:
+                        sentence = sentence.strip()
+                        if not sentence:
+                            continue
+                        # Score based on keyword density and presence of question words
+                        score = 0
+                        sentence_lower = sentence.lower()
+                        for keyword in question_keywords:
+                            if keyword.lower() in sentence_lower:
+                                score += 1
+                        # Boost if it contains an answer-like entity (number, date, named entity)
+                        if any(char.isdigit() for char in sentence): # Contains numbers
+                            score += 0.5
+                        if any(ent.label_ in ["PERSON", "ORG", "GPE", "DATE"] for ent in nlp(sentence).ents):
+                            score += 0.7
+                        if score > 0:
+                            scored_sentences.append((score, sentence))
+                    if scored_sentences:
+                        scored_sentences.sort(key=lambda x: (-x[0], -len(x[1])))
+                        best_sentence = scored_sentences[0][1]
+                        # Fallback to the best sentence, ensuring it ends properly
+                        if best_sentence.endswith('.') or best_sentence.endswith('!') or best_sentence.endswith('?'):
+                            state["response"] = best_sentence
+                        else:
+                            state["response"] = best_sentence + "."
+                        return state
+            # If all else fails, provide a summary
+            try:
+                first_page = self.wiki_wiki.page(search_results[0])
+                if first_page.exists():
+                    summary = first_page.summary[:700] + "..."  # Slightly larger summary
+                    state["response"] = f"I couldn't find a specific answer, but here's some relevant information: {summary}"
+                else:
+                    state["response"] = "No relevant information found."
+            except Exception as e:
+                state["response"] = f"I couldn't find a specific answer in the available information."
+        except Exception as e:
+            state["response"] = f"An error occurred while searching for information: {str(e)}"
+        return state
+    def validate_answer(self, question, answer):
+        """Validate if the extracted answer seems plausible for the question type."""
+        question_lower = question.lower()
+        # Check for numeric answers for "how many/much" questions
+        if "how many" in question_lower or "how much" in question_lower:
+            if not re.search(r'\d+', answer):
+                # If question asks for a number but answer has no number, it's likely wrong
+                return False
+        # Check for year/date answers for "when" questions
+        if "when" in question_lower or "year" in question_lower:
+            if not re.search(r'\b\d{4}\b', answer) and not re.search(r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{1,2}(?:st|nd|rd|th)?,\s+\d{4}\b', answer):
+                return False
+        # Simple check: avoid answers that are just prepositions or very short
+        if len(answer.split()) < 3 and not re.search(r'\d+', answer): # Allow short numeric answers
+            return False
+        return True
+    def extract_tables_from_wikipedia(self, content):
+        """
+        Extract tables from Wikipedia content (wiki markup and basic HTML).
+        Improved parsing for cells and handling multiple tables.
+        """
+        tables = []
+        # Regex for wiki markup tables
+        # Improved: Capture table contents, then parse row by row
+        wiki_table_pattern = r'\{\|\s*(?:class="[^"]*")?.*?\|\}(?=\n|\Z)'
+        for table_match in re.finditer(wiki_table_pattern, content, re.DOTALL):
+            table_content = table_match.group(0)
+            rows = re.findall(r'\|\-(.*?)(?=\|\-|\{\||\Z)', table_content, re.DOTALL) # Split by |-
+            clean_rows = []
+            if not rows and '|+' in table_content: # Handle tables with only a caption
+                continue
+            # First row might be headers (starting with !) or data (|)
+            # Try to find header row explicitly if present
+            header_match = re.search(r'\|\n(?:!\s*[^|!]+\s*(?:\|\|)?)+\n', table_content)
+            if header_match:
+                header_line = header_match.group(0).strip()
+                headers = re.findall(r'!\s*([^|!]+?)\s*(?:\|\||(?=\n))', header_line)
+                clean_headers = [self._clean_cell_content(h) for h in headers]
+                if clean_headers:
+                    clean_rows.append(clean_headers)
+                # Remove header line from subsequent parsing
+                table_content = table_content.replace(header_line, '', 1)
+                rows = re.findall(r'\|\-(.*?)(?=\|\-|\{\||\Z)', table_content, re.DOTALL)
+            for row in rows:
+                # Cells can start with | or ||
+                cells = re.findall(r'(?:\||\!)\s*([^|!]+?)(?:\|\||(?=\n)|(?=\Z))', row, re.DOTALL)
+                clean_cells = [self._clean_cell_content(cell) for cell in cells]
+                if clean_cells:
+                    clean_rows.append(clean_cells)
+            if clean_rows:
+                tables.append(clean_rows)
+        # Basic HTML table extraction (often less structured in Wikipedia text than wiki markup)
+        html_table_pattern = r'<table.*?</table>'
+        for html_table_match in re.finditer(html_table_pattern, content, re.DOTALL | re.IGNORECASE):
+            table_content = html_table_match.group(0)
+            rows = re.findall(r'<tr.*?</tr>', table_content, re.DOTALL | re.IGNORECASE)
+            clean_rows = []
+            for row in rows:
+                cells = re.findall(r'<t[dh].*?</t[dh]>', row, re.DOTALL | re.IGNORECASE)
+                clean_cells = []
+                for cell in cells:
+                    cell_text = self._clean_cell_content(cell)
+                    clean_cells.append(cell_text)
+                if clean_cells:
+                    clean_rows.append(clean_cells)
+            if clean_rows:
+                tables.append(clean_rows)
+        return tables
+    def _clean_cell_content(self, cell):
+        """Helper to clean individual table cell content."""
+        cell = re.sub(r'\[\[(?:[^|\]]+\|)?([^\]]+)\]\]', r'\1', cell)  # Remove wiki links, keep text
+        cell = re.sub(r'<[^>]+>', '', cell)  # Remove HTML tags
+        cell = re.sub(r'\{\{.*?\}\}', '', cell) # Remove templates within cells
+        cell = re.sub(r'\s+', ' ', cell).strip()
+        return cell
+    def extract_answer(self, question, context, tables=None):
+        """
+        Enhanced general purpose answer extraction from text context using spaCy.
+        """
+        if tables is None:
+            tables = []
+        question_lower = question.lower()
+        doc_context = nlp(context)
+        # First, check tables for a direct answer
+        table_answer = self.find_answer_in_tables(question, tables)
+        if table_answer:
+            return table_answer
+        question_type = self.detect_question_type(question_lower)
+        # Extract named entities and their labels
+        entities = [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc_context.ents]
+        # Extract all numbers and dates
+        numbers_dates = []
+        for match in re.finditer(r'(\d[\d,]*\d*|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{1,2}(?:st|nd|rd|th)?(?:,\s+\d{4})?|\b\d{1,2}/\d{1,2}/\d{2,4}\b|\b\d{4}\b)', context, re.IGNORECASE):
+            numbers_dates.append((match.group(1).replace(',', ''), match.start(), match.end()))
+        # Prioritize answers based on question type and entity recognition
+        if question_type in ["count", "how many"]:
+            # Look for numbers with relevant context
+            best_number_match = self.find_best_number_match_spacy(question_lower, numbers_dates, context)
+            if best_number_match:
+                return f"The answer is {best_number_match[0]}."
+        elif question_type == "person":
+            relevant_person = self.find_relevant_person_spacy(question_lower, entities)
+            if relevant_person:
+                return f"The answer is {relevant_person}."
+        elif question_type == "date":
+            relevant_date = self.find_relevant_date_spacy(question_lower, numbers_dates, entities)
+            if relevant_date:
+                return f"The answer is {relevant_date}."
+        elif question_type == "location":
+            relevant_location = self.find_relevant_location_spacy(question_lower, entities)
+            if relevant_location:
+                return f"The answer is {relevant_location}."
+        # Fallback to general sentence scoring if specific extraction fails
+        key_phrases = self.extract_key_phrases(question)
+        sentences = re.split(r'(?<=[.!?])\s+', context)
+        scored_sentences = []
+        for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            score = 0
+            sentence_lower = sentence.lower()
+            for keyword in key_phrases:
+                if keyword.lower() in sentence_lower:
+                    score += 1
+            # Boost score if sentence contains relevant entity types
+            doc_sentence = nlp(sentence)
+            for ent in doc_sentence.ents:
+                if (question_type == "person" and ent.label_ == "PERSON") or \
+                   (question_type == "date" and ent.label_ == "DATE") or \
+                   (question_type == "location" and ent.label_ in ["GPE", "LOC", "ORG"]) or \
+                   (question_type == "count" and ent.label_ == "CARDINAL"):
+                    score += 2 # Higher boost for direct entity type match
+            if score > 0:
+                scored_sentences.append((score, sentence))
+        if scored_sentences:
+            scored_sentences.sort(key=lambda x: (-x[0], -len(x[1])))
+            best_sentence = scored_sentences[0][1]
+            if best_sentence.endswith('.') or best_sentence.endswith('!') or best_sentence.endswith('?'):
+                return best_sentence
+            return best_sentence + "."
+        return None
+    def detect_question_type(self, question):
+        """Classify the type of question using spaCy token analysis."""
+        doc = nlp(question)
+        # Check for "wh-" words and common patterns
+        if "how many" in question or "how much" in question or "total" in question or "number of" in question:
+            return "count"
+        if "who" in question or "which person" in question or "which player" in question:
+            return "person"
+        if "when" in question or "what year" in question or "what date" in question:
+            return "date"
+        if "where" in question or "what location" in question or "in what city" in question:
+            return "location"
+        if "what is" in question or "what was" in question or "define" in question:
+            return "definition"
+        if "list of" in question or "list the" in question or "enumerate" in question:
+            return "list"
+        # Analyze dependency parse for more complex types
+        for token in doc:
+            if token.dep_ == "nsubj" and token.head.lemma_ in ["be", "do"]: # What is X, Who is Y
+                if token.ent_type_ == "PERSON": return "person"
+                if token.ent_type_ in ["GPE", "LOC"]: return "location"
+                if token.text.lower() in ["number", "amount", "total"]: return "count"
+        return "general" # Default to general
+    def find_best_number_match_spacy(self, question, numbers_dates, context):
+        """Find the number from context that best matches the question using spaCy."""
+        if not numbers_dates:
+            return None
+        question_keywords = self.extract_key_phrases(question)
+        doc_context = nlp(context)
+        scored_numbers = []
+        for number, start_char, end_char in numbers_dates:
+            score = 0
+            # Get surrounding text (sentence)
+            span = doc_context.char_span(start_char, end_char)
+            if span and span.sent:
+                sentence = span.sent.text
+                sentence_lower = sentence.lower()
+                # Score based on question keyword presence in the sentence
+                for keyword in question_keywords:
+                    if keyword.lower() in sentence_lower:
+                        score += 1
+                # Check if it's a cardinal entity
+                for ent in nlp(sentence).ents:
+                    if ent.text == number and ent.label_ == "CARDINAL":
+                        score += 2 # Boost for being a recognized cardinal number
+                # Proximity to keywords (more advanced with spaCy)
+                for keyword_doc in nlp(question):
+                    if not keyword_doc.is_stop and not keyword_doc.is_punct:
+                        # Find occurrences of keyword lemma in sentence
+                        for token in nlp(sentence):
+                            if token.lemma_ == keyword_doc.lemma_:
+                                distance = abs(token.i - span.start - (span.end - span.start) // 2)
+                                score += max(0, 1.0 - (distance / 20.0)) # Closer is better
+                scored_numbers.append((score, number, sentence))
+        if not scored_numbers:
+            return None
+        scored_numbers.sort(reverse=True, key=lambda x: x[0])
+        return (scored_numbers[0][1], scored_numbers[0][2])
+    def extract_named_entities(self, text):
+        """Extract named entities (PERSON, ORG, GPE, LOC) from text using spaCy."""
+        doc = nlp(text)
+        entities = []
+        for ent in doc.ents:
+            if ent.label_ in ["PERSON", "ORG", "GPE", "LOC"]:
+                entities.append((ent.text, ent.label_, ent.start_char, ent.end_char))
+        return entities
+    def find_relevant_person_spacy(self, question, entities):
+        """Find the most relevant person entity based on question context using spaCy."""
+        person_entities = [ent for ent in entities if ent[1] == "PERSON"]
+        if not person_entities:
+            return None
+        question_doc = nlp(question)
+        question_keywords = [token.lemma_ for token in question_doc if not token.is_stop and not token.is_punct]
+        best_score = -1
+        best_person = None
+        for person_text, _, start_char, end_char in person_entities:
+            score = 0
+            # Get sentence where person appears
+            span = nlp(self.current_context).char_span(start_char, end_char) # Need current_context
+            if span and span.sent:
+                sentence = span.sent.text
+                sentence_doc = nlp(sentence)
+                # Check for keyword overlap (lemma-based)
+                for q_lemma in question_keywords:
+                    for s_token in sentence_doc:
+                        if s_token.lemma_ == q_lemma:
+                            score += 1
+                # Boost if the person is the subject of a relevant verb
+                for token in sentence_doc:
+                    if token.text == person_text and token.dep_ == "nsubj":
+                        if token.head.lemma_ in ["be", "do", "play", "win", "create", "discover", "lead"]:
+                            score += 2 # Strong boost
+            if score > best_score:
+                best_score = score
+                best_person = person_text
+        return best_person
+    def find_relevant_date_spacy(self, question, numbers_dates, entities):
+        """Find the most relevant date entity based on question context using spaCy."""
+        date_entities = [ent for ent in entities if ent[1] == "DATE"]
+        # Combine with numbers/dates that match date patterns
+        all_potential_dates = []
+        for date_text, start_char, end_char in numbers_dates:
+            # Simple check if it looks like a year or full date
+            if re.fullmatch(r'\d{4}', date_text) or \
+               re.fullmatch(r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{1,2}(?:st|nd|rd|th)?(?:,\s+\d{4})?\b', date_text, re.IGNORECASE) or \
+               re.fullmatch(r'\d{1,2}/\d{1,2}/\d{2,4}', date_text):
+                all_potential_dates.append((date_text, "DATE_CANDIDATE", start_char, end_char))
+        all_potential_dates.extend(date_entities)
+        if not all_potential_dates:
+            return None
+        question_doc = nlp(question)
+        question_keywords = [token.lemma_ for token in question_doc if not token.is_stop and not token.is_punct]
+        best_score = -1
+        best_date = None
+        for date_text, _, start_char, end_char in all_potential_dates:
+            score = 0
+            span = nlp(self.current_context).char_span(start_char, end_char) # Need current_context
+            if span and span.sent:
+                sentence = span.sent.text
+                sentence_doc = nlp(sentence)
+                for q_lemma in question_keywords:
+                    for s_token in sentence_doc:
+                        if s_token.lemma_ == q_lemma:
+                            score += 1
+                # Boost if it's explicitly labeled as DATE by spaCy
+                for ent in sentence_doc.ents:
+                    if ent.text == date_text and ent.label_ == "DATE":
+                        score += 2
+            if score > best_score:
+                best_score = score
+                best_date = date_text
+        return best_date
+    def find_relevant_location_spacy(self, question, entities):
+        """Find the most relevant location entity based on question context using spaCy."""
+        location_entities = [ent for ent in entities if ent[1] in ["GPE", "LOC"]]
+        if not location_entities:
+            return None
+        question_doc = nlp(question)
+        question_keywords = [token.lemma_ for token in question_doc if not token.is_stop and not token.is_punct]
+        best_score = -1
+        best_location = None
+        for loc_text, _, start_char, end_char in location_entities:
+            score = 0
+            span = nlp(self.current_context).char_span(start_char, end_char) # Need current_context
+            if span and span.sent:
+                sentence = span.sent.text
+                sentence_doc = nlp(sentence)
+                for q_lemma in question_keywords:
+                    for s_token in sentence_doc:
+                        if s_token.lemma_ == q_lemma:
+                            score += 1
+                # Boost if it's a recognized GPE (geo-political entity)
+                for ent in sentence_doc.ents:
+                    if ent.text == loc_text and ent.label_ in ["GPE", "LOC"]:
+                        score += 2
+            if score > best_score:
+                best_score = score
+                best_location = loc_text
+        return best_location
+    def find_answer_in_tables(self, question, tables):
+        """
+        Search through extracted tables to find an answer to the question.
+        Improved with better column type detection and keyword matching.
+        """
+        if not tables:
+            return None
+        question_keywords = self.extract_key_phrases(question)
+        question_lower = question.lower()
+        for table in tables:
+            if not table:
+                continue
+            # Assuming the first row is headers if present
+            headers = [self._clean_cell_content(cell).lower() for cell in table[0]] if table else []
+            data_rows = table[1:] if len(table) > 1 else []
+            # Determine column types
+            column_types = self.detect_column_types(table)
+            # Check if table is relevant to the question by checking headers and sample data
+            table_is_relevant = any(phrase.lower() in ' '.join(headers) for phrase in question_keywords) or \
+                                any(any(phrase.lower() in self._clean_cell_content(cell).lower() for phrase in question_keywords) for row in data_rows for cell in row[:min(len(row), 3)]) # Check first few cells of first few rows
+            if not table_is_relevant:
+                continue
+            # Prioritize based on question type
+            if "how many" in question_lower or "what was the" in question_lower or "total" in question_lower:
+                numeric_columns_indices = [i for i, col_type in enumerate(column_types) if col_type == 'number']
+                if numeric_columns_indices and data_rows:
+                    best_match_score = -1
+                    best_numeric_answer = None
+                    for row in data_rows:
+                        row_text_lower = ' '.join([self._clean_cell_content(c).lower() for c in row])
+                        # Score row based on how many question keywords it contains
+                        row_score = sum(1 for kw in question_keywords if kw.lower() in row_text_lower)
+                        if row_score > best_match_score:
+                            for col_idx in numeric_columns_indices:
+                                if col_idx < len(row):
+                                    cell_content = self._clean_cell_content(row[col_idx])
+                                    numbers = re.findall(r'(\d[\d,]*\d*)', cell_content)
+                                    if numbers:
+                                        # Take the first number found in the cell
+                                        clean_num = numbers[0].replace(',', '')
+                                        if clean_num.isdigit():
+                                            best_match_score = row_score
+                                            best_numeric_answer = clean_num
+                                            break # Found a number, move to next row if not the best
+                    if best_numeric_answer:
+                        return f"The answer is {best_numeric_answer}."
+            elif "who" in question_lower or "which person" in question_lower or "player" in question_lower:
+                name_columns_indices = [i for i, col_type in enumerate(column_types) if col_type == 'name']
+                if name_columns_indices and data_rows:
+                    best_match_score = -1
+                    best_name_answer = None
+                    for row in data_rows:
+                        row_text_lower = ' '.join([self._clean_cell_content(c).lower() for c in row])
+                        row_score = sum(1 for kw in question_keywords if kw.lower() in row_text_lower)
+                        if row_score > best_match_score:
+                            for col_idx in name_columns_indices:
+                                if col_idx < len(row):
+                                    cell_content = self._clean_cell_content(row[col_idx])
+                                    # Check if the cell content looks like a name using spaCy
+                                    doc_cell = nlp(cell_content)
+                                    if any(ent.label_ == "PERSON" for ent in doc_cell.ents):
+                                        best_match_score = row_score
+                                        best_name_answer = cell_content.strip()
+                                        break
+                    if best_name_answer:
+                        return f"The answer is {best_name_answer}."
+            elif "when" in question_lower or "year" in question_lower or "date" in question_lower:
+                date_columns_indices = [i for i, col_type in enumerate(column_types) if col_type == 'date']
+                if date_columns_indices and data_rows:
+                    best_match_score = -1
+                    best_date_answer = None
+                    for row in data_rows:
+                        row_text_lower = ' '.join([self._clean_cell_content(c).lower() for c in row])
+                        row_score = sum(1 for kw in question_keywords if kw.lower() in row_text_lower)
+                        if row_score > best_match_score:
+                            for col_idx in date_columns_indices:
+                                if col_idx < len(row):
+                                    cell_content = self._clean_cell_content(row[col_idx])
+                                    # Use more robust date detection
+                                    if re.search(r'\b(19|20)\d{2}\b', cell_content) or \
+                                       re.search(r'\b\d{1,2}\s+(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s*\d{4}\b', cell_content, re.IGNORECASE):
+                                        best_match_score = row_score
+                                        best_date_answer = cell_content.strip()
+                                        break
+                    if best_date_answer:
+                        return f"The answer is {best_date_answer}."
+        return None
+    def detect_column_types(self, table):
+        """
+        Detects the type of data in each column (e.g., 'number', 'name', 'date', 'text').
+        Uses spaCy for better entity recognition.
+        """
+        if not table:
+            return []
+        num_columns = len(table[0]) if table else 0
+        column_types = ['text'] * num_columns
+        # Sample a few rows to determine type
+        sample_rows = table[1:min(len(table), 5)]
+        for col_idx in range(num_columns):
+            col_values = [self._clean_cell_content(row[col_idx]) for row in sample_rows if col_idx < len(row)]
+            num_count = 0
+            name_count = 0
+            date_count = 0
+            for value in col_values:
+                value_doc = nlp(value)
+                # Check for numbers
+                if re.fullmatch(r'[\d,.-]+', value.replace(' ', '')): # Allow for decimals, negatives, commas
+                    num_count += 1
+                # Check for dates
+                if any(ent.label_ == "DATE" for ent in value_doc.ents):
+                    date_count += 1
+                elif re.search(r'\b\d{4}\b|\b\d{1,2}/\d{1,2}/\d{2,4}\b', value):
+                    date_count += 1
+                # Check for names (PERSON entity)
+                if any(ent.label_ == "PERSON" for ent in value_doc.ents):
+                    name_count += 1
+            # Heuristic to assign type: majority rules or strong indicators
+            if len(col_values) > 0:
+                if num_count / len(col_values) > 0.7: # More than 70% numbers
+                    column_types[col_idx] = 'number'
+                elif date_count / len(col_values) > 0.7: # More than 70% dates
+                    column_types[col_idx] = 'date'
+                elif name_count / len(col_values) > 0.5 and num_count == 0: # More than 50% names and no numbers
+                    column_types[col_idx] = 'name'
+                # Default remains 'text'
+        return column_types
+    def column_looks_like_names(self, sample_values):
+        """Checks if a sample of values from a column primarily contains names using spaCy."""
+        if not sample_values:
+            return False
+        name_like_count = 0
+        for value in sample_values:
+            doc = nlp(value)
+            # A value looks like a name if spaCy identifies a PERSON entity
+            if any(ent.label_ == "PERSON" for ent in doc.ents):
+                name_like_count += 1
+        return name_like_count / len(sample_values) > 0.6 # Majority are name-like
+    class AgentState(TypedDict, total=False):
+        question: str
+        is_reversed: bool
+        is_python: bool
+        is_riddle: bool
+        is_wiki: bool
+        needs_reasoning: bool
+        response: str
+        use_tool: str
+        # Add current_context to state for find_relevant_person_spacy etc.
+        current_context: str # Stores the context retrieved from Wikipedia
+    def _build_graph(self):
+        # Nested functions need access to 'self' for the new methods.
+        # One way is to pass 'self' or make them direct methods of the class.
+        # For simplicity and to fit the graph builder, I'll assume `self`
+        # is implicitly available or methods are bound later.
+        # In this updated code, I've moved the modified/new functions directly
+        # into the SuperSmartAgent class as methods.
+        # The graph nodes will then call self.method_name.
+        # Ensure the graph nodes correctly reference the class methods
+        # For the graph to work, these need to be callable methods of the class.
+        # So we adapt the node definitions:
+        builder = StateGraph(self.AgentState)
+        builder.add_node("check_reversed", self.check_reversed_node)
+        builder.add_node("fix_question", self.fix_question_node)
+        builder.add_node("check_riddle_or_trick", self.check_riddle_or_trick_node)
+        builder.add_node("solve_riddle", self.solve_riddle_node)
+        builder.add_node("check_wikipedia_suitability", self.check_wikipedia_suitability_node)
+        builder.add_node("check_reasoning_needed", self.check_reasoning_needed_node)
+        builder.add_node("general_reasoning_qa", self.general_reasoning_qa_node)
+        builder.add_node("check_python_suitability", self.check_python_suitability_node)
+        builder.add_node("generate_code", self.generate_code_node)
+        builder.add_node("fallback", self.fallback_node)
+        # Bind the functions as methods of the class for the graph to call them
+        # This is a common pattern when using StateGraph with class methods
+        # The methods need to be defined outside _build_graph as instance methods
+        # I've defined them above as regular methods, so this part simplifies.
+        # Rename the nested functions to be class methods or use wrappers
+        # For simplicity, I'm just renaming the graph nodes to call self.method
+        # Make sure the actual function implementations are now class methods.
+        # Define wrapper methods to fit the graph signature if needed, or
+        # directly call the class methods from the graph nodes.
+        # Here, I'm directly renaming the graph calls to assume the original
+        # functions are now methods.
+        # Set entry point and define edges
+        builder.set_entry_point("check_reversed")
+        builder.add_edge("check_reversed", "fix_question")
+        builder.add_edge("fix_question", "check_riddle_or_trick")
+        builder.add_conditional_edges(
+            "check_riddle_or_trick",
+            lambda s: "solve_riddle" if s.get("is_riddle") else "check_wikipedia_suitability"
+        )
+        builder.add_conditional_edges(
+            "check_wikipedia_suitability",
+            lambda s: "general_reasoning_qa" if s.get("is_wiki") else "check_reasoning_needed" # Go directly to general_reasoning_qa for wiki
+        )
+        builder.add_conditional_edges(
+            "check_reasoning_needed",
+            lambda s: "general_reasoning_qa" if s.get("needs_reasoning") else "check_python_suitability"
+        )
+        builder.add_conditional_edges(
+            "check_python_suitability",
+            lambda s: "generate_code" if s.get("is_python") else "fallback"
+        )
+        builder.add_edge("solve_riddle", END)
+        builder.add_edge("general_reasoning_qa", END)
+        builder.add_edge("generate_code", END)
+        builder.add_edge("fallback", END)
+        return builder.compile()
+    # --- Wrapper methods for the graph nodes ---
+    # These call the actual logic methods. This is a common pattern
+    # when your graph functions are class methods and need `self`.
+    def check_reversed_node(self, state):
+        return self._check_reversed(state)
+    def fix_question_node(self, state):
+        return self._fix_question(state)
+    def check_riddle_or_trick_node(self, state):
+        return self._check_riddle_or_trick(state)
+    def solve_riddle_node(self, state):
+        return self._solve_riddle(state)
+    def check_wikipedia_suitability_node(self, state):
+        return self._check_wikipedia_suitability(state)
+    def check_reasoning_needed_node(self, state):
+        return self._check_reasoning_needed(state)
+    def general_reasoning_qa_node(self, state):
+        # Before calling general_reasoning_qa, ensure current_context is set up
+        # This part of the logic might need to be shifted depending on graph flow.
+        # For now, general_reasoning_qa itself will fetch context.
+        response_state = self.general_reasoning_qa(state)
+        # Update current_context in the state if it was retrieved, for consistency
+        # although general_reasoning_qa itself uses it internally.
+        # This is a bit tricky with StateGraph if context isn't explicitly passed around
+        # or stored in the state by the `general_reasoning_qa` function itself.
+        # The `find_relevant_person_spacy` and similar methods now assume `self.current_context`
+        # is available. The `general_reasoning_qa` method *should* set it.
+        return response_state
+    def check_python_suitability_node(self, state):
+        return self._check_python_suitability(state)
+    def generate_code_node(self, state):
+        return self._generate_code(state)
+    def fallback_node(self, state):
+        return self._fallback(state)
+    # --- Renamed original helper functions to be internal methods ---
+    # These are the actual implementations, now as instance methods.
+    def _check_reversed(self, state):
+        question = state["question"]
+        reversed_candidate = question[::-1]
+        original_score = self._score_text(question)
+        reversed_score = self._score_text(reversed_candidate)
+        if reversed_score > original_score:
+            state["is_reversed"] = True
+        else:
+            state["is_reversed"] = False
+        return state
+    def _fix_question(self, state):
+        if state.get("is_reversed", False):
+            state["question"] = state["question"][::-1]
+        return state
+    def _check_riddle_or_trick(self, state):
+        q = state["question"].lower()
+        keywords = ["opposite of", "if you understand", "riddle", "trick question", "what comes next", "i speak without"]
+        state["is_riddle"] = any(kw in q for kw in keywords)
+        return state
+    def _solve_riddle(self, state):
+        q = state["question"].lower()
+        if "opposite of the word" in q:
+            if "left" in q:
+                state["response"] = "right"
+            elif "up" in q:
+                state["response"] = "down"
+            elif "hot" in q:
+                state["response"] = "cold"
+            else:
+                state["response"] = "Unknown opposite."
+        else:
+            state["response"] = "Could not solve riddle."
+        return state
+    def _check_python_suitability(self, state):
+        question = state["question"].lower()
+        patterns = ["sum", "average", "count", "sort", "generate", "regex", "convert"]
+        state["is_python"] = any(word in question for word in patterns)
+        return state
+    def _generate_code(self, state):
+        q = state["question"].lower()
+        if "sum" in q:
+            state["response"] = "numbers = [1, 2, 3]\nprint(sum(numbers))"
+        elif "average" in q:
+            state["response"] = "numbers = [1, 2, 3]\nprint(sum(numbers) / len(numbers))"
+        elif "sort" in q:
+            state["response"] = "data = [3, 1, 2]\ndata.sort()\nprint(data)"
+        else:
+            state["response"] = "# Code generation not implemented for this case."
+        return state
+    def _fallback(self, state):
+        state["response"] = "This question doesn't require Python or is unclear."
+        return state
+    def _check_reasoning_needed(self, state):
+        q = state["question"].lower()
+        needs_reasoning = any(word in q for word in ["whose", "only", "first", "after", "before", "no longer", "not", "but", "except"])
+        state["needs_reasoning"] = needs_reasoning
+        return state
+    def _check_wikipedia_suitability(self, state):
+        q = state["question"].lower()
+        triggers = [
+            "wikipedia", "who is", "what is", "when did", "where is",
+            "tell me about", "how many", "how much", "what was the",
+            "describe", "explain", "information about", "details about",
+            "history of", "facts about", "define", "give me data on"
+        ]
+        state["is_wiki"] = any(trigger in q for trigger in triggers)
+        return state
+    def _score_text(self, text):
+        alnum_count = sum(c.isalnum() for c in text)
+        space_count = text.count(' ')
+        punctuation_count = sum(c in string.punctuation for c in text)
+        ends_properly = text[-1] in '.!?'
+        score = alnum_count + space_count
+        if ends_properly:
+            score += 5
+        return score
+########################################
+def run_and_submit_all( profile: gr.OAuthProfile | None):
+    """
+    Fetches all questions, runs the BasicAgent on them, submits all answers,
+    and displays the results.
+    """
+    # --- Determine HF Space Runtime URL and Repo URL ---
+    space_id = os.getenv("https://huggingface.co/spaces/selim-ba/Final_Agent_HF_Course/tree/main") # Get the SPACE_ID for sending link to the code
+    if profile:
+        username= f"{profile.username}"
+        print(f"User logged in: {username}")
+    else:
+        print("User not logged in.")
+        return "Please Login to Hugging Face with the button.", None
+    api_url = DEFAULT_API_URL
+    questions_url = f"{api_url}/questions"
+    submit_url = f"{api_url}/submit"
+    # 1. Instantiate Agent ( modify this part to create your agent)
+    try:
+        agent = SuperSmartAgent() #BasicAgent()
+    except Exception as e:
+        print(f"Error instantiating agent: {e}")
+        return f"Error initializing agent: {e}", None
+    # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
+    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
+    print(agent_code)
+    # 2. Fetch Questions
+    print(f"Fetching questions from: {questions_url}")
+    try:
+        response = requests.get(questions_url, timeout=15)
+        response.raise_for_status()
+        questions_data = response.json()
+        if not questions_data:
+             print("Fetched questions list is empty.")
+             return "Fetched questions list is empty or invalid format.", None
+        print(f"Fetched {len(questions_data)} questions.")
+    except requests.exceptions.RequestException as e:
+        print(f"Error fetching questions: {e}")
+        return f"Error fetching questions: {e}", None
+    except requests.exceptions.JSONDecodeError as e:
+         print(f"Error decoding JSON response from questions endpoint: {e}")
+         print(f"Response text: {response.text[:500]}")
+         return f"Error decoding server response for questions: {e}", None
+    except Exception as e:
+        print(f"An unexpected error occurred fetching questions: {e}")
+        return f"An unexpected error occurred fetching questions: {e}", None
+    # 3. Run your Agent
+    results_log = []
+    answers_payload = []
+    print(f"Running agent on {len(questions_data)} questions...")
+    for item in questions_data:
+        task_id = item.get("task_id")
+        question_text = item.get("question")
+        if not task_id or question_text is None:
+            print(f"Skipping item with missing task_id or question: {item}")
+            continue
+        try:
+            submitted_answer = agent(question_text)
+            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
+            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
+        except Exception as e:
+             print(f"Error running agent on task {task_id}: {e}")
+             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
+    if not answers_payload:
+        print("Agent did not produce any answers to submit.")
+        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
+    # 4. Prepare Submission
+    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
+    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
+    print(status_update)
+    # 5. Submit
+    print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
+    try:
+        response = requests.post(submit_url, json=submission_data, timeout=60)
+        response.raise_for_status()
+        result_data = response.json()
+        final_status = (
+            f"Submission Successful!\n"
+            f"User: {result_data.get('username')}\n"
+            f"Overall Score: {result_data.get('score', 'N/A')}% "
+            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
+            f"Message: {result_data.get('message', 'No message received.')}"
+        )
+        print("Submission successful.")
+        results_df = pd.DataFrame(results_log)
+        return final_status, results_df
+    except requests.exceptions.HTTPError as e:
+        error_detail = f"Server responded with status {e.response.status_code}."
+        try:
+            error_json = e.response.json()
+            error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
+        except requests.exceptions.JSONDecodeError:
+            error_detail += f" Response: {e.response.text[:500]}"
+        status_message = f"Submission Failed: {error_detail}"
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+    except requests.exceptions.Timeout:
+        status_message = "Submission Failed: The request timed out."
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+    except requests.exceptions.RequestException as e:
+        status_message = f"Submission Failed: Network error - {e}"
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+    except Exception as e:
+        status_message = f"An unexpected error occurred during submission: {e}"
+        print(status_message)
+        results_df = pd.DataFrame(results_log)
+        return status_message, results_df
+# --- Build Gradio Interface using Blocks ---
+with gr.Blocks() as demo:
+    gr.Markdown("# Basic Agent Evaluation Runner")
+    gr.Markdown(
+        """
+        **Instructions:**
+        1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
+        2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
+        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
+        ---
+        **Disclaimers:**
+        Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
+        This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
+        """
+    )
+    gr.LoginButton()
+    run_button = gr.Button("Run Evaluation & Submit All Answers")
+    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
+    # Removed max_rows=10 from DataFrame constructor
+    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
+    run_button.click(
+        fn=run_and_submit_all,
+        outputs=[status_output, results_table]
+    )
+if __name__ == "__main__":
+    print("\n" + "-"*30 + " App Starting " + "-"*30)
+    # Check for SPACE_HOST and SPACE_ID at startup for information
+    space_host_startup = os.getenv("SPACE_HOST")
+    space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
+    if space_host_startup:
+        print(f"✅ SPACE_HOST found: {space_host_startup}")
+        print(f"   Runtime URL should be: https://{space_host_startup}.hf.space")
+    else:
+        print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
+    if space_id_startup: # Print repo URLs if SPACE_ID is found
+        print(f"✅ SPACE_ID found: {space_id_startup}")
+        print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
+        print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
+    else:
+        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
+    print("-"*(60 + len(" App Starting ")) + "\n")
+    print("Launching Gradio Interface for Basic Agent Evaluation...")
+    demo.launch(debug=True, share=False)