GAIA_v6

Sleeping

App Files Files Community

sajjadpsavoji commited on Aug 12, 2025

Commit

e05ac72

1 Parent(s): f79b0ac

update

Browse files

Files changed (1) hide show

app.py +305 -91

app.py CHANGED Viewed

@@ -48,7 +48,7 @@ def extract_final_answer(text: str) -> str:
     matches = FINAL_ANSWER_RE.findall(text)
     if matches:
         return matches[-1].strip()
-    return text.strip()
 def is_number(s: str) -> bool:
     try:
@@ -106,6 +106,56 @@ def fast_heuristic_match(pred: str, gold: str) -> bool:
             return True
     return False
 # --- Gold Answers Loader ---
 class GoldAnswers:
     """
@@ -182,14 +232,12 @@ class JudgeAgent:
     direct use of model.generate signatures — this mirrors the GAIA agent path.
     """
     def __init__(self, base_model: OpenAIServerModel, verbose: bool = False):
-        # Reuse the exact same OpenAIServerModel instance (base_model)
         self.verbose = verbose
-        # No tools required for judging; keep it simple
         self.agent = CodeAgent(
             tools=[],
             model=base_model,
-            add_base_tools=False,       # no need for memory/python exec for judging
-            planning_interval=0,        # no re-planning needed
             verbosity_level=2 if verbose else 0,
             additional_authorized_imports=[]
         )
@@ -199,14 +247,10 @@ class JudgeAgent:
         if fast_heuristic_match(predicted, gold):
             return {"is_correct": True, "score": 1.0, "justification": "Heuristic match."}
-        # Build a single prompt that includes the system guidance and the user content.
-        # With CodeAgent, we put the system message at the top of the prompt text.
         prompt = f"{JUDGE_SYSTEM}\n\n{build_judge_prompt(question, predicted, gold)}"
         try:
-            raw = self.agent.run(prompt)  # returns a string via the same path as GAIAAgent
             text = (raw or "").strip()
-            # Extract the JSON object
             m = re.search(r"\{.*\}", text, flags=re.DOTALL)
             payload = json.loads(m.group(0) if m else text)
@@ -215,7 +259,6 @@ class JudgeAgent:
             justification = str(payload.get("justification", "")).strip()[:300]
             return {"is_correct": is_correct, "score": score, "justification": justification}
         except Exception as e:
             return {"is_correct": False, "score": 0.0, "justification": f"Judge error: {e}"}
@@ -276,28 +319,26 @@ class GAIAAgent:
             "sdrawkcab" in text
         )
-    def __call__(self, question: str) -> str:
-        if self.verbose:
-            print(f"Processing question: {question[:100]}..." if len(question) > 100 else f"Processing question: {question}")
-        if self._is_reversed_text(question):
-            prompt = f"""
-You are a general AI assistant. I will ask you a question.
-This question appears to be in reversed text. Here is the reversed version for clarity:
-{question[::-1]}
-Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
 - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
 - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
-- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
 IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
 - Use web search sparingly and only when absolutely necessary.
-- Limit to 1-2 web searches per question.
-- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
 - Do not import libraries that aren't available - stick to basic Python and the tools provided.
 - Focus on answering directly with what you already know when possible.
 - If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
@@ -305,21 +346,29 @@ IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
 Remember to structure your response in Python code format using the final_answer() function.
 """
-        else:
-            prompt = f"""
-You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
 - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
 - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
-- If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
-Question: {question}
 IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
 - Use web search sparingly and only when absolutely necessary.
-- Limit to 1-2 web searches per question.
-- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
 - Do not import libraries that aren't available - stick to basic Python and the tools provided.
 - Focus on answering directly with what you already know when possible.
 - If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
@@ -327,6 +376,17 @@ IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
 Remember to structure your response in Python code format using the final_answer() function.
 """
         try:
             answer = self.agent.run(prompt)
             if self.verbose:
@@ -338,15 +398,55 @@ Remember to structure your response in Python code format using the final_answer
                 print(error_msg)
             return error_msg
 # --- Singletons for judge/gold ---
 gold_answers = GoldAnswers(path=DEFAULT_GOLD_CSV)
 _judge_agent_singleton: Optional[JudgeAgent] = None
 # --- Runner & Submitter (with judge integration) ---
-def run_and_submit_all(sample_size: int = 0):
     """
     Fetches all questions, runs the agent on them, judges locally (if gold available),
-    submits answers, and returns a results table for the UI.
     """
     username = "Gralon"
     print(f"Using username: {username}")
@@ -360,19 +460,14 @@ def run_and_submit_all(sample_size: int = 0):
         agent = GAIAAgent(verbose=True)
     except Exception as e:
         print(f"Error instantiating agent: {e}")
-        return f"Error initializing agent: {e}", None
     # 1b. Init JudgeAgent once, reusing the SAME model instance
-    global _judge_agent_singleton
-    if _judge_agent_singleton is None:
-        _judge_agent_singleton = JudgeAgent(base_model=agent.agent.model, verbose=False)
     # Derive code URL for submission
     space_id = os.getenv("SPACE_ID")
-    if space_id:
-        agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
-    else:
-        agent_code = "local"
     # 2. Fetch Questions
     print(f"Fetching questions from: {questions_url}")
@@ -382,62 +477,146 @@ def run_and_submit_all(sample_size: int = 0):
         questions_data = response.json()
         if not questions_data:
             print("Fetched questions list is empty.")
-            return "Fetched questions list is empty or invalid format.", None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
-        return f"Error fetching questions: {e}", None
     except json.JSONDecodeError as e:
         print(f"Error decoding JSON response from questions endpoint: {e}")
         print(f"Response text: {response.text[:500]}")
-        return f"Error decoding server response for questions: {e}", None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
-        return f"An unexpected error occurred fetching questions: {e}", None
-    # 3. Run Agent + Judge
-    results_log = []
-    answers_payload = []
     if sample_size > 0 and sample_size < len(questions_data):
         import random
         print(f"Using a sample of {sample_size} questions from {len(questions_data)} total questions")
         questions_data = random.sample(questions_data, sample_size)
     print(f"Running agent on {len(questions_data)} questions...")
     for i, item in enumerate(questions_data):
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
             print(f"Processing question {i+1}/{len(questions_data)}: Task ID {task_id}")
-            submitted_answer_raw = agent(question_text)
-            submitted_answer = extract_final_answer(submitted_answer_raw)
-            # Local judge (if we have gold)
-            gold = gold_answers.by_task_id.get(task_id)
-            judge_is_correct = None
-            judge_score = None
-            judge_just = None
             if gold:
-                judge_res = _judge_agent_singleton.judge(question_text, submitted_answer, gold)
-                judge_is_correct = judge_res.get("is_correct")
-                judge_score = judge_res.get("score")
-                judge_just = judge_res.get("justification")
-            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text,
-                "Submitted Answer": submitted_answer,
                 "Gold (local)": gold if gold else "",
-                "Judge Correct?": judge_is_correct,
-                "Judge Score": judge_score,
-                "Judge Note": judge_just
             })
-            print(f"Successfully processed question {i+1}")
             if i < len(questions_data) - 1:
                 print("Waiting 2 seconds before next question...")
@@ -457,7 +636,7 @@ def run_and_submit_all(sample_size: int = 0):
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
-        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
     # 4. Prepare Submission
     submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
@@ -479,7 +658,8 @@ def run_and_submit_all(sample_size: int = 0):
         )
         print("Submission successful.")
         results_df = pd.DataFrame(results_log)
-        return final_status, results_df
     except requests.exceptions.HTTPError as e:
         error_detail = f"Server responded with status {e.response.status_code}."
         try:
@@ -490,29 +670,50 @@ def run_and_submit_all(sample_size: int = 0):
         status_message = f"Submission Failed: {error_detail}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except requests.exceptions.Timeout:
         status_message = "Submission Failed: The request timed out."
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except requests.exceptions.RequestException as e:
         status_message = f"Submission Failed: Network error - {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except Exception as e:
         status_message = f"An unexpected error occurred during submission: {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-def test_single_question(question: str) -> str:
-    """Test the agent on a single question (no submission)."""
     try:
         agent = GAIAAgent(verbose=True)
-        answer = agent(question)
-        return answer
     except Exception as e:
         return f"Error: {e}"
@@ -521,10 +722,8 @@ def local_judge_single(question: str, predicted: str, task_id_or_gold: str):
     # try task_id lookup first
     gold = gold_answers.by_task_id.get(task_id_or_gold, task_id_or_gold)
     agent = GAIAAgent(verbose=False)
-    global _judge_agent_singleton
-    if _judge_agent_singleton is None:
-        _judge_agent_singleton = JudgeAgent(base_model=agent.agent.model, verbose=False)
-    res = _judge_agent_singleton.judge(question, predicted, gold)
     out = {
         "Gold": gold,
         "is_correct": res["is_correct"],
@@ -535,7 +734,7 @@ def local_judge_single(question: str, predicted: str, task_id_or_gold: str):
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
-    gr.Markdown("# GAIA Agent Evaluation Runner + Local LLM Judge")
     gr.Markdown(
         """
         ## Instructions:
@@ -545,7 +744,7 @@ with gr.Blocks() as demo:
         3. Run the full evaluation on the GAIA benchmark in the Evaluation tab
         This agent runs locally, uses an LLM judge against your answers.csv (if present),
-        and then submits answers to the server.
         """
     )
@@ -553,12 +752,13 @@ with gr.Blocks() as demo:
     with gr.Tab("Test Single Question"):
         test_input = gr.Textbox(label="Enter a question to test", lines=3)
         test_output = gr.Textbox(label="Answer", lines=3)
         test_button = gr.Button("Test Question")
         test_button.click(
             fn=test_single_question,
-            inputs=test_input,
             outputs=test_output
         )
@@ -579,15 +779,29 @@ with gr.Blocks() as demo:
                 label="Sample Size (0 for all questions)",
                 info="Set a number to limit how many questions to process (reduces costs)"
             )
-        run_button = gr.Button("Run Evaluation, Judge Locally & Submit")
         status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
-        results_table = gr.DataFrame(label="Questions, Answers & Local Judge", wrap=True)
         run_button.click(
             fn=run_and_submit_all,
-            inputs=sample_size,
-            outputs=[status_output, results_table]
         )
 if __name__ == "__main__":

     matches = FINAL_ANSWER_RE.findall(text)
     if matches:
         return matches[-1].strip()
+    return (text or "").strip()
 def is_number(s: str) -> bool:
     try:
             return True
     return False
+def quick_format_fix(answer: str, question: str) -> str:
+    """
+    Deterministic, judge-friendly cleanup. We DO NOT use gold here.
+    - Remove leading articles for strings
+    - Strip currency & percent unless explicitly requested by question
+    - Remove thousands commas in numbers
+    - Trim trailing punctuation
+    - Normalize whitespace
+    - Unify separators to comma for list-like strings
+    """
+    if not isinstance(answer, str):
+        return answer
+    s = answer.strip()
+    # remove code fences around final answer if any
+    s = re.sub(r"^```.*?\n", "", s, flags=re.DOTALL)
+    s = s.replace("```", "").strip()
+    # normalize whitespace
+    s = re.sub(r"\s+", " ", s).strip()
+    # drop trailing period if looks like a sentence end
+    s = re.sub(r"[.。]+$", "", s)
+    # if list-like but uses semicolons or slashes, convert to commas
+    if ";" in s or "/" in s:
+        s = re.sub(r"[;/]+", ",", s)
+        s = re.sub(r"\s*,\s*", ", ", s)  # pretty spacing
+    # remove leading articles for string-y answers
+    s = re.sub(r"^(?i)(a|an|the)\s+", "", s)
+    # remove thousands commas in numbers like 1,234 -> 1234 (but keep commas that separate lists)
+    # crude heuristic: if the whole answer is numeric-with-commas and no other commas
+    if "," in s and not re.search(r".*,.*", s):  # only one comma group
+        if re.fullmatch(r"\d{1,3}(,\d{3})+(\.\d+)?", s):
+            s = s.replace(",", "")
+    # remove currency unless explicitly requested
+    if "$" in s and not re.search(r"(?i)\b(dollar|usd|\$)\b.*(include|keep|use)|include\s*\$", question):
+        s = s.replace("$", "")
+    # percent sign rules: keep only if question appears to require it explicitly
+    needs_percent = bool(re.search(r"(?i)\b(percent|%)\b.*(include|with|as sign)|include\s*%", question))
+    if "%" in s and not needs_percent:
+        s = s.replace("%", "")
+    return s.strip()
 # --- Gold Answers Loader ---
 class GoldAnswers:
     """
     direct use of model.generate signatures — this mirrors the GAIA agent path.
     """
     def __init__(self, base_model: OpenAIServerModel, verbose: bool = False):
         self.verbose = verbose
         self.agent = CodeAgent(
             tools=[],
             model=base_model,
+            add_base_tools=False,
+            planning_interval=0,
             verbosity_level=2 if verbose else 0,
             additional_authorized_imports=[]
         )
         if fast_heuristic_match(predicted, gold):
             return {"is_correct": True, "score": 1.0, "justification": "Heuristic match."}
         prompt = f"{JUDGE_SYSTEM}\n\n{build_judge_prompt(question, predicted, gold)}"
         try:
+            raw = self.agent.run(prompt)
             text = (raw or "").strip()
             m = re.search(r"\{.*\}", text, flags=re.DOTALL)
             payload = json.loads(m.group(0) if m else text)
             justification = str(payload.get("justification", "")).strip()[:300]
             return {"is_correct": is_correct, "score": score, "justification": justification}
         except Exception as e:
             return {"is_correct": False, "score": 0.0, "justification": f"Judge error: {e}"}
             "sdrawkcab" in text
         )
+    def _base_prompt(self, question: str, allow_extra_searches: bool = False) -> str:
+        # Let retries slightly relax the search budget
+        search_budget_line = (
+            "- Limit to 1-2 web searches per question.\n"
+            if not allow_extra_searches else
+            "- You may use up to 3-4 web searches if needed.\n"
+        )
+        return f"""
+You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
 - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
 - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
+- If you are asked for a comma separated list, apply the above rules depending on whether the element to be put in the list is a number or a string.
+Question: {question}
 IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
 - Use web search sparingly and only when absolutely necessary.
+{search_budget_line}- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
 - Do not import libraries that aren't available - stick to basic Python and the tools provided.
 - Focus on answering directly with what you already know when possible.
 - If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
 Remember to structure your response in Python code format using the final_answer() function.
 """
+    def _reversed_prompt(self, question: str, allow_extra_searches: bool = False) -> str:
+        search_budget_line = (
+            "- Limit to 1-2 web searches per question.\n"
+            if not allow_extra_searches else
+            "- You may use up to 3-4 web searches if needed.\n"
+        )
+        return f"""
+You are a general AI assistant. I will ask you a question.
+This question appears to be in reversed text. Here is the reversed version for clarity:
+{question[::-1]}
+Report your thoughts, and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
 YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
 - If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
 - If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
+- If you are asked for a comma separated list, apply the above rules depending on whether the element to be put in the list is a number or a string.
 IMPORTANT NOTES TO LIMIT COSTS AND PREVENT ERRORS:
 - Use web search sparingly and only when absolutely necessary.
+{search_budget_line}- If a search fails due to rate limiting, add a 3-5 second delay using time.sleep() before retrying with a different search term.
 - Do not import libraries that aren't available - stick to basic Python and the tools provided.
 - Focus on answering directly with what you already know when possible.
 - If you've made more than 3 attempts to solve a problem, prioritize providing your best guess.
 Remember to structure your response in Python code format using the final_answer() function.
 """
+    def __call__(self, question: str, allow_extra_searches: bool = False) -> str:
+        if self.verbose:
+            msg = f"Processing question: {question[:100]}..." if len(question) > 100 else f"Processing question: {question}"
+            print(msg)
+        prompt = (
+            self._reversed_prompt(question, allow_extra_searches)
+            if self._is_reversed_text(question)
+            else self._base_prompt(question, allow_extra_searches)
+        )
         try:
             answer = self.agent.run(prompt)
             if self.verbose:
                 print(error_msg)
             return error_msg
+    def refine(self, question: str, prev_answer: str, judge_feedback: str, attempt_no: int) -> str:
+        """
+        Reflection-based reattempt without using gold.
+        """
+        if self.verbose:
+            print(f"Refining (attempt {attempt_no}) based on judge note: {judge_feedback}")
+        allow_extra = attempt_no >= 2  # relax search budget after first retry
+        base = self._base_prompt(question, allow_extra_searches=allow_extra)
+        refinement_addendum = f"""
+Your previous FINAL ANSWER was:
+{prev_answer}
+A strict judge said this answer was incorrect for the following reason(s) (be concise): {judge_feedback}
+Re-evaluate the question carefully. Consider possible formatting issues (units, articles, thousands commas), list ordering (only if the question requires a specific order), and rounding.
+Produce a NEW final answer. Do not repeat the previous final answer if you think it was wrong.
+"""
+        try:
+            answer = self.agent.run(base + refinement_addendum)
+            if self.verbose:
+                print(f"Refined answer: {answer}")
+            return answer
+        except Exception as e:
+            err = f"Error refining: {e}"
+            if self.verbose:
+                print(err)
+            return err
 # --- Singletons for judge/gold ---
 gold_answers = GoldAnswers(path=DEFAULT_GOLD_CSV)
 _judge_agent_singleton: Optional[JudgeAgent] = None
 # --- Runner & Submitter (with judge integration) ---
+def _ensure_judge(model: OpenAIServerModel) -> JudgeAgent:
+    global _judge_agent_singleton
+    if _judge_agent_singleton is None:
+        _judge_agent_singleton = JudgeAgent(base_model=model, verbose=False)
+    return _judge_agent_singleton
+def run_and_submit_all(sample_size: int = 0, max_retries: int = 1, use_local_judge_to_select: bool = True):
     """
     Fetches all questions, runs the agent on them, judges locally (if gold available),
+    optionally reattempts on incorrect results, submits answers, and returns:
+      - final status string
+      - final results dataframe (one row per question)
+      - attempt log dataframe (one row per attempt)
     """
     username = "Gralon"
     print(f"Using username: {username}")
         agent = GAIAAgent(verbose=True)
     except Exception as e:
         print(f"Error instantiating agent: {e}")
+        return f"Error initializing agent: {e}", None, None
     # 1b. Init JudgeAgent once, reusing the SAME model instance
+    judge_agent = _ensure_judge(agent.agent.model)
     # Derive code URL for submission
     space_id = os.getenv("SPACE_ID")
+    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local"
     # 2. Fetch Questions
     print(f"Fetching questions from: {questions_url}")
         questions_data = response.json()
         if not questions_data:
             print("Fetched questions list is empty.")
+            return "Fetched questions list is empty or invalid format.", None, None
         print(f"Fetched {len(questions_data)} questions.")
     except requests.exceptions.RequestException as e:
         print(f"Error fetching questions: {e}")
+        return f"Error fetching questions: {e}", None, None
     except json.JSONDecodeError as e:
         print(f"Error decoding JSON response from questions endpoint: {e}")
         print(f"Response text: {response.text[:500]}")
+        return f"Error decoding server response for questions: {e}", None, None
     except Exception as e:
         print(f"An unexpected error occurred fetching questions: {e}")
+        return f"An unexpected error occurred fetching questions: {e}", None, None
+    # Sampling
     if sample_size > 0 and sample_size < len(questions_data):
         import random
         print(f"Using a sample of {sample_size} questions from {len(questions_data)} total questions")
         questions_data = random.sample(questions_data, sample_size)
     print(f"Running agent on {len(questions_data)} questions...")
+    results_log: List[Dict[str, Any]] = []
+    attempts_log: List[Dict[str, Any]] = []
+    answers_payload: List[Dict[str, Any]] = []
     for i, item in enumerate(questions_data):
         task_id = item.get("task_id")
         question_text = item.get("question")
         if not task_id or question_text is None:
             print(f"Skipping item with missing task_id or question: {item}")
             continue
+        gold = gold_answers.by_task_id.get(task_id)
+        per_question_attempts: List[Dict[str, Any]] = []
         try:
             print(f"Processing question {i+1}/{len(questions_data)}: Task ID {task_id}")
+            # -- First attempt
+            raw = agent(question_text, allow_extra_searches=False)
+            ans = extract_final_answer(raw)
+            fixed = quick_format_fix(ans, question_text) or ans
+            # judge first (on fixed)
+            jres = None
+            j_is_correct = None
+            j_score = None
+            j_note = None
             if gold:
+                jres = judge_agent.judge(question_text, fixed, gold)
+                j_is_correct = jres.get("is_correct")
+                j_score = jres.get("score")
+                j_note = jres.get("justification")
+            per_question_attempts.append({
+                "Task ID": task_id,
+                "Attempt": 1,
+                "Submitted Answer (raw)": ans,
+                "Submitted Answer (fixed)": fixed,
+                "Judge Correct?": j_is_correct,
+                "Judge Score": j_score,
+                "Judge Note": j_note
+            })
+            best_answer = fixed
+            best_score = j_score if j_score is not None else 0.0
+            best_correct = j_is_correct
+            retries = 0
+            while (j_is_correct is False) and (retries < max_retries):
+                retries += 1
+                # Try reflective retry
+                refined_raw = agent.refine(
+                    question=question_text,
+                    prev_answer=fixed,
+                    judge_feedback=j_note or "Format/content mismatch.",
+                    attempt_no=retries
+                )
+                refined = extract_final_answer(refined_raw)
+                refined_fixed = quick_format_fix(refined, question_text) or refined
+                # Judge the refined answer
+                j2 = None
+                j2_is_correct = None
+                j2_score = None
+                j2_note = None
+                if gold:
+                    j2 = judge_agent.judge(question_text, refined_fixed, gold)
+                    j2_is_correct = j2.get("is_correct")
+                    j2_score = j2.get("score")
+                    j2_note = j2.get("justification")
+                per_question_attempts.append({
+                    "Task ID": task_id,
+                    "Attempt": retries + 1,
+                    "Submitted Answer (raw)": refined,
+                    "Submitted Answer (fixed)": refined_fixed,
+                    "Judge Correct?": j2_is_correct,
+                    "Judge Score": j2_score,
+                    "Judge Note": j2_note
+                })
+                # Decide whether to keep this as best
+                if use_local_judge_to_select and gold and (j2_score is not None):
+                    if (j2_score > (best_score or 0)) or (best_score is None):
+                        best_answer, best_score, best_correct = refined_fixed, j2_score, j2_is_correct
+                else:
+                    # If we don't have gold/judge, prefer the newest answer
+                    best_answer = refined_fixed
+                    best_score = j2_score if j2_score is not None else best_score
+                    best_correct = j2_is_correct if j2_is_correct is not None else best_correct
+                # Prepare for another retry if needed
+                fixed = refined_fixed
+                j_is_correct = j2_is_correct
+                j_score = j2_score
+                j_note = j2_note
+                if j2_is_correct:
+                    break
+                if retries < max_retries:
+                    print("Waiting 2 seconds before next attempt...")
+                    time.sleep(2)
+            # Append final choice per question
+            answers_payload.append({"task_id": task_id, "submitted_answer": best_answer})
             results_log.append({
                 "Task ID": task_id,
                 "Question": question_text,
+                "Submitted Answer": best_answer,
                 "Gold (local)": gold if gold else "",
+                "Judge Correct?": best_correct,
+                "Judge Score": best_score,
+                "Judge Note": j_note
             })
+            print(f"Finished question {i+1}")
+            # Add to global attempts log
+            attempts_log.extend(per_question_attempts)
             if i < len(questions_data) - 1:
                 print("Waiting 2 seconds before next question...")
     if not answers_payload:
         print("Agent did not produce any answers to submit.")
+        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log), pd.DataFrame(attempts_log)
     # 4. Prepare Submission
     submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
         )
         print("Submission successful.")
         results_df = pd.DataFrame(results_log)
+        attempts_df = pd.DataFrame(attempts_log)
+        return final_status, results_df, attempts_df
     except requests.exceptions.HTTPError as e:
         error_detail = f"Server responded with status {e.response.status_code}."
         try:
         status_message = f"Submission Failed: {error_detail}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        attempts_df = pd.DataFrame(attempts_log)
+        return status_message, results_df, attempts_df
     except requests.exceptions.Timeout:
         status_message = "Submission Failed: The request timed out."
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        attempts_df = pd.DataFrame(attempts_log)
+        return status_message, results_df, attempts_df
     except requests.exceptions.RequestException as e:
         status_message = f"Submission Failed: Network error - {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        attempts_df = pd.DataFrame(attempts_log)
+        return status_message, results_df, attempts_df
     except Exception as e:
         status_message = f"An unexpected error occurred during submission: {e}"
         print(status_message)
         results_df = pd.DataFrame(results_log)
+        attempts_df = pd.DataFrame(attempts_log)
+        return status_message, results_df, attempts_df
+def test_single_question(question: str, retries: int = 1) -> str:
+    """Test the agent on a single question (no submission), with judge-aware retries if gold exists."""
     try:
         agent = GAIAAgent(verbose=True)
+        judge_agent = _ensure_judge(agent.agent.model)
+        gold = None  # No task_id context here; pure test (no gold)
+        # First attempt
+        raw = agent(question)
+        ans = extract_final_answer(raw)
+        fixed = quick_format_fix(ans, question) or ans
+        if retries <= 0:
+            return fixed
+        # Without gold we can't know correctness; just do a reflective retry once for demo
+        last = fixed
+        note = "Possible format/content mismatch; re-evaluate."
+        for k in range(retries):
+            refined_raw = agent.refine(question, prev_answer=last, judge_feedback=note, attempt_no=k+1)
+            refined = extract_final_answer(refined_raw)
+            refined_fixed = quick_format_fix(refined, question) or refined
+            last = refined_fixed
+        return last
     except Exception as e:
         return f"Error: {e}"
     # try task_id lookup first
     gold = gold_answers.by_task_id.get(task_id_or_gold, task_id_or_gold)
     agent = GAIAAgent(verbose=False)
+    judge_agent = _ensure_judge(agent.agent.model)
+    res = judge_agent.judge(question, predicted, gold)
     out = {
         "Gold": gold,
         "is_correct": res["is_correct"],
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
+    gr.Markdown("# GAIA Agent Evaluation Runner + Local LLM Judge (with smart retries)")
     gr.Markdown(
         """
         ## Instructions:
         3. Run the full evaluation on the GAIA benchmark in the Evaluation tab
         This agent runs locally, uses an LLM judge against your answers.csv (if present),
+        **retries intelligently** when the judge says 'incorrect', and then submits answers to the server.
         """
     )
     with gr.Tab("Test Single Question"):
         test_input = gr.Textbox(label="Enter a question to test", lines=3)
+        test_retries = gr.Slider(minimum=0, maximum=3, value=1, step=1, label="Retries (no gold here, heuristic only)")
         test_output = gr.Textbox(label="Answer", lines=3)
         test_button = gr.Button("Test Question")
         test_button.click(
             fn=test_single_question,
+            inputs=[test_input, test_retries],
             outputs=test_output
         )
                 label="Sample Size (0 for all questions)",
                 info="Set a number to limit how many questions to process (reduces costs)"
             )
+            max_retries = gr.Slider(
+                minimum=0,
+                maximum=3,
+                value=1,
+                step=1,
+                label="Max judge-driven retries per question",
+                info="0 = no retries; 1-3 = progressively more effort"
+            )
+            use_local = gr.Checkbox(
+                value=True,
+                label="Use local judge (gold) to pick best attempt when available",
+                info="If unchecked, we submit the last attempt instead."
+            )
+        run_button = gr.Button("Run Evaluation, Judge Locally, Retry & Submit")
         status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
+        results_table = gr.DataFrame(label="Final Results (per question)", wrap=True)
+        attempts_table = gr.DataFrame(label="Attempt Log (expanded)", wrap=True)
         run_button.click(
             fn=run_and_submit_all,
+            inputs=[sample_size, max_retries, use_local],
+            outputs=[status_output, results_table, attempts_table]
         )
 if __name__ == "__main__":