Final_Assignment_Template

Sleeping

App Files Files Community

maytemuma commited on Apr 29

Commit

634f467

verified ·

1 Parent(s): 01674ea

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -70

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import gradio as gr
 import requests
 import pandas as pd
@@ -66,7 +67,7 @@ def download_file_from_api(task_id: str) -> str:
                 text = ""
                 for page in reader.pages:
                     text += page.extract_text() or ""
-                return text[:15000] if text.strip() else "PDF found but could not extract text (may be scanned/image-based)."
             except Exception as e:
                 return f"PDF file detected but error reading: {str(e)}"
@@ -102,7 +103,7 @@ def download_file_from_api(task_id: str) -> str:
         # --- FALLBACK ---
         with tempfile.NamedTemporaryFile(delete=False, suffix=".bin") as f:
             f.write(response.content)
-            return f"File downloaded to {f.name} (type: {content_type}). Size: {len(response.content)} bytes. Could not auto-parse."
     except Exception as e:
         return f"Error downloading file for task {task_id}: {str(e)}"
@@ -111,8 +112,8 @@ def download_file_from_api(task_id: str) -> str:
 @tool
 def describe_image(image_path: str) -> str:
     """Describes the content of an image file using an AI vision model.
-    Use this when you have an image file path (e.g. from IMAGE_FILE_SAVED)
-    and need to understand what the image shows, including any text in it.
     Args:
         image_path: The local file path to the image to describe.
@@ -120,26 +121,22 @@ def describe_image(image_path: str) -> str:
     try:
         from huggingface_hub import InferenceClient
-        token = os.getenv("HF_TOKEN")
-        client = InferenceClient(token=token)
         with open(image_path, "rb") as f:
             image_bytes = f.read()
-        # Use BLIP2 for image captioning
         result = client.image_to_text(
             image=image_bytes,
             model="Salesforce/blip2-opt-2.7b",
         )
         if isinstance(result, str):
-            description = result
         elif hasattr(result, "generated_text"):
-            description = result.generated_text
         else:
-            description = str(result)
-        return f"Image description: {description}"
     except Exception as e:
         return f"Could not describe image at {image_path}. Error: {str(e)}"
@@ -148,8 +145,7 @@ def describe_image(image_path: str) -> str:
 @tool
 def transcribe_audio(audio_path: str) -> str:
     """Transcribes an audio file to text using Whisper speech recognition.
-    Use this when you have an audio file path (e.g. from AUDIO_FILE_SAVED)
-    and need to know what is spoken in the recording.
     Args:
         audio_path: The local file path to the audio file to transcribe.
@@ -157,8 +153,7 @@ def transcribe_audio(audio_path: str) -> str:
     try:
         from huggingface_hub import InferenceClient
-        token = os.getenv("HF_TOKEN")
-        client = InferenceClient(token=token)
         with open(audio_path, "rb") as f:
             audio_bytes = f.read()
@@ -198,7 +193,6 @@ def read_local_file(file_path: str) -> str:
 @tool
 def execute_python_file(file_path: str) -> str:
     """Executes a Python script file and returns its stdout output.
-    Use this when you receive a .py file that needs to be run to get the answer.
     Args:
         file_path: The path to the Python file to execute.
@@ -216,8 +210,6 @@ def execute_python_file(file_path: str) -> str:
             output += result.stdout
         if result.stderr:
             output += f"\nSTDERR: {result.stderr}"
-        if result.returncode != 0:
-            output += f"\nReturn code: {result.returncode}"
         return output.strip() if output.strip() else "Script executed but produced no output."
     except subprocess.TimeoutExpired:
         return "Script execution timed out after 30 seconds."
@@ -229,22 +221,7 @@ def execute_python_file(file_path: str) -> str:
 # AGENT CLASS
 # =============================================
-class BasicAgent:
-    """An agent using smolagents CodeAgent with web search, file handling,
-    image description, and audio transcription tools.
-    Uses HF Inference API — no GPU needed."""
-    def __init__(self):
-        print("Initializing SmolAgent for GAIA benchmark...")
-        model = InferenceClientModel(
-            model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
-            token=os.getenv("HF_TOKEN"),
-            max_tokens=2096,
-            temperature=0.1,
-        )
-        custom_instructions = """You are a precise AI assistant solving GAIA benchmark questions.
 CRITICAL RULES FOR ANSWERING:
 1. Your final answer must be ONLY the answer itself — no explanations, no "The answer is", no extra words.
@@ -255,8 +232,8 @@ CRITICAL RULES FOR ANSWERING:
 TOOL USAGE RULES:
 6. If a question mentions an attached file, image, audio, spreadsheet, or document, FIRST use download_file_from_api with the task_id.
-7. If download returns "IMAGE_FILE_SAVED:/some/path", then call describe_image("/some/path") to see what the image contains.
-8. If download returns "AUDIO_FILE_SAVED:/some/path", then call transcribe_audio("/some/path") to hear what is said.
 9. If the file is a Python script (.py), you can use read_local_file to view it or execute_python_file to run it.
 10. Use DuckDuckGoSearchTool when you need factual information from the internet.
 11. Use visit_webpage to read the full content of a specific URL.
@@ -266,6 +243,23 @@ REASONING:
 13. Double-check your answer before giving it.
 """
         self.agent = CodeAgent(
             model=model,
             tools=[
@@ -278,14 +272,15 @@ REASONING:
                 execute_python_file,
             ],
             max_steps=10,
-            verbosity_level=1,
-            instructions=custom_instructions,
             additional_authorized_imports=[
                 "json", "re", "math", "datetime", "collections",
                 "csv", "io", "os", "tempfile", "subprocess",
                 "base64", "hashlib", "unicodedata", "string",
             ],
         )
         print("SmolAgent initialized successfully!")
     def __call__(self, question: str, task_id: str = None) -> str:
@@ -304,37 +299,43 @@ Question: {question}
 Remember: respond with ONLY the final answer, nothing else."""
-        try:
-            result = self.agent.run(prompt)
-            answer = str(result).strip()
-            # Clean up common LLM prefixes
-            prefixes_to_remove = [
-                "The answer is ", "The answer is: ",
-                "Answer: ", "FINAL ANSWER: ",
-                "Final answer: ", "The final answer is ",
-                "The final answer is: ", "Result: ",
-            ]
-            for prefix in prefixes_to_remove:
-                if answer.lower().startswith(prefix.lower()):
-                    answer = answer[len(prefix):].strip()
-            # Remove wrapping quotes
-            if len(answer) > 2 and \
-               ((answer.startswith('"') and answer.endswith('"')) or
-                (answer.startswith("'") and answer.endswith("'"))):
-                answer = answer[1:-1].strip()
-            # Remove trailing period for short answers
-            if answer.endswith(".") and len(answer.split()) <= 5:
-                answer = answer[:-1].strip()
-            print(f"Final answer: {answer}")
-            return answer
-        except Exception as e:
-            print(f"Agent error: {e}")
-            return "Unable to determine the answer."
 # =============================================
@@ -405,6 +406,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
                 "Submitted Answer": f"AGENT ERROR: {e}"
             })
     if not answers_payload:
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
@@ -448,7 +452,7 @@ with gr.Blocks() as demo:
     gr.Markdown("# 🤖 GAIA Agent — Final Assignment")
     gr.Markdown(
         """
-        **Agent**: SmolAgent (CodeAgent) with Qwen2.5-Coder-32B via HF Inference API
         **Tools**: Web Search · Webpage Visitor · File Downloader · Image Describer · Audio Transcriber · Python Executor

 import os
+import time
 import gradio as gr
 import requests
 import pandas as pd
                 text = ""
                 for page in reader.pages:
                     text += page.extract_text() or ""
+                return text[:15000] if text.strip() else "PDF found but could not extract text."
             except Exception as e:
                 return f"PDF file detected but error reading: {str(e)}"
         # --- FALLBACK ---
         with tempfile.NamedTemporaryFile(delete=False, suffix=".bin") as f:
             f.write(response.content)
+            return f"File downloaded to {f.name} (type: {content_type}). Size: {len(response.content)} bytes."
     except Exception as e:
         return f"Error downloading file for task {task_id}: {str(e)}"
 @tool
 def describe_image(image_path: str) -> str:
     """Describes the content of an image file using an AI vision model.
+    Use this when you have an image file path (from IMAGE_FILE_SAVED)
+    and need to understand what the image shows.
     Args:
         image_path: The local file path to the image to describe.
     try:
         from huggingface_hub import InferenceClient
+        client = InferenceClient(token=os.getenv("HF_TOKEN"))
         with open(image_path, "rb") as f:
             image_bytes = f.read()
         result = client.image_to_text(
             image=image_bytes,
             model="Salesforce/blip2-opt-2.7b",
         )
         if isinstance(result, str):
+            return f"Image description: {result}"
         elif hasattr(result, "generated_text"):
+            return f"Image description: {result.generated_text}"
         else:
+            return f"Image description: {str(result)}"
     except Exception as e:
         return f"Could not describe image at {image_path}. Error: {str(e)}"
 @tool
 def transcribe_audio(audio_path: str) -> str:
     """Transcribes an audio file to text using Whisper speech recognition.
+    Use this when you have an audio file path (from AUDIO_FILE_SAVED).
     Args:
         audio_path: The local file path to the audio file to transcribe.
     try:
         from huggingface_hub import InferenceClient
+        client = InferenceClient(token=os.getenv("HF_TOKEN"))
         with open(audio_path, "rb") as f:
             audio_bytes = f.read()
 @tool
 def execute_python_file(file_path: str) -> str:
     """Executes a Python script file and returns its stdout output.
     Args:
         file_path: The path to the Python file to execute.
             output += result.stdout
         if result.stderr:
             output += f"\nSTDERR: {result.stderr}"
         return output.strip() if output.strip() else "Script executed but produced no output."
     except subprocess.TimeoutExpired:
         return "Script execution timed out after 30 seconds."
 # AGENT CLASS
 # =============================================
+CUSTOM_INSTRUCTIONS = """You are a precise AI assistant solving GAIA benchmark questions.
 CRITICAL RULES FOR ANSWERING:
 1. Your final answer must be ONLY the answer itself — no explanations, no "The answer is", no extra words.
 TOOL USAGE RULES:
 6. If a question mentions an attached file, image, audio, spreadsheet, or document, FIRST use download_file_from_api with the task_id.
+7. If download returns "IMAGE_FILE_SAVED:/some/path", then call describe_image with that path.
+8. If download returns "AUDIO_FILE_SAVED:/some/path", then call transcribe_audio with that path.
 9. If the file is a Python script (.py), you can use read_local_file to view it or execute_python_file to run it.
 10. Use DuckDuckGoSearchTool when you need factual information from the internet.
 11. Use visit_webpage to read the full content of a specific URL.
 13. Double-check your answer before giving it.
 """
+class BasicAgent:
+    """Agent using smolagents CodeAgent with HF Inference API."""
+    def __init__(self):
+        print("Initializing SmolAgent for GAIA benchmark...")
+        # Use the default model with Nebius provider for better reliability
+        model = InferenceClientModel(
+            model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
+            provider="nebius",
+            token=os.getenv("HF_TOKEN"),
+            timeout=180,
+            max_tokens=2096,
+            temperature=0.1,
+        )
         self.agent = CodeAgent(
             model=model,
             tools=[
                 execute_python_file,
             ],
             max_steps=10,
+            verbosity_level=2,
+            instructions=CUSTOM_INSTRUCTIONS,
             additional_authorized_imports=[
                 "json", "re", "math", "datetime", "collections",
                 "csv", "io", "os", "tempfile", "subprocess",
                 "base64", "hashlib", "unicodedata", "string",
             ],
         )
         print("SmolAgent initialized successfully!")
     def __call__(self, question: str, task_id: str = None) -> str:
 Remember: respond with ONLY the final answer, nothing else."""
+        # Retry logic: try up to 2 times
+        for attempt in range(2):
+            try:
+                result = self.agent.run(prompt)
+                answer = str(result).strip()
+                # Clean up common LLM prefixes
+                prefixes_to_remove = [
+                    "The answer is ", "The answer is: ",
+                    "Answer: ", "FINAL ANSWER: ",
+                    "Final answer: ", "The final answer is ",
+                    "The final answer is: ", "Result: ",
+                ]
+                for prefix in prefixes_to_remove:
+                    if answer.lower().startswith(prefix.lower()):
+                        answer = answer[len(prefix):].strip()
+                # Remove wrapping quotes
+                if len(answer) > 2 and \
+                   ((answer.startswith('"') and answer.endswith('"')) or
+                    (answer.startswith("'") and answer.endswith("'"))):
+                    answer = answer[1:-1].strip()
+                # Remove trailing period for short answers
+                if answer.endswith(".") and len(answer.split()) <= 5:
+                    answer = answer[:-1].strip()
+                print(f"Final answer: {answer}")
+                return answer
+            except Exception as e:
+                print(f"Agent error (attempt {attempt + 1}): {e}")
+                if attempt == 0:
+                    print("Retrying in 5 seconds...")
+                    time.sleep(5)
+        return "Unable to determine the answer."
 # =============================================
                 "Submitted Answer": f"AGENT ERROR: {e}"
             })
+        # Small delay between questions to avoid rate limiting
+        time.sleep(2)
     if not answers_payload:
         return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
     gr.Markdown("# 🤖 GAIA Agent — Final Assignment")
     gr.Markdown(
         """
+        **Agent**: SmolAgent (CodeAgent) with Qwen2.5-Coder-32B via Nebius (HF Inference)
         **Tools**: Web Search · Webpage Visitor · File Downloader · Image Describer · Audio Transcriber · Python Executor