Final_Assignment_Template

Running

App Files Files Community

Ghisalbertifederico commited on 18 days ago

Commit

84ffff4

verified ·

1 Parent(s): 3dead3c

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -12

app.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import os
 import re as _re
 import time
 import concurrent.futures
 import gradio as gr
 import pypdf
 import requests
@@ -14,29 +19,35 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 HF_TOKEN = os.environ.get("HF_TOKEN")
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
 SYSTEM_PROMPT_ADDITION = """You are a general-purpose assistant that answers questions accurately and concisely.
 ANSWERING STRATEGY — follow this order strictly:
-STEP 1 — ATTACHED FILE (if a file path is mentioned in the task context)
-  Call `read_task_file(file_path)` immediately with the provided path.
   Parse the returned content and try to extract the answer from it.
   If the file contains the answer, call `final_answer` right away.
-STEP 2 — WEB SEARCH (if step 1 was not applicable or did not yield an answer)
   Choose the most appropriate tool:
-  * `wikipedia_search(query)` — encyclopedic facts: people, places, history, science.
-  * `web_search(query)` — recent events, statistics, niche facts.
   * `visit_webpage(url)` — fetch the full text of a URL returned by web_search.
   If you are able to extract the answer from the results, call `final_answer`.
-STEP 3 — NATIVE LLM KNOWLEDGE (only if steps 1 and 2 both failed or were not applicable)
   Reason from your own training knowledge and call `final_answer` with your best answer.
   Clearly prefix with "Based on my training knowledge:" so it is distinguishable.
 GENERAL RULES:
-- Available tools: `read_task_file`, `web_search`, `visit_webpage`, `wikipedia_search`, `final_answer`.
 - Do NOT invent other tool names.
 - Do NOT import modules not in additional_authorized_imports.
 - Always wrap code in <code> and </code> tags.
@@ -44,6 +55,77 @@ GENERAL RULES:
 - Give a SHORT final answer: a number, a name, a word — not a paragraph.
 """
 @tool
 def read_task_file(file_path: str) -> str:
@@ -100,7 +182,10 @@ class WebSearchAgent:
         self.agent = CodeAgent(
             model=OpenAIServerModel(
-                model_id="llama-3.1-8b-instant",
                 api_base="https://api.groq.com/openai/v1",
                 api_key=GROQ_API_KEY,
             ),
@@ -108,6 +193,8 @@ class WebSearchAgent:
                 DuckDuckGoSearchTool(),
                 VisitWebpageTool(),
                 WikipediaSearchTool(),
                 read_task_file,
             ],
             name="fast_agent",
@@ -134,7 +221,6 @@ class WebSearchAgent:
             print("Agent error:", e)
             return f"AGENT ERROR: {e}"
 MAX_WORKERS = 2        # keep low to avoid burning through Groq's free TPD limit
 QUESTION_TIMEOUT = 300  # seconds before a single question is abandoned
@@ -319,11 +405,9 @@ with gr.Blocks() as demo:
     gr.Markdown(
         """
         **Instructions:**
         1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
         2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
         3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
         ---
         **Disclaimers:**
         Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).

 import os
 import re as _re
+import sys
 import time
 import concurrent.futures
+# Force UTF-8 output on Windows to avoid charmap crashes with Unicode characters
+if sys.platform == "win32":
+    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+    sys.stderr.reconfigure(encoding="utf-8", errors="replace")
 import gradio as gr
 import pypdf
 import requests
 HF_TOKEN = os.environ.get("HF_TOKEN")
 GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
 SYSTEM_PROMPT_ADDITION = """You are a general-purpose assistant that answers questions accurately and concisely.
 ANSWERING STRATEGY — follow this order strictly:
+Step 1: Analyze the question and identify what type of information is strictly needed to answer it.
+For example if the question is "Who is the person that discovered or published x?" find the info about the person by looking up data related to x. Only lookup data once you have identified what is strictly needed to answer the question. Don't make a second search if you are unable to find the answer after the first one.
+For example if the question is about an artist's numer of albums, only look for information about discography, not biography or other unneccesary information. No tools are needed for this step, just careful reading and understanding of the question.
+STEP 2 — ATTACHED FILE (if a file path is mentioned in the task context)
+  * If the file is an image (.png, .jpg, .jpeg, .gif, .webp): call `describe_image(file_path, question)` with a focused question derived from the task.
+  * Otherwise: call `read_task_file(file_path)` immediately with the provided path.
   Parse the returned content and try to extract the answer from it.
   If the file contains the answer, call `final_answer` right away.
+STEP 3 — WEB SEARCH (if step 1 was not applicable or did not yield an answer)
   Choose the most appropriate tool:
+  * `get_youtube_transcript(video_url)` — fetch the transcript/captions of a YouTube video.
+  * `wikipedia_search(query)` — encyclopedic facts: people, places, history, science, discoveries.
+  * `web_search(query)` — recent events, statistics, niche facts, articles.
   * `visit_webpage(url)` — fetch the full text of a URL returned by web_search.
+  If the url contains "youtube.com" or "youtu.be", only use `get_youtube_transcript`. If this tool is unable to extract the answer skip to the next step.
   If you are able to extract the answer from the results, call `final_answer`.
+STEP 4 — NATIVE LLM KNOWLEDGE (only if steps 1, 2, and 3 all failed or were not applicable)
   Reason from your own training knowledge and call `final_answer` with your best answer.
   Clearly prefix with "Based on my training knowledge:" so it is distinguishable.
 GENERAL RULES:
+- Available tools: `read_task_file`, `get_youtube_transcript`, `describe_image`, `web_search`, `visit_webpage`, `wikipedia_search`, `final_answer`.
+- If the task context mentions an image file path, call `describe_image(image_path, question)` with a focused question to extract the answer.
 - Do NOT invent other tool names.
 - Do NOT import modules not in additional_authorized_imports.
 - Always wrap code in <code> and </code> tags.
 - Give a SHORT final answer: a number, a name, a word — not a paragraph.
 """
+@tool
+def get_youtube_transcript(video_url: str) -> str:
+    """Fetch the transcript/captions of a YouTube video.
+    Args:
+        video_url: Full YouTube URL or just the video ID.
+    Returns:
+        The full transcript as a single string.
+    """
+    import re
+    from youtube_transcript_api import YouTubeTranscriptApi
+    match = re.search(r"(?:v=|youtu\.be/)([A-Za-z0-9_-]{11})", video_url)
+    video_id = match.group(1) if match else video_url
+    from youtube_transcript_api import YouTubeTranscriptApi
+    try:
+        # youtube-transcript-api >= 0.6.0
+        entries = YouTubeTranscriptApi().fetch(video_id)
+    except TypeError:
+        # fallback for older versions
+        entries = YouTubeTranscriptApi.get_transcript(video_id)
+    return " ".join(e["text"] for e in entries)
+@tool
+def describe_image(image_path: str, question: str = "Describe this image in detail.") -> str:
+    """Use a vision model to interpret or answer questions about an image file.
+    Args:
+        image_path: The local path to the image file (.png, .jpg, .jpeg, .gif, .webp).
+        question: Specific question to ask about the image content.
+    Returns:
+        A text description or answer about the image content.
+    """
+    import base64
+    import os
+    import requests as _req
+    if not os.path.exists(image_path):
+        return f"Image not found: {image_path}"
+    ext = os.path.splitext(image_path)[1].lower().lstrip(".")
+    mime_map = {"jpg": "image/jpeg", "jpeg": "image/jpeg", "png": "image/png",
+                "gif": "image/gif", "webp": "image/webp"}
+    mime_type = mime_map.get(ext, "image/png")
+    with open(image_path, "rb") as f:
+        image_data = base64.standard_b64encode(f.read()).decode("utf-8")
+    payload = {
+        "model": "llama-3.2-11b-vision-preview",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{image_data}"}},
+                    {"type": "text", "text": question},
+                ],
+            }
+        ],
+        "max_tokens": 1024,
+    }
+    headers = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"}
+    resp = _req.post(
+        "https://api.groq.com/openai/v1/chat/completions",
+        json=payload, headers=headers, timeout=60,
+    )
+    resp.raise_for_status()
+    return resp.json()["choices"][0]["message"]["content"]
 @tool
 def read_task_file(file_path: str) -> str:
         self.agent = CodeAgent(
             model=OpenAIServerModel(
+                # model_id="gemma2-9b-it",
+                # model_id="llama-3.1-8b-instant",
+                # model_id="llama-3.3-70b-versatile",
+                model_id="meta-llama/llama-4-scout-17b-16e-instruct",
                 api_base="https://api.groq.com/openai/v1",
                 api_key=GROQ_API_KEY,
             ),
                 DuckDuckGoSearchTool(),
                 VisitWebpageTool(),
                 WikipediaSearchTool(),
+                get_youtube_transcript,
+                describe_image,
                 read_task_file,
             ],
             name="fast_agent",
             print("Agent error:", e)
             return f"AGENT ERROR: {e}"
 MAX_WORKERS = 2        # keep low to avoid burning through Groq's free TPD limit
 QUESTION_TIMEOUT = 300  # seconds before a single question is abandoned
     gr.Markdown(
         """
         **Instructions:**
         1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
         2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
         3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
         ---
         **Disclaimers:**
         Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).