Spaces:

felixmortas
/

Hf_Agent_Course_Final_Assignment

Configuration error

App Files Files Community

felixmortas commited on Jul 8, 2025

Commit

7791360

1 Parent(s): 9b140cb

Make wiki_search tool take into account article historical version and language

Browse files

Files changed (2) hide show

custom_tools.py +44 -71
utils.py +71 -2

custom_tools.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from utils import download_file, read_file, sum_pandas_df_cols, download_yt_video, extract_frames, encode_image, analyze_frame, generate_prompt_for_video_frame_analysis, get_response_from_frames_analysis, transcript_audio_file
 import os
 import requests
@@ -72,70 +72,43 @@ def url_search(url: str) -> str:
     except RequestException as e:
         return f"Failed to access the URL. Error: {e}"
 @tool
-def wiki_search(query: str) -> str:
     """
-    Search Wikipedia for a query and return maximum 1 result.
-    Before starting any search, you must first think about the TRUE necessary steps that are required to answer the question.
-    If you need to search for information, the query should be a 1 to 3 keywords that can be used to find the most information about the subject.
-    If the question specifies a date, do not put the date into the query.
-    THEN you should analyze the result to answer the question.
     Args:
-        query (str): The search query with a few keywords.
     Returns:
-        str: The main content of the Wikipedia page or an error message.
     """
-    try:
-        # Step 1: Search for Wikipedia pages
-        search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={query}&format=json"
-        try:
-            response = requests.get(search_url, timeout=10)
-            response.raise_for_status()
-            data = response.json()
-            search_results = data.get('query', {}).get('search', [])
-            title = search_results[0]['title'] if search_results else None
-            if not title:
-                return "No relevant Wikipedia page found."
-            # Step 2: Fetch the HTML content of the page
-            page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
-            try:
-                page_response = requests.get(page_url, timeout=10)
-                page_response.raise_for_status()
-                html_content = page_response.text
-                # Step 3: Parse the HTML content using Beautiful Soup
-                soup = BeautifulSoup(html_content, 'html.parser')
-                # Extract the main content of the page
-                content_div = soup.find('div', {'id': 'mw-content-text'})
-                if content_div:
-                    parsed_content = content_div.get_text(separator='\n', strip=True)
-                    return parsed_content
-                else:
-                    return "No main content found on the Wikipedia page."
-            except Timeout:
-                return "Request timed out while trying to fetch the Wikipedia page."
-            except TooManyRedirects:
-                return "Too many redirects while trying to fetch the Wikipedia page."
-            except RequestException as e:
-                return f"Failed to fetch the Wikipedia page. Error: {e}"
-        except Timeout:
-            return "Request timed out while searching for Wikipedia pages."
-        except TooManyRedirects:
-            return "Too many redirects while searching for Wikipedia pages."
-        except RequestException as e:
-            return f"Failed to search Wikipedia. Error: {e}"
-    except Exception as e:
-        return f"An unexpected error occurred: {e}"
 @tool
 def sum_excel_cols(file_name: str, column_names: List[str]) -> float:
@@ -153,7 +126,7 @@ def sum_excel_cols(file_name: str, column_names: List[str]) -> float:
     Example:
         sum_excel_cols("data.xlsx", ["Column1", "Column2"]) -> 100.0
     """
-    file_status = download_file(file_name)
     if not os.path.exists(file_name):
         return f"File {file_name} does not exist."
@@ -169,7 +142,7 @@ def sum_excel_cols(file_name: str, column_names: List[str]) -> float:
         df = pd.read_excel(file_name)
     try:
-        total_sum = sum_pandas_df_cols(df, column_names)
         return total_sum
     except Exception as e:
         return f"Error summing columns: {e}"
@@ -221,10 +194,10 @@ def read_file_content(file_name: str) -> str:
     Returns:
         str: The content of the file, or a detailed error message.
     """
-    download_state = download_file(file_name)
     if download_state.startswith("Success") or "already exists" in download_state:
-        return read_file(file_name)
     else:
         return download_state  # Return the error message from downloading
@@ -244,8 +217,8 @@ def analyse_youtube_video(url: str, video_question: str):
     if url=="https://www.youtube.com/watch?v=L1vXCYZAYYM":
         return "3"
-    file_name = download_yt_video(url=url)
-    frames_path = extract_frames(video_path=file_name)
     load_dotenv()
     MISTRAL_API_KEY = os.getenv("MISTRAL")
@@ -256,12 +229,12 @@ def analyse_youtube_video(url: str, video_question: str):
     frames_answers = []
     for frame_path in frames_path:
-        encoded_image = encode_image(image_path=frame_path)
         # If generate_prompt_for_video_frame_analysis() is used, replace video_question with frame_question
-        image_answer = analyze_frame(client=client, question=video_question, base64_image=encoded_image)
         frames_answers.append(image_answer)
-    video_answer = get_response_from_frames_analysis(client=client, video_question=video_question, frames_answers=frames_answers)
     return video_answer
@@ -278,18 +251,18 @@ def analyze_image(file_name: str, question: str) -> str:
     """
     try:
         if not os.path.exists(file_name):
-            file_status = download_file(file_name)
         if not os.path.exists(file_name):
             return f"File {file_name} does not exist : {file_status}"
-        base64_image = encode_image(image_path=file_name)
         load_dotenv()
         MISTRAL_API_KEY = os.getenv("MISTRAL")
         client = Mistral(api_key=MISTRAL_API_KEY)
-        response = analyze_frame(client=client, question=question, base64_image=base64_image, model="pixtral-large-latest")
         return response
@@ -308,7 +281,7 @@ def transcript_audio(file_name: str) -> str:
     """
     # Download the image file if not already present
     if not os.path.exists(file_name):
-        file_status = download_file(file_name)
     # Check if the file exists
     if not os.path.exists(file_name):
@@ -317,7 +290,7 @@ def transcript_audio(file_name: str) -> str:
     load_dotenv()
     GROQ_API_KEY = os.getenv("GROQ")
     client = Groq(api_key=GROQ_API_KEY)
-    transcript = transcript_audio_file(client=client, file_path=file_name)
     return transcript

+import utils
 import os
 import requests
     except RequestException as e:
         return f"Failed to access the URL. Error: {e}"
 @tool
+def wiki_search(query: str, lang_tag: str = 'en', date: str = None) -> str:
     """
+    Search and extract content from a Wikipedia page, optionally retrieving a historical version.
     Args:
+        query (str): The search query to look up on Wikipedia.
+        lang_tag (str, optional): The language of the Wikipedia version to search from. Expected format: 'en' for English, 'fr' for French, 'it' for Italian etc.
+        date (str, optional): A precise description of the desired historical version. Expected format: "End of 2022", "last day of January 2023", "first day of last June" etc.
     Returns:
+        str: The textual content of the most relevant Wikipedia page.
     """
+    page_title = utils.search_wikipedia(query, lang_tag)
+    if not page_title:
+        return f"No results found on Wikipedia for query: {query}"
+    if not date:
+        content_url = f"https://{lang_tag}.wikipedia.org/wiki/{page_title}"
+        content = utils.fetch_page_content(content_url)
+        return content if content else f"Failed to retrieve Wikipedia page: {page_title}"
+    versions = utils.get_history_versions(page_title, lang_tag)
+    if not versions:
+        return f"No historical versions found for {page_title}"
+    load_dotenv()
+    MISTRAL_API_KEY = os.getenv("MISTRAL")
+    client = Mistral(api_key=MISTRAL_API_KEY)
+    print(f"date: {date}")
+    selected_id = utils.select_historical_version(client, versions, date)
+    if not selected_id:
+        return "Could not determine a valid historical version from the date provided."
+    historical_content = utils.fetch_page_content(f"https://{lang_tag}.wikipedia.org/w/index.php?title={page_title}&oldid={selected_id}")
+    return historical_content if historical_content else f"Failed to access the historical Wikipedia page: {selected_id}"
 @tool
 def sum_excel_cols(file_name: str, column_names: List[str]) -> float:
     Example:
         sum_excel_cols("data.xlsx", ["Column1", "Column2"]) -> 100.0
     """
+    file_status = utils.download_file(file_name)
     if not os.path.exists(file_name):
         return f"File {file_name} does not exist."
         df = pd.read_excel(file_name)
     try:
+        total_sum = utils.sum_pandas_df_cols(df, column_names)
         return total_sum
     except Exception as e:
         return f"Error summing columns: {e}"
     Returns:
         str: The content of the file, or a detailed error message.
     """
+    download_state = utils.download_file(file_name)
     if download_state.startswith("Success") or "already exists" in download_state:
+        return utils.read_file(file_name)
     else:
         return download_state  # Return the error message from downloading
     if url=="https://www.youtube.com/watch?v=L1vXCYZAYYM":
         return "3"
+    file_name = utils.download_yt_video(url=url)
+    frames_path = utils.extract_frames(video_path=file_name)
     load_dotenv()
     MISTRAL_API_KEY = os.getenv("MISTRAL")
     frames_answers = []
     for frame_path in frames_path:
+        encoded_image = utils.encode_image(image_path=frame_path)
         # If generate_prompt_for_video_frame_analysis() is used, replace video_question with frame_question
+        image_answer = utils.analyze_frame(client=client, question=video_question, base64_image=encoded_image)
         frames_answers.append(image_answer)
+    video_answer = utils.get_response_from_frames_analysis(client=client, video_question=video_question, frames_answers=frames_answers)
     return video_answer
     """
     try:
         if not os.path.exists(file_name):
+            file_status = utils.download_file(file_name)
         if not os.path.exists(file_name):
             return f"File {file_name} does not exist : {file_status}"
+        base64_image = utils.encode_image(image_path=file_name)
         load_dotenv()
         MISTRAL_API_KEY = os.getenv("MISTRAL")
         client = Mistral(api_key=MISTRAL_API_KEY)
+        response = utils.analyze_frame(client=client, question=question, base64_image=base64_image, model="pixtral-large-latest")
         return response
     """
     # Download the image file if not already present
     if not os.path.exists(file_name):
+        file_status = utils.download_file(file_name)
     # Check if the file exists
     if not os.path.exists(file_name):
     load_dotenv()
     GROQ_API_KEY = os.getenv("GROQ")
     client = Groq(api_key=GROQ_API_KEY)
+    transcript = utils.transcript_audio_file(client=client, file_path=file_name)
     return transcript

utils.py CHANGED Viewed

@@ -1,14 +1,16 @@
 import errno
 import os
 import requests
 from requests.exceptions import RequestException, Timeout, TooManyRedirects
 import pandas as pd
 from yt_dlp import YoutubeDL
 from yt_dlp.utils import DownloadError
 import cv2
 import numpy as np
 import base64
-from typing import List
@@ -365,4 +367,71 @@ def transcript_audio_file(client, file_path: str) -> str:
             model="distil-whisper-large-v3-en", # Required model to use for transcription
             language="en",  # Optional
         )
-    return transcription

 import errno
 import os
+from bs4 import BeautifulSoup
 import requests
 from requests.exceptions import RequestException, Timeout, TooManyRedirects
 import pandas as pd
+import urllib
 from yt_dlp import YoutubeDL
 from yt_dlp.utils import DownloadError
 import cv2
 import numpy as np
 import base64
+from typing import Dict, List, Optional
             model="distil-whisper-large-v3-en", # Required model to use for transcription
             language="en",  # Optional
         )
+    return transcription
+def search_wikipedia(query: str, lang_tag: str) -> Optional[str]:
+    search_url = f"https://{lang_tag}.wikipedia.org/w/api.php?action=query&list=search&srsearch={query}&format=json"
+    response = requests.get(search_url, timeout=10)
+    response.raise_for_status()
+    data = response.json()
+    search_results = data.get('query', {}).get('search', [])
+    title = search_results[0]['title'] if search_results else None
+    if not title:
+        return "No relevant Wikipedia page found."
+    return title.replace(' ', '_')
+def fetch_page_content(url: str) -> Optional[str]:
+    page_resp = requests.get(url)
+    if page_resp.status_code != 200:
+        return None
+    content_soup = BeautifulSoup(page_resp.text, 'html.parser')
+    content_div = content_soup.find("div", id="mw-content-text")
+    return content_div.get_text(separator="\n", strip=True) if content_div else None
+def get_history_versions(page_title: str, lang_tag: str) -> List[Dict[str, str]]:
+    history_url = f"https://{lang_tag}.wikipedia.org/w/index.php?title={page_title}&action=history&limit=100"
+    history_resp = requests.get(history_url)
+    if history_resp.status_code != 200:
+        return []
+    history_soup = BeautifulSoup(history_resp.text, 'html.parser')
+    history_items = history_soup.find_all("a", class_="mw-changeslist-date")
+    if not history_items:
+        return []
+    versions = []
+    for item in history_items:
+        if item and 'oldid=' in item['href']:
+            versions.append({
+                "id": item['href'].split('oldid=')[-1],
+                "date": item.get_text()
+            })
+    return versions
+def select_historical_version(client, versions: List[Dict[str, str]], date: str) -> Optional[str]:
+    formatted_versions = "\n".join([f"{v['date']} -> {v['id']}" for v in versions])
+    prompt = f"""
+        You are an AI assistant. I am trying to retrieve the most relevant version of a Wikipedia page for the date described as: "{date}".
+        Here is a list of available version timestamps and their IDs:
+        {formatted_versions}
+        Which ID best matches the given date? Return ONLY the ID.
+        """
+    resp = client.chat.complete(
+        model="mistral-small-latest",
+        messages=[
+            {"role": "user", "content": prompt}
+        ]
+    )
+    selected_id = resp.choices[0].message.content.strip()
+    return selected_id