| | '''Tools for GAIA question answering agent.'''
|
| |
|
| | import time
|
| | import logging
|
| | import requests
|
| | from smolagents import tool
|
| | from googlesearch import search
|
| | from bs4 import BeautifulSoup
|
| | from selenium import webdriver
|
| | from selenium.webdriver.common.by import By
|
| | from selenium.webdriver.support.ui import WebDriverWait
|
| | from selenium.webdriver.support import expected_conditions as EC
|
| | from selenium.webdriver.chrome.options import Options
|
| | from selenium.common.exceptions import TimeoutException, WebDriverException
|
| | from functions.tool_helper_functions import (
|
| | libretext_book_parser,
|
| | libretext_chapter_parser,
|
| | save_libretext_book_as_markdown,
|
| | WikipediaFetcher
|
| | )
|
| |
|
| |
|
| | logger = logging.getLogger(__name__)
|
| |
|
| |
|
| | @tool
|
| | def google_search(query: str) -> dict:
|
| | """
|
| | Perform a Google search and return the top 10 results.
|
| |
|
| | Args:
|
| | query (str): The search query.
|
| |
|
| | Returns:
|
| | dict: A dictionary containing the search results in the following format.
|
| | {0: {'title': str, 'url': str, 'description': str}, ...}
|
| | """
|
| |
|
| |
|
| | results = list(search(query, num_results=10, advanced=True))
|
| |
|
| |
|
| | parsed_results = {}
|
| |
|
| | for i, result in enumerate(results):
|
| |
|
| | parsed_results[i] = {
|
| | 'title': result.title,
|
| | 'url': result.url,
|
| | 'description': result.description
|
| | }
|
| |
|
| | return parsed_results
|
| |
|
| |
|
| | @tool
|
| | def wikipedia_search(query: str) -> dict:
|
| | """
|
| | Perform a search for wikipedia pages and return the top 5 results.
|
| |
|
| | Args:
|
| | query (str): The search query.
|
| |
|
| | Returns:
|
| | dict: A dictionary containing the search results in the following format.
|
| | {0: {'title': str, 'description': str}, ...}
|
| | """
|
| |
|
| | repo_url = 'https://github.com/gperdrizet/unit-four-final-project'
|
| |
|
| | language_code = 'en'
|
| | number_of_results = 5
|
| | headers = {
|
| | 'User-Agent': f'HuggingFace Agents course final project ({repo_url})'
|
| | }
|
| |
|
| | base_url = 'https://api.wikimedia.org/core/v1/wikipedia/'
|
| | endpoint = '/search/page'
|
| | url = base_url + language_code + endpoint
|
| | parameters = {'q': query, 'limit': number_of_results}
|
| | response = requests.get(url, headers=headers, params=parameters, timeout=15)
|
| |
|
| | if response.status_code == 200:
|
| | results = response.json().get('pages', [])
|
| | parsed_results = {}
|
| |
|
| | else:
|
| | return f"Error: Unable to retrieve page. Status code {response.status_code}"
|
| |
|
| | for i, result in enumerate(results):
|
| |
|
| | parsed_results[i] = {
|
| | 'title': result.get('title', None),
|
| | 'description': result.get('description', None)
|
| | }
|
| |
|
| | return parsed_results
|
| |
|
| |
|
| | @tool
|
| | def get_wikipedia_page(query: str) -> str:
|
| | """
|
| | Get the content of a Wikipedia page as HTML. Use this tool when trying to
|
| | retrieve information from a Wikipedia page or article.
|
| |
|
| | Args:
|
| | query (str): The title of the Wikipedia page.
|
| |
|
| | Returns:
|
| | str: The HTML content of the Wikipedia page.
|
| | """
|
| |
|
| | fetcher = WikipediaFetcher()
|
| | html_result = fetcher.fetch(query.replace(' ', '_'))
|
| |
|
| | content = html_result['content']
|
| |
|
| | content = content.split(
|
| | '<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>'
|
| | )[0]
|
| |
|
| | content = content.split(
|
| | '<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>'
|
| | )[0]
|
| |
|
| | return content
|
| |
|
| |
|
| | @tool
|
| | def libretext_book_search(query: str) -> dict:
|
| | """
|
| | Search for LibreTexts books using Selenium to handle JavaScript-rendered content.
|
| |
|
| | Args:
|
| | query (str): The search query.
|
| |
|
| | Returns:
|
| | dict: A dictionary containing the search results in the following format.
|
| | {0: {'title': str, 'url': str, 'description': str}, ...}
|
| | """
|
| |
|
| |
|
| | chrome_options = Options()
|
| | chrome_options.add_argument("--headless")
|
| | chrome_options.add_argument("--no-sandbox")
|
| | chrome_options.add_argument("--disable-dev-shm-usage")
|
| | chrome_options.add_argument("--disable-gpu")
|
| | chrome_options.add_argument("--window-size=1920,1080")
|
| | chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
|
| | "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
| |
|
| | driver = None
|
| | try:
|
| |
|
| | driver = webdriver.Chrome(options=chrome_options)
|
| |
|
| |
|
| | search_url = 'https://chem.libretexts.org/Special:Search'
|
| | params = {
|
| | 'qid': '',
|
| | 'fpid': '230',
|
| | 'fpth': '',
|
| | 'query': query
|
| | }
|
| |
|
| |
|
| | param_string = '&'.join([f"{k}={v}" for k, v in params.items()])
|
| | full_url = f"{search_url}?{param_string}"
|
| |
|
| | logger.info('Selenium search URL: %s', full_url)
|
| |
|
| |
|
| | driver.get(full_url)
|
| |
|
| |
|
| |
|
| | wait = WebDriverWait(driver, 15)
|
| |
|
| | try:
|
| |
|
| |
|
| | _ = wait.until(
|
| | EC.presence_of_element_located((By.ID, "mt-search-spblls"))
|
| | )
|
| |
|
| |
|
| | time.sleep(3)
|
| |
|
| |
|
| | page_source = driver.page_source
|
| | soup = BeautifulSoup(page_source, 'html.parser')
|
| |
|
| |
|
| | search_info_divs = soup.find_all('div', class_='mt-search-information')
|
| |
|
| |
|
| | if not search_info_divs:
|
| |
|
| | search_info_divs = soup.find_all('div', class_='search-result')
|
| | if not search_info_divs:
|
| | search_info_divs = soup.find_all('div', class_='result')
|
| | if not search_info_divs:
|
| |
|
| | results_container = soup.find('div', id='mt-search-spblls')
|
| | if results_container:
|
| | search_info_divs = results_container.find_all('div', recursive=False)
|
| |
|
| | logger.info('Found %d potential search result divs', len(search_info_divs))
|
| |
|
| |
|
| | parsed_results = {}
|
| | result_count = 0
|
| |
|
| | for div in search_info_divs:
|
| |
|
| | title = None
|
| | url = None
|
| | summary = None
|
| |
|
| |
|
| | title_link = div.find('a')
|
| | if title_link:
|
| | title = title_link.get_text(strip=True)
|
| | url = title_link.get('href', '')
|
| |
|
| |
|
| | if url and url.startswith('/'):
|
| | url = 'https://chem.libretexts.org' + url
|
| |
|
| |
|
| |
|
| | text_elements = div.find_all(['p', 'span', 'div'])
|
| | for element in text_elements:
|
| | text = element.get_text(strip=True)
|
| | if text and len(text) > 20 and not title or text != title:
|
| | summary = text
|
| | break
|
| |
|
| |
|
| | if title and len(title) > 3:
|
| | parsed_results[result_count] = {
|
| | 'title': title,
|
| | 'url': url or '',
|
| | 'description': summary or ''
|
| | }
|
| |
|
| | logger.debug(
|
| | 'Extracted result %d: title="%s", url="%s"',
|
| | result_count,
|
| | title,
|
| | url
|
| | )
|
| |
|
| | result_count += 1
|
| |
|
| | logger.info('Successfully extracted %d search results', len(parsed_results))
|
| | return parsed_results
|
| |
|
| | except TimeoutException:
|
| | logger.error('Timeout waiting for search results to load')
|
| | return {'error': 'Timeout waiting for search results to load'}
|
| |
|
| | except WebDriverException as e:
|
| | logger.error('WebDriver error: %s', str(e))
|
| | return {'error': f'WebDriver error: {str(e)}'}
|
| |
|
| | except Exception as e:
|
| | logger.error('Unexpected error in Selenium search: %s', str(e))
|
| | return {'error': f'Unexpected error: {str(e)}'}
|
| |
|
| | finally:
|
| |
|
| | if driver:
|
| | try:
|
| | driver.quit()
|
| | except Exception as e:
|
| | logger.warning('Error closing driver: %s', str(e))
|
| |
|
| |
|
| | @tool
|
| | def get_libretext_book(url: str) -> dict:
|
| | """
|
| | Get the complete content of a LibreTexts book including all chapters and sections.
|
| |
|
| | Args:
|
| | url (str): The URL of the LibreTexts book page.
|
| |
|
| | Returns:
|
| | dict: A dictionary containing the complete book structure in the following format.
|
| | {
|
| | 'title': 'book title string',
|
| | 'chapters': {
|
| | 'Chapter title': {
|
| | 'sections': {
|
| | 'Section title': {
|
| | 'Section summary': 'Section summary string',
|
| | 'Section url': 'https://example.com/section-url',
|
| | },
|
| | ...
|
| | }
|
| | },
|
| | ...
|
| | }
|
| | }
|
| | """
|
| |
|
| | logger.info('Getting complete LibreTexts book: %s', url)
|
| |
|
| |
|
| | book_data = libretext_book_parser(url)
|
| |
|
| | if 'error' in book_data:
|
| | logger.error('Failed to parse book structure: %s', book_data['error'])
|
| | return book_data
|
| |
|
| |
|
| | book_title = url.split('/')[-1].replace('%3A', ':').replace('_', ' ')
|
| | if '(' in book_title:
|
| | book_title = book_title.split('(')[0].strip()
|
| |
|
| |
|
| | complete_book = {
|
| | 'title': book_title,
|
| | 'chapters': {}
|
| | }
|
| |
|
| | logger.info('Found %d chapters to process', len(book_data))
|
| |
|
| |
|
| | for chapter_info in book_data.values():
|
| | chapter_title = chapter_info['title']
|
| | chapter_url = chapter_info['url']
|
| |
|
| | logger.info('Processing chapter: %s', chapter_title)
|
| |
|
| |
|
| | sections_data = libretext_chapter_parser(chapter_url)
|
| |
|
| |
|
| | complete_book['chapters'][chapter_title] = {
|
| | 'sections': {}
|
| | }
|
| |
|
| | if 'error' in sections_data:
|
| | logger.warning('Failed to parse sections for chapter "%s": %s',
|
| | chapter_title, sections_data['error'])
|
| | complete_book['chapters'][chapter_title]['sections']['Error'] = {
|
| | 'Section summary': f"Failed to parse sections: {sections_data['error']}",
|
| | 'Section url': chapter_url
|
| | }
|
| | else:
|
| |
|
| | for section_info in sections_data.values():
|
| | section_title = section_info['title']
|
| | section_url = section_info['url']
|
| | section_description = section_info['description']
|
| |
|
| | complete_book['chapters'][chapter_title]['sections'][section_title] = {
|
| | 'Section summary': section_description,
|
| | 'Section url': section_url
|
| | }
|
| |
|
| | logger.debug('Added section: %s', section_title)
|
| |
|
| | logger.info('Successfully processed %d sections for chapter "%s"',
|
| | len(sections_data), chapter_title)
|
| |
|
| | logger.info('Successfully compiled complete book with %d chapters',
|
| | len(complete_book['chapters']))
|
| |
|
| | save_libretext_book_as_markdown(complete_book, filename=f"{book_title}.md", source_url=url)
|
| |
|
| | return complete_book
|
| |
|
| |
|