Spaces:
Sleeping
Sleeping
Switched to single agent powered by GPT-4.1, added step wait function to avoid hitting the OpenAI API rate limit.
b4e2809 verified | '''Tools for GAIA question answering agent.''' | |
| import time | |
| import logging | |
| import requests | |
| from smolagents import tool | |
| from googlesearch import search | |
| from bs4 import BeautifulSoup | |
| from selenium import webdriver | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.common.exceptions import TimeoutException, WebDriverException | |
| from functions.tool_helper_functions import ( | |
| libretext_book_parser, | |
| libretext_chapter_parser, | |
| save_libretext_book_as_markdown, | |
| WikipediaFetcher | |
| ) | |
| # Get logger for this module | |
| logger = logging.getLogger(__name__) | |
| def google_search(query: str) -> dict: | |
| """ | |
| Perform a Google search and return the top 10 results. | |
| Args: | |
| query (str): The search query. | |
| Returns: | |
| dict: A dictionary containing the search results in the following format. | |
| {0: {'title': str, 'url': str, 'description': str}, ...} | |
| """ | |
| # Run the query | |
| results = list(search(query, num_results=10, advanced=True)) | |
| # Parse and format the results | |
| parsed_results = {} | |
| for i, result in enumerate(results): | |
| parsed_results[i] = { | |
| 'title': result.title, | |
| 'url': result.url, | |
| 'description': result.description | |
| } | |
| return parsed_results | |
| def wikipedia_search(query: str) -> dict: | |
| """ | |
| Perform a search for wikipedia pages and return the top 5 results. | |
| Args: | |
| query (str): The search query. | |
| Returns: | |
| dict: A dictionary containing the search results in the following format. | |
| {0: {'title': str, 'description': str}, ...} | |
| """ | |
| repo_url = 'https://github.com/gperdrizet/unit-four-final-project' | |
| language_code = 'en' | |
| number_of_results = 5 | |
| headers = { | |
| 'User-Agent': f'HuggingFace Agents course final project ({repo_url})' | |
| } | |
| base_url = 'https://api.wikimedia.org/core/v1/wikipedia/' | |
| endpoint = '/search/page' | |
| url = base_url + language_code + endpoint | |
| parameters = {'q': query, 'limit': number_of_results} | |
| response = requests.get(url, headers=headers, params=parameters, timeout=15) | |
| if response.status_code == 200: | |
| results = response.json().get('pages', []) | |
| parsed_results = {} | |
| else: | |
| return f"Error: Unable to retrieve page. Status code {response.status_code}" | |
| for i, result in enumerate(results): | |
| parsed_results[i] = { | |
| 'title': result.get('title', None), | |
| 'description': result.get('description', None) | |
| } | |
| return parsed_results | |
| def get_wikipedia_page(query: str) -> str: | |
| """ | |
| Get the content of a Wikipedia page as HTML. Use this tool when trying to | |
| retrieve information from a Wikipedia page or article. | |
| Args: | |
| query (str): The title of the Wikipedia page. | |
| Returns: | |
| str: The HTML content of the Wikipedia page. | |
| """ | |
| fetcher = WikipediaFetcher() | |
| html_result = fetcher.fetch(query.replace(' ', '_')) | |
| content = html_result['content'] | |
| content = content.split( | |
| '<div class="mw-heading mw-heading2"><h2 id="Further_reading">Further reading</h2></div>' | |
| )[0] | |
| content = content.split( | |
| '<div class="mw-heading mw-heading2"><h2 id="References">References</h2></div>' | |
| )[0] | |
| return content | |
| def libretext_book_search(query: str) -> dict: | |
| """ | |
| Search for LibreTexts books using Selenium to handle JavaScript-rendered content. | |
| Args: | |
| query (str): The search query. | |
| Returns: | |
| dict: A dictionary containing the search results in the following format. | |
| {0: {'title': str, 'url': str, 'description': str}, ...} | |
| """ | |
| # Configure Chrome options for headless mode | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") | |
| chrome_options.add_argument("--no-sandbox") | |
| chrome_options.add_argument("--disable-dev-shm-usage") | |
| chrome_options.add_argument("--disable-gpu") | |
| chrome_options.add_argument("--window-size=1920,1080") | |
| chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + | |
| "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") | |
| driver = None | |
| try: | |
| # Initialize the Chrome driver | |
| driver = webdriver.Chrome(options=chrome_options) | |
| # Construct search URL | |
| search_url = 'https://chem.libretexts.org/Special:Search' | |
| params = { | |
| 'qid': '', | |
| 'fpid': '230', | |
| 'fpth': '', | |
| 'query': query | |
| } | |
| # Build URL with parameters | |
| param_string = '&'.join([f"{k}={v}" for k, v in params.items()]) | |
| full_url = f"{search_url}?{param_string}" | |
| logger.info('Selenium search URL: %s', full_url) | |
| # Navigate to the search page | |
| driver.get(full_url) | |
| # Wait for the search results to load | |
| # Wait for either search results or an indication that search is complete | |
| wait = WebDriverWait(driver, 15) | |
| try: | |
| # Wait for the search results container to be present and have content | |
| # or for a specific search result element to appear | |
| _ = wait.until( | |
| EC.presence_of_element_located((By.ID, "mt-search-spblls")) | |
| ) | |
| # Give additional time for JavaScript to populate results | |
| time.sleep(3) | |
| # Get the page source after JavaScript execution | |
| page_source = driver.page_source | |
| soup = BeautifulSoup(page_source, 'html.parser') | |
| # Look for search results using multiple possible selectors | |
| search_info_divs = soup.find_all('div', class_='mt-search-information') | |
| # If no results with that class, try other common search result patterns | |
| if not search_info_divs: | |
| # Try alternative selectors that might be used for search results | |
| search_info_divs = soup.find_all('div', class_='search-result') | |
| if not search_info_divs: | |
| search_info_divs = soup.find_all('div', class_='result') | |
| if not search_info_divs: | |
| # Look for any divs within the search results container | |
| results_container = soup.find('div', id='mt-search-spblls') | |
| if results_container: | |
| search_info_divs = results_container.find_all('div', recursive=False) | |
| logger.info('Found %d potential search result divs', len(search_info_divs)) | |
| # Parse the search results | |
| parsed_results = {} | |
| result_count = 0 | |
| for div in search_info_divs: | |
| # Try to extract title and URL from various possible structures | |
| title = None | |
| url = None | |
| summary = None | |
| # Look for title in anchor tags | |
| title_link = div.find('a') | |
| if title_link: | |
| title = title_link.get_text(strip=True) | |
| url = title_link.get('href', '') | |
| # Make URL absolute if it's relative | |
| if url and url.startswith('/'): | |
| url = 'https://chem.libretexts.org' + url | |
| # Look for description/summary text | |
| # Try multiple approaches to find descriptive text | |
| text_elements = div.find_all(['p', 'span', 'div']) | |
| for element in text_elements: | |
| text = element.get_text(strip=True) | |
| if text and len(text) > 20 and not title or text != title: | |
| summary = text | |
| break | |
| # Only add to results if we have at least a title | |
| if title and len(title) > 3: # Ensure title is meaningful | |
| parsed_results[result_count] = { | |
| 'title': title, | |
| 'url': url or '', | |
| 'description': summary or '' | |
| } | |
| logger.debug( | |
| 'Extracted result %d: title="%s", url="%s"', | |
| result_count, | |
| title, | |
| url | |
| ) | |
| result_count += 1 | |
| logger.info('Successfully extracted %d search results', len(parsed_results)) | |
| return parsed_results | |
| except TimeoutException: | |
| logger.error('Timeout waiting for search results to load') | |
| return {'error': 'Timeout waiting for search results to load'} | |
| except WebDriverException as e: | |
| logger.error('WebDriver error: %s', str(e)) | |
| return {'error': f'WebDriver error: {str(e)}'} | |
| except Exception as e: # pylint:disable=broad-exception-caught | |
| logger.error('Unexpected error in Selenium search: %s', str(e)) | |
| return {'error': f'Unexpected error: {str(e)}'} | |
| finally: | |
| # Always clean up the driver | |
| if driver: | |
| try: | |
| driver.quit() | |
| except Exception as e: # pylint:disable=broad-exception-caught | |
| logger.warning('Error closing driver: %s', str(e)) | |
| def get_libretext_book(url: str) -> dict: | |
| """ | |
| Get the complete content of a LibreTexts book including all chapters and sections. | |
| Args: | |
| url (str): The URL of the LibreTexts book page. | |
| Returns: | |
| dict: A dictionary containing the complete book structure in the following format. | |
| { | |
| 'title': 'book title string', | |
| 'chapters': { | |
| 'Chapter title': { | |
| 'sections': { | |
| 'Section title': { | |
| 'Section summary': 'Section summary string', | |
| 'Section url': 'https://example.com/section-url', | |
| }, | |
| ... | |
| } | |
| }, | |
| ... | |
| } | |
| } | |
| """ | |
| logger.info('Getting complete LibreTexts book: %s', url) | |
| # First, get the book structure (chapters) | |
| book_data = libretext_book_parser(url) | |
| if 'error' in book_data: | |
| logger.error('Failed to parse book structure: %s', book_data['error']) | |
| return book_data | |
| # Extract book title from URL or use a default | |
| book_title = url.split('/')[-1].replace('%3A', ':').replace('_', ' ') | |
| if '(' in book_title: | |
| book_title = book_title.split('(')[0].strip() | |
| # Initialize the complete book structure | |
| complete_book = { | |
| 'title': book_title, | |
| 'chapters': {} | |
| } | |
| logger.info('Found %d chapters to process', len(book_data)) | |
| # Process each chapter | |
| for chapter_info in book_data.values(): | |
| chapter_title = chapter_info['title'] | |
| chapter_url = chapter_info['url'] | |
| logger.info('Processing chapter: %s', chapter_title) | |
| # Get sections for this chapter | |
| sections_data = libretext_chapter_parser(chapter_url) | |
| # Initialize chapter structure | |
| complete_book['chapters'][chapter_title] = { | |
| 'sections': {} | |
| } | |
| if 'error' in sections_data: | |
| logger.warning('Failed to parse sections for chapter "%s": %s', | |
| chapter_title, sections_data['error']) | |
| complete_book['chapters'][chapter_title]['sections']['Error'] = { | |
| 'Section summary': f"Failed to parse sections: {sections_data['error']}", | |
| 'Section url': chapter_url | |
| } | |
| else: | |
| # Process each section | |
| for section_info in sections_data.values(): | |
| section_title = section_info['title'] | |
| section_url = section_info['url'] | |
| section_description = section_info['description'] | |
| complete_book['chapters'][chapter_title]['sections'][section_title] = { | |
| 'Section summary': section_description, | |
| 'Section url': section_url | |
| } | |
| logger.debug('Added section: %s', section_title) | |
| logger.info('Successfully processed %d sections for chapter "%s"', | |
| len(sections_data), chapter_title) | |
| logger.info('Successfully compiled complete book with %d chapters', | |
| len(complete_book['chapters'])) | |
| save_libretext_book_as_markdown(complete_book, filename=f"{book_title}.md", source_url=url) | |
| return complete_book | |