agents-course-final-assignment

Runtime error

File size: 4,684 Bytes

3adfe4f
92f38fd
eb3f029
 
0242ef6
 
 
3adfe4f
 
baeb823
3adfe4f
92f38fd
baeb823
 
3adfe4f
a40ea82
baeb823
3adfe4f
eb3f029
 
92f38fd
 
 
 
 
baeb823
92f38fd
 
 
baeb823
92f38fd
 
 
eb3f029
 
 
 
 
baeb823
eb3f029
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0242ef6
 
 
 
 
 
 
 
 
 
 
 
 
eb3f029
 
0242ef6
 
 
 
 
 
 
 
 
 
 
 
 
eb3f029

from langchain_core.tools import tool
import wikipediaapi
import requests
from bs4 import BeautifulSoup
from langchain_core.messages.utils import count_tokens_approximately, trim_messages
from langchain_core.messages import HumanMessage
from agent.config import MAX_TOKENS

@tool
def wiki_fetch_article(article_title: str) -> str:
    """
    Search Wikipedia for a given query and return the full page content.
    Remember that Wikipedia titles must be exact and describe the subject of the article in a general way.
    (For instance, to get "The Beatles" info including discography use "The Beatles" as the title, not "The Beatles discography")

    Args:
        article_title (str): The article's title.
    """
    # Initialize Wikipedia API with additional parameters for more info
    wiki = wikipediaapi.Wikipedia(
        user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
        language='en',
    )
    
    # Get the page
    page = wiki.page(article_title)
    
    # Check if page exists
    if not page.exists():
        return f"No Wikipedia page found for '{article_title}'. Please try a different search term."
    
    # Return the full text content (summary + all sections)
    return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"

@tool
def wiki_parse_html(page_title: str, section_id: int | None = None) -> str:
    """
    Get Wikipedia page HTML content using the parse API.
    Use only if the standard wiki_fetch_article tool returns insufficient text for a section.

    Args:
        page_title (str): The exact title of the Wikipedia page.
        section_id (int, optional): The section ID number to parse (e.g., "1" for first section). 
                                   If None, returns the entire page.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'parse',
        'page': page_title,
        'format': 'json',
        'prop': 'text'
    }
    
    # Add section parameter if provided
    if section_id is not None:
        params['section'] = str(section_id)
    
    headers = {
        'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)'
    }
    
    try:
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()
        data = response.json()
        
        if 'error' in data:
            return f"Error: {data['error']['info']}"
        
        if 'parse' not in data or 'text' not in data['parse']:
            return f"No content found for page '{page_title}'"
        
        # Raw HTML content from Wikipedia
        raw_html = data['parse']['text']['*']

        # Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure
        try:
            soup = BeautifulSoup(raw_html, 'html.parser')

            # Remove unwanted tags entirely
            for tag in soup(['style', 'script']):
                tag.decompose()

            # Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>)
            from bs4.element import Tag as _Tag
            for tag in soup.find_all(True):
                if isinstance(tag, _Tag):
                    tag.attrs.clear()

            # Optional: collapse excessive whitespace
            text = str(soup)

            if MAX_TOKENS:
                # Use trim_messages to fit max tokens
                messages = [HumanMessage(content=text)]
                trimmed_messages = trim_messages(
                    messages,
                    strategy="last",
                    token_counter=count_tokens_approximately,
                    allow_partial=True,
                    max_tokens=MAX_TOKENS,
                )
            
            return trimmed_messages[0].content if trimmed_messages else text
        except Exception as e:
            # Fallback to raw HTML if sanitization fails
            messages = [HumanMessage(content=raw_html)]

            if MAX_TOKENS:
                # Use trim_messages to fit max tokens
                trimmed_messages = trim_messages(
                    messages,
                    strategy="last",
                    token_counter=count_tokens_approximately,
                    allow_partial=True,
                    max_tokens=MAX_TOKENS,
                )
            return trimmed_messages[0].content if trimmed_messages else text

    except requests.RequestException as e:
        return f"Error fetching page: {str(e)}"
    except Exception as e:
        return f"Error parsing response: {str(e)}"

if __name__ == "__main__":
    query = "Malko Competition"
    result = wiki_parse_html(query)
    print(result)