Spaces:
Runtime error
Runtime error
File size: 4,684 Bytes
3adfe4f 92f38fd eb3f029 0242ef6 3adfe4f baeb823 3adfe4f 92f38fd baeb823 3adfe4f a40ea82 baeb823 3adfe4f eb3f029 92f38fd baeb823 92f38fd baeb823 92f38fd eb3f029 baeb823 eb3f029 0242ef6 eb3f029 0242ef6 eb3f029 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
from langchain_core.tools import tool
import wikipediaapi
import requests
from bs4 import BeautifulSoup
from langchain_core.messages.utils import count_tokens_approximately, trim_messages
from langchain_core.messages import HumanMessage
from agent.config import MAX_TOKENS
@tool
def wiki_fetch_article(article_title: str) -> str:
"""
Search Wikipedia for a given query and return the full page content.
Remember that Wikipedia titles must be exact and describe the subject of the article in a general way.
(For instance, to get "The Beatles" info including discography use "The Beatles" as the title, not "The Beatles discography")
Args:
article_title (str): The article's title.
"""
# Initialize Wikipedia API with additional parameters for more info
wiki = wikipediaapi.Wikipedia(
user_agent='OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)',
language='en',
)
# Get the page
page = wiki.page(article_title)
# Check if page exists
if not page.exists():
return f"No Wikipedia page found for '{article_title}'. Please try a different search term."
# Return the full text content (summary + all sections)
return f"Title: {page.title}\n\nURL: {page.fullurl}\n\n{page.text}"
@tool
def wiki_parse_html(page_title: str, section_id: int | None = None) -> str:
"""
Get Wikipedia page HTML content using the parse API.
Use only if the standard wiki_fetch_article tool returns insufficient text for a section.
Args:
page_title (str): The exact title of the Wikipedia page.
section_id (int, optional): The section ID number to parse (e.g., "1" for first section).
If None, returns the entire page.
"""
url = "https://en.wikipedia.org/w/api.php"
params = {
'action': 'parse',
'page': page_title,
'format': 'json',
'prop': 'text'
}
# Add section parameter if provided
if section_id is not None:
params['section'] = str(section_id)
headers = {
'User-Agent': 'OracleBot/0.1.0 (https://github.com/abtsousa/oraclebot)'
}
try:
response = requests.get(url, params=params, headers=headers)
response.raise_for_status()
data = response.json()
if 'error' in data:
return f"Error: {data['error']['info']}"
if 'parse' not in data or 'text' not in data['parse']:
return f"No content found for page '{page_title}'"
# Raw HTML content from Wikipedia
raw_html = data['parse']['text']['*']
# Sanitize HTML: remove style/script tags and strip all attributes while keeping tag structure
try:
soup = BeautifulSoup(raw_html, 'html.parser')
# Remove unwanted tags entirely
for tag in soup(['style', 'script']):
tag.decompose()
# Strip attributes from all remaining tags (e.g., <div class=".." id=".."> -> <div>)
from bs4.element import Tag as _Tag
for tag in soup.find_all(True):
if isinstance(tag, _Tag):
tag.attrs.clear()
# Optional: collapse excessive whitespace
text = str(soup)
if MAX_TOKENS:
# Use trim_messages to fit max tokens
messages = [HumanMessage(content=text)]
trimmed_messages = trim_messages(
messages,
strategy="last",
token_counter=count_tokens_approximately,
allow_partial=True,
max_tokens=MAX_TOKENS,
)
return trimmed_messages[0].content if trimmed_messages else text
except Exception as e:
# Fallback to raw HTML if sanitization fails
messages = [HumanMessage(content=raw_html)]
if MAX_TOKENS:
# Use trim_messages to fit max tokens
trimmed_messages = trim_messages(
messages,
strategy="last",
token_counter=count_tokens_approximately,
allow_partial=True,
max_tokens=MAX_TOKENS,
)
return trimmed_messages[0].content if trimmed_messages else text
except requests.RequestException as e:
return f"Error fetching page: {str(e)}"
except Exception as e:
return f"Error parsing response: {str(e)}"
if __name__ == "__main__":
query = "Malko Competition"
result = wiki_parse_html(query)
print(result)
|