Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -629,19 +629,13 @@ def validate_answer(proposed_answer: str, original_question: str) -> str:
|
|
| 629 |
class WikipediaInput(BaseModel):
|
| 630 |
query: str = Field(description="Topic to search (e.g., 'Mercedes Sosa', 'Python programming')")
|
| 631 |
|
|
|
|
|
|
|
| 632 |
@tool(args_schema=WikipediaInput)
|
| 633 |
@retry_with_backoff(max_retries=2)
|
| 634 |
def wikipedia_search(query: str) -> str:
|
| 635 |
"""
|
| 636 |
-
Search Wikipedia
|
| 637 |
-
|
| 638 |
-
Better than search_tool for:
|
| 639 |
-
- Biographical information
|
| 640 |
-
- Historical facts
|
| 641 |
-
- Scientific concepts
|
| 642 |
-
- Counting items in lists (discography, filmography, etc.)
|
| 643 |
-
|
| 644 |
-
Returns full article sections, not just snippets.
|
| 645 |
"""
|
| 646 |
start_time = time.time()
|
| 647 |
|
|
@@ -656,71 +650,51 @@ def wikipedia_search(query: str) -> str:
|
|
| 656 |
telemetry.record_call("wikipedia_search", time.time() - start_time, True)
|
| 657 |
return cached
|
| 658 |
|
| 659 |
-
|
|
|
|
|
|
|
| 660 |
|
| 661 |
-
|
| 662 |
-
search_url = "https://en.wikipedia.org/w/api.php"
|
| 663 |
-
search_params = {
|
| 664 |
-
'action': 'opensearch',
|
| 665 |
-
'search': query,
|
| 666 |
-
'limit': 1,
|
| 667 |
-
'namespace': 0,
|
| 668 |
-
'format': 'json'
|
| 669 |
-
}
|
| 670 |
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
|
| 675 |
-
|
| 676 |
-
result = f"No Wikipedia article found for: '{query}'"
|
| 677 |
-
search_cache.put(cache_key, result)
|
| 678 |
-
telemetry.record_call("wikipedia_search", time.time() - start_time, True)
|
| 679 |
-
return result
|
| 680 |
|
| 681 |
-
|
| 682 |
-
|
|
|
|
|
|
|
|
|
|
| 683 |
|
| 684 |
-
|
| 685 |
-
print(f" URL: {page_url}")
|
| 686 |
|
| 687 |
-
|
| 688 |
-
content_params = {
|
| 689 |
-
'action': 'query',
|
| 690 |
-
'titles': page_title,
|
| 691 |
-
'prop': 'extracts',
|
| 692 |
-
'explaintext': True,
|
| 693 |
-
'format': 'json'
|
| 694 |
-
}
|
| 695 |
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
|
| 700 |
-
|
| 701 |
-
|
| 702 |
|
| 703 |
-
if
|
| 704 |
-
|
| 705 |
-
search_cache.put(cache_key, result)
|
| 706 |
-
telemetry.record_call("wikipedia_search", time.time() - start_time, True)
|
| 707 |
-
return result
|
| 708 |
|
| 709 |
-
|
|
|
|
|
|
|
| 710 |
|
| 711 |
-
|
| 712 |
-
result = f"Wikipedia page found but content empty: '{page_title}'"
|
| 713 |
-
search_cache.put(cache_key, result)
|
| 714 |
-
telemetry.record_call("wikipedia_search", time.time() - start_time, True)
|
| 715 |
-
return result
|
| 716 |
|
| 717 |
print(f" Retrieved {len(content)} chars")
|
| 718 |
|
| 719 |
# Format result
|
| 720 |
-
result = f"Wikipedia: {
|
| 721 |
-
result += f"URL: {
|
| 722 |
result += content
|
| 723 |
-
result = truncate_if_needed(result, max_length=12000)
|
| 724 |
|
| 725 |
# Cache result
|
| 726 |
search_cache.put(cache_key, result)
|
|
@@ -730,7 +704,7 @@ def wikipedia_search(query: str) -> str:
|
|
| 730 |
|
| 731 |
except Exception as e:
|
| 732 |
telemetry.record_call("wikipedia_search", time.time() - start_time, False)
|
| 733 |
-
raise ToolError("wikipedia_search", e, "Try
|
| 734 |
|
| 735 |
|
| 736 |
class SearchInput(BaseModel):
|
|
|
|
| 629 |
class WikipediaInput(BaseModel):
|
| 630 |
query: str = Field(description="Topic to search (e.g., 'Mercedes Sosa', 'Python programming')")
|
| 631 |
|
| 632 |
+
# Replace your wikipedia_search function with:
|
| 633 |
+
|
| 634 |
@tool(args_schema=WikipediaInput)
|
| 635 |
@retry_with_backoff(max_retries=2)
|
| 636 |
def wikipedia_search(query: str) -> str:
|
| 637 |
"""
|
| 638 |
+
Search Wikipedia by scraping (API blocked on HuggingFace).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
"""
|
| 640 |
start_time = time.time()
|
| 641 |
|
|
|
|
| 650 |
telemetry.record_call("wikipedia_search", time.time() - start_time, True)
|
| 651 |
return cached
|
| 652 |
|
| 653 |
+
# Build direct Wikipedia URL
|
| 654 |
+
wiki_title = query.replace(' ', '_')
|
| 655 |
+
wiki_url = f"https://en.wikipedia.org/wiki/{wiki_title}"
|
| 656 |
|
| 657 |
+
print(f" Trying: {wiki_url}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 658 |
|
| 659 |
+
headers = {
|
| 660 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
| 661 |
+
}
|
| 662 |
|
| 663 |
+
response = requests.get(wiki_url, headers=headers, timeout=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
|
| 665 |
+
# If 404, try search page
|
| 666 |
+
if response.status_code == 404:
|
| 667 |
+
search_url = f"https://en.wikipedia.org/w/index.php?search={query.replace(' ', '+')}"
|
| 668 |
+
print(f" 404, trying search: {search_url}")
|
| 669 |
+
response = requests.get(search_url, headers=headers, timeout=10)
|
| 670 |
|
| 671 |
+
response.raise_for_status()
|
|
|
|
| 672 |
|
| 673 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
|
| 675 |
+
# Get title
|
| 676 |
+
title_elem = soup.find('h1', {'id': 'firstHeading'})
|
| 677 |
+
title = title_elem.get_text() if title_elem else query
|
| 678 |
|
| 679 |
+
# Get main content
|
| 680 |
+
content_div = soup.find('div', {'class': 'mw-parser-output'})
|
| 681 |
|
| 682 |
+
if not content_div:
|
| 683 |
+
raise ValueError("No content found on Wikipedia page")
|
|
|
|
|
|
|
|
|
|
| 684 |
|
| 685 |
+
# Remove unwanted elements
|
| 686 |
+
for tag in content_div(['script', 'style', 'table', 'sup', 'span.reference']):
|
| 687 |
+
tag.extract()
|
| 688 |
|
| 689 |
+
content = content_div.get_text(separator='\n', strip=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 690 |
|
| 691 |
print(f" Retrieved {len(content)} chars")
|
| 692 |
|
| 693 |
# Format result
|
| 694 |
+
result = f"Wikipedia: {title}\n"
|
| 695 |
+
result += f"URL: {response.url}\n\n"
|
| 696 |
result += content
|
| 697 |
+
result = truncate_if_needed(result, max_length=12000)
|
| 698 |
|
| 699 |
# Cache result
|
| 700 |
search_cache.put(cache_key, result)
|
|
|
|
| 704 |
|
| 705 |
except Exception as e:
|
| 706 |
telemetry.record_call("wikipedia_search", time.time() - start_time, False)
|
| 707 |
+
raise ToolError("wikipedia_search", e, "Try using search_tool instead")
|
| 708 |
|
| 709 |
|
| 710 |
class SearchInput(BaseModel):
|