gabejavitt commited on
Commit
c47deb7
·
verified ·
1 Parent(s): e6af6ca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -59
app.py CHANGED
@@ -629,19 +629,13 @@ def validate_answer(proposed_answer: str, original_question: str) -> str:
629
  class WikipediaInput(BaseModel):
630
  query: str = Field(description="Topic to search (e.g., 'Mercedes Sosa', 'Python programming')")
631
 
 
 
632
  @tool(args_schema=WikipediaInput)
633
  @retry_with_backoff(max_retries=2)
634
  def wikipedia_search(query: str) -> str:
635
  """
636
- Search Wikipedia with automatic page retrieval.
637
-
638
- Better than search_tool for:
639
- - Biographical information
640
- - Historical facts
641
- - Scientific concepts
642
- - Counting items in lists (discography, filmography, etc.)
643
-
644
- Returns full article sections, not just snippets.
645
  """
646
  start_time = time.time()
647
 
@@ -656,71 +650,51 @@ def wikipedia_search(query: str) -> str:
656
  telemetry.record_call("wikipedia_search", time.time() - start_time, True)
657
  return cached
658
 
659
- import requests
 
 
660
 
661
- # Step 1: Search for page
662
- search_url = "https://en.wikipedia.org/w/api.php"
663
- search_params = {
664
- 'action': 'opensearch',
665
- 'search': query,
666
- 'limit': 1,
667
- 'namespace': 0,
668
- 'format': 'json'
669
- }
670
 
671
- response = requests.get(search_url, params=search_params, timeout=10)
672
- response.raise_for_status()
673
- search_results = response.json()
674
 
675
- if not search_results[1]: # No results
676
- result = f"No Wikipedia article found for: '{query}'"
677
- search_cache.put(cache_key, result)
678
- telemetry.record_call("wikipedia_search", time.time() - start_time, True)
679
- return result
680
 
681
- page_title = search_results[1][0]
682
- page_url = search_results[3][0]
 
 
 
683
 
684
- print(f" Found: {page_title}")
685
- print(f" URL: {page_url}")
686
 
687
- # Step 2: Get full page content
688
- content_params = {
689
- 'action': 'query',
690
- 'titles': page_title,
691
- 'prop': 'extracts',
692
- 'explaintext': True,
693
- 'format': 'json'
694
- }
695
 
696
- response = requests.get(search_url, params=content_params, timeout=10)
697
- response.raise_for_status()
698
- data = response.json()
699
 
700
- pages = data['query']['pages']
701
- page_id = list(pages.keys())[0]
702
 
703
- if page_id == '-1':
704
- result = f"Wikipedia page not found: '{query}'"
705
- search_cache.put(cache_key, result)
706
- telemetry.record_call("wikipedia_search", time.time() - start_time, True)
707
- return result
708
 
709
- content = pages[page_id].get('extract', '')
 
 
710
 
711
- if not content:
712
- result = f"Wikipedia page found but content empty: '{page_title}'"
713
- search_cache.put(cache_key, result)
714
- telemetry.record_call("wikipedia_search", time.time() - start_time, True)
715
- return result
716
 
717
  print(f" Retrieved {len(content)} chars")
718
 
719
  # Format result
720
- result = f"Wikipedia: {page_title}\n"
721
- result += f"URL: {page_url}\n\n"
722
  result += content
723
- result = truncate_if_needed(result, max_length=12000) # Allow more for Wikipedia
724
 
725
  # Cache result
726
  search_cache.put(cache_key, result)
@@ -730,7 +704,7 @@ def wikipedia_search(query: str) -> str:
730
 
731
  except Exception as e:
732
  telemetry.record_call("wikipedia_search", time.time() - start_time, False)
733
- raise ToolError("wikipedia_search", e, "Try a more specific search term")
734
 
735
 
736
  class SearchInput(BaseModel):
 
629
  class WikipediaInput(BaseModel):
630
  query: str = Field(description="Topic to search (e.g., 'Mercedes Sosa', 'Python programming')")
631
 
632
+ # Replace your wikipedia_search function with:
633
+
634
  @tool(args_schema=WikipediaInput)
635
  @retry_with_backoff(max_retries=2)
636
  def wikipedia_search(query: str) -> str:
637
  """
638
+ Search Wikipedia by scraping (API blocked on HuggingFace).
 
 
 
 
 
 
 
 
639
  """
640
  start_time = time.time()
641
 
 
650
  telemetry.record_call("wikipedia_search", time.time() - start_time, True)
651
  return cached
652
 
653
+ # Build direct Wikipedia URL
654
+ wiki_title = query.replace(' ', '_')
655
+ wiki_url = f"https://en.wikipedia.org/wiki/{wiki_title}"
656
 
657
+ print(f" Trying: {wiki_url}")
 
 
 
 
 
 
 
 
658
 
659
+ headers = {
660
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
661
+ }
662
 
663
+ response = requests.get(wiki_url, headers=headers, timeout=10)
 
 
 
 
664
 
665
+ # If 404, try search page
666
+ if response.status_code == 404:
667
+ search_url = f"https://en.wikipedia.org/w/index.php?search={query.replace(' ', '+')}"
668
+ print(f" 404, trying search: {search_url}")
669
+ response = requests.get(search_url, headers=headers, timeout=10)
670
 
671
+ response.raise_for_status()
 
672
 
673
+ soup = BeautifulSoup(response.text, 'html.parser')
 
 
 
 
 
 
 
674
 
675
+ # Get title
676
+ title_elem = soup.find('h1', {'id': 'firstHeading'})
677
+ title = title_elem.get_text() if title_elem else query
678
 
679
+ # Get main content
680
+ content_div = soup.find('div', {'class': 'mw-parser-output'})
681
 
682
+ if not content_div:
683
+ raise ValueError("No content found on Wikipedia page")
 
 
 
684
 
685
+ # Remove unwanted elements
686
+ for tag in content_div(['script', 'style', 'table', 'sup', 'span.reference']):
687
+ tag.extract()
688
 
689
+ content = content_div.get_text(separator='\n', strip=True)
 
 
 
 
690
 
691
  print(f" Retrieved {len(content)} chars")
692
 
693
  # Format result
694
+ result = f"Wikipedia: {title}\n"
695
+ result += f"URL: {response.url}\n\n"
696
  result += content
697
+ result = truncate_if_needed(result, max_length=12000)
698
 
699
  # Cache result
700
  search_cache.put(cache_key, result)
 
704
 
705
  except Exception as e:
706
  telemetry.record_call("wikipedia_search", time.time() - start_time, False)
707
+ raise ToolError("wikipedia_search", e, "Try using search_tool instead")
708
 
709
 
710
  class SearchInput(BaseModel):