Vladyslav Khaitov commited on
Commit
8e26419
·
1 Parent(s): e354c90

Add more readable version of wikipedia pages when visiting web page (live or archived)

Browse files
Files changed (1) hide show
  1. tools/text_web_browser.py +33 -13
tools/text_web_browser.py CHANGED
@@ -13,6 +13,7 @@ from urllib.parse import unquote, urljoin, urlparse
13
 
14
  import pathvalidate
15
  import requests
 
16
  # from serpapi import GoogleSearch
17
 
18
  from markdownify import markdownify
@@ -471,10 +472,19 @@ class VisitTool(Tool):
471
  self.browser = browser
472
 
473
  def forward(self, url: str) -> str:
474
- self.browser.visit_page(url)
475
- header, content = self.browser._state()
476
- # return header.strip() + "\n=======================\n" + content
477
- return markdownify(content)
 
 
 
 
 
 
 
 
 
478
 
479
 
480
  class DownloadTool(Tool):
@@ -545,15 +555,25 @@ class ArchiveSearchTool(Tool):
545
  else:
546
  raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
547
  target_url = closest["url"]
548
- self.browser.visit_page(target_url)
549
- header, content = self.browser._state()
550
- # return (
551
- # f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
552
- # + header.strip()
553
- # + "\n=======================\n"
554
- # + content
555
- # )
556
- return markdownify(content)
 
 
 
 
 
 
 
 
 
 
557
 
558
 
559
  class PageUpTool(Tool):
 
13
 
14
  import pathvalidate
15
  import requests
16
+ import trafilatura
17
  # from serpapi import GoogleSearch
18
 
19
  from markdownify import markdownify
 
472
  self.browser = browser
473
 
474
  def forward(self, url: str) -> str:
475
+ if 'wikipedia.org/wiki/' in url:
476
+ downloaded = trafilatura.fetch_url(url)
477
+ if downloaded is None:
478
+ return "Failed to fetch the Wikipedia page content."
479
+ extracted = trafilatura.extract(downloaded)
480
+ if not extracted:
481
+ return "Failed to extract main content from the Wikipedia page."
482
+ return extracted
483
+ else:
484
+ self.browser.visit_page(url)
485
+ header, content = self.browser._state()
486
+ # return header.strip() + "\n=======================\n" + content
487
+ return markdownify(content)
488
 
489
 
490
  class DownloadTool(Tool):
 
555
  else:
556
  raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
557
  target_url = closest["url"]
558
+
559
+ if 'wikipedia.org/wiki/' in target_url:
560
+ downloaded = trafilatura.fetch_url(target_url)
561
+ if downloaded is None:
562
+ return "Failed to fetch the Wikipedia page content."
563
+ extracted = trafilatura.extract(downloaded)
564
+ if not extracted:
565
+ return "Failed to extract main content from the Wikipedia page."
566
+ return extracted
567
+ else:
568
+ self.browser.visit_page(target_url)
569
+ header, content = self.browser._state()
570
+ # return (
571
+ # f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
572
+ # + header.strip()
573
+ # + "\n=======================\n"
574
+ # + content
575
+ # )
576
+ return markdownify(content)
577
 
578
 
579
  class PageUpTool(Tool):