Spaces:
Sleeping
Sleeping
Vladyslav Khaitov commited on
Commit ·
8e26419
1
Parent(s): e354c90
Add more readable version of wikipedia pages when visiting web page (live or archived)
Browse files- tools/text_web_browser.py +33 -13
tools/text_web_browser.py
CHANGED
|
@@ -13,6 +13,7 @@ from urllib.parse import unquote, urljoin, urlparse
|
|
| 13 |
|
| 14 |
import pathvalidate
|
| 15 |
import requests
|
|
|
|
| 16 |
# from serpapi import GoogleSearch
|
| 17 |
|
| 18 |
from markdownify import markdownify
|
|
@@ -471,10 +472,19 @@ class VisitTool(Tool):
|
|
| 471 |
self.browser = browser
|
| 472 |
|
| 473 |
def forward(self, url: str) -> str:
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 478 |
|
| 479 |
|
| 480 |
class DownloadTool(Tool):
|
|
@@ -545,15 +555,25 @@ class ArchiveSearchTool(Tool):
|
|
| 545 |
else:
|
| 546 |
raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
|
| 547 |
target_url = closest["url"]
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
|
| 558 |
|
| 559 |
class PageUpTool(Tool):
|
|
|
|
| 13 |
|
| 14 |
import pathvalidate
|
| 15 |
import requests
|
| 16 |
+
import trafilatura
|
| 17 |
# from serpapi import GoogleSearch
|
| 18 |
|
| 19 |
from markdownify import markdownify
|
|
|
|
| 472 |
self.browser = browser
|
| 473 |
|
| 474 |
def forward(self, url: str) -> str:
|
| 475 |
+
if 'wikipedia.org/wiki/' in url:
|
| 476 |
+
downloaded = trafilatura.fetch_url(url)
|
| 477 |
+
if downloaded is None:
|
| 478 |
+
return "Failed to fetch the Wikipedia page content."
|
| 479 |
+
extracted = trafilatura.extract(downloaded)
|
| 480 |
+
if not extracted:
|
| 481 |
+
return "Failed to extract main content from the Wikipedia page."
|
| 482 |
+
return extracted
|
| 483 |
+
else:
|
| 484 |
+
self.browser.visit_page(url)
|
| 485 |
+
header, content = self.browser._state()
|
| 486 |
+
# return header.strip() + "\n=======================\n" + content
|
| 487 |
+
return markdownify(content)
|
| 488 |
|
| 489 |
|
| 490 |
class DownloadTool(Tool):
|
|
|
|
| 555 |
else:
|
| 556 |
raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
|
| 557 |
target_url = closest["url"]
|
| 558 |
+
|
| 559 |
+
if 'wikipedia.org/wiki/' in target_url:
|
| 560 |
+
downloaded = trafilatura.fetch_url(target_url)
|
| 561 |
+
if downloaded is None:
|
| 562 |
+
return "Failed to fetch the Wikipedia page content."
|
| 563 |
+
extracted = trafilatura.extract(downloaded)
|
| 564 |
+
if not extracted:
|
| 565 |
+
return "Failed to extract main content from the Wikipedia page."
|
| 566 |
+
return extracted
|
| 567 |
+
else:
|
| 568 |
+
self.browser.visit_page(target_url)
|
| 569 |
+
header, content = self.browser._state()
|
| 570 |
+
# return (
|
| 571 |
+
# f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
|
| 572 |
+
# + header.strip()
|
| 573 |
+
# + "\n=======================\n"
|
| 574 |
+
# + content
|
| 575 |
+
# )
|
| 576 |
+
return markdownify(content)
|
| 577 |
|
| 578 |
|
| 579 |
class PageUpTool(Tool):
|