Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -1,41 +1,13 @@
|
|
| 1 |
-
import
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def fetch_html(url: str, timeout=15) -> str:
|
| 16 |
-
headers = {"User-Agent": USER_AGENT}
|
| 17 |
-
resp = requests.get(url, headers=headers, timeout=timeout)
|
| 18 |
-
resp.raise_for_status()
|
| 19 |
-
return resp.text
|
| 20 |
-
|
| 21 |
-
def soup_cleanup(soup: BeautifulSoup, remove_images=False) -> None:
|
| 22 |
-
for tag in soup(["script", "style", "noscript", "iframe", "header", "footer", "svg"]):
|
| 23 |
-
tag.decompose()
|
| 24 |
-
if remove_images:
|
| 25 |
-
for img in soup.find_all("img"):
|
| 26 |
-
img.decompose()
|
| 27 |
-
|
| 28 |
-
def html_to_markdown(html: str) -> str:
|
| 29 |
-
h = html2text.HTML2Text()
|
| 30 |
-
h.ignore_links = False
|
| 31 |
-
h.body_width = 0
|
| 32 |
-
return h.handle(html)
|
| 33 |
-
|
| 34 |
-
def scrape_with_bs(url: str, remove_images=False) -> Tuple[str, Optional[str]]:
|
| 35 |
-
html = fetch_html(url)
|
| 36 |
-
soup = BeautifulSoup(html, "html.parser")
|
| 37 |
-
soup_cleanup(soup, remove_images=remove_images)
|
| 38 |
-
title_tag = soup.find("title")
|
| 39 |
-
title = title_tag.get_text(strip=True) if title_tag else None
|
| 40 |
-
md = html_to_markdown(str(soup))
|
| 41 |
-
return md, title
|
|
|
|
| 1 |
+
import subprocess, sys
|
| 2 |
+
|
| 3 |
+
def ensure_package(package_name: str, import_name: str = None):
|
| 4 |
+
"""
|
| 5 |
+
Ensures a package is installed at runtime.
|
| 6 |
+
If missing, installs it via pip.
|
| 7 |
+
"""
|
| 8 |
+
import importlib
|
| 9 |
+
try:
|
| 10 |
+
return importlib.import_module(import_name or package_name)
|
| 11 |
+
except ImportError:
|
| 12 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
|
| 13 |
+
return importlib.import_module(import_name or package_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|