Spaces:

Payam75
/

WEB2MARKDOWN

Sleeping

Payam75 commited on Aug 23, 2025

Commit

cb1827c

verified ·

1 Parent(s): eb81514

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -1,41 +1,13 @@
-import re
-import requests
-from bs4 import BeautifulSoup
-import html2text
-from typing import Tuple, Optional
-USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
-def normalize_url(url: str) -> str:
-    url = url.strip()
-    if not url.startswith("http"):
-        url = "https://" + url
-    return url
-def fetch_html(url: str, timeout=15) -> str:
-    headers = {"User-Agent": USER_AGENT}
-    resp = requests.get(url, headers=headers, timeout=timeout)
-    resp.raise_for_status()
-    return resp.text
-def soup_cleanup(soup: BeautifulSoup, remove_images=False) -> None:
-    for tag in soup(["script", "style", "noscript", "iframe", "header", "footer", "svg"]):
-        tag.decompose()
-    if remove_images:
-        for img in soup.find_all("img"):
-            img.decompose()
-def html_to_markdown(html: str) -> str:
-    h = html2text.HTML2Text()
-    h.ignore_links = False
-    h.body_width = 0
-    return h.handle(html)
-def scrape_with_bs(url: str, remove_images=False) -> Tuple[str, Optional[str]]:
-    html = fetch_html(url)
-    soup = BeautifulSoup(html, "html.parser")
-    soup_cleanup(soup, remove_images=remove_images)
-    title_tag = soup.find("title")
-    title = title_tag.get_text(strip=True) if title_tag else None
-    md = html_to_markdown(str(soup))
-    return md, title

+import subprocess, sys
+def ensure_package(package_name: str, import_name: str = None):
+    """
+    Ensures a package is installed at runtime.
+    If missing, installs it via pip.
+    """
+    import importlib
+    try:
+        return importlib.import_module(import_name or package_name)
+    except ImportError:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
+        return importlib.import_module(import_name or package_name)