Payam75 commited on
Commit
cb1827c
·
verified ·
1 Parent(s): eb81514

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +13 -41
utils.py CHANGED
@@ -1,41 +1,13 @@
1
- import re
2
- import requests
3
- from bs4 import BeautifulSoup
4
- import html2text
5
- from typing import Tuple, Optional
6
-
7
- USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
8
-
9
- def normalize_url(url: str) -> str:
10
- url = url.strip()
11
- if not url.startswith("http"):
12
- url = "https://" + url
13
- return url
14
-
15
- def fetch_html(url: str, timeout=15) -> str:
16
- headers = {"User-Agent": USER_AGENT}
17
- resp = requests.get(url, headers=headers, timeout=timeout)
18
- resp.raise_for_status()
19
- return resp.text
20
-
21
- def soup_cleanup(soup: BeautifulSoup, remove_images=False) -> None:
22
- for tag in soup(["script", "style", "noscript", "iframe", "header", "footer", "svg"]):
23
- tag.decompose()
24
- if remove_images:
25
- for img in soup.find_all("img"):
26
- img.decompose()
27
-
28
- def html_to_markdown(html: str) -> str:
29
- h = html2text.HTML2Text()
30
- h.ignore_links = False
31
- h.body_width = 0
32
- return h.handle(html)
33
-
34
- def scrape_with_bs(url: str, remove_images=False) -> Tuple[str, Optional[str]]:
35
- html = fetch_html(url)
36
- soup = BeautifulSoup(html, "html.parser")
37
- soup_cleanup(soup, remove_images=remove_images)
38
- title_tag = soup.find("title")
39
- title = title_tag.get_text(strip=True) if title_tag else None
40
- md = html_to_markdown(str(soup))
41
- return md, title
 
1
+ import subprocess, sys
2
+
3
+ def ensure_package(package_name: str, import_name: str = None):
4
+ """
5
+ Ensures a package is installed at runtime.
6
+ If missing, installs it via pip.
7
+ """
8
+ import importlib
9
+ try:
10
+ return importlib.import_module(import_name or package_name)
11
+ except ImportError:
12
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
13
+ return importlib.import_module(import_name or package_name)