Payam75 commited on
Commit
eb81514
·
verified ·
1 Parent(s): e600655

Create scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +35 -0
scraper.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import ensure_package
2
+
3
+ # Ensure runtime dependencies
4
+ requests = ensure_package("requests")
5
+ bs4 = ensure_package("beautifulsoup4", "bs4")
6
+ mistune = ensure_package("mistune")
7
+
8
+ def scrape_to_markdown(url: str) -> str:
9
+ """
10
+ Fetches a webpage, extracts visible text, and converts it to markdown.
11
+ """
12
+ response = requests.get(url, timeout=10)
13
+ response.raise_for_status()
14
+
15
+ from bs4 import BeautifulSoup
16
+ soup = BeautifulSoup(response.text, "html.parser")
17
+
18
+ # Extract text from main tags
19
+ elements = []
20
+ for tag in soup.find_all(["h1", "h2", "h3", "p", "li"]):
21
+ if tag.name.startswith("h"):
22
+ level = int(tag.name[1])
23
+ elements.append("#" * level + " " + tag.get_text(strip=True))
24
+ elif tag.name == "li":
25
+ elements.append(f"- {tag.get_text(strip=True)}")
26
+ else:
27
+ elements.append(tag.get_text(strip=True))
28
+
29
+ md_text = "\n\n".join(elements)
30
+
31
+ # Convert markdown to HTML for preview
32
+ markdown_parser = mistune.create_markdown(renderer=mistune.HTMLRenderer())
33
+ html_preview = markdown_parser(md_text)
34
+
35
+ return md_text, html_preview