Spaces:
Sleeping
Sleeping
File size: 1,143 Bytes
eb81514 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
from utils import ensure_package
# Ensure runtime dependencies
requests = ensure_package("requests")
bs4 = ensure_package("beautifulsoup4", "bs4")
mistune = ensure_package("mistune")
def scrape_to_markdown(url: str) -> str:
"""
Fetches a webpage, extracts visible text, and converts it to markdown.
"""
response = requests.get(url, timeout=10)
response.raise_for_status()
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Extract text from main tags
elements = []
for tag in soup.find_all(["h1", "h2", "h3", "p", "li"]):
if tag.name.startswith("h"):
level = int(tag.name[1])
elements.append("#" * level + " " + tag.get_text(strip=True))
elif tag.name == "li":
elements.append(f"- {tag.get_text(strip=True)}")
else:
elements.append(tag.get_text(strip=True))
md_text = "\n\n".join(elements)
# Convert markdown to HTML for preview
markdown_parser = mistune.create_markdown(renderer=mistune.HTMLRenderer())
html_preview = markdown_parser(md_text)
return md_text, html_preview |