Spaces:
Sleeping
Sleeping
| from utils import ensure_package | |
| # Ensure runtime dependencies | |
| requests = ensure_package("requests") | |
| bs4 = ensure_package("beautifulsoup4", "bs4") | |
| mistune = ensure_package("mistune") | |
| def scrape_to_markdown(url: str) -> str: | |
| """ | |
| Fetches a webpage, extracts visible text, and converts it to markdown. | |
| """ | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # Extract text from main tags | |
| elements = [] | |
| for tag in soup.find_all(["h1", "h2", "h3", "p", "li"]): | |
| if tag.name.startswith("h"): | |
| level = int(tag.name[1]) | |
| elements.append("#" * level + " " + tag.get_text(strip=True)) | |
| elif tag.name == "li": | |
| elements.append(f"- {tag.get_text(strip=True)}") | |
| else: | |
| elements.append(tag.get_text(strip=True)) | |
| md_text = "\n\n".join(elements) | |
| # Convert markdown to HTML for preview | |
| markdown_parser = mistune.create_markdown(renderer=mistune.HTMLRenderer()) | |
| html_preview = markdown_parser(md_text) | |
| return md_text, html_preview |