File size: 1,143 Bytes
eb81514
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from utils import ensure_package

# Ensure runtime dependencies
requests = ensure_package("requests")
bs4 = ensure_package("beautifulsoup4", "bs4")
mistune = ensure_package("mistune")

def scrape_to_markdown(url: str) -> str:
    """
    Fetches a webpage, extracts visible text, and converts it to markdown.
    """
    response = requests.get(url, timeout=10)
    response.raise_for_status()

    from bs4 import BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract text from main tags
    elements = []
    for tag in soup.find_all(["h1", "h2", "h3", "p", "li"]):
        if tag.name.startswith("h"):
            level = int(tag.name[1])
            elements.append("#" * level + " " + tag.get_text(strip=True))
        elif tag.name == "li":
            elements.append(f"- {tag.get_text(strip=True)}")
        else:
            elements.append(tag.get_text(strip=True))

    md_text = "\n\n".join(elements)

    # Convert markdown to HTML for preview
    markdown_parser = mistune.create_markdown(renderer=mistune.HTMLRenderer())
    html_preview = markdown_parser(md_text)

    return md_text, html_preview