Spaces:

build-small-hackathon
/

KnowledgeMesh

Running on Zero

File size: 4,134 Bytes

9707a84

from urllib.parse import quote, urljoin, urlparse

import requests
from bs4 import BeautifulSoup

from app.core.models import Document, SourceType


FREEDIUM_BASE = "https://freedium-mirror.cfd"
USER_AGENT = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36"
)


def extract_medium(url: str) -> Document:
    source_url = url.strip()
    html, mirror_url = _fetch_freedium_html(source_url)
    soup = BeautifulSoup(html, "html.parser")

    for tag in soup(["script", "style", "noscript", "svg", "form", "nav", "header", "footer"]):
        tag.decompose()

    title = _extract_title(soup) or "Medium Article"
    body = soup.find("article") or soup.find("main") or soup.body
    if body is None:
        raise ValueError("Freedium returned a page without readable article content.")

    text_parts = _extract_text_parts(body)
    image_parts = _extract_images(body, mirror_url)
    combined = "\n\n".join([*text_parts, *image_parts]).strip()

    if len(combined) < 300:
        raise ValueError(
            "Could not extract enough readable content from the Medium article through Freedium. "
            "Check that the article URL is public and try again."
        )

    return Document(
        source_type=SourceType.MEDIUM,
        title=title,
        text=combined,
        source=source_url,
        metadata={
            "mirror_url": mirror_url,
            "images": len(image_parts),
            "extractor": "freedium-mirror.cfd",
        },
    )


def _fetch_freedium_html(source_url: str) -> tuple[str, str]:
    candidates = _freedium_candidates(source_url)
    errors: list[str] = []
    for candidate in candidates:
        try:
            response = requests.get(
                candidate,
                headers={"User-Agent": USER_AGENT, "Accept": "text/html,application/xhtml+xml"},
                timeout=45,
            )
            response.raise_for_status()
            if response.text.strip():
                return response.text, response.url
        except requests.RequestException as exc:
            errors.append(f"{candidate}: {exc}")
    raise ValueError("Could not fetch the Medium article through Freedium. " + " | ".join(errors[-2:]))


def _freedium_candidates(source_url: str) -> list[str]:
    parsed = urlparse(source_url)
    if "freedium" in parsed.netloc:
        return [source_url]
    return [
        f"{FREEDIUM_BASE}/{source_url}",
        f"{FREEDIUM_BASE}/{quote(source_url, safe='')}",
    ]


def _extract_title(soup: BeautifulSoup) -> str:
    for selector in ['meta[property="og:title"]', 'meta[name="twitter:title"]']:
        tag = soup.select_one(selector)
        if tag and tag.get("content"):
            return tag["content"].strip()
    heading = soup.find("h1")
    if heading:
        return heading.get_text(" ", strip=True)
    if soup.title:
        return soup.title.get_text(" ", strip=True)
    return ""


def _extract_text_parts(body) -> list[str]:
    parts: list[str] = []
    seen: set[str] = set()
    for tag in body.find_all(["h1", "h2", "h3", "p", "li", "blockquote", "pre", "figcaption"]):
        text = tag.get_text(" ", strip=True)
        if not text or text in seen:
            continue
        seen.add(text)
        if tag.name in {"h1", "h2", "h3"}:
            parts.append(f"## {text}")
        elif tag.name == "blockquote":
            parts.append(f"> {text}")
        else:
            parts.append(text)
    return parts


def _extract_images(body, base_url: str) -> list[str]:
    images: list[str] = []
    seen: set[str] = set()
    for image in body.find_all("img"):
        src = image.get("src") or image.get("data-src") or image.get("data-original")
        if not src:
            continue
        absolute_src = urljoin(base_url, src)
        if absolute_src in seen:
            continue
        seen.add(absolute_src)
        alt = image.get("alt", "").strip()
        if alt:
            images.append(f"Image: {alt}\nURL: {absolute_src}")
        else:
            images.append(f"Image URL: {absolute_src}")
    return images