Spaces:
Running on Zero
Running on Zero
| from urllib.parse import quote, urljoin, urlparse | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from app.core.models import Document, SourceType | |
| FREEDIUM_BASE = "https://freedium-mirror.cfd" | |
| USER_AGENT = ( | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36" | |
| ) | |
| def extract_medium(url: str) -> Document: | |
| source_url = url.strip() | |
| html, mirror_url = _fetch_freedium_html(source_url) | |
| soup = BeautifulSoup(html, "html.parser") | |
| for tag in soup(["script", "style", "noscript", "svg", "form", "nav", "header", "footer"]): | |
| tag.decompose() | |
| title = _extract_title(soup) or "Medium Article" | |
| body = soup.find("article") or soup.find("main") or soup.body | |
| if body is None: | |
| raise ValueError("Freedium returned a page without readable article content.") | |
| text_parts = _extract_text_parts(body) | |
| image_parts = _extract_images(body, mirror_url) | |
| combined = "\n\n".join([*text_parts, *image_parts]).strip() | |
| if len(combined) < 300: | |
| raise ValueError( | |
| "Could not extract enough readable content from the Medium article through Freedium. " | |
| "Check that the article URL is public and try again." | |
| ) | |
| return Document( | |
| source_type=SourceType.MEDIUM, | |
| title=title, | |
| text=combined, | |
| source=source_url, | |
| metadata={ | |
| "mirror_url": mirror_url, | |
| "images": len(image_parts), | |
| "extractor": "freedium-mirror.cfd", | |
| }, | |
| ) | |
| def _fetch_freedium_html(source_url: str) -> tuple[str, str]: | |
| candidates = _freedium_candidates(source_url) | |
| errors: list[str] = [] | |
| for candidate in candidates: | |
| try: | |
| response = requests.get( | |
| candidate, | |
| headers={"User-Agent": USER_AGENT, "Accept": "text/html,application/xhtml+xml"}, | |
| timeout=45, | |
| ) | |
| response.raise_for_status() | |
| if response.text.strip(): | |
| return response.text, response.url | |
| except requests.RequestException as exc: | |
| errors.append(f"{candidate}: {exc}") | |
| raise ValueError("Could not fetch the Medium article through Freedium. " + " | ".join(errors[-2:])) | |
| def _freedium_candidates(source_url: str) -> list[str]: | |
| parsed = urlparse(source_url) | |
| if "freedium" in parsed.netloc: | |
| return [source_url] | |
| return [ | |
| f"{FREEDIUM_BASE}/{source_url}", | |
| f"{FREEDIUM_BASE}/{quote(source_url, safe='')}", | |
| ] | |
| def _extract_title(soup: BeautifulSoup) -> str: | |
| for selector in ['meta[property="og:title"]', 'meta[name="twitter:title"]']: | |
| tag = soup.select_one(selector) | |
| if tag and tag.get("content"): | |
| return tag["content"].strip() | |
| heading = soup.find("h1") | |
| if heading: | |
| return heading.get_text(" ", strip=True) | |
| if soup.title: | |
| return soup.title.get_text(" ", strip=True) | |
| return "" | |
| def _extract_text_parts(body) -> list[str]: | |
| parts: list[str] = [] | |
| seen: set[str] = set() | |
| for tag in body.find_all(["h1", "h2", "h3", "p", "li", "blockquote", "pre", "figcaption"]): | |
| text = tag.get_text(" ", strip=True) | |
| if not text or text in seen: | |
| continue | |
| seen.add(text) | |
| if tag.name in {"h1", "h2", "h3"}: | |
| parts.append(f"## {text}") | |
| elif tag.name == "blockquote": | |
| parts.append(f"> {text}") | |
| else: | |
| parts.append(text) | |
| return parts | |
| def _extract_images(body, base_url: str) -> list[str]: | |
| images: list[str] = [] | |
| seen: set[str] = set() | |
| for image in body.find_all("img"): | |
| src = image.get("src") or image.get("data-src") or image.get("data-original") | |
| if not src: | |
| continue | |
| absolute_src = urljoin(base_url, src) | |
| if absolute_src in seen: | |
| continue | |
| seen.add(absolute_src) | |
| alt = image.get("alt", "").strip() | |
| if alt: | |
| images.append(f"Image: {alt}\nURL: {absolute_src}") | |
| else: | |
| images.append(f"Image URL: {absolute_src}") | |
| return images | |