Spaces:
Running on Zero
Running on Zero
File size: 4,134 Bytes
9707a84 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | from urllib.parse import quote, urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from app.core.models import Document, SourceType
FREEDIUM_BASE = "https://freedium-mirror.cfd"
USER_AGENT = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0 Safari/537.36"
)
def extract_medium(url: str) -> Document:
source_url = url.strip()
html, mirror_url = _fetch_freedium_html(source_url)
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "noscript", "svg", "form", "nav", "header", "footer"]):
tag.decompose()
title = _extract_title(soup) or "Medium Article"
body = soup.find("article") or soup.find("main") or soup.body
if body is None:
raise ValueError("Freedium returned a page without readable article content.")
text_parts = _extract_text_parts(body)
image_parts = _extract_images(body, mirror_url)
combined = "\n\n".join([*text_parts, *image_parts]).strip()
if len(combined) < 300:
raise ValueError(
"Could not extract enough readable content from the Medium article through Freedium. "
"Check that the article URL is public and try again."
)
return Document(
source_type=SourceType.MEDIUM,
title=title,
text=combined,
source=source_url,
metadata={
"mirror_url": mirror_url,
"images": len(image_parts),
"extractor": "freedium-mirror.cfd",
},
)
def _fetch_freedium_html(source_url: str) -> tuple[str, str]:
candidates = _freedium_candidates(source_url)
errors: list[str] = []
for candidate in candidates:
try:
response = requests.get(
candidate,
headers={"User-Agent": USER_AGENT, "Accept": "text/html,application/xhtml+xml"},
timeout=45,
)
response.raise_for_status()
if response.text.strip():
return response.text, response.url
except requests.RequestException as exc:
errors.append(f"{candidate}: {exc}")
raise ValueError("Could not fetch the Medium article through Freedium. " + " | ".join(errors[-2:]))
def _freedium_candidates(source_url: str) -> list[str]:
parsed = urlparse(source_url)
if "freedium" in parsed.netloc:
return [source_url]
return [
f"{FREEDIUM_BASE}/{source_url}",
f"{FREEDIUM_BASE}/{quote(source_url, safe='')}",
]
def _extract_title(soup: BeautifulSoup) -> str:
for selector in ['meta[property="og:title"]', 'meta[name="twitter:title"]']:
tag = soup.select_one(selector)
if tag and tag.get("content"):
return tag["content"].strip()
heading = soup.find("h1")
if heading:
return heading.get_text(" ", strip=True)
if soup.title:
return soup.title.get_text(" ", strip=True)
return ""
def _extract_text_parts(body) -> list[str]:
parts: list[str] = []
seen: set[str] = set()
for tag in body.find_all(["h1", "h2", "h3", "p", "li", "blockquote", "pre", "figcaption"]):
text = tag.get_text(" ", strip=True)
if not text or text in seen:
continue
seen.add(text)
if tag.name in {"h1", "h2", "h3"}:
parts.append(f"## {text}")
elif tag.name == "blockquote":
parts.append(f"> {text}")
else:
parts.append(text)
return parts
def _extract_images(body, base_url: str) -> list[str]:
images: list[str] = []
seen: set[str] = set()
for image in body.find_all("img"):
src = image.get("src") or image.get("data-src") or image.get("data-original")
if not src:
continue
absolute_src = urljoin(base_url, src)
if absolute_src in seen:
continue
seen.add(absolute_src)
alt = image.get("alt", "").strip()
if alt:
images.append(f"Image: {alt}\nURL: {absolute_src}")
else:
images.append(f"Image URL: {absolute_src}")
return images
|