Spaces:
Sleeping
Sleeping
File size: 13,239 Bytes
60742a2 a80eeb8 60742a2 a80eeb8 ae588db a80eeb8 60742a2 a80eeb8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 | import re
from bs4 import BeautifulSoup
from typing import Dict, List, Optional, Any
from markdownify import markdownify as md
from urllib.parse import urljoin
# Import centralized image URL utilities from utils
from src.utils import upgrade_medium_image_url, get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH
def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
"""
Extracts article metadata from search result cards.
"""
results = []
# Selectors for article cards
# Try multiple selectors as Medium's DOM changes
cards = soup.select("article") or \
soup.select('div[role="article"]') or \
soup.select(".postArticle") or \
soup.select(".js-block")
for card in cards:
data = _extract_from_card(card, base_url)
if data.get("url"):
results.append(data)
return results
def _extract_from_card(card, base_url: str) -> Dict[str, Any]:
"""Helper to extract data from a single card element."""
# 1. URL & Title
# Look for <a> tags that link to the article
# Usually the first <h2> inside an <a> is the title
title_tag = card.find("h2")
title = title_tag.get_text(strip=True) if title_tag else None
# Find the link associated with the title or the card
link_tag = card.find("a", href=True)
if title_tag and title_tag.find_parent("a"):
link_tag = title_tag.find_parent("a")
url = None
if link_tag:
href = link_tag["href"]
# Clean up URL (remove query params usually)
if "?" in href:
href = href.split("?")[0]
url = urljoin(base_url, href)
# 2. Author
# Heuristic: Look for links that go to a user profile (/@username or /u/username)
# but aren't the main article link.
author = None
# Try specific selectors first
author_tag = card.select_one('a[data-action="show-user-card"]') or \
card.select_one('.ds-link') or \
card.select_one('a[href*="/@"]')
if author_tag:
# Verify it's not the title link
if title_tag and author_tag == title_tag.find_parent("a"):
pass # It's the title
else:
author = author_tag.get_text(strip=True)
# Fallback: Look for a <p> or <span> that contains the author name
# Usually it's the first piece of text in the card meta area
if not author:
# Find the meta div (often has date/read time)
# We look for text that is NOT the date or read time
for p in card.find_all(["p", "span"]):
txt = p.get_text(strip=True)
# Skip empty, date-like, or read-time strings
if not txt or "min read" in txt or any(m in txt for m in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "ago"]):
continue
# Skip title
if title and txt in title:
continue
# If it looks like a name (2-3 words, capitalized), take it
if 0 < len(txt.split()) <= 3 and txt[0].isupper():
author = txt
break
# 3. Date / Reading Time
# Often spans
spans = card.find_all("span")
pub_date = None
reading_time = None
for s in spans:
txt = s.get_text(strip=True)
# Reading time usually ends with "min read"
if "min read" in txt:
try:
reading_time = float(txt.replace("min read", "").strip())
except ValueError:
pass
# Date heuristic: "Nov 7" or "2 days ago"
# Hard to parse perfectly without regex, but we can grab it if it looks like a date
# For now, we might skip complex date parsing or just take the first span that isn't reading time
elif not pub_date and len(txt) < 15 and any(c.isdigit() for c in txt):
# Very rough heuristic
pub_date = txt
# 4. Image URL
# Priority:
# 1. <img src="..." class="..."/> inside the card (often has specific classes for covers)
# 2. First <img> tag in the card
# Note: Search results don't always have og:image tags (those are in the head), so we must rely on the card's HTML.
image_url = None
# Try to find the main article image (often has specific classes or sizes)
# Medium uses responsive images, often in <picture> or <img> with srcset.
# We'll look for the largest image or the first one that isn't an avatar.
images = card.find_all("img")
for img in images:
src = img.get("src", "")
# Skip small avatars (often 20x20 or similar in URL)
if "1*dmbNkD5D-u45r44go_cf0g.png" in src: # Common default avatar
continue
if "resize:fill:20:20" in src: # Tiny thumbnail
continue
# If it's a valid image, take it.
# Medium images often have 'cdn-images-1.medium.com'
if src:
image_url = src
break
if not image_url:
# Fallback to any img
img_tag = card.find("img")
if img_tag and img_tag.get("src"):
image_url = img_tag["src"]
# Upgrade image URL to high resolution
image_url = upgrade_medium_image_url(image_url, target_width=1400)
return {
"url": url,
"title": title,
"author": {"name": author} if author else None,
"publishingDate": pub_date,
"readingTime": reading_time,
"imageUrl": image_url,
}
def extract_article_content(soup: BeautifulSoup, url: Optional[str] = None) -> Dict[str, Any]:
"""
Extracts full content, claps, and responses from an article page.
If extraction fails (Cloudflare/paywall), falls back to URL parsing.
"""
content_data = {
"markdownContent": None,
"claps": None,
"responses": None,
"title": None,
"author": None,
"publication": None # New field to track publication separately from author
}
# Extract Title (with fallbacks)
# Try h1 first
title_tag = soup.find("h1")
if title_tag:
content_data["title"] = title_tag.get_text(strip=True)
# Try og:title
if not content_data["title"]:
og_title = soup.find("meta", property="og:title")
if og_title and og_title.get("content"):
content_data["title"] = og_title.get("content")
# Try URL parsing if title is empty or generic (Cloudflare/Medium homepage)
is_generic_title = content_data["title"] in [None, "", "Just a moment...", "medium.com", "Medium"]
if is_generic_title and url:
# Medium URLs are like: https://medium.com/@author/article-title-slug-hash
# or https://medium.com/publication/article-title-slug-hash
try:
from urllib.parse import urlparse
path_parts = urlparse(url).path.strip("/").split("/")
if len(path_parts) >= 2:
# Last part is the article slug
article_slug = path_parts[-1]
# Remove hash (last part after last hyphen if it's alphanumeric)
slug_parts = article_slug.rsplit("-", 1)
if len(slug_parts) > 1 and len(slug_parts[-1]) == 12: # Medium hash is 12 chars
article_slug = slug_parts[0]
# Convert slug to title: replace-hyphens-with-spaces
title = article_slug.replace("-", " ").title()
content_data["title"] = title
except Exception:
pass
# Last resort: Try page title element (will be "medium.com" or "Just a moment..." for Cloudflare)
if not content_data["title"]:
title_elem = soup.find("title")
if title_elem:
page_title = title_elem.get_text(strip=True)
# Only use if it's not a Cloudflare/generic page
if page_title and page_title not in ["Just a moment...", "medium.com", "Medium"]:
content_data["title"] = page_title
# Extract Author
# Meta tag is reliable: <meta name="author" content="...">
meta_author = soup.find("meta", attrs={"name": "author"})
if meta_author and meta_author.get("content"):
content_data["author"] = {"name": meta_author.get("content")}
else:
# Fallback to selectors
author_tag = soup.select_one('a[data-action="show-user-card"]') or soup.select_one('.ds-link')
if author_tag:
author_text = author_tag.get_text(strip=True)
if author_text: # Only set if we got actual text
content_data["author"] = {"name": author_text}
# Extract publication or author from URL (metadata extraction)
if url:
try:
from urllib.parse import urlparse
path_parts = urlparse(url).path.strip("/").split("/")
if len(path_parts) >= 1:
first_part = path_parts[0]
# Check for @username format (personal blog)
if first_part.startswith("@"):
username = first_part[1:] # Remove @ symbol
formatted_name = username.replace("-", " ").title()
# If we don't have an author yet, use the username
if not content_data["author"]:
content_data["author"] = {"name": formatted_name}
# Otherwise it's a publication name (like "ai-in-plain-english")
else:
pub_name = first_part.replace("-", " ").title()
content_data["publication"] = pub_name
# Only use publication as author if we have absolutely no author info
# (Note: This is not ideal but better than nothing for blocked pages)
except Exception:
pass
# Pre-extract og:description for fallback (before attempting main extraction)
og_description = soup.find("meta", property="og:description")
fallback_description = og_description.get("content") if og_description else None
# Extract Claps
try:
clap_el = soup.select_one('button[data-testid="clapCount"]') or soup.select_one('.clapCount')
if clap_el:
txt = clap_el.get_text(strip=True)
if "K" in txt:
content_data["claps"] = int(float(txt.replace("K", "")) * 1000)
else:
content_data["claps"] = int(txt)
except Exception:
pass
# Extract Responses
try:
resp_el = soup.select_one('button[data-testid="responsesCount"]') or soup.select_one('.responsesCount')
if resp_el:
txt = resp_el.get_text(strip=True)
content_data["responses"] = int(txt)
except Exception:
pass
# Extract Content
article = soup.find("article") or soup.find("section")
if article:
# Remove clutter
for tag in article.select("button, .speechify-btn, .metabar, footer"):
tag.decompose()
html = str(article)
content_data["markdownContent"] = md(html, heading_style="ATX")
# Fallback 1: Try to extract first few paragraphs (intro/header) even if article tag failed
if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 100:
# Look for any paragraphs in the page (might be intro text that loaded before paywall)
paragraphs = soup.find_all("p")
if paragraphs:
# Get first 3-5 paragraphs that have substantial content
intro_text = []
for p in paragraphs[:10]: # Check first 10 paragraphs
text = p.get_text(strip=True)
# Skip short paragraphs (likely meta info) and certain patterns
if len(text) > 50 and "min read" not in text.lower() and "ago" not in text:
intro_text.append(text)
if len(intro_text) >= 3: # Got enough intro paragraphs
break
if intro_text:
combined_intro = "\n\n".join(intro_text)
if not content_data["markdownContent"]:
content_data["markdownContent"] = combined_intro
else:
# Append intro to existing content if it was too short
content_data["markdownContent"] += "\n\n" + combined_intro
# Fallback 2: Meta Description (if still no content)
if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 50:
if fallback_description:
desc_text = f"Summary: {fallback_description}"
if content_data["markdownContent"]:
content_data["markdownContent"] = desc_text + "\n\n" + content_data["markdownContent"]
else:
content_data["markdownContent"] = desc_text
else:
# Last resort: try name="description"
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc:
content_data["markdownContent"] = f"Summary: {meta_desc.get('content', '')}"
return content_data
|