Spaces:

T0X1N
/

Medium-MCP

Sleeping

Nikhil Pravin Pise

fix: Upgrade Medium images to high resolution (1400px) across entire app

60742a2 3 months ago

13.2 kB

	import re
	from bs4 import BeautifulSoup
	from typing import Dict, List, Optional, Any
	from markdownify import markdownify as md
	from urllib.parse import urljoin

	# Import centralized image URL utilities from utils
	from src.utils import upgrade_medium_image_url, get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH


	def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
	"""
	Extracts article metadata from search result cards.
	"""
	results = []

	# Selectors for article cards
	# Try multiple selectors as Medium's DOM changes
	cards = soup.select("article") or \
	soup.select('div[role="article"]') or \
	soup.select(".postArticle") or \
	soup.select(".js-block")

	for card in cards:
	data = _extract_from_card(card, base_url)
	if data.get("url"):
	results.append(data)

	return results

	def _extract_from_card(card, base_url: str) -> Dict[str, Any]:
	"""Helper to extract data from a single card element."""
	# 1. URL & Title
	# Look for <a> tags that link to the article
	# Usually the first <h2> inside an <a> is the title
	title_tag = card.find("h2")
	title = title_tag.get_text(strip=True) if title_tag else None

	# Find the link associated with the title or the card
	link_tag = card.find("a", href=True)
	if title_tag and title_tag.find_parent("a"):
	link_tag = title_tag.find_parent("a")

	url = None
	if link_tag:
	href = link_tag["href"]
	# Clean up URL (remove query params usually)
	if "?" in href:
	href = href.split("?")[0]
	url = urljoin(base_url, href)

	# 2. Author
	# Heuristic: Look for links that go to a user profile (/@username or /u/username)
	# but aren't the main article link.
	author = None

	# Try specific selectors first
	author_tag = card.select_one('a[data-action="show-user-card"]') or \
	card.select_one('.ds-link') or \
	card.select_one('a[href*="/@"]')

	if author_tag:
	# Verify it's not the title link
	if title_tag and author_tag == title_tag.find_parent("a"):
	pass # It's the title
	else:
	author = author_tag.get_text(strip=True)

	# Fallback: Look for a <p> or <span> that contains the author name
	# Usually it's the first piece of text in the card meta area
	if not author:
	# Find the meta div (often has date/read time)
	# We look for text that is NOT the date or read time
	for p in card.find_all(["p", "span"]):
	txt = p.get_text(strip=True)
	# Skip empty, date-like, or read-time strings
	if not txt or "min read" in txt or any(m in txt for m in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "ago"]):
	continue
	# Skip title
	if title and txt in title:
	continue

	# If it looks like a name (2-3 words, capitalized), take it
	if 0 < len(txt.split()) <= 3 and txt[0].isupper():
	author = txt
	break

	# 3. Date / Reading Time
	# Often spans
	spans = card.find_all("span")
	pub_date = None
	reading_time = None

	for s in spans:
	txt = s.get_text(strip=True)
	# Reading time usually ends with "min read"
	if "min read" in txt:
	try:
	reading_time = float(txt.replace("min read", "").strip())
	except ValueError:
	pass
	# Date heuristic: "Nov 7" or "2 days ago"
	# Hard to parse perfectly without regex, but we can grab it if it looks like a date
	# For now, we might skip complex date parsing or just take the first span that isn't reading time
	elif not pub_date and len(txt) < 15 and any(c.isdigit() for c in txt):
	# Very rough heuristic
	pub_date = txt

	# 4. Image URL
	# Priority:
	# 1. <img src="..." class="..."/> inside the card (often has specific classes for covers)
	# 2. First <img> tag in the card
	# Note: Search results don't always have og:image tags (those are in the head), so we must rely on the card's HTML.
	image_url = None

	# Try to find the main article image (often has specific classes or sizes)
	# Medium uses responsive images, often in <picture> or <img> with srcset.
	# We'll look for the largest image or the first one that isn't an avatar.

	images = card.find_all("img")
	for img in images:
	src = img.get("src", "")
	# Skip small avatars (often 20x20 or similar in URL)
	if "1*dmbNkD5D-u45r44go_cf0g.png" in src: # Common default avatar
	continue
	if "resize:fill:20:20" in src: # Tiny thumbnail
	continue

	# If it's a valid image, take it.
	# Medium images often have 'cdn-images-1.medium.com'
	if src:
	image_url = src
	break

	if not image_url:
	# Fallback to any img
	img_tag = card.find("img")
	if img_tag and img_tag.get("src"):
	image_url = img_tag["src"]

	# Upgrade image URL to high resolution
	image_url = upgrade_medium_image_url(image_url, target_width=1400)

	return {
	"url": url,
	"title": title,
	"author": {"name": author} if author else None,
	"publishingDate": pub_date,
	"readingTime": reading_time,
	"imageUrl": image_url,
	}

	def extract_article_content(soup: BeautifulSoup, url: Optional[str] = None) -> Dict[str, Any]:
	"""
	Extracts full content, claps, and responses from an article page.
	If extraction fails (Cloudflare/paywall), falls back to URL parsing.
	"""
	content_data = {
	"markdownContent": None,
	"claps": None,
	"responses": None,
	"title": None,
	"author": None,
	"publication": None # New field to track publication separately from author
	}

	# Extract Title (with fallbacks)
	# Try h1 first
	title_tag = soup.find("h1")
	if title_tag:
	content_data["title"] = title_tag.get_text(strip=True)

	# Try og:title
	if not content_data["title"]:
	og_title = soup.find("meta", property="og:title")
	if og_title and og_title.get("content"):
	content_data["title"] = og_title.get("content")

	# Try URL parsing if title is empty or generic (Cloudflare/Medium homepage)
	is_generic_title = content_data["title"] in [None, "", "Just a moment...", "medium.com", "Medium"]
	if is_generic_title and url:
	# Medium URLs are like: https://medium.com/@author/article-title-slug-hash
	# or https://medium.com/publication/article-title-slug-hash
	try:
	from urllib.parse import urlparse
	path_parts = urlparse(url).path.strip("/").split("/")
	if len(path_parts) >= 2:
	# Last part is the article slug
	article_slug = path_parts[-1]
	# Remove hash (last part after last hyphen if it's alphanumeric)
	slug_parts = article_slug.rsplit("-", 1)
	if len(slug_parts) > 1 and len(slug_parts[-1]) == 12: # Medium hash is 12 chars
	article_slug = slug_parts[0]
	# Convert slug to title: replace-hyphens-with-spaces
	title = article_slug.replace("-", " ").title()
	content_data["title"] = title
	except Exception:
	pass

	# Last resort: Try page title element (will be "medium.com" or "Just a moment..." for Cloudflare)
	if not content_data["title"]:
	title_elem = soup.find("title")
	if title_elem:
	page_title = title_elem.get_text(strip=True)
	# Only use if it's not a Cloudflare/generic page
	if page_title and page_title not in ["Just a moment...", "medium.com", "Medium"]:
	content_data["title"] = page_title

	# Extract Author
	# Meta tag is reliable: <meta name="author" content="...">
	meta_author = soup.find("meta", attrs={"name": "author"})
	if meta_author and meta_author.get("content"):
	content_data["author"] = {"name": meta_author.get("content")}
	else:
	# Fallback to selectors
	author_tag = soup.select_one('a[data-action="show-user-card"]') or soup.select_one('.ds-link')
	if author_tag:
	author_text = author_tag.get_text(strip=True)
	if author_text: # Only set if we got actual text
	content_data["author"] = {"name": author_text}

	# Extract publication or author from URL (metadata extraction)
	if url:
	try:
	from urllib.parse import urlparse
	path_parts = urlparse(url).path.strip("/").split("/")
	if len(path_parts) >= 1:
	first_part = path_parts[0]
	# Check for @username format (personal blog)
	if first_part.startswith("@"):
	username = first_part[1:] # Remove @ symbol
	formatted_name = username.replace("-", " ").title()
	# If we don't have an author yet, use the username
	if not content_data["author"]:
	content_data["author"] = {"name": formatted_name}
	# Otherwise it's a publication name (like "ai-in-plain-english")
	else:
	pub_name = first_part.replace("-", " ").title()
	content_data["publication"] = pub_name
	# Only use publication as author if we have absolutely no author info
	# (Note: This is not ideal but better than nothing for blocked pages)
	except Exception:
	pass

	# Pre-extract og:description for fallback (before attempting main extraction)
	og_description = soup.find("meta", property="og:description")
	fallback_description = og_description.get("content") if og_description else None

	# Extract Claps
	try:
	clap_el = soup.select_one('button[data-testid="clapCount"]') or soup.select_one('.clapCount')
	if clap_el:
	txt = clap_el.get_text(strip=True)
	if "K" in txt:
	content_data["claps"] = int(float(txt.replace("K", "")) * 1000)
	else:
	content_data["claps"] = int(txt)
	except Exception:
	pass

	# Extract Responses
	try:
	resp_el = soup.select_one('button[data-testid="responsesCount"]') or soup.select_one('.responsesCount')
	if resp_el:
	txt = resp_el.get_text(strip=True)
	content_data["responses"] = int(txt)
	except Exception:
	pass

	# Extract Content
	article = soup.find("article") or soup.find("section")
	if article:
	# Remove clutter
	for tag in article.select("button, .speechify-btn, .metabar, footer"):
	tag.decompose()

	html = str(article)
	content_data["markdownContent"] = md(html, heading_style="ATX")

	# Fallback 1: Try to extract first few paragraphs (intro/header) even if article tag failed
	if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 100:
	# Look for any paragraphs in the page (might be intro text that loaded before paywall)
	paragraphs = soup.find_all("p")
	if paragraphs:
	# Get first 3-5 paragraphs that have substantial content
	intro_text = []
	for p in paragraphs[:10]: # Check first 10 paragraphs
	text = p.get_text(strip=True)
	# Skip short paragraphs (likely meta info) and certain patterns
	if len(text) > 50 and "min read" not in text.lower() and "ago" not in text:
	intro_text.append(text)
	if len(intro_text) >= 3: # Got enough intro paragraphs
	break

	if intro_text:
	combined_intro = "\n\n".join(intro_text)
	if not content_data["markdownContent"]:
	content_data["markdownContent"] = combined_intro
	else:
	# Append intro to existing content if it was too short
	content_data["markdownContent"] += "\n\n" + combined_intro

	# Fallback 2: Meta Description (if still no content)
	if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 50:
	if fallback_description:
	desc_text = f"Summary: {fallback_description}"
	if content_data["markdownContent"]:
	content_data["markdownContent"] = desc_text + "\n\n" + content_data["markdownContent"]
	else:
	content_data["markdownContent"] = desc_text
	else:
	# Last resort: try name="description"
	meta_desc = soup.find("meta", attrs={"name": "description"})
	if meta_desc:
	content_data["markdownContent"] = f"Summary: {meta_desc.get('content', '')}"

	return content_data