Medium-MCP / src /parser.py
Nikhil Pravin Pise
fix: Upgrade Medium images to high resolution (1400px) across entire app
60742a2
import re
from bs4 import BeautifulSoup
from typing import Dict, List, Optional, Any
from markdownify import markdownify as md
from urllib.parse import urljoin
# Import centralized image URL utilities from utils
from src.utils import upgrade_medium_image_url, get_medium_image_url, MEDIUM_IMAGE_DEFAULT_WIDTH
def extract_search_results(soup: BeautifulSoup, base_url: str) -> List[Dict[str, Any]]:
"""
Extracts article metadata from search result cards.
"""
results = []
# Selectors for article cards
# Try multiple selectors as Medium's DOM changes
cards = soup.select("article") or \
soup.select('div[role="article"]') or \
soup.select(".postArticle") or \
soup.select(".js-block")
for card in cards:
data = _extract_from_card(card, base_url)
if data.get("url"):
results.append(data)
return results
def _extract_from_card(card, base_url: str) -> Dict[str, Any]:
"""Helper to extract data from a single card element."""
# 1. URL & Title
# Look for <a> tags that link to the article
# Usually the first <h2> inside an <a> is the title
title_tag = card.find("h2")
title = title_tag.get_text(strip=True) if title_tag else None
# Find the link associated with the title or the card
link_tag = card.find("a", href=True)
if title_tag and title_tag.find_parent("a"):
link_tag = title_tag.find_parent("a")
url = None
if link_tag:
href = link_tag["href"]
# Clean up URL (remove query params usually)
if "?" in href:
href = href.split("?")[0]
url = urljoin(base_url, href)
# 2. Author
# Heuristic: Look for links that go to a user profile (/@username or /u/username)
# but aren't the main article link.
author = None
# Try specific selectors first
author_tag = card.select_one('a[data-action="show-user-card"]') or \
card.select_one('.ds-link') or \
card.select_one('a[href*="/@"]')
if author_tag:
# Verify it's not the title link
if title_tag and author_tag == title_tag.find_parent("a"):
pass # It's the title
else:
author = author_tag.get_text(strip=True)
# Fallback: Look for a <p> or <span> that contains the author name
# Usually it's the first piece of text in the card meta area
if not author:
# Find the meta div (often has date/read time)
# We look for text that is NOT the date or read time
for p in card.find_all(["p", "span"]):
txt = p.get_text(strip=True)
# Skip empty, date-like, or read-time strings
if not txt or "min read" in txt or any(m in txt for m in ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "ago"]):
continue
# Skip title
if title and txt in title:
continue
# If it looks like a name (2-3 words, capitalized), take it
if 0 < len(txt.split()) <= 3 and txt[0].isupper():
author = txt
break
# 3. Date / Reading Time
# Often spans
spans = card.find_all("span")
pub_date = None
reading_time = None
for s in spans:
txt = s.get_text(strip=True)
# Reading time usually ends with "min read"
if "min read" in txt:
try:
reading_time = float(txt.replace("min read", "").strip())
except ValueError:
pass
# Date heuristic: "Nov 7" or "2 days ago"
# Hard to parse perfectly without regex, but we can grab it if it looks like a date
# For now, we might skip complex date parsing or just take the first span that isn't reading time
elif not pub_date and len(txt) < 15 and any(c.isdigit() for c in txt):
# Very rough heuristic
pub_date = txt
# 4. Image URL
# Priority:
# 1. <img src="..." class="..."/> inside the card (often has specific classes for covers)
# 2. First <img> tag in the card
# Note: Search results don't always have og:image tags (those are in the head), so we must rely on the card's HTML.
image_url = None
# Try to find the main article image (often has specific classes or sizes)
# Medium uses responsive images, often in <picture> or <img> with srcset.
# We'll look for the largest image or the first one that isn't an avatar.
images = card.find_all("img")
for img in images:
src = img.get("src", "")
# Skip small avatars (often 20x20 or similar in URL)
if "1*dmbNkD5D-u45r44go_cf0g.png" in src: # Common default avatar
continue
if "resize:fill:20:20" in src: # Tiny thumbnail
continue
# If it's a valid image, take it.
# Medium images often have 'cdn-images-1.medium.com'
if src:
image_url = src
break
if not image_url:
# Fallback to any img
img_tag = card.find("img")
if img_tag and img_tag.get("src"):
image_url = img_tag["src"]
# Upgrade image URL to high resolution
image_url = upgrade_medium_image_url(image_url, target_width=1400)
return {
"url": url,
"title": title,
"author": {"name": author} if author else None,
"publishingDate": pub_date,
"readingTime": reading_time,
"imageUrl": image_url,
}
def extract_article_content(soup: BeautifulSoup, url: Optional[str] = None) -> Dict[str, Any]:
"""
Extracts full content, claps, and responses from an article page.
If extraction fails (Cloudflare/paywall), falls back to URL parsing.
"""
content_data = {
"markdownContent": None,
"claps": None,
"responses": None,
"title": None,
"author": None,
"publication": None # New field to track publication separately from author
}
# Extract Title (with fallbacks)
# Try h1 first
title_tag = soup.find("h1")
if title_tag:
content_data["title"] = title_tag.get_text(strip=True)
# Try og:title
if not content_data["title"]:
og_title = soup.find("meta", property="og:title")
if og_title and og_title.get("content"):
content_data["title"] = og_title.get("content")
# Try URL parsing if title is empty or generic (Cloudflare/Medium homepage)
is_generic_title = content_data["title"] in [None, "", "Just a moment...", "medium.com", "Medium"]
if is_generic_title and url:
# Medium URLs are like: https://medium.com/@author/article-title-slug-hash
# or https://medium.com/publication/article-title-slug-hash
try:
from urllib.parse import urlparse
path_parts = urlparse(url).path.strip("/").split("/")
if len(path_parts) >= 2:
# Last part is the article slug
article_slug = path_parts[-1]
# Remove hash (last part after last hyphen if it's alphanumeric)
slug_parts = article_slug.rsplit("-", 1)
if len(slug_parts) > 1 and len(slug_parts[-1]) == 12: # Medium hash is 12 chars
article_slug = slug_parts[0]
# Convert slug to title: replace-hyphens-with-spaces
title = article_slug.replace("-", " ").title()
content_data["title"] = title
except Exception:
pass
# Last resort: Try page title element (will be "medium.com" or "Just a moment..." for Cloudflare)
if not content_data["title"]:
title_elem = soup.find("title")
if title_elem:
page_title = title_elem.get_text(strip=True)
# Only use if it's not a Cloudflare/generic page
if page_title and page_title not in ["Just a moment...", "medium.com", "Medium"]:
content_data["title"] = page_title
# Extract Author
# Meta tag is reliable: <meta name="author" content="...">
meta_author = soup.find("meta", attrs={"name": "author"})
if meta_author and meta_author.get("content"):
content_data["author"] = {"name": meta_author.get("content")}
else:
# Fallback to selectors
author_tag = soup.select_one('a[data-action="show-user-card"]') or soup.select_one('.ds-link')
if author_tag:
author_text = author_tag.get_text(strip=True)
if author_text: # Only set if we got actual text
content_data["author"] = {"name": author_text}
# Extract publication or author from URL (metadata extraction)
if url:
try:
from urllib.parse import urlparse
path_parts = urlparse(url).path.strip("/").split("/")
if len(path_parts) >= 1:
first_part = path_parts[0]
# Check for @username format (personal blog)
if first_part.startswith("@"):
username = first_part[1:] # Remove @ symbol
formatted_name = username.replace("-", " ").title()
# If we don't have an author yet, use the username
if not content_data["author"]:
content_data["author"] = {"name": formatted_name}
# Otherwise it's a publication name (like "ai-in-plain-english")
else:
pub_name = first_part.replace("-", " ").title()
content_data["publication"] = pub_name
# Only use publication as author if we have absolutely no author info
# (Note: This is not ideal but better than nothing for blocked pages)
except Exception:
pass
# Pre-extract og:description for fallback (before attempting main extraction)
og_description = soup.find("meta", property="og:description")
fallback_description = og_description.get("content") if og_description else None
# Extract Claps
try:
clap_el = soup.select_one('button[data-testid="clapCount"]') or soup.select_one('.clapCount')
if clap_el:
txt = clap_el.get_text(strip=True)
if "K" in txt:
content_data["claps"] = int(float(txt.replace("K", "")) * 1000)
else:
content_data["claps"] = int(txt)
except Exception:
pass
# Extract Responses
try:
resp_el = soup.select_one('button[data-testid="responsesCount"]') or soup.select_one('.responsesCount')
if resp_el:
txt = resp_el.get_text(strip=True)
content_data["responses"] = int(txt)
except Exception:
pass
# Extract Content
article = soup.find("article") or soup.find("section")
if article:
# Remove clutter
for tag in article.select("button, .speechify-btn, .metabar, footer"):
tag.decompose()
html = str(article)
content_data["markdownContent"] = md(html, heading_style="ATX")
# Fallback 1: Try to extract first few paragraphs (intro/header) even if article tag failed
if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 100:
# Look for any paragraphs in the page (might be intro text that loaded before paywall)
paragraphs = soup.find_all("p")
if paragraphs:
# Get first 3-5 paragraphs that have substantial content
intro_text = []
for p in paragraphs[:10]: # Check first 10 paragraphs
text = p.get_text(strip=True)
# Skip short paragraphs (likely meta info) and certain patterns
if len(text) > 50 and "min read" not in text.lower() and "ago" not in text:
intro_text.append(text)
if len(intro_text) >= 3: # Got enough intro paragraphs
break
if intro_text:
combined_intro = "\n\n".join(intro_text)
if not content_data["markdownContent"]:
content_data["markdownContent"] = combined_intro
else:
# Append intro to existing content if it was too short
content_data["markdownContent"] += "\n\n" + combined_intro
# Fallback 2: Meta Description (if still no content)
if not content_data["markdownContent"] or len(content_data["markdownContent"]) < 50:
if fallback_description:
desc_text = f"Summary: {fallback_description}"
if content_data["markdownContent"]:
content_data["markdownContent"] = desc_text + "\n\n" + content_data["markdownContent"]
else:
content_data["markdownContent"] = desc_text
else:
# Last resort: try name="description"
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc:
content_data["markdownContent"] = f"Summary: {meta_desc.get('content', '')}"
return content_data