Medium-MCP / src /extractor.py
Nikhil Pravin Pise
feat: comprehensive migration - merge Scraper + MCP Server
ae588db
import json
import re
from typing import Dict, Any, Optional
from bs4 import BeautifulSoup
def extract_from_apollo_state(html: str) -> Optional[Dict[str, Any]]:
"""
Extracts article data from window.__APOLLO_STATE__.
This is the "Gold Mine" - raw JSON data used by Medium's React app.
"""
try:
# Find the start of the object
pattern = r'window\.__APOLLO_STATE__\s*=\s*'
match = re.search(pattern, html)
if not match:
return None
# Use raw_decode to parse the JSON object starting from the match end
# This avoids issues with regex matching nested braces or trailing semicolons
try:
json_str = html[match.end():]
data, _ = json.JSONDecoder().raw_decode(json_str)
except Exception as e:
print(f"JSON Decode Error: {e}")
return None
# The Apollo state is a flat map of ID -> Object.
# We need to find the "Post" object that corresponds to the article.
article_data = {}
# Find the main Post object
# Usually has "title", "creator", "content"
post_key = None
for key, value in data.items():
if key.startswith("Post:") and value.get("title") and value.get("content"):
post_key = key
break
if not post_key:
return None
post = data[post_key]
# Extract fields
article_data["title"] = post.get("title")
article_data["id"] = post.get("id")
article_data["firstPublishedAt"] = post.get("firstPublishedAt")
article_data["readingTime"] = post.get("readingTime")
# Author
creator_id = post.get("creator", {}).get("__ref")
if creator_id and creator_id in data:
creator = data[creator_id]
article_data["author"] = {
"name": creator.get("name"),
"username": creator.get("username"),
"bio": creator.get("bio"),
"id": creator.get("id"),
"followerCount": creator.get("socialStats", {}).get("followerCount"),
"imageId": creator.get("imageId")
}
# Content (Paragraphs)
# Content is often stored as a list of paragraph IDs or a content object
# In newer Medium, it might be in "content" -> "bodyModel" -> "paragraphs"
content_ref = post.get("content", {}).get("__ref") or post.get("content")
paragraphs = []
if content_ref and isinstance(content_ref, str) and content_ref in data:
# It's a ref
content_obj = data[content_ref]
body_model = content_obj.get("bodyModel")
if body_model:
paragraphs = body_model.get("paragraphs", [])
elif isinstance(post.get("content"), dict):
# It's inline
paragraphs = post.get("content", {}).get("bodyModel", {}).get("paragraphs", [])
# Reconstruct Markdown
markdown = []
markdown.append(f"# {article_data['title']}")
if article_data.get("author"):
markdown.append(f"**By {article_data['author']['name']}**")
markdown.append("")
for p in paragraphs:
text = p.get("text", "")
type_ = p.get("type")
markups = p.get("markups", [])
# Basic formatting application could go here, but raw text is often enough
# Types: P (paragraph), H3 (header), H4 (subheader), IMG (image), CODE (code block)
if type_ == "H3":
markdown.append(f"## {text}")
elif type_ == "H4":
markdown.append(f"### {text}")
elif type_ == "IMG":
meta = p.get("metadata", {})
img_id = meta.get("id")
if img_id:
url = f"https://miro.medium.com/v2/resize:fit:1400/{img_id}"
markdown.append(f"![Image]({url})")
if text:
markdown.append(f"*{text}*")
elif type_ == "CODE":
markdown.append(f"```\n{text}\n```")
elif type_ == "PQ": # Pull Quote
markdown.append(f"> {text}")
else:
markdown.append(text)
markdown.append("")
article_data["markdownContent"] = "\n".join(markdown)
article_data["source"] = "apollo"
article_data["json_state"] = post # Store raw post data
# Phase 2: Deep Graph Extraction
# 1. Comments (Responses)
# Look for posts that are inResponseToPostId == article_data["id"]
comments = []
for key, value in data.items():
if key.startswith("Post:") and value.get("inResponseToPostId") == article_data["id"]:
# Extract comment text
comment_text = ""
# Simplified content extraction for comments
c_content_ref = value.get("content", {}).get("__ref")
if c_content_ref and c_content_ref in data:
c_paragraphs = data[c_content_ref].get("bodyModel", {}).get("paragraphs", [])
comment_text = "\n".join([p.get("text", "") for p in c_paragraphs])
comments.append({
"id": value.get("id"),
"authorId": value.get("creator", {}).get("__ref"),
"text": comment_text,
"claps": value.get("virtuals", {}).get("totalClapCount")
})
article_data["comments"] = comments[:10] # Top 10
# 2. Recommended Articles
# Often found in "relatedPosts" or similar fields in the Post object
# Or we can just look for other Post objects in the state that are NOT the main post and NOT comments
recommended = []
for key, value in data.items():
if key.startswith("Post:") and key != post_key and value.get("title") and not value.get("inResponseToPostId"):
recommended.append({
"id": value.get("id"),
"title": value.get("title"),
"url": f"https://medium.com/p/{value.get('id')}" # Construct URL
})
article_data["recommended"] = recommended[:5]
return article_data
except Exception as e:
print(f"Error extracting Apollo state: {e}")
return None
def extract_from_json_ld(html: str) -> Optional[Dict[str, Any]]:
"""
Extracts article data from JSON-LD structured data.
"""
try:
soup = BeautifulSoup(html, "html.parser")
scripts = soup.find_all("script", type="application/ld+json")
for script in scripts:
try:
data = json.loads(script.string)
# Check if it's an Article or NewsArticle
type_ = data.get("@type")
if isinstance(type_, list):
if "Article" in type_ or "NewsArticle" in type_ or "BlogPosting" in type_:
pass
else:
continue
elif type_ not in ["Article", "NewsArticle", "BlogPosting"]:
continue
# Extract
article_data = {
"title": data.get("headline") or data.get("name"),
"description": data.get("description"),
"author": {"name": data.get("author", {}).get("name")},
"datePublished": data.get("datePublished"),
"image": data.get("image"),
"source": "json-ld",
"json_state": data
}
# JSON-LD usually doesn't have full body text, mostly just description
# But sometimes "articleBody" is present
if data.get("articleBody"):
article_data["markdownContent"] = data["articleBody"]
else:
# Fallback to description
article_data["markdownContent"] = f"# {article_data['title']}\n\n{article_data['description']}"
return article_data
except:
continue
except Exception:
pass
return None
def extract_from_graphql_response(response: dict) -> Optional[Dict[str, Any]]:
"""
Extract article data from direct GraphQL API response.
This is used with the new Tier 1.5 (Direct GraphQL API) that queries
medium.com/_/graphql directly.
Uses paragraph_parser for rich content extraction with all 13 paragraph
types and 5 markup types.
Args:
response: The raw GraphQL API response
Returns:
Dict with title, author, markdownContent, etc. or None if failed
"""
try:
from src.paragraph_parser import (
parse_graphql_response_to_markdown,
extract_article_metadata,
)
# Parse content and metadata
markdown_content, metadata = parse_graphql_response_to_markdown(response)
if not markdown_content or len(markdown_content) < 100:
return None
# Build article data structure compatible with existing code
article_data = {
"title": metadata.get("title", ""),
"author": metadata.get("author", {}),
"publication": metadata.get("publication", ""),
"markdownContent": markdown_content,
"source": "graphql_api",
"json_state": response,
"firstPublishedAt": metadata.get("firstPublishedAt"),
"readingTime": metadata.get("readingTime", 0),
"mediumUrl": metadata.get("mediumUrl", ""),
"canonicalUrl": metadata.get("canonicalUrl", ""),
"clapCount": metadata.get("clapCount", 0),
"isLocked": metadata.get("isLocked", False),
"tags": metadata.get("tags", []),
"detectedLanguage": metadata.get("detectedLanguage", "en"),
}
return article_data
except ImportError as e:
print(f"Error importing paragraph_parser: {e}")
return None
except Exception as e:
print(f"Error extracting from GraphQL response: {e}")
return None