Spaces:

T0X1N
/

Medium-MCP

Sleeping

Nikhil Pravin Pise

feat: comprehensive migration - merge Scraper + MCP Server

ae588db 3 months ago

10.6 kB

	import json
	import re
	from typing import Dict, Any, Optional
	from bs4 import BeautifulSoup

	def extract_from_apollo_state(html: str) -> Optional[Dict[str, Any]]:
	"""
	Extracts article data from window.__APOLLO_STATE__.
	This is the "Gold Mine" - raw JSON data used by Medium's React app.
	"""
	try:
	# Find the start of the object
	pattern = r'window\.__APOLLO_STATE__\s=\s'
	match = re.search(pattern, html)
	if not match:
	return None

	# Use raw_decode to parse the JSON object starting from the match end
	# This avoids issues with regex matching nested braces or trailing semicolons
	try:
	json_str = html[match.end():]
	data, _ = json.JSONDecoder().raw_decode(json_str)
	except Exception as e:
	print(f"JSON Decode Error: {e}")
	return None

	# The Apollo state is a flat map of ID -> Object.
	# We need to find the "Post" object that corresponds to the article.

	article_data = {}

	# Find the main Post object
	# Usually has "title", "creator", "content"
	post_key = None
	for key, value in data.items():
	if key.startswith("Post:") and value.get("title") and value.get("content"):
	post_key = key
	break

	if not post_key:
	return None

	post = data[post_key]

	# Extract fields
	article_data["title"] = post.get("title")
	article_data["id"] = post.get("id")
	article_data["firstPublishedAt"] = post.get("firstPublishedAt")
	article_data["readingTime"] = post.get("readingTime")

	# Author
	creator_id = post.get("creator", {}).get("__ref")
	if creator_id and creator_id in data:
	creator = data[creator_id]
	article_data["author"] = {
	"name": creator.get("name"),
	"username": creator.get("username"),
	"bio": creator.get("bio"),
	"id": creator.get("id"),
	"followerCount": creator.get("socialStats", {}).get("followerCount"),
	"imageId": creator.get("imageId")
	}

	# Content (Paragraphs)
	# Content is often stored as a list of paragraph IDs or a content object
	# In newer Medium, it might be in "content" -> "bodyModel" -> "paragraphs"
	content_ref = post.get("content", {}).get("__ref") or post.get("content")
	paragraphs = []

	if content_ref and isinstance(content_ref, str) and content_ref in data:
	# It's a ref
	content_obj = data[content_ref]
	body_model = content_obj.get("bodyModel")
	if body_model:
	paragraphs = body_model.get("paragraphs", [])
	elif isinstance(post.get("content"), dict):
	# It's inline
	paragraphs = post.get("content", {}).get("bodyModel", {}).get("paragraphs", [])

	# Reconstruct Markdown
	markdown = []
	markdown.append(f"# {article_data['title']}")
	if article_data.get("author"):
	markdown.append(f"By {article_data['author']['name']}")
	markdown.append("")

	for p in paragraphs:
	text = p.get("text", "")
	type_ = p.get("type")
	markups = p.get("markups", [])

	# Basic formatting application could go here, but raw text is often enough
	# Types: P (paragraph), H3 (header), H4 (subheader), IMG (image), CODE (code block)

	if type_ == "H3":
	markdown.append(f"## {text}")
	elif type_ == "H4":
	markdown.append(f"### {text}")
	elif type_ == "IMG":
	meta = p.get("metadata", {})
	img_id = meta.get("id")
	if img_id:
	url = f"https://miro.medium.com/v2/resize:fit:1400/{img_id}"
	markdown.append(f"![Image]({url})")
	if text:
	markdown.append(f"{text}")
	elif type_ == "CODE":
	markdown.append(f"```\n{text}\n```")
	elif type_ == "PQ": # Pull Quote
	markdown.append(f"> {text}")
	else:
	markdown.append(text)

	markdown.append("")

	article_data["markdownContent"] = "\n".join(markdown)
	article_data["source"] = "apollo"
	article_data["json_state"] = post # Store raw post data

	# Phase 2: Deep Graph Extraction

	# 1. Comments (Responses)
	# Look for posts that are inResponseToPostId == article_data["id"]
	comments = []
	for key, value in data.items():
	if key.startswith("Post:") and value.get("inResponseToPostId") == article_data["id"]:
	# Extract comment text
	comment_text = ""
	# Simplified content extraction for comments
	c_content_ref = value.get("content", {}).get("__ref")
	if c_content_ref and c_content_ref in data:
	c_paragraphs = data[c_content_ref].get("bodyModel", {}).get("paragraphs", [])
	comment_text = "\n".join([p.get("text", "") for p in c_paragraphs])

	comments.append({
	"id": value.get("id"),
	"authorId": value.get("creator", {}).get("__ref"),
	"text": comment_text,
	"claps": value.get("virtuals", {}).get("totalClapCount")
	})
	article_data["comments"] = comments[:10] # Top 10

	# 2. Recommended Articles
	# Often found in "relatedPosts" or similar fields in the Post object
	# Or we can just look for other Post objects in the state that are NOT the main post and NOT comments
	recommended = []
	for key, value in data.items():
	if key.startswith("Post:") and key != post_key and value.get("title") and not value.get("inResponseToPostId"):
	recommended.append({
	"id": value.get("id"),
	"title": value.get("title"),
	"url": f"https://medium.com/p/{value.get('id')}" # Construct URL
	})
	article_data["recommended"] = recommended[:5]

	return article_data

	except Exception as e:
	print(f"Error extracting Apollo state: {e}")
	return None

	def extract_from_json_ld(html: str) -> Optional[Dict[str, Any]]:
	"""
	Extracts article data from JSON-LD structured data.
	"""
	try:
	soup = BeautifulSoup(html, "html.parser")
	scripts = soup.find_all("script", type="application/ld+json")

	for script in scripts:
	try:
	data = json.loads(script.string)
	# Check if it's an Article or NewsArticle
	type_ = data.get("@type")
	if isinstance(type_, list):
	if "Article" in type_ or "NewsArticle" in type_ or "BlogPosting" in type_:
	pass
	else:
	continue
	elif type_ not in ["Article", "NewsArticle", "BlogPosting"]:
	continue

	# Extract
	article_data = {
	"title": data.get("headline") or data.get("name"),
	"description": data.get("description"),
	"author": {"name": data.get("author", {}).get("name")},
	"datePublished": data.get("datePublished"),
	"image": data.get("image"),
	"source": "json-ld",
	"json_state": data
	}

	# JSON-LD usually doesn't have full body text, mostly just description
	# But sometimes "articleBody" is present
	if data.get("articleBody"):
	article_data["markdownContent"] = data["articleBody"]
	else:
	# Fallback to description
	article_data["markdownContent"] = f"# {article_data['title']}\n\n{article_data['description']}"

	return article_data

	except:
	continue

	except Exception:
	pass
	return None


	def extract_from_graphql_response(response: dict) -> Optional[Dict[str, Any]]:
	"""
	Extract article data from direct GraphQL API response.

	This is used with the new Tier 1.5 (Direct GraphQL API) that queries
	medium.com/_/graphql directly.

	Uses paragraph_parser for rich content extraction with all 13 paragraph
	types and 5 markup types.

	Args:
	response: The raw GraphQL API response

	Returns:
	Dict with title, author, markdownContent, etc. or None if failed
	"""
	try:
	from src.paragraph_parser import (
	parse_graphql_response_to_markdown,
	extract_article_metadata,
	)

	# Parse content and metadata
	markdown_content, metadata = parse_graphql_response_to_markdown(response)

	if not markdown_content or len(markdown_content) < 100:
	return None

	# Build article data structure compatible with existing code
	article_data = {
	"title": metadata.get("title", ""),
	"author": metadata.get("author", {}),
	"publication": metadata.get("publication", ""),
	"markdownContent": markdown_content,
	"source": "graphql_api",
	"json_state": response,
	"firstPublishedAt": metadata.get("firstPublishedAt"),
	"readingTime": metadata.get("readingTime", 0),
	"mediumUrl": metadata.get("mediumUrl", ""),
	"canonicalUrl": metadata.get("canonicalUrl", ""),
	"clapCount": metadata.get("clapCount", 0),
	"isLocked": metadata.get("isLocked", False),
	"tags": metadata.get("tags", []),
	"detectedLanguage": metadata.get("detectedLanguage", "en"),
	}

	return article_data

	except ImportError as e:
	print(f"Error importing paragraph_parser: {e}")
	return None
	except Exception as e:
	print(f"Error extracting from GraphQL response: {e}")
	return None