Medium-MCP / src /medium_api.py
Nikhil Pravin Pise
feat: comprehensive migration - merge Scraper + MCP Server
ae588db
"""
Medium GraphQL API Client
Direct GraphQL API client for Medium articles using Chrome TLS fingerprinting.
Ported from Freedium's medium-parser/api.py
"""
import hashlib
import logging
import random
import secrets
from datetime import datetime
from typing import List, Optional
try:
from curl_cffi.requests import AsyncSession
HAS_CURL_CFFI = True
except ImportError:
HAS_CURL_CFFI = False
import httpx
logger = logging.getLogger("MediumAPI")
def generate_random_sha256_hash() -> str:
"""Generate a random SHA256 hash for operation ID."""
random_input_bytes = secrets.token_bytes()
sha256_hash = hashlib.sha256()
sha256_hash.update(random_input_bytes)
return sha256_hash.hexdigest()
def get_unix_ms() -> int:
"""Get current Unix timestamp in milliseconds."""
return int(datetime.now().timestamp() * 1000)
# Full GraphQL query for fetching complete post data
# Source: medium-parser/api.py
FULL_POST_QUERY = """query FullPostQuery($postId: ID!, $postMeteringOptions: PostMeteringOptions) { post(id: $postId) { __typename id ...FullPostData } meterPost(postId: $postId, postMeteringOptions: $postMeteringOptions) { __typename ...MeteringInfoData } } fragment UserFollowData on User { id socialStats { followingCount followerCount } viewerEdge { isFollowing } } fragment NewsletterData on NewsletterV3 { id viewerEdge { id isSubscribed } } fragment UserNewsletterData on User { id newsletterV3 { __typename ...NewsletterData } } fragment ImageMetadataData on ImageMetadata { id originalWidth originalHeight focusPercentX focusPercentY alt } fragment CollectionFollowData on Collection { id subscriberCount viewerEdge { isFollowing } } fragment CollectionNewsletterData on Collection { id newsletterV3 { __typename ...NewsletterData } } fragment BylineData on Post { id readingTime creator { __typename id imageId username name bio tippingLink viewerEdge { isUser } ...UserFollowData ...UserNewsletterData } collection { __typename id name avatar { __typename id ...ImageMetadataData } ...CollectionFollowData ...CollectionNewsletterData } isLocked firstPublishedAt latestPublishedVersion } fragment ResponseCountData on Post { postResponses { count } } fragment InResponseToPost on Post { id title creator { name } clapCount responsesCount isLocked } fragment PostVisibilityData on Post { id collection { viewerEdge { isEditor canEditPosts canEditOwnPosts } } creator { id } isLocked visibility } fragment PostMenuData on Post { id title creator { __typename ...UserFollowData } collection { __typename ...CollectionFollowData } } fragment PostMetaData on Post { __typename id title visibility ...ResponseCountData clapCount viewerEdge { clapCount } detectedLanguage mediumUrl readingTime updatedAt isLocked allowResponses isProxyPost latestPublishedVersion isSeries firstPublishedAt previewImage { id } inResponseToPostResult { __typename ...InResponseToPost } inResponseToMediaResource { mediumQuote { startOffset endOffset paragraphs { text type markups { type start end anchorType } } } } inResponseToEntityType canonicalUrl collection { id slug name shortDescription avatar { __typename id ...ImageMetadataData } viewerEdge { isFollowing isEditor canEditPosts canEditOwnPosts isMuting } } creator { id isFollowing name bio imageId mediumMemberAt twitterScreenName viewerEdge { isBlocking isMuting isUser } } previewContent { subtitle } pinnedByCreatorAt ...PostVisibilityData ...PostMenuData } fragment LinkMetadataList on Post { linkMetadataList { url alts { type url } } } fragment MediaResourceData on MediaResource { id iframeSrc thumbnailUrl iframeHeight iframeWidth title } fragment IframeData on Iframe { iframeHeight iframeWidth mediaResource { __typename ...MediaResourceData } } fragment MarkupData on Markup { name type start end href title rel type anchorType userId creatorIds } fragment CatalogSummaryData on Catalog { id name description type visibility predefined responsesLocked creator { id name username imageId bio viewerEdge { isUser } } createdAt version itemsLastInsertedAt postItemsCount } fragment CatalogPreviewData on Catalog { __typename ...CatalogSummaryData id itemsConnection(pagingOptions: { limit: 10 } ) { items { entity { __typename ... on Post { id previewImage { id } } } } paging { count } } } fragment MixtapeMetadataData on MixtapeMetadata { mediaResourceId href thumbnailImageId mediaResource { mediumCatalog { __typename ...CatalogPreviewData } } } fragment ParagraphData on Paragraph { id name href text iframe { __typename ...IframeData } layout markups { __typename ...MarkupData } metadata { __typename ...ImageMetadataData } mixtapeMetadata { __typename ...MixtapeMetadataData } type hasDropCap dropCapImage { __typename ...ImageMetadataData } codeBlockMetadata { lang mode } } fragment QuoteData on Quote { id postId userId startOffset endOffset paragraphs { __typename id ...ParagraphData } quoteType } fragment HighlightsData on Post { id highlights { __typename ...QuoteData } } fragment PostFooterCountData on Post { __typename id clapCount viewerEdge { clapCount } ...ResponseCountData responsesLocked mediumUrl title collection { id viewerEdge { isMuting isFollowing } } creator { id viewerEdge { isMuting isFollowing } } } fragment TagNoViewerEdgeData on Tag { id normalizedTagSlug displayTitle followerCount postCount } fragment VideoMetadataData on VideoMetadata { videoId previewImageId originalWidth originalHeight } fragment SectionData on Section { name startIndex textLayout imageLayout videoLayout backgroundImage { __typename ...ImageMetadataData } backgroundVideo { __typename ...VideoMetadataData } } fragment PostBodyData on RichText { sections { __typename ...SectionData } paragraphs { __typename id ...ParagraphData } } fragment FullPostData on Post { __typename ...BylineData ...PostMetaData ...LinkMetadataList ...HighlightsData ...PostFooterCountData tags { __typename id ...TagNoViewerEdgeData } content(postMeteringOptions: $postMeteringOptions) { bodyModel { __typename ...PostBodyData } validatedShareKey } } fragment MeteringInfoData on MeteringInfo { maxUnlockCount unlocksRemaining postIds }"""
class MediumGraphQLApi:
"""
Direct GraphQL API client for Medium articles.
Uses Chrome TLS fingerprinting via curl_cffi to bypass bot detection.
Falls back to httpx if curl_cffi is not available.
"""
__slots__ = ("auth_cookies", "proxy_list", "timeout")
def __init__(
self,
auth_cookies: Optional[str] = None,
proxy_list: Optional[List[str]] = None,
timeout: int = 3,
):
"""
Initialize the API client.
Args:
auth_cookies: Optional Medium authentication cookies
proxy_list: Optional list of proxy URLs
timeout: Request timeout in seconds
"""
self.auth_cookies = auth_cookies
self.proxy_list = proxy_list
self.timeout = timeout
def _get_headers(self) -> dict:
"""Build request headers with spoofed values."""
headers = {
"X-APOLLO-OPERATION-ID": generate_random_sha256_hash(),
"X-APOLLO-OPERATION-NAME": "FullPostQuery",
"Accept": "multipart/mixed; deferSpec=20220824, application/json, application/json",
"Accept-Language": "en-US",
"X-Obvious-CID": "android",
"X-Xsrf-Token": "1",
"X-Client-Date": str(get_unix_ms()),
# User-Agent mimicking Yandex Mobile Bot (less likely to be blocked)
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0;",
"Cache-Control": "public, max-age=-1",
"Content-Type": "application/json",
"Connection": "Keep-Alive",
}
if self.auth_cookies is not None:
headers["Cookie"] = self.auth_cookies
return headers
def _get_graphql_payload(self, post_id: str) -> dict:
"""Build GraphQL request payload."""
return {
"operationName": "FullPostQuery",
"variables": {
"postId": post_id,
"postMeteringOptions": {},
},
"query": FULL_POST_QUERY,
}
async def query_post_by_id(self, post_id: str, retries: int = 2) -> Optional[dict]:
"""
Query a Medium post by its ID with retry logic.
Uses exponential backoff for retries (Freedium-style).
Args:
post_id: The 8-12 character hexadecimal post ID
retries: Number of retry attempts (default: 2)
Returns:
Dict containing the GraphQL response data, or None if failed
"""
import asyncio
for attempt in range(retries + 1):
try:
result = await self.query_post_graphql(post_id)
# Validate response
if result and isinstance(result, dict):
if result.get("error"):
logger.warning(f"GraphQL error for {post_id}: {result.get('error')}")
elif result.get("data", {}).get("post"):
logger.debug(f"Successfully queried post {post_id} on attempt {attempt + 1}")
return result
else:
logger.debug(f"No post data in response for {post_id}")
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed for {post_id}: {e}")
# Exponential backoff before retry
if attempt < retries:
wait_time = 2 ** attempt
logger.debug(f"Retrying in {wait_time}s...")
await asyncio.sleep(wait_time)
logger.error(f"All {retries + 1} attempts failed for post {post_id}")
return None
async def query_post_graphql(self, post_id: str) -> Optional[dict]:
"""
Execute GraphQL query to fetch post data.
Uses curl_cffi with Chrome impersonation if available,
falls back to httpx otherwise.
"""
logger.debug(f"Starting GraphQL request for post {post_id}")
proxy = None
if self.proxy_list:
proxy = random.choice(self.proxy_list)
logger.debug(f"Using proxy: {proxy}")
headers = self._get_headers()
graphql_data = self._get_graphql_payload(post_id)
logger.debug("Request started...")
if HAS_CURL_CFFI:
return await self._query_with_curl_cffi(post_id, headers, graphql_data, proxy)
else:
return await self._query_with_httpx(post_id, headers, graphql_data, proxy)
async def _query_with_curl_cffi(
self,
post_id: str,
headers: dict,
graphql_data: dict,
proxy: Optional[str]
) -> Optional[dict]:
"""Execute query using curl_cffi with Chrome TLS fingerprinting."""
try:
async with AsyncSession() as session:
response = await session.post(
"https://medium.com/_/graphql",
headers=headers,
json=graphql_data,
proxies={"http": proxy, "https": proxy} if proxy else None,
timeout=self.timeout,
impersonate="chrome110",
)
if response.status_code != 200:
logger.error(
f"Failed to fetch post {post_id}: status={response.status_code}"
)
return None
logger.debug("Request completed successfully")
return response.json()
except Exception as ex:
logger.error(f"curl_cffi request failed for post {post_id}: {ex}")
raise
async def _query_with_httpx(
self,
post_id: str,
headers: dict,
graphql_data: dict,
proxy: Optional[str]
) -> Optional[dict]:
"""Execute query using httpx (fallback without TLS fingerprinting)."""
logger.warning("curl_cffi not available, using httpx (no TLS fingerprinting)")
try:
async with httpx.AsyncClient(
proxies=proxy,
timeout=self.timeout,
follow_redirects=True
) as client:
response = await client.post(
"https://medium.com/_/graphql",
headers=headers,
json=graphql_data,
)
if response.status_code != 200:
logger.error(
f"Failed to fetch post {post_id}: status={response.status_code}"
)
return None
logger.debug("Request completed successfully")
return response.json()
except Exception as ex:
logger.error(f"httpx request failed for post {post_id}: {ex}")
raise