""" Medium GraphQL API Client Direct GraphQL API client for Medium articles using Chrome TLS fingerprinting. Ported from Freedium's medium-parser/api.py """ import hashlib import logging import random import secrets from datetime import datetime from typing import List, Optional try: from curl_cffi.requests import AsyncSession HAS_CURL_CFFI = True except ImportError: HAS_CURL_CFFI = False import httpx logger = logging.getLogger("MediumAPI") def generate_random_sha256_hash() -> str: """Generate a random SHA256 hash for operation ID.""" random_input_bytes = secrets.token_bytes() sha256_hash = hashlib.sha256() sha256_hash.update(random_input_bytes) return sha256_hash.hexdigest() def get_unix_ms() -> int: """Get current Unix timestamp in milliseconds.""" return int(datetime.now().timestamp() * 1000) # Full GraphQL query for fetching complete post data # Source: medium-parser/api.py FULL_POST_QUERY = """query FullPostQuery($postId: ID!, $postMeteringOptions: PostMeteringOptions) { post(id: $postId) { __typename id ...FullPostData } meterPost(postId: $postId, postMeteringOptions: $postMeteringOptions) { __typename ...MeteringInfoData } } fragment UserFollowData on User { id socialStats { followingCount followerCount } viewerEdge { isFollowing } } fragment NewsletterData on NewsletterV3 { id viewerEdge { id isSubscribed } } fragment UserNewsletterData on User { id newsletterV3 { __typename ...NewsletterData } } fragment ImageMetadataData on ImageMetadata { id originalWidth originalHeight focusPercentX focusPercentY alt } fragment CollectionFollowData on Collection { id subscriberCount viewerEdge { isFollowing } } fragment CollectionNewsletterData on Collection { id newsletterV3 { __typename ...NewsletterData } } fragment BylineData on Post { id readingTime creator { __typename id imageId username name bio tippingLink viewerEdge { isUser } ...UserFollowData ...UserNewsletterData } collection { __typename id name avatar { __typename id ...ImageMetadataData } ...CollectionFollowData ...CollectionNewsletterData } isLocked firstPublishedAt latestPublishedVersion } fragment ResponseCountData on Post { postResponses { count } } fragment InResponseToPost on Post { id title creator { name } clapCount responsesCount isLocked } fragment PostVisibilityData on Post { id collection { viewerEdge { isEditor canEditPosts canEditOwnPosts } } creator { id } isLocked visibility } fragment PostMenuData on Post { id title creator { __typename ...UserFollowData } collection { __typename ...CollectionFollowData } } fragment PostMetaData on Post { __typename id title visibility ...ResponseCountData clapCount viewerEdge { clapCount } detectedLanguage mediumUrl readingTime updatedAt isLocked allowResponses isProxyPost latestPublishedVersion isSeries firstPublishedAt previewImage { id } inResponseToPostResult { __typename ...InResponseToPost } inResponseToMediaResource { mediumQuote { startOffset endOffset paragraphs { text type markups { type start end anchorType } } } } inResponseToEntityType canonicalUrl collection { id slug name shortDescription avatar { __typename id ...ImageMetadataData } viewerEdge { isFollowing isEditor canEditPosts canEditOwnPosts isMuting } } creator { id isFollowing name bio imageId mediumMemberAt twitterScreenName viewerEdge { isBlocking isMuting isUser } } previewContent { subtitle } pinnedByCreatorAt ...PostVisibilityData ...PostMenuData } fragment LinkMetadataList on Post { linkMetadataList { url alts { type url } } } fragment MediaResourceData on MediaResource { id iframeSrc thumbnailUrl iframeHeight iframeWidth title } fragment IframeData on Iframe { iframeHeight iframeWidth mediaResource { __typename ...MediaResourceData } } fragment MarkupData on Markup { name type start end href title rel type anchorType userId creatorIds } fragment CatalogSummaryData on Catalog { id name description type visibility predefined responsesLocked creator { id name username imageId bio viewerEdge { isUser } } createdAt version itemsLastInsertedAt postItemsCount } fragment CatalogPreviewData on Catalog { __typename ...CatalogSummaryData id itemsConnection(pagingOptions: { limit: 10 } ) { items { entity { __typename ... on Post { id previewImage { id } } } } paging { count } } } fragment MixtapeMetadataData on MixtapeMetadata { mediaResourceId href thumbnailImageId mediaResource { mediumCatalog { __typename ...CatalogPreviewData } } } fragment ParagraphData on Paragraph { id name href text iframe { __typename ...IframeData } layout markups { __typename ...MarkupData } metadata { __typename ...ImageMetadataData } mixtapeMetadata { __typename ...MixtapeMetadataData } type hasDropCap dropCapImage { __typename ...ImageMetadataData } codeBlockMetadata { lang mode } } fragment QuoteData on Quote { id postId userId startOffset endOffset paragraphs { __typename id ...ParagraphData } quoteType } fragment HighlightsData on Post { id highlights { __typename ...QuoteData } } fragment PostFooterCountData on Post { __typename id clapCount viewerEdge { clapCount } ...ResponseCountData responsesLocked mediumUrl title collection { id viewerEdge { isMuting isFollowing } } creator { id viewerEdge { isMuting isFollowing } } } fragment TagNoViewerEdgeData on Tag { id normalizedTagSlug displayTitle followerCount postCount } fragment VideoMetadataData on VideoMetadata { videoId previewImageId originalWidth originalHeight } fragment SectionData on Section { name startIndex textLayout imageLayout videoLayout backgroundImage { __typename ...ImageMetadataData } backgroundVideo { __typename ...VideoMetadataData } } fragment PostBodyData on RichText { sections { __typename ...SectionData } paragraphs { __typename id ...ParagraphData } } fragment FullPostData on Post { __typename ...BylineData ...PostMetaData ...LinkMetadataList ...HighlightsData ...PostFooterCountData tags { __typename id ...TagNoViewerEdgeData } content(postMeteringOptions: $postMeteringOptions) { bodyModel { __typename ...PostBodyData } validatedShareKey } } fragment MeteringInfoData on MeteringInfo { maxUnlockCount unlocksRemaining postIds }""" class MediumGraphQLApi: """ Direct GraphQL API client for Medium articles. Uses Chrome TLS fingerprinting via curl_cffi to bypass bot detection. Falls back to httpx if curl_cffi is not available. """ __slots__ = ("auth_cookies", "proxy_list", "timeout") def __init__( self, auth_cookies: Optional[str] = None, proxy_list: Optional[List[str]] = None, timeout: int = 3, ): """ Initialize the API client. Args: auth_cookies: Optional Medium authentication cookies proxy_list: Optional list of proxy URLs timeout: Request timeout in seconds """ self.auth_cookies = auth_cookies self.proxy_list = proxy_list self.timeout = timeout def _get_headers(self) -> dict: """Build request headers with spoofed values.""" headers = { "X-APOLLO-OPERATION-ID": generate_random_sha256_hash(), "X-APOLLO-OPERATION-NAME": "FullPostQuery", "Accept": "multipart/mixed; deferSpec=20220824, application/json, application/json", "Accept-Language": "en-US", "X-Obvious-CID": "android", "X-Xsrf-Token": "1", "X-Client-Date": str(get_unix_ms()), # User-Agent mimicking Yandex Mobile Bot (less likely to be blocked) "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0;", "Cache-Control": "public, max-age=-1", "Content-Type": "application/json", "Connection": "Keep-Alive", } if self.auth_cookies is not None: headers["Cookie"] = self.auth_cookies return headers def _get_graphql_payload(self, post_id: str) -> dict: """Build GraphQL request payload.""" return { "operationName": "FullPostQuery", "variables": { "postId": post_id, "postMeteringOptions": {}, }, "query": FULL_POST_QUERY, } async def query_post_by_id(self, post_id: str, retries: int = 2) -> Optional[dict]: """ Query a Medium post by its ID with retry logic. Uses exponential backoff for retries (Freedium-style). Args: post_id: The 8-12 character hexadecimal post ID retries: Number of retry attempts (default: 2) Returns: Dict containing the GraphQL response data, or None if failed """ import asyncio for attempt in range(retries + 1): try: result = await self.query_post_graphql(post_id) # Validate response if result and isinstance(result, dict): if result.get("error"): logger.warning(f"GraphQL error for {post_id}: {result.get('error')}") elif result.get("data", {}).get("post"): logger.debug(f"Successfully queried post {post_id} on attempt {attempt + 1}") return result else: logger.debug(f"No post data in response for {post_id}") except Exception as e: logger.warning(f"Attempt {attempt + 1} failed for {post_id}: {e}") # Exponential backoff before retry if attempt < retries: wait_time = 2 ** attempt logger.debug(f"Retrying in {wait_time}s...") await asyncio.sleep(wait_time) logger.error(f"All {retries + 1} attempts failed for post {post_id}") return None async def query_post_graphql(self, post_id: str) -> Optional[dict]: """ Execute GraphQL query to fetch post data. Uses curl_cffi with Chrome impersonation if available, falls back to httpx otherwise. """ logger.debug(f"Starting GraphQL request for post {post_id}") proxy = None if self.proxy_list: proxy = random.choice(self.proxy_list) logger.debug(f"Using proxy: {proxy}") headers = self._get_headers() graphql_data = self._get_graphql_payload(post_id) logger.debug("Request started...") if HAS_CURL_CFFI: return await self._query_with_curl_cffi(post_id, headers, graphql_data, proxy) else: return await self._query_with_httpx(post_id, headers, graphql_data, proxy) async def _query_with_curl_cffi( self, post_id: str, headers: dict, graphql_data: dict, proxy: Optional[str] ) -> Optional[dict]: """Execute query using curl_cffi with Chrome TLS fingerprinting.""" try: async with AsyncSession() as session: response = await session.post( "https://medium.com/_/graphql", headers=headers, json=graphql_data, proxies={"http": proxy, "https": proxy} if proxy else None, timeout=self.timeout, impersonate="chrome110", ) if response.status_code != 200: logger.error( f"Failed to fetch post {post_id}: status={response.status_code}" ) return None logger.debug("Request completed successfully") return response.json() except Exception as ex: logger.error(f"curl_cffi request failed for post {post_id}: {ex}") raise async def _query_with_httpx( self, post_id: str, headers: dict, graphql_data: dict, proxy: Optional[str] ) -> Optional[dict]: """Execute query using httpx (fallback without TLS fingerprinting).""" logger.warning("curl_cffi not available, using httpx (no TLS fingerprinting)") try: async with httpx.AsyncClient( proxies=proxy, timeout=self.timeout, follow_redirects=True ) as client: response = await client.post( "https://medium.com/_/graphql", headers=headers, json=graphql_data, ) if response.status_code != 200: logger.error( f"Failed to fetch post {post_id}: status={response.status_code}" ) return None logger.debug("Request completed successfully") return response.json() except Exception as ex: logger.error(f"httpx request failed for post {post_id}: {ex}") raise