Spaces:
Sleeping
Sleeping
| """ | |
| Medium GraphQL API Client | |
| Direct GraphQL API client for Medium articles using Chrome TLS fingerprinting. | |
| Ported from Freedium's medium-parser/api.py | |
| """ | |
| import hashlib | |
| import logging | |
| import random | |
| import secrets | |
| from datetime import datetime | |
| from typing import List, Optional | |
| try: | |
| from curl_cffi.requests import AsyncSession | |
| HAS_CURL_CFFI = True | |
| except ImportError: | |
| HAS_CURL_CFFI = False | |
| import httpx | |
| logger = logging.getLogger("MediumAPI") | |
| def generate_random_sha256_hash() -> str: | |
| """Generate a random SHA256 hash for operation ID.""" | |
| random_input_bytes = secrets.token_bytes() | |
| sha256_hash = hashlib.sha256() | |
| sha256_hash.update(random_input_bytes) | |
| return sha256_hash.hexdigest() | |
| def get_unix_ms() -> int: | |
| """Get current Unix timestamp in milliseconds.""" | |
| return int(datetime.now().timestamp() * 1000) | |
| # Full GraphQL query for fetching complete post data | |
| # Source: medium-parser/api.py | |
| FULL_POST_QUERY = """query FullPostQuery($postId: ID!, $postMeteringOptions: PostMeteringOptions) { post(id: $postId) { __typename id ...FullPostData } meterPost(postId: $postId, postMeteringOptions: $postMeteringOptions) { __typename ...MeteringInfoData } } fragment UserFollowData on User { id socialStats { followingCount followerCount } viewerEdge { isFollowing } } fragment NewsletterData on NewsletterV3 { id viewerEdge { id isSubscribed } } fragment UserNewsletterData on User { id newsletterV3 { __typename ...NewsletterData } } fragment ImageMetadataData on ImageMetadata { id originalWidth originalHeight focusPercentX focusPercentY alt } fragment CollectionFollowData on Collection { id subscriberCount viewerEdge { isFollowing } } fragment CollectionNewsletterData on Collection { id newsletterV3 { __typename ...NewsletterData } } fragment BylineData on Post { id readingTime creator { __typename id imageId username name bio tippingLink viewerEdge { isUser } ...UserFollowData ...UserNewsletterData } collection { __typename id name avatar { __typename id ...ImageMetadataData } ...CollectionFollowData ...CollectionNewsletterData } isLocked firstPublishedAt latestPublishedVersion } fragment ResponseCountData on Post { postResponses { count } } fragment InResponseToPost on Post { id title creator { name } clapCount responsesCount isLocked } fragment PostVisibilityData on Post { id collection { viewerEdge { isEditor canEditPosts canEditOwnPosts } } creator { id } isLocked visibility } fragment PostMenuData on Post { id title creator { __typename ...UserFollowData } collection { __typename ...CollectionFollowData } } fragment PostMetaData on Post { __typename id title visibility ...ResponseCountData clapCount viewerEdge { clapCount } detectedLanguage mediumUrl readingTime updatedAt isLocked allowResponses isProxyPost latestPublishedVersion isSeries firstPublishedAt previewImage { id } inResponseToPostResult { __typename ...InResponseToPost } inResponseToMediaResource { mediumQuote { startOffset endOffset paragraphs { text type markups { type start end anchorType } } } } inResponseToEntityType canonicalUrl collection { id slug name shortDescription avatar { __typename id ...ImageMetadataData } viewerEdge { isFollowing isEditor canEditPosts canEditOwnPosts isMuting } } creator { id isFollowing name bio imageId mediumMemberAt twitterScreenName viewerEdge { isBlocking isMuting isUser } } previewContent { subtitle } pinnedByCreatorAt ...PostVisibilityData ...PostMenuData } fragment LinkMetadataList on Post { linkMetadataList { url alts { type url } } } fragment MediaResourceData on MediaResource { id iframeSrc thumbnailUrl iframeHeight iframeWidth title } fragment IframeData on Iframe { iframeHeight iframeWidth mediaResource { __typename ...MediaResourceData } } fragment MarkupData on Markup { name type start end href title rel type anchorType userId creatorIds } fragment CatalogSummaryData on Catalog { id name description type visibility predefined responsesLocked creator { id name username imageId bio viewerEdge { isUser } } createdAt version itemsLastInsertedAt postItemsCount } fragment CatalogPreviewData on Catalog { __typename ...CatalogSummaryData id itemsConnection(pagingOptions: { limit: 10 } ) { items { entity { __typename ... on Post { id previewImage { id } } } } paging { count } } } fragment MixtapeMetadataData on MixtapeMetadata { mediaResourceId href thumbnailImageId mediaResource { mediumCatalog { __typename ...CatalogPreviewData } } } fragment ParagraphData on Paragraph { id name href text iframe { __typename ...IframeData } layout markups { __typename ...MarkupData } metadata { __typename ...ImageMetadataData } mixtapeMetadata { __typename ...MixtapeMetadataData } type hasDropCap dropCapImage { __typename ...ImageMetadataData } codeBlockMetadata { lang mode } } fragment QuoteData on Quote { id postId userId startOffset endOffset paragraphs { __typename id ...ParagraphData } quoteType } fragment HighlightsData on Post { id highlights { __typename ...QuoteData } } fragment PostFooterCountData on Post { __typename id clapCount viewerEdge { clapCount } ...ResponseCountData responsesLocked mediumUrl title collection { id viewerEdge { isMuting isFollowing } } creator { id viewerEdge { isMuting isFollowing } } } fragment TagNoViewerEdgeData on Tag { id normalizedTagSlug displayTitle followerCount postCount } fragment VideoMetadataData on VideoMetadata { videoId previewImageId originalWidth originalHeight } fragment SectionData on Section { name startIndex textLayout imageLayout videoLayout backgroundImage { __typename ...ImageMetadataData } backgroundVideo { __typename ...VideoMetadataData } } fragment PostBodyData on RichText { sections { __typename ...SectionData } paragraphs { __typename id ...ParagraphData } } fragment FullPostData on Post { __typename ...BylineData ...PostMetaData ...LinkMetadataList ...HighlightsData ...PostFooterCountData tags { __typename id ...TagNoViewerEdgeData } content(postMeteringOptions: $postMeteringOptions) { bodyModel { __typename ...PostBodyData } validatedShareKey } } fragment MeteringInfoData on MeteringInfo { maxUnlockCount unlocksRemaining postIds }""" | |
| class MediumGraphQLApi: | |
| """ | |
| Direct GraphQL API client for Medium articles. | |
| Uses Chrome TLS fingerprinting via curl_cffi to bypass bot detection. | |
| Falls back to httpx if curl_cffi is not available. | |
| """ | |
| __slots__ = ("auth_cookies", "proxy_list", "timeout") | |
| def __init__( | |
| self, | |
| auth_cookies: Optional[str] = None, | |
| proxy_list: Optional[List[str]] = None, | |
| timeout: int = 3, | |
| ): | |
| """ | |
| Initialize the API client. | |
| Args: | |
| auth_cookies: Optional Medium authentication cookies | |
| proxy_list: Optional list of proxy URLs | |
| timeout: Request timeout in seconds | |
| """ | |
| self.auth_cookies = auth_cookies | |
| self.proxy_list = proxy_list | |
| self.timeout = timeout | |
| def _get_headers(self) -> dict: | |
| """Build request headers with spoofed values.""" | |
| headers = { | |
| "X-APOLLO-OPERATION-ID": generate_random_sha256_hash(), | |
| "X-APOLLO-OPERATION-NAME": "FullPostQuery", | |
| "Accept": "multipart/mixed; deferSpec=20220824, application/json, application/json", | |
| "Accept-Language": "en-US", | |
| "X-Obvious-CID": "android", | |
| "X-Xsrf-Token": "1", | |
| "X-Client-Date": str(get_unix_ms()), | |
| # User-Agent mimicking Yandex Mobile Bot (less likely to be blocked) | |
| "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0;", | |
| "Cache-Control": "public, max-age=-1", | |
| "Content-Type": "application/json", | |
| "Connection": "Keep-Alive", | |
| } | |
| if self.auth_cookies is not None: | |
| headers["Cookie"] = self.auth_cookies | |
| return headers | |
| def _get_graphql_payload(self, post_id: str) -> dict: | |
| """Build GraphQL request payload.""" | |
| return { | |
| "operationName": "FullPostQuery", | |
| "variables": { | |
| "postId": post_id, | |
| "postMeteringOptions": {}, | |
| }, | |
| "query": FULL_POST_QUERY, | |
| } | |
| async def query_post_by_id(self, post_id: str, retries: int = 2) -> Optional[dict]: | |
| """ | |
| Query a Medium post by its ID with retry logic. | |
| Uses exponential backoff for retries (Freedium-style). | |
| Args: | |
| post_id: The 8-12 character hexadecimal post ID | |
| retries: Number of retry attempts (default: 2) | |
| Returns: | |
| Dict containing the GraphQL response data, or None if failed | |
| """ | |
| import asyncio | |
| for attempt in range(retries + 1): | |
| try: | |
| result = await self.query_post_graphql(post_id) | |
| # Validate response | |
| if result and isinstance(result, dict): | |
| if result.get("error"): | |
| logger.warning(f"GraphQL error for {post_id}: {result.get('error')}") | |
| elif result.get("data", {}).get("post"): | |
| logger.debug(f"Successfully queried post {post_id} on attempt {attempt + 1}") | |
| return result | |
| else: | |
| logger.debug(f"No post data in response for {post_id}") | |
| except Exception as e: | |
| logger.warning(f"Attempt {attempt + 1} failed for {post_id}: {e}") | |
| # Exponential backoff before retry | |
| if attempt < retries: | |
| wait_time = 2 ** attempt | |
| logger.debug(f"Retrying in {wait_time}s...") | |
| await asyncio.sleep(wait_time) | |
| logger.error(f"All {retries + 1} attempts failed for post {post_id}") | |
| return None | |
| async def query_post_graphql(self, post_id: str) -> Optional[dict]: | |
| """ | |
| Execute GraphQL query to fetch post data. | |
| Uses curl_cffi with Chrome impersonation if available, | |
| falls back to httpx otherwise. | |
| """ | |
| logger.debug(f"Starting GraphQL request for post {post_id}") | |
| proxy = None | |
| if self.proxy_list: | |
| proxy = random.choice(self.proxy_list) | |
| logger.debug(f"Using proxy: {proxy}") | |
| headers = self._get_headers() | |
| graphql_data = self._get_graphql_payload(post_id) | |
| logger.debug("Request started...") | |
| if HAS_CURL_CFFI: | |
| return await self._query_with_curl_cffi(post_id, headers, graphql_data, proxy) | |
| else: | |
| return await self._query_with_httpx(post_id, headers, graphql_data, proxy) | |
| async def _query_with_curl_cffi( | |
| self, | |
| post_id: str, | |
| headers: dict, | |
| graphql_data: dict, | |
| proxy: Optional[str] | |
| ) -> Optional[dict]: | |
| """Execute query using curl_cffi with Chrome TLS fingerprinting.""" | |
| try: | |
| async with AsyncSession() as session: | |
| response = await session.post( | |
| "https://medium.com/_/graphql", | |
| headers=headers, | |
| json=graphql_data, | |
| proxies={"http": proxy, "https": proxy} if proxy else None, | |
| timeout=self.timeout, | |
| impersonate="chrome110", | |
| ) | |
| if response.status_code != 200: | |
| logger.error( | |
| f"Failed to fetch post {post_id}: status={response.status_code}" | |
| ) | |
| return None | |
| logger.debug("Request completed successfully") | |
| return response.json() | |
| except Exception as ex: | |
| logger.error(f"curl_cffi request failed for post {post_id}: {ex}") | |
| raise | |
| async def _query_with_httpx( | |
| self, | |
| post_id: str, | |
| headers: dict, | |
| graphql_data: dict, | |
| proxy: Optional[str] | |
| ) -> Optional[dict]: | |
| """Execute query using httpx (fallback without TLS fingerprinting).""" | |
| logger.warning("curl_cffi not available, using httpx (no TLS fingerprinting)") | |
| try: | |
| async with httpx.AsyncClient( | |
| proxies=proxy, | |
| timeout=self.timeout, | |
| follow_redirects=True | |
| ) as client: | |
| response = await client.post( | |
| "https://medium.com/_/graphql", | |
| headers=headers, | |
| json=graphql_data, | |
| ) | |
| if response.status_code != 200: | |
| logger.error( | |
| f"Failed to fetch post {post_id}: status={response.status_code}" | |
| ) | |
| return None | |
| logger.debug("Request completed successfully") | |
| return response.json() | |
| except Exception as ex: | |
| logger.error(f"httpx request failed for post {post_id}: {ex}") | |
| raise | |