Spaces:
Sleeping
Sleeping
File size: 13,378 Bytes
ae588db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 | """
Medium GraphQL API Client
Direct GraphQL API client for Medium articles using Chrome TLS fingerprinting.
Ported from Freedium's medium-parser/api.py
"""
import hashlib
import logging
import random
import secrets
from datetime import datetime
from typing import List, Optional
try:
from curl_cffi.requests import AsyncSession
HAS_CURL_CFFI = True
except ImportError:
HAS_CURL_CFFI = False
import httpx
logger = logging.getLogger("MediumAPI")
def generate_random_sha256_hash() -> str:
"""Generate a random SHA256 hash for operation ID."""
random_input_bytes = secrets.token_bytes()
sha256_hash = hashlib.sha256()
sha256_hash.update(random_input_bytes)
return sha256_hash.hexdigest()
def get_unix_ms() -> int:
"""Get current Unix timestamp in milliseconds."""
return int(datetime.now().timestamp() * 1000)
# Full GraphQL query for fetching complete post data
# Source: medium-parser/api.py
FULL_POST_QUERY = """query FullPostQuery($postId: ID!, $postMeteringOptions: PostMeteringOptions) { post(id: $postId) { __typename id ...FullPostData } meterPost(postId: $postId, postMeteringOptions: $postMeteringOptions) { __typename ...MeteringInfoData } } fragment UserFollowData on User { id socialStats { followingCount followerCount } viewerEdge { isFollowing } } fragment NewsletterData on NewsletterV3 { id viewerEdge { id isSubscribed } } fragment UserNewsletterData on User { id newsletterV3 { __typename ...NewsletterData } } fragment ImageMetadataData on ImageMetadata { id originalWidth originalHeight focusPercentX focusPercentY alt } fragment CollectionFollowData on Collection { id subscriberCount viewerEdge { isFollowing } } fragment CollectionNewsletterData on Collection { id newsletterV3 { __typename ...NewsletterData } } fragment BylineData on Post { id readingTime creator { __typename id imageId username name bio tippingLink viewerEdge { isUser } ...UserFollowData ...UserNewsletterData } collection { __typename id name avatar { __typename id ...ImageMetadataData } ...CollectionFollowData ...CollectionNewsletterData } isLocked firstPublishedAt latestPublishedVersion } fragment ResponseCountData on Post { postResponses { count } } fragment InResponseToPost on Post { id title creator { name } clapCount responsesCount isLocked } fragment PostVisibilityData on Post { id collection { viewerEdge { isEditor canEditPosts canEditOwnPosts } } creator { id } isLocked visibility } fragment PostMenuData on Post { id title creator { __typename ...UserFollowData } collection { __typename ...CollectionFollowData } } fragment PostMetaData on Post { __typename id title visibility ...ResponseCountData clapCount viewerEdge { clapCount } detectedLanguage mediumUrl readingTime updatedAt isLocked allowResponses isProxyPost latestPublishedVersion isSeries firstPublishedAt previewImage { id } inResponseToPostResult { __typename ...InResponseToPost } inResponseToMediaResource { mediumQuote { startOffset endOffset paragraphs { text type markups { type start end anchorType } } } } inResponseToEntityType canonicalUrl collection { id slug name shortDescription avatar { __typename id ...ImageMetadataData } viewerEdge { isFollowing isEditor canEditPosts canEditOwnPosts isMuting } } creator { id isFollowing name bio imageId mediumMemberAt twitterScreenName viewerEdge { isBlocking isMuting isUser } } previewContent { subtitle } pinnedByCreatorAt ...PostVisibilityData ...PostMenuData } fragment LinkMetadataList on Post { linkMetadataList { url alts { type url } } } fragment MediaResourceData on MediaResource { id iframeSrc thumbnailUrl iframeHeight iframeWidth title } fragment IframeData on Iframe { iframeHeight iframeWidth mediaResource { __typename ...MediaResourceData } } fragment MarkupData on Markup { name type start end href title rel type anchorType userId creatorIds } fragment CatalogSummaryData on Catalog { id name description type visibility predefined responsesLocked creator { id name username imageId bio viewerEdge { isUser } } createdAt version itemsLastInsertedAt postItemsCount } fragment CatalogPreviewData on Catalog { __typename ...CatalogSummaryData id itemsConnection(pagingOptions: { limit: 10 } ) { items { entity { __typename ... on Post { id previewImage { id } } } } paging { count } } } fragment MixtapeMetadataData on MixtapeMetadata { mediaResourceId href thumbnailImageId mediaResource { mediumCatalog { __typename ...CatalogPreviewData } } } fragment ParagraphData on Paragraph { id name href text iframe { __typename ...IframeData } layout markups { __typename ...MarkupData } metadata { __typename ...ImageMetadataData } mixtapeMetadata { __typename ...MixtapeMetadataData } type hasDropCap dropCapImage { __typename ...ImageMetadataData } codeBlockMetadata { lang mode } } fragment QuoteData on Quote { id postId userId startOffset endOffset paragraphs { __typename id ...ParagraphData } quoteType } fragment HighlightsData on Post { id highlights { __typename ...QuoteData } } fragment PostFooterCountData on Post { __typename id clapCount viewerEdge { clapCount } ...ResponseCountData responsesLocked mediumUrl title collection { id viewerEdge { isMuting isFollowing } } creator { id viewerEdge { isMuting isFollowing } } } fragment TagNoViewerEdgeData on Tag { id normalizedTagSlug displayTitle followerCount postCount } fragment VideoMetadataData on VideoMetadata { videoId previewImageId originalWidth originalHeight } fragment SectionData on Section { name startIndex textLayout imageLayout videoLayout backgroundImage { __typename ...ImageMetadataData } backgroundVideo { __typename ...VideoMetadataData } } fragment PostBodyData on RichText { sections { __typename ...SectionData } paragraphs { __typename id ...ParagraphData } } fragment FullPostData on Post { __typename ...BylineData ...PostMetaData ...LinkMetadataList ...HighlightsData ...PostFooterCountData tags { __typename id ...TagNoViewerEdgeData } content(postMeteringOptions: $postMeteringOptions) { bodyModel { __typename ...PostBodyData } validatedShareKey } } fragment MeteringInfoData on MeteringInfo { maxUnlockCount unlocksRemaining postIds }"""
class MediumGraphQLApi:
"""
Direct GraphQL API client for Medium articles.
Uses Chrome TLS fingerprinting via curl_cffi to bypass bot detection.
Falls back to httpx if curl_cffi is not available.
"""
__slots__ = ("auth_cookies", "proxy_list", "timeout")
def __init__(
self,
auth_cookies: Optional[str] = None,
proxy_list: Optional[List[str]] = None,
timeout: int = 3,
):
"""
Initialize the API client.
Args:
auth_cookies: Optional Medium authentication cookies
proxy_list: Optional list of proxy URLs
timeout: Request timeout in seconds
"""
self.auth_cookies = auth_cookies
self.proxy_list = proxy_list
self.timeout = timeout
def _get_headers(self) -> dict:
"""Build request headers with spoofed values."""
headers = {
"X-APOLLO-OPERATION-ID": generate_random_sha256_hash(),
"X-APOLLO-OPERATION-NAME": "FullPostQuery",
"Accept": "multipart/mixed; deferSpec=20220824, application/json, application/json",
"Accept-Language": "en-US",
"X-Obvious-CID": "android",
"X-Xsrf-Token": "1",
"X-Client-Date": str(get_unix_ms()),
# User-Agent mimicking Yandex Mobile Bot (less likely to be blocked)
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0;",
"Cache-Control": "public, max-age=-1",
"Content-Type": "application/json",
"Connection": "Keep-Alive",
}
if self.auth_cookies is not None:
headers["Cookie"] = self.auth_cookies
return headers
def _get_graphql_payload(self, post_id: str) -> dict:
"""Build GraphQL request payload."""
return {
"operationName": "FullPostQuery",
"variables": {
"postId": post_id,
"postMeteringOptions": {},
},
"query": FULL_POST_QUERY,
}
async def query_post_by_id(self, post_id: str, retries: int = 2) -> Optional[dict]:
"""
Query a Medium post by its ID with retry logic.
Uses exponential backoff for retries (Freedium-style).
Args:
post_id: The 8-12 character hexadecimal post ID
retries: Number of retry attempts (default: 2)
Returns:
Dict containing the GraphQL response data, or None if failed
"""
import asyncio
for attempt in range(retries + 1):
try:
result = await self.query_post_graphql(post_id)
# Validate response
if result and isinstance(result, dict):
if result.get("error"):
logger.warning(f"GraphQL error for {post_id}: {result.get('error')}")
elif result.get("data", {}).get("post"):
logger.debug(f"Successfully queried post {post_id} on attempt {attempt + 1}")
return result
else:
logger.debug(f"No post data in response for {post_id}")
except Exception as e:
logger.warning(f"Attempt {attempt + 1} failed for {post_id}: {e}")
# Exponential backoff before retry
if attempt < retries:
wait_time = 2 ** attempt
logger.debug(f"Retrying in {wait_time}s...")
await asyncio.sleep(wait_time)
logger.error(f"All {retries + 1} attempts failed for post {post_id}")
return None
async def query_post_graphql(self, post_id: str) -> Optional[dict]:
"""
Execute GraphQL query to fetch post data.
Uses curl_cffi with Chrome impersonation if available,
falls back to httpx otherwise.
"""
logger.debug(f"Starting GraphQL request for post {post_id}")
proxy = None
if self.proxy_list:
proxy = random.choice(self.proxy_list)
logger.debug(f"Using proxy: {proxy}")
headers = self._get_headers()
graphql_data = self._get_graphql_payload(post_id)
logger.debug("Request started...")
if HAS_CURL_CFFI:
return await self._query_with_curl_cffi(post_id, headers, graphql_data, proxy)
else:
return await self._query_with_httpx(post_id, headers, graphql_data, proxy)
async def _query_with_curl_cffi(
self,
post_id: str,
headers: dict,
graphql_data: dict,
proxy: Optional[str]
) -> Optional[dict]:
"""Execute query using curl_cffi with Chrome TLS fingerprinting."""
try:
async with AsyncSession() as session:
response = await session.post(
"https://medium.com/_/graphql",
headers=headers,
json=graphql_data,
proxies={"http": proxy, "https": proxy} if proxy else None,
timeout=self.timeout,
impersonate="chrome110",
)
if response.status_code != 200:
logger.error(
f"Failed to fetch post {post_id}: status={response.status_code}"
)
return None
logger.debug("Request completed successfully")
return response.json()
except Exception as ex:
logger.error(f"curl_cffi request failed for post {post_id}: {ex}")
raise
async def _query_with_httpx(
self,
post_id: str,
headers: dict,
graphql_data: dict,
proxy: Optional[str]
) -> Optional[dict]:
"""Execute query using httpx (fallback without TLS fingerprinting)."""
logger.warning("curl_cffi not available, using httpx (no TLS fingerprinting)")
try:
async with httpx.AsyncClient(
proxies=proxy,
timeout=self.timeout,
follow_redirects=True
) as client:
response = await client.post(
"https://medium.com/_/graphql",
headers=headers,
json=graphql_data,
)
if response.status_code != 200:
logger.error(
f"Failed to fetch post {post_id}: status={response.status_code}"
)
return None
logger.debug("Request completed successfully")
return response.json()
except Exception as ex:
logger.error(f"httpx request failed for post {post_id}: {ex}")
raise
|