File size: 13,378 Bytes
ae588db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
"""
Medium GraphQL API Client

Direct GraphQL API client for Medium articles using Chrome TLS fingerprinting.
Ported from Freedium's medium-parser/api.py
"""

import hashlib
import logging
import random
import secrets
from datetime import datetime
from typing import List, Optional

try:
    from curl_cffi.requests import AsyncSession
    HAS_CURL_CFFI = True
except ImportError:
    HAS_CURL_CFFI = False
    import httpx

logger = logging.getLogger("MediumAPI")


def generate_random_sha256_hash() -> str:
    """Generate a random SHA256 hash for operation ID."""
    random_input_bytes = secrets.token_bytes()
    sha256_hash = hashlib.sha256()
    sha256_hash.update(random_input_bytes)
    return sha256_hash.hexdigest()


def get_unix_ms() -> int:
    """Get current Unix timestamp in milliseconds."""
    return int(datetime.now().timestamp() * 1000)


# Full GraphQL query for fetching complete post data
# Source: medium-parser/api.py
FULL_POST_QUERY = """query FullPostQuery($postId: ID!, $postMeteringOptions: PostMeteringOptions) { post(id: $postId) { __typename id ...FullPostData } meterPost(postId: $postId, postMeteringOptions: $postMeteringOptions) { __typename ...MeteringInfoData } }  fragment UserFollowData on User { id socialStats { followingCount followerCount } viewerEdge { isFollowing } }  fragment NewsletterData on NewsletterV3 { id viewerEdge { id isSubscribed } }  fragment UserNewsletterData on User { id newsletterV3 { __typename ...NewsletterData } }  fragment ImageMetadataData on ImageMetadata { id originalWidth originalHeight focusPercentX focusPercentY alt }  fragment CollectionFollowData on Collection { id subscriberCount viewerEdge { isFollowing } }  fragment CollectionNewsletterData on Collection { id newsletterV3 { __typename ...NewsletterData } }  fragment BylineData on Post { id readingTime creator { __typename id imageId username name bio tippingLink viewerEdge { isUser } ...UserFollowData ...UserNewsletterData } collection { __typename id name avatar { __typename id ...ImageMetadataData } ...CollectionFollowData ...CollectionNewsletterData } isLocked firstPublishedAt latestPublishedVersion }  fragment ResponseCountData on Post { postResponses { count } }  fragment InResponseToPost on Post { id title creator { name } clapCount responsesCount isLocked }  fragment PostVisibilityData on Post { id collection { viewerEdge { isEditor canEditPosts canEditOwnPosts } } creator { id } isLocked visibility }  fragment PostMenuData on Post { id title creator { __typename ...UserFollowData } collection { __typename ...CollectionFollowData } }  fragment PostMetaData on Post { __typename id title visibility ...ResponseCountData clapCount viewerEdge { clapCount } detectedLanguage mediumUrl readingTime updatedAt isLocked allowResponses isProxyPost latestPublishedVersion isSeries firstPublishedAt previewImage { id } inResponseToPostResult { __typename ...InResponseToPost } inResponseToMediaResource { mediumQuote { startOffset endOffset paragraphs { text type markups { type start end anchorType } } } } inResponseToEntityType canonicalUrl collection { id slug name shortDescription avatar { __typename id ...ImageMetadataData } viewerEdge { isFollowing isEditor canEditPosts canEditOwnPosts isMuting } } creator { id isFollowing name bio imageId mediumMemberAt twitterScreenName viewerEdge { isBlocking isMuting isUser } } previewContent { subtitle } pinnedByCreatorAt ...PostVisibilityData ...PostMenuData }  fragment LinkMetadataList on Post { linkMetadataList { url alts { type url } } }  fragment MediaResourceData on MediaResource { id iframeSrc thumbnailUrl iframeHeight iframeWidth title }  fragment IframeData on Iframe { iframeHeight iframeWidth mediaResource { __typename ...MediaResourceData } }  fragment MarkupData on Markup { name type start end href title rel type anchorType userId creatorIds }  fragment CatalogSummaryData on Catalog { id name description type visibility predefined responsesLocked creator { id name username imageId bio viewerEdge { isUser } } createdAt version itemsLastInsertedAt postItemsCount }  fragment CatalogPreviewData on Catalog { __typename ...CatalogSummaryData id itemsConnection(pagingOptions: { limit: 10 } ) { items { entity { __typename ... on Post { id previewImage { id } } } } paging { count } } }  fragment MixtapeMetadataData on MixtapeMetadata { mediaResourceId href thumbnailImageId mediaResource { mediumCatalog { __typename ...CatalogPreviewData } } }  fragment ParagraphData on Paragraph { id name href text iframe { __typename ...IframeData } layout markups { __typename ...MarkupData } metadata { __typename ...ImageMetadataData } mixtapeMetadata { __typename ...MixtapeMetadataData } type hasDropCap dropCapImage { __typename ...ImageMetadataData } codeBlockMetadata { lang mode } }  fragment QuoteData on Quote { id postId userId startOffset endOffset paragraphs { __typename id ...ParagraphData } quoteType }  fragment HighlightsData on Post { id highlights { __typename ...QuoteData } }  fragment PostFooterCountData on Post { __typename id clapCount viewerEdge { clapCount } ...ResponseCountData responsesLocked mediumUrl title collection { id viewerEdge { isMuting isFollowing } } creator { id viewerEdge { isMuting isFollowing } } }  fragment TagNoViewerEdgeData on Tag { id normalizedTagSlug displayTitle followerCount postCount }  fragment VideoMetadataData on VideoMetadata { videoId previewImageId originalWidth originalHeight }  fragment SectionData on Section { name startIndex textLayout imageLayout videoLayout backgroundImage { __typename ...ImageMetadataData } backgroundVideo { __typename ...VideoMetadataData } }  fragment PostBodyData on RichText { sections { __typename ...SectionData } paragraphs { __typename id ...ParagraphData } }  fragment FullPostData on Post { __typename ...BylineData ...PostMetaData ...LinkMetadataList ...HighlightsData ...PostFooterCountData tags { __typename id ...TagNoViewerEdgeData } content(postMeteringOptions: $postMeteringOptions) { bodyModel { __typename ...PostBodyData } validatedShareKey } }  fragment MeteringInfoData on MeteringInfo { maxUnlockCount unlocksRemaining postIds }"""


class MediumGraphQLApi:
    """
    Direct GraphQL API client for Medium articles.
    
    Uses Chrome TLS fingerprinting via curl_cffi to bypass bot detection.
    Falls back to httpx if curl_cffi is not available.
    """
    
    __slots__ = ("auth_cookies", "proxy_list", "timeout")
    
    def __init__(
        self,
        auth_cookies: Optional[str] = None,
        proxy_list: Optional[List[str]] = None,
        timeout: int = 3,
    ):
        """
        Initialize the API client.
        
        Args:
            auth_cookies: Optional Medium authentication cookies
            proxy_list: Optional list of proxy URLs
            timeout: Request timeout in seconds
        """
        self.auth_cookies = auth_cookies
        self.proxy_list = proxy_list
        self.timeout = timeout
    
    def _get_headers(self) -> dict:
        """Build request headers with spoofed values."""
        headers = {
            "X-APOLLO-OPERATION-ID": generate_random_sha256_hash(),
            "X-APOLLO-OPERATION-NAME": "FullPostQuery",
            "Accept": "multipart/mixed; deferSpec=20220824, application/json, application/json",
            "Accept-Language": "en-US",
            "X-Obvious-CID": "android",
            "X-Xsrf-Token": "1",
            "X-Client-Date": str(get_unix_ms()),
            # User-Agent mimicking Yandex Mobile Bot (less likely to be blocked)
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 15_4_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Mobile/15E148 Safari/604.1 (compatible; YandexMobileBot/3.0;",
            "Cache-Control": "public, max-age=-1",
            "Content-Type": "application/json",
            "Connection": "Keep-Alive",
        }
        
        if self.auth_cookies is not None:
            headers["Cookie"] = self.auth_cookies
        
        return headers
    
    def _get_graphql_payload(self, post_id: str) -> dict:
        """Build GraphQL request payload."""
        return {
            "operationName": "FullPostQuery",
            "variables": {
                "postId": post_id,
                "postMeteringOptions": {},
            },
            "query": FULL_POST_QUERY,
        }
    
    async def query_post_by_id(self, post_id: str, retries: int = 2) -> Optional[dict]:
        """
        Query a Medium post by its ID with retry logic.
        
        Uses exponential backoff for retries (Freedium-style).
        
        Args:
            post_id: The 8-12 character hexadecimal post ID
            retries: Number of retry attempts (default: 2)
            
        Returns:
            Dict containing the GraphQL response data, or None if failed
        """
        import asyncio
        
        for attempt in range(retries + 1):
            try:
                result = await self.query_post_graphql(post_id)
                
                # Validate response
                if result and isinstance(result, dict):
                    if result.get("error"):
                        logger.warning(f"GraphQL error for {post_id}: {result.get('error')}")
                    elif result.get("data", {}).get("post"):
                        logger.debug(f"Successfully queried post {post_id} on attempt {attempt + 1}")
                        return result
                    else:
                        logger.debug(f"No post data in response for {post_id}")
                        
            except Exception as e:
                logger.warning(f"Attempt {attempt + 1} failed for {post_id}: {e}")
                
            # Exponential backoff before retry
            if attempt < retries:
                wait_time = 2 ** attempt
                logger.debug(f"Retrying in {wait_time}s...")
                await asyncio.sleep(wait_time)
        
        logger.error(f"All {retries + 1} attempts failed for post {post_id}")
        return None
    
    async def query_post_graphql(self, post_id: str) -> Optional[dict]:
        """
        Execute GraphQL query to fetch post data.
        
        Uses curl_cffi with Chrome impersonation if available,
        falls back to httpx otherwise.
        """
        logger.debug(f"Starting GraphQL request for post {post_id}")
        
        proxy = None
        if self.proxy_list:
            proxy = random.choice(self.proxy_list)
            logger.debug(f"Using proxy: {proxy}")
        
        headers = self._get_headers()
        graphql_data = self._get_graphql_payload(post_id)
        
        logger.debug("Request started...")
        
        if HAS_CURL_CFFI:
            return await self._query_with_curl_cffi(post_id, headers, graphql_data, proxy)
        else:
            return await self._query_with_httpx(post_id, headers, graphql_data, proxy)
    
    async def _query_with_curl_cffi(
        self, 
        post_id: str, 
        headers: dict, 
        graphql_data: dict, 
        proxy: Optional[str]
    ) -> Optional[dict]:
        """Execute query using curl_cffi with Chrome TLS fingerprinting."""
        try:
            async with AsyncSession() as session:
                response = await session.post(
                    "https://medium.com/_/graphql",
                    headers=headers,
                    json=graphql_data,
                    proxies={"http": proxy, "https": proxy} if proxy else None,
                    timeout=self.timeout,
                    impersonate="chrome110",
                )
                
                if response.status_code != 200:
                    logger.error(
                        f"Failed to fetch post {post_id}: status={response.status_code}"
                    )
                    return None
                
                logger.debug("Request completed successfully")
                return response.json()
                
        except Exception as ex:
            logger.error(f"curl_cffi request failed for post {post_id}: {ex}")
            raise
    
    async def _query_with_httpx(
        self, 
        post_id: str, 
        headers: dict, 
        graphql_data: dict, 
        proxy: Optional[str]
    ) -> Optional[dict]:
        """Execute query using httpx (fallback without TLS fingerprinting)."""
        logger.warning("curl_cffi not available, using httpx (no TLS fingerprinting)")
        
        try:
            async with httpx.AsyncClient(
                proxies=proxy,
                timeout=self.timeout,
                follow_redirects=True
            ) as client:
                response = await client.post(
                    "https://medium.com/_/graphql",
                    headers=headers,
                    json=graphql_data,
                )
                
                if response.status_code != 200:
                    logger.error(
                        f"Failed to fetch post {post_id}: status={response.status_code}"
                    )
                    return None
                
                logger.debug("Request completed successfully")
                return response.json()
                
        except Exception as ex:
            logger.error(f"httpx request failed for post {post_id}: {ex}")
            raise