File size: 10,617 Bytes
ae588db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import json
import re
from typing import Dict, Any, Optional
from bs4 import BeautifulSoup

def extract_from_apollo_state(html: str) -> Optional[Dict[str, Any]]:
    """
    Extracts article data from window.__APOLLO_STATE__.
    This is the "Gold Mine" - raw JSON data used by Medium's React app.
    """
    try:
        # Find the start of the object
        pattern = r'window\.__APOLLO_STATE__\s*=\s*'
        match = re.search(pattern, html)
        if not match:
            return None
            
        # Use raw_decode to parse the JSON object starting from the match end
        # This avoids issues with regex matching nested braces or trailing semicolons
        try:
            json_str = html[match.end():]
            data, _ = json.JSONDecoder().raw_decode(json_str)
        except Exception as e:
            print(f"JSON Decode Error: {e}")
            return None
        
        # The Apollo state is a flat map of ID -> Object.
        # We need to find the "Post" object that corresponds to the article.
        
        article_data = {}
        
        # Find the main Post object
        # Usually has "title", "creator", "content"
        post_key = None
        for key, value in data.items():
            if key.startswith("Post:") and value.get("title") and value.get("content"):
                post_key = key
                break
        
        if not post_key:
            return None
            
        post = data[post_key]
        
        # Extract fields
        article_data["title"] = post.get("title")
        article_data["id"] = post.get("id")
        article_data["firstPublishedAt"] = post.get("firstPublishedAt")
        article_data["readingTime"] = post.get("readingTime")
        
        # Author
        creator_id = post.get("creator", {}).get("__ref")
        if creator_id and creator_id in data:
            creator = data[creator_id]
            article_data["author"] = {
                "name": creator.get("name"),
                "username": creator.get("username"),
                "bio": creator.get("bio"),
                "id": creator.get("id"),
                "followerCount": creator.get("socialStats", {}).get("followerCount"),
                "imageId": creator.get("imageId")
            }
            
        # Content (Paragraphs)
        # Content is often stored as a list of paragraph IDs or a content object
        # In newer Medium, it might be in "content" -> "bodyModel" -> "paragraphs"
        content_ref = post.get("content", {}).get("__ref") or post.get("content")
        paragraphs = []
        
        if content_ref and isinstance(content_ref, str) and content_ref in data:
             # It's a ref
             content_obj = data[content_ref]
             body_model = content_obj.get("bodyModel")
             if body_model:
                 paragraphs = body_model.get("paragraphs", [])
        elif isinstance(post.get("content"), dict):
             # It's inline
             paragraphs = post.get("content", {}).get("bodyModel", {}).get("paragraphs", [])
             
        # Reconstruct Markdown
        markdown = []
        markdown.append(f"# {article_data['title']}")
        if article_data.get("author"):
            markdown.append(f"**By {article_data['author']['name']}**")
        markdown.append("")
        
        for p in paragraphs:
            text = p.get("text", "")
            type_ = p.get("type")
            markups = p.get("markups", [])
            
            # Basic formatting application could go here, but raw text is often enough
            # Types: P (paragraph), H3 (header), H4 (subheader), IMG (image), CODE (code block)
            
            if type_ == "H3":
                markdown.append(f"## {text}")
            elif type_ == "H4":
                markdown.append(f"### {text}")
            elif type_ == "IMG":
                meta = p.get("metadata", {})
                img_id = meta.get("id")
                if img_id:
                    url = f"https://miro.medium.com/v2/resize:fit:1400/{img_id}"
                    markdown.append(f"![Image]({url})")
                if text:
                    markdown.append(f"*{text}*")
            elif type_ == "CODE":
                markdown.append(f"```\n{text}\n```")
            elif type_ == "PQ": # Pull Quote
                markdown.append(f"> {text}")
            else:
                markdown.append(text)
            
            markdown.append("")
            
        article_data["markdownContent"] = "\n".join(markdown)
        article_data["source"] = "apollo"
        article_data["json_state"] = post # Store raw post data

        # Phase 2: Deep Graph Extraction
        
        # 1. Comments (Responses)
        # Look for posts that are inResponseToPostId == article_data["id"]
        comments = []
        for key, value in data.items():
            if key.startswith("Post:") and value.get("inResponseToPostId") == article_data["id"]:
                # Extract comment text
                comment_text = ""
                # Simplified content extraction for comments
                c_content_ref = value.get("content", {}).get("__ref")
                if c_content_ref and c_content_ref in data:
                    c_paragraphs = data[c_content_ref].get("bodyModel", {}).get("paragraphs", [])
                    comment_text = "\n".join([p.get("text", "") for p in c_paragraphs])
                
                comments.append({
                    "id": value.get("id"),
                    "authorId": value.get("creator", {}).get("__ref"),
                    "text": comment_text,
                    "claps": value.get("virtuals", {}).get("totalClapCount")
                })
        article_data["comments"] = comments[:10] # Top 10

        # 2. Recommended Articles
        # Often found in "relatedPosts" or similar fields in the Post object
        # Or we can just look for other Post objects in the state that are NOT the main post and NOT comments
        recommended = []
        for key, value in data.items():
            if key.startswith("Post:") and key != post_key and value.get("title") and not value.get("inResponseToPostId"):
                 recommended.append({
                     "id": value.get("id"),
                     "title": value.get("title"),
                     "url": f"https://medium.com/p/{value.get('id')}" # Construct URL
                 })
        article_data["recommended"] = recommended[:5]

        return article_data
        
    except Exception as e:
        print(f"Error extracting Apollo state: {e}")
        return None

def extract_from_json_ld(html: str) -> Optional[Dict[str, Any]]:
    """
    Extracts article data from JSON-LD structured data.
    """
    try:
        soup = BeautifulSoup(html, "html.parser")
        scripts = soup.find_all("script", type="application/ld+json")
        
        for script in scripts:
            try:
                data = json.loads(script.string)
                # Check if it's an Article or NewsArticle
                type_ = data.get("@type")
                if isinstance(type_, list):
                    if "Article" in type_ or "NewsArticle" in type_ or "BlogPosting" in type_:
                        pass
                    else:
                        continue
                elif type_ not in ["Article", "NewsArticle", "BlogPosting"]:
                    continue
                    
                # Extract
                article_data = {
                    "title": data.get("headline") or data.get("name"),
                    "description": data.get("description"),
                    "author": {"name": data.get("author", {}).get("name")},
                    "datePublished": data.get("datePublished"),
                    "image": data.get("image"),
                    "source": "json-ld",
                    "json_state": data
                }
                
                # JSON-LD usually doesn't have full body text, mostly just description
                # But sometimes "articleBody" is present
                if data.get("articleBody"):
                    article_data["markdownContent"] = data["articleBody"]
                else:
                    # Fallback to description
                    article_data["markdownContent"] = f"# {article_data['title']}\n\n{article_data['description']}"
                    
                return article_data
                
            except:
                continue
                
    except Exception:
        pass
    return None


def extract_from_graphql_response(response: dict) -> Optional[Dict[str, Any]]:
    """
    Extract article data from direct GraphQL API response.
    
    This is used with the new Tier 1.5 (Direct GraphQL API) that queries
    medium.com/_/graphql directly.
    
    Uses paragraph_parser for rich content extraction with all 13 paragraph
    types and 5 markup types.
    
    Args:
        response: The raw GraphQL API response
        
    Returns:
        Dict with title, author, markdownContent, etc. or None if failed
    """
    try:
        from src.paragraph_parser import (
            parse_graphql_response_to_markdown,
            extract_article_metadata,
        )
        
        # Parse content and metadata
        markdown_content, metadata = parse_graphql_response_to_markdown(response)
        
        if not markdown_content or len(markdown_content) < 100:
            return None
        
        # Build article data structure compatible with existing code
        article_data = {
            "title": metadata.get("title", ""),
            "author": metadata.get("author", {}),
            "publication": metadata.get("publication", ""),
            "markdownContent": markdown_content,
            "source": "graphql_api",
            "json_state": response,
            "firstPublishedAt": metadata.get("firstPublishedAt"),
            "readingTime": metadata.get("readingTime", 0),
            "mediumUrl": metadata.get("mediumUrl", ""),
            "canonicalUrl": metadata.get("canonicalUrl", ""),
            "clapCount": metadata.get("clapCount", 0),
            "isLocked": metadata.get("isLocked", False),
            "tags": metadata.get("tags", []),
            "detectedLanguage": metadata.get("detectedLanguage", "en"),
        }
        
        return article_data
        
    except ImportError as e:
        print(f"Error importing paragraph_parser: {e}")
        return None
    except Exception as e:
        print(f"Error extracting from GraphQL response: {e}")
        return None