import json import re from typing import Dict, Any, Optional from bs4 import BeautifulSoup def extract_from_apollo_state(html: str) -> Optional[Dict[str, Any]]: """ Extracts article data from window.__APOLLO_STATE__. This is the "Gold Mine" - raw JSON data used by Medium's React app. """ try: # Find the start of the object pattern = r'window\.__APOLLO_STATE__\s*=\s*' match = re.search(pattern, html) if not match: return None # Use raw_decode to parse the JSON object starting from the match end # This avoids issues with regex matching nested braces or trailing semicolons try: json_str = html[match.end():] data, _ = json.JSONDecoder().raw_decode(json_str) except Exception as e: print(f"JSON Decode Error: {e}") return None # The Apollo state is a flat map of ID -> Object. # We need to find the "Post" object that corresponds to the article. article_data = {} # Find the main Post object # Usually has "title", "creator", "content" post_key = None for key, value in data.items(): if key.startswith("Post:") and value.get("title") and value.get("content"): post_key = key break if not post_key: return None post = data[post_key] # Extract fields article_data["title"] = post.get("title") article_data["id"] = post.get("id") article_data["firstPublishedAt"] = post.get("firstPublishedAt") article_data["readingTime"] = post.get("readingTime") # Author creator_id = post.get("creator", {}).get("__ref") if creator_id and creator_id in data: creator = data[creator_id] article_data["author"] = { "name": creator.get("name"), "username": creator.get("username"), "bio": creator.get("bio"), "id": creator.get("id"), "followerCount": creator.get("socialStats", {}).get("followerCount"), "imageId": creator.get("imageId") } # Content (Paragraphs) # Content is often stored as a list of paragraph IDs or a content object # In newer Medium, it might be in "content" -> "bodyModel" -> "paragraphs" content_ref = post.get("content", {}).get("__ref") or post.get("content") paragraphs = [] if content_ref and isinstance(content_ref, str) and content_ref in data: # It's a ref content_obj = data[content_ref] body_model = content_obj.get("bodyModel") if body_model: paragraphs = body_model.get("paragraphs", []) elif isinstance(post.get("content"), dict): # It's inline paragraphs = post.get("content", {}).get("bodyModel", {}).get("paragraphs", []) # Reconstruct Markdown markdown = [] markdown.append(f"# {article_data['title']}") if article_data.get("author"): markdown.append(f"**By {article_data['author']['name']}**") markdown.append("") for p in paragraphs: text = p.get("text", "") type_ = p.get("type") markups = p.get("markups", []) # Basic formatting application could go here, but raw text is often enough # Types: P (paragraph), H3 (header), H4 (subheader), IMG (image), CODE (code block) if type_ == "H3": markdown.append(f"## {text}") elif type_ == "H4": markdown.append(f"### {text}") elif type_ == "IMG": meta = p.get("metadata", {}) img_id = meta.get("id") if img_id: url = f"https://miro.medium.com/v2/resize:fit:1400/{img_id}" markdown.append(f"![Image]({url})") if text: markdown.append(f"*{text}*") elif type_ == "CODE": markdown.append(f"```\n{text}\n```") elif type_ == "PQ": # Pull Quote markdown.append(f"> {text}") else: markdown.append(text) markdown.append("") article_data["markdownContent"] = "\n".join(markdown) article_data["source"] = "apollo" article_data["json_state"] = post # Store raw post data # Phase 2: Deep Graph Extraction # 1. Comments (Responses) # Look for posts that are inResponseToPostId == article_data["id"] comments = [] for key, value in data.items(): if key.startswith("Post:") and value.get("inResponseToPostId") == article_data["id"]: # Extract comment text comment_text = "" # Simplified content extraction for comments c_content_ref = value.get("content", {}).get("__ref") if c_content_ref and c_content_ref in data: c_paragraphs = data[c_content_ref].get("bodyModel", {}).get("paragraphs", []) comment_text = "\n".join([p.get("text", "") for p in c_paragraphs]) comments.append({ "id": value.get("id"), "authorId": value.get("creator", {}).get("__ref"), "text": comment_text, "claps": value.get("virtuals", {}).get("totalClapCount") }) article_data["comments"] = comments[:10] # Top 10 # 2. Recommended Articles # Often found in "relatedPosts" or similar fields in the Post object # Or we can just look for other Post objects in the state that are NOT the main post and NOT comments recommended = [] for key, value in data.items(): if key.startswith("Post:") and key != post_key and value.get("title") and not value.get("inResponseToPostId"): recommended.append({ "id": value.get("id"), "title": value.get("title"), "url": f"https://medium.com/p/{value.get('id')}" # Construct URL }) article_data["recommended"] = recommended[:5] return article_data except Exception as e: print(f"Error extracting Apollo state: {e}") return None def extract_from_json_ld(html: str) -> Optional[Dict[str, Any]]: """ Extracts article data from JSON-LD structured data. """ try: soup = BeautifulSoup(html, "html.parser") scripts = soup.find_all("script", type="application/ld+json") for script in scripts: try: data = json.loads(script.string) # Check if it's an Article or NewsArticle type_ = data.get("@type") if isinstance(type_, list): if "Article" in type_ or "NewsArticle" in type_ or "BlogPosting" in type_: pass else: continue elif type_ not in ["Article", "NewsArticle", "BlogPosting"]: continue # Extract article_data = { "title": data.get("headline") or data.get("name"), "description": data.get("description"), "author": {"name": data.get("author", {}).get("name")}, "datePublished": data.get("datePublished"), "image": data.get("image"), "source": "json-ld", "json_state": data } # JSON-LD usually doesn't have full body text, mostly just description # But sometimes "articleBody" is present if data.get("articleBody"): article_data["markdownContent"] = data["articleBody"] else: # Fallback to description article_data["markdownContent"] = f"# {article_data['title']}\n\n{article_data['description']}" return article_data except: continue except Exception: pass return None def extract_from_graphql_response(response: dict) -> Optional[Dict[str, Any]]: """ Extract article data from direct GraphQL API response. This is used with the new Tier 1.5 (Direct GraphQL API) that queries medium.com/_/graphql directly. Uses paragraph_parser for rich content extraction with all 13 paragraph types and 5 markup types. Args: response: The raw GraphQL API response Returns: Dict with title, author, markdownContent, etc. or None if failed """ try: from src.paragraph_parser import ( parse_graphql_response_to_markdown, extract_article_metadata, ) # Parse content and metadata markdown_content, metadata = parse_graphql_response_to_markdown(response) if not markdown_content or len(markdown_content) < 100: return None # Build article data structure compatible with existing code article_data = { "title": metadata.get("title", ""), "author": metadata.get("author", {}), "publication": metadata.get("publication", ""), "markdownContent": markdown_content, "source": "graphql_api", "json_state": response, "firstPublishedAt": metadata.get("firstPublishedAt"), "readingTime": metadata.get("readingTime", 0), "mediumUrl": metadata.get("mediumUrl", ""), "canonicalUrl": metadata.get("canonicalUrl", ""), "clapCount": metadata.get("clapCount", 0), "isLocked": metadata.get("isLocked", False), "tags": metadata.get("tags", []), "detectedLanguage": metadata.get("detectedLanguage", "en"), } return article_data except ImportError as e: print(f"Error importing paragraph_parser: {e}") return None except Exception as e: print(f"Error extracting from GraphQL response: {e}") return None