Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| from typing import Dict, Any, Optional | |
| from bs4 import BeautifulSoup | |
| def extract_from_apollo_state(html: str) -> Optional[Dict[str, Any]]: | |
| """ | |
| Extracts article data from window.__APOLLO_STATE__. | |
| This is the "Gold Mine" - raw JSON data used by Medium's React app. | |
| """ | |
| try: | |
| # Find the start of the object | |
| pattern = r'window\.__APOLLO_STATE__\s*=\s*' | |
| match = re.search(pattern, html) | |
| if not match: | |
| return None | |
| # Use raw_decode to parse the JSON object starting from the match end | |
| # This avoids issues with regex matching nested braces or trailing semicolons | |
| try: | |
| json_str = html[match.end():] | |
| data, _ = json.JSONDecoder().raw_decode(json_str) | |
| except Exception as e: | |
| print(f"JSON Decode Error: {e}") | |
| return None | |
| # The Apollo state is a flat map of ID -> Object. | |
| # We need to find the "Post" object that corresponds to the article. | |
| article_data = {} | |
| # Find the main Post object | |
| # Usually has "title", "creator", "content" | |
| post_key = None | |
| for key, value in data.items(): | |
| if key.startswith("Post:") and value.get("title") and value.get("content"): | |
| post_key = key | |
| break | |
| if not post_key: | |
| return None | |
| post = data[post_key] | |
| # Extract fields | |
| article_data["title"] = post.get("title") | |
| article_data["id"] = post.get("id") | |
| article_data["firstPublishedAt"] = post.get("firstPublishedAt") | |
| article_data["readingTime"] = post.get("readingTime") | |
| # Author | |
| creator_id = post.get("creator", {}).get("__ref") | |
| if creator_id and creator_id in data: | |
| creator = data[creator_id] | |
| article_data["author"] = { | |
| "name": creator.get("name"), | |
| "username": creator.get("username"), | |
| "bio": creator.get("bio"), | |
| "id": creator.get("id"), | |
| "followerCount": creator.get("socialStats", {}).get("followerCount"), | |
| "imageId": creator.get("imageId") | |
| } | |
| # Content (Paragraphs) | |
| # Content is often stored as a list of paragraph IDs or a content object | |
| # In newer Medium, it might be in "content" -> "bodyModel" -> "paragraphs" | |
| content_ref = post.get("content", {}).get("__ref") or post.get("content") | |
| paragraphs = [] | |
| if content_ref and isinstance(content_ref, str) and content_ref in data: | |
| # It's a ref | |
| content_obj = data[content_ref] | |
| body_model = content_obj.get("bodyModel") | |
| if body_model: | |
| paragraphs = body_model.get("paragraphs", []) | |
| elif isinstance(post.get("content"), dict): | |
| # It's inline | |
| paragraphs = post.get("content", {}).get("bodyModel", {}).get("paragraphs", []) | |
| # Reconstruct Markdown | |
| markdown = [] | |
| markdown.append(f"# {article_data['title']}") | |
| if article_data.get("author"): | |
| markdown.append(f"**By {article_data['author']['name']}**") | |
| markdown.append("") | |
| for p in paragraphs: | |
| text = p.get("text", "") | |
| type_ = p.get("type") | |
| markups = p.get("markups", []) | |
| # Basic formatting application could go here, but raw text is often enough | |
| # Types: P (paragraph), H3 (header), H4 (subheader), IMG (image), CODE (code block) | |
| if type_ == "H3": | |
| markdown.append(f"## {text}") | |
| elif type_ == "H4": | |
| markdown.append(f"### {text}") | |
| elif type_ == "IMG": | |
| meta = p.get("metadata", {}) | |
| img_id = meta.get("id") | |
| if img_id: | |
| url = f"https://miro.medium.com/v2/resize:fit:1400/{img_id}" | |
| markdown.append(f"") | |
| if text: | |
| markdown.append(f"*{text}*") | |
| elif type_ == "CODE": | |
| markdown.append(f"```\n{text}\n```") | |
| elif type_ == "PQ": # Pull Quote | |
| markdown.append(f"> {text}") | |
| else: | |
| markdown.append(text) | |
| markdown.append("") | |
| article_data["markdownContent"] = "\n".join(markdown) | |
| article_data["source"] = "apollo" | |
| article_data["json_state"] = post # Store raw post data | |
| # Phase 2: Deep Graph Extraction | |
| # 1. Comments (Responses) | |
| # Look for posts that are inResponseToPostId == article_data["id"] | |
| comments = [] | |
| for key, value in data.items(): | |
| if key.startswith("Post:") and value.get("inResponseToPostId") == article_data["id"]: | |
| # Extract comment text | |
| comment_text = "" | |
| # Simplified content extraction for comments | |
| c_content_ref = value.get("content", {}).get("__ref") | |
| if c_content_ref and c_content_ref in data: | |
| c_paragraphs = data[c_content_ref].get("bodyModel", {}).get("paragraphs", []) | |
| comment_text = "\n".join([p.get("text", "") for p in c_paragraphs]) | |
| comments.append({ | |
| "id": value.get("id"), | |
| "authorId": value.get("creator", {}).get("__ref"), | |
| "text": comment_text, | |
| "claps": value.get("virtuals", {}).get("totalClapCount") | |
| }) | |
| article_data["comments"] = comments[:10] # Top 10 | |
| # 2. Recommended Articles | |
| # Often found in "relatedPosts" or similar fields in the Post object | |
| # Or we can just look for other Post objects in the state that are NOT the main post and NOT comments | |
| recommended = [] | |
| for key, value in data.items(): | |
| if key.startswith("Post:") and key != post_key and value.get("title") and not value.get("inResponseToPostId"): | |
| recommended.append({ | |
| "id": value.get("id"), | |
| "title": value.get("title"), | |
| "url": f"https://medium.com/p/{value.get('id')}" # Construct URL | |
| }) | |
| article_data["recommended"] = recommended[:5] | |
| return article_data | |
| except Exception as e: | |
| print(f"Error extracting Apollo state: {e}") | |
| return None | |
| def extract_from_json_ld(html: str) -> Optional[Dict[str, Any]]: | |
| """ | |
| Extracts article data from JSON-LD structured data. | |
| """ | |
| try: | |
| soup = BeautifulSoup(html, "html.parser") | |
| scripts = soup.find_all("script", type="application/ld+json") | |
| for script in scripts: | |
| try: | |
| data = json.loads(script.string) | |
| # Check if it's an Article or NewsArticle | |
| type_ = data.get("@type") | |
| if isinstance(type_, list): | |
| if "Article" in type_ or "NewsArticle" in type_ or "BlogPosting" in type_: | |
| pass | |
| else: | |
| continue | |
| elif type_ not in ["Article", "NewsArticle", "BlogPosting"]: | |
| continue | |
| # Extract | |
| article_data = { | |
| "title": data.get("headline") or data.get("name"), | |
| "description": data.get("description"), | |
| "author": {"name": data.get("author", {}).get("name")}, | |
| "datePublished": data.get("datePublished"), | |
| "image": data.get("image"), | |
| "source": "json-ld", | |
| "json_state": data | |
| } | |
| # JSON-LD usually doesn't have full body text, mostly just description | |
| # But sometimes "articleBody" is present | |
| if data.get("articleBody"): | |
| article_data["markdownContent"] = data["articleBody"] | |
| else: | |
| # Fallback to description | |
| article_data["markdownContent"] = f"# {article_data['title']}\n\n{article_data['description']}" | |
| return article_data | |
| except: | |
| continue | |
| except Exception: | |
| pass | |
| return None | |
| def extract_from_graphql_response(response: dict) -> Optional[Dict[str, Any]]: | |
| """ | |
| Extract article data from direct GraphQL API response. | |
| This is used with the new Tier 1.5 (Direct GraphQL API) that queries | |
| medium.com/_/graphql directly. | |
| Uses paragraph_parser for rich content extraction with all 13 paragraph | |
| types and 5 markup types. | |
| Args: | |
| response: The raw GraphQL API response | |
| Returns: | |
| Dict with title, author, markdownContent, etc. or None if failed | |
| """ | |
| try: | |
| from src.paragraph_parser import ( | |
| parse_graphql_response_to_markdown, | |
| extract_article_metadata, | |
| ) | |
| # Parse content and metadata | |
| markdown_content, metadata = parse_graphql_response_to_markdown(response) | |
| if not markdown_content or len(markdown_content) < 100: | |
| return None | |
| # Build article data structure compatible with existing code | |
| article_data = { | |
| "title": metadata.get("title", ""), | |
| "author": metadata.get("author", {}), | |
| "publication": metadata.get("publication", ""), | |
| "markdownContent": markdown_content, | |
| "source": "graphql_api", | |
| "json_state": response, | |
| "firstPublishedAt": metadata.get("firstPublishedAt"), | |
| "readingTime": metadata.get("readingTime", 0), | |
| "mediumUrl": metadata.get("mediumUrl", ""), | |
| "canonicalUrl": metadata.get("canonicalUrl", ""), | |
| "clapCount": metadata.get("clapCount", 0), | |
| "isLocked": metadata.get("isLocked", False), | |
| "tags": metadata.get("tags", []), | |
| "detectedLanguage": metadata.get("detectedLanguage", "en"), | |
| } | |
| return article_data | |
| except ImportError as e: | |
| print(f"Error importing paragraph_parser: {e}") | |
| return None | |
| except Exception as e: | |
| print(f"Error extracting from GraphQL response: {e}") | |
| return None | |