Spaces:

T0X1N
/

Medium-MCP

Running

File size: 10,617 Bytes

ae588db

import json
import re
from typing import Dict, Any, Optional
from bs4 import BeautifulSoup

def extract_from_apollo_state(html: str) -> Optional[Dict[str, Any]]:
    """
    Extracts article data from window.__APOLLO_STATE__.
    This is the "Gold Mine" - raw JSON data used by Medium's React app.
    """
    try:
        # Find the start of the object
        pattern = r'window\.__APOLLO_STATE__\s*=\s*'
        match = re.search(pattern, html)
        if not match:
            return None
            
        # Use raw_decode to parse the JSON object starting from the match end
        # This avoids issues with regex matching nested braces or trailing semicolons
        try:
            json_str = html[match.end():]
            data, _ = json.JSONDecoder().raw_decode(json_str)
        except Exception as e:
            print(f"JSON Decode Error: {e}")
            return None
        
        # The Apollo state is a flat map of ID -> Object.
        # We need to find the "Post" object that corresponds to the article.
        
        article_data = {}
        
        # Find the main Post object
        # Usually has "title", "creator", "content"
        post_key = None
        for key, value in data.items():
            if key.startswith("Post:") and value.get("title") and value.get("content"):
                post_key = key
                break
        
        if not post_key:
            return None
            
        post = data[post_key]
        
        # Extract fields
        article_data["title"] = post.get("title")
        article_data["id"] = post.get("id")
        article_data["firstPublishedAt"] = post.get("firstPublishedAt")
        article_data["readingTime"] = post.get("readingTime")
        
        # Author
        creator_id = post.get("creator", {}).get("__ref")
        if creator_id and creator_id in data:
            creator = data[creator_id]
            article_data["author"] = {
                "name": creator.get("name"),
                "username": creator.get("username"),
                "bio": creator.get("bio"),
                "id": creator.get("id"),
                "followerCount": creator.get("socialStats", {}).get("followerCount"),
                "imageId": creator.get("imageId")
            }
            
        # Content (Paragraphs)
        # Content is often stored as a list of paragraph IDs or a content object
        # In newer Medium, it might be in "content" -> "bodyModel" -> "paragraphs"
        content_ref = post.get("content", {}).get("__ref") or post.get("content")
        paragraphs = []
        
        if content_ref and isinstance(content_ref, str) and content_ref in data:
             # It's a ref
             content_obj = data[content_ref]
             body_model = content_obj.get("bodyModel")
             if body_model:
                 paragraphs = body_model.get("paragraphs", [])
        elif isinstance(post.get("content"), dict):
             # It's inline
             paragraphs = post.get("content", {}).get("bodyModel", {}).get("paragraphs", [])
             
        # Reconstruct Markdown
        markdown = []
        markdown.append(f"# {article_data['title']}")
        if article_data.get("author"):
            markdown.append(f"**By {article_data['author']['name']}**")
        markdown.append("")
        
        for p in paragraphs:
            text = p.get("text", "")
            type_ = p.get("type")
            markups = p.get("markups", [])
            
            # Basic formatting application could go here, but raw text is often enough
            # Types: P (paragraph), H3 (header), H4 (subheader), IMG (image), CODE (code block)
            
            if type_ == "H3":
                markdown.append(f"## {text}")
            elif type_ == "H4":
                markdown.append(f"### {text}")
            elif type_ == "IMG":
                meta = p.get("metadata", {})
                img_id = meta.get("id")
                if img_id:
                    url = f"https://miro.medium.com/v2/resize:fit:1400/{img_id}"
                    markdown.append(f"![Image]({url})")
                if text:
                    markdown.append(f"*{text}*")
            elif type_ == "CODE":
                markdown.append(f"```\n{text}\n```")
            elif type_ == "PQ": # Pull Quote
                markdown.append(f"> {text}")
            else:
                markdown.append(text)
            
            markdown.append("")
            
        article_data["markdownContent"] = "\n".join(markdown)
        article_data["source"] = "apollo"
        article_data["json_state"] = post # Store raw post data

        # Phase 2: Deep Graph Extraction
        
        # 1. Comments (Responses)
        # Look for posts that are inResponseToPostId == article_data["id"]
        comments = []
        for key, value in data.items():
            if key.startswith("Post:") and value.get("inResponseToPostId") == article_data["id"]:
                # Extract comment text
                comment_text = ""
                # Simplified content extraction for comments
                c_content_ref = value.get("content", {}).get("__ref")
                if c_content_ref and c_content_ref in data:
                    c_paragraphs = data[c_content_ref].get("bodyModel", {}).get("paragraphs", [])
                    comment_text = "\n".join([p.get("text", "") for p in c_paragraphs])
                
                comments.append({
                    "id": value.get("id"),
                    "authorId": value.get("creator", {}).get("__ref"),
                    "text": comment_text,
                    "claps": value.get("virtuals", {}).get("totalClapCount")
                })
        article_data["comments"] = comments[:10] # Top 10

        # 2. Recommended Articles
        # Often found in "relatedPosts" or similar fields in the Post object
        # Or we can just look for other Post objects in the state that are NOT the main post and NOT comments
        recommended = []
        for key, value in data.items():
            if key.startswith("Post:") and key != post_key and value.get("title") and not value.get("inResponseToPostId"):
                 recommended.append({
                     "id": value.get("id"),
                     "title": value.get("title"),
                     "url": f"https://medium.com/p/{value.get('id')}" # Construct URL
                 })
        article_data["recommended"] = recommended[:5]

        return article_data
        
    except Exception as e:
        print(f"Error extracting Apollo state: {e}")
        return None

def extract_from_json_ld(html: str) -> Optional[Dict[str, Any]]:
    """
    Extracts article data from JSON-LD structured data.
    """
    try:
        soup = BeautifulSoup(html, "html.parser")
        scripts = soup.find_all("script", type="application/ld+json")
        
        for script in scripts:
            try:
                data = json.loads(script.string)
                # Check if it's an Article or NewsArticle
                type_ = data.get("@type")
                if isinstance(type_, list):
                    if "Article" in type_ or "NewsArticle" in type_ or "BlogPosting" in type_:
                        pass
                    else:
                        continue
                elif type_ not in ["Article", "NewsArticle", "BlogPosting"]:
                    continue
                    
                # Extract
                article_data = {
                    "title": data.get("headline") or data.get("name"),
                    "description": data.get("description"),
                    "author": {"name": data.get("author", {}).get("name")},
                    "datePublished": data.get("datePublished"),
                    "image": data.get("image"),
                    "source": "json-ld",
                    "json_state": data
                }
                
                # JSON-LD usually doesn't have full body text, mostly just description
                # But sometimes "articleBody" is present
                if data.get("articleBody"):
                    article_data["markdownContent"] = data["articleBody"]
                else:
                    # Fallback to description
                    article_data["markdownContent"] = f"# {article_data['title']}\n\n{article_data['description']}"
                    
                return article_data
                
            except:
                continue
                
    except Exception:
        pass
    return None


def extract_from_graphql_response(response: dict) -> Optional[Dict[str, Any]]:
    """
    Extract article data from direct GraphQL API response.
    
    This is used with the new Tier 1.5 (Direct GraphQL API) that queries
    medium.com/_/graphql directly.
    
    Uses paragraph_parser for rich content extraction with all 13 paragraph
    types and 5 markup types.
    
    Args:
        response: The raw GraphQL API response
        
    Returns:
        Dict with title, author, markdownContent, etc. or None if failed
    """
    try:
        from src.paragraph_parser import (
            parse_graphql_response_to_markdown,
            extract_article_metadata,
        )
        
        # Parse content and metadata
        markdown_content, metadata = parse_graphql_response_to_markdown(response)
        
        if not markdown_content or len(markdown_content) < 100:
            return None
        
        # Build article data structure compatible with existing code
        article_data = {
            "title": metadata.get("title", ""),
            "author": metadata.get("author", {}),
            "publication": metadata.get("publication", ""),
            "markdownContent": markdown_content,
            "source": "graphql_api",
            "json_state": response,
            "firstPublishedAt": metadata.get("firstPublishedAt"),
            "readingTime": metadata.get("readingTime", 0),
            "mediumUrl": metadata.get("mediumUrl", ""),
            "canonicalUrl": metadata.get("canonicalUrl", ""),
            "clapCount": metadata.get("clapCount", 0),
            "isLocked": metadata.get("isLocked", False),
            "tags": metadata.get("tags", []),
            "detectedLanguage": metadata.get("detectedLanguage", "en"),
        }
        
        return article_data
        
    except ImportError as e:
        print(f"Error importing paragraph_parser: {e}")
        return None
    except Exception as e:
        print(f"Error extracting from GraphQL response: {e}")
        return None