Spaces:

T0X1N
/

Medium-MCP

Sleeping

File size: 23,867 Bytes

"""
Medium Article HTML Renderer

Renders article data to beautiful HTML matching Medium's styling.
Based on Freedium's medium-parser/core.py template rendering.
"""

import html
from typing import Dict, List, Any, Optional
import logging

# Import centralized image URL utilities
from src.utils import MEDIUM_IMAGE_DEFAULT_WIDTH

logger = logging.getLogger("HTMLRenderer")

# Base HTML template for standalone page
BASE_TEMPLATE = """<!DOCTYPE html>
<html lang="en" class="dark">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{title} | Medium Scraper</title>
    <style>
        @import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@400;700&family=Inter:wght@300;400;600&family=JetBrains+Mono:wght@400;600&display=swap');
        
        :root {{
            --bg-color: #121212;
            --text-color: #e5e5e5;
            --accent-color: #6366f1;
            --code-bg: #1e1e1e;
        }}
        
        body {{
            background-color: var(--bg-color);
            color: var(--text-color);
            font-family: 'Inter', sans-serif;
            line-height: 1.6;
            margin: 0;
            padding: 0;
        }}

        /* Container for PDF and Web consistency */
        .container {{
            max-width: 100%;
            margin: 0 auto;
            padding: 40px;
        }}

        /* Typography */
        h1, h2, h3, h4 {{
            font-family: 'Playfair Display', serif;
            color: #ffffff;
            margin-top: 2em;
            margin-bottom: 0.5em;
            line-height: 1.25;
        }}
        
        h1 {{ font-size: 2.5rem; border-bottom: 1px solid rgba(255,255,255,0.1); padding-bottom: 20px; }}
        h2 {{ font-size: 1.8rem; }}
        h3 {{ font-size: 1.5rem; }}
        
        p {{ margin-bottom: 1.5em; font-size: 1.1rem; }}
        
        a {{ color: var(--accent-color); text-decoration: none; }}
        a:hover {{ text-decoration: underline; }}

        /* Code Blocks */
        pre {{
            background: var(--code-bg);
            padding: 20px;
            border-radius: 8px;
            overflow-x: auto;
            border: 1px solid rgba(255,255,255,0.1);
            margin: 2em 0;
        }}
        
        code {{
            font-family: 'JetBrains Mono', monospace;
            font-size: 0.9em;
            color: #efefef;
        }}
        
        p code {{
            background: rgba(255,255,255,0.1);
            padding: 2px 6px;
            border-radius: 4px;
        }}

        /* Blockquotes */
        blockquote {{
            border-left: 4px solid var(--accent-color);
            margin: 2em 0;
            padding-left: 20px;
            font-style: italic;
            color: #a1a1aa;
        }}

        /* Images */
        img {{
            max-width: 100%;
            height: auto;
            border-radius: 8px;
            margin: 2em auto;
            display: block;
        }}
        
        /* Lists */
        ul, ol {{ margin: 1.5em 0; padding-left: 2em; }}
        li {{ margin-bottom: 0.5em; }}
        
        /* Tables */
        table {{
            width: 100%;
            border-collapse: collapse;
            margin: 2em 0;
        }}
        th, td {{
            padding: 12px;
            border-bottom: 1px solid rgba(255,255,255,0.1);
            text-align: left;
        }}
        th {{ font-weight: 600; color: #fff; }}

        /* Author Card */
        .author-card {{
            background: rgba(255,255,255,0.05);
            padding: 20px;
            border-radius: 12px;
            margin-bottom: 40px;
            display: flex;
            align-items: center;
            gap: 20px;
        }}
        .author-card img {{ margin: 0; width: 64px; height: 64px; border-radius: 50%; }}
        
        /* Print Overrides */
        @media print {{
            body {{ background: white; color: black; }}
            h1, h2, h3 {{ color: black; }}
            pre {{ background: #f5f5f5; border: 1px solid #ddd; color: black; }}
            code {{ color: black; }}
            a {{ color: #000; text-decoration: underline; }}
            .container {{ padding: 0; }}
        }}
    </style>
</head>
<body class="bg-gray-900 text-gray-100">
    {content}
</body>
</html>"""

# Article content template
ARTICLE_TEMPLATE = """
<div class="container w-full pt-20 mx-auto text-gray-100 break-words bg-gray-800 max-w-none">
    <div class="w-full px-4 text-xl leading-normal md:px-6" style="font-family:Georgia,serif;">
        <div class="font-sans">
            <p class="pb-3 text-base font-bold text-green-500 md:text-sm">
                <a href="{url}#bypass" class="text-sm font-bold text-green-500 no-underline md:text-sm hover:underline">&lt; Go to the original</a>
            </p>
            {preview_image}
            <h1 class="pt-6 pb-2 font-sans text-3xl font-bold text-gray-100 break-normal md:text-4xl">{title}</h1>
            {subtitle_html}
        </div>
        {author_card}
        <div class="mt-8 main-content">
            {content}
        </div>
        <div class="flex flex-wrap gap-2 mt-5">
            {tags_html}
        </div>
        <div class="container w-full pt-12 mx-auto"></div>
    </div>
</div>
"""

# Author card template
AUTHOR_CARD_TEMPLATE = """
<div class="m-2 mt-5 bg-gray-700 border border-gray-600">
    <div class="flex items-center p-4 space-x-4">
        <div class="flex-shrink-0">
            <a href="https://medium.com/@{username}" target="_blank" class="relative block">
                <img src="https://miro.medium.com/v2/resize:fill:88:88/{image_id}" 
                     alt="{name}" loading="eager" referrerpolicy="no-referrer" 
                     class="rounded-full h-11 w-11">
            </a>
        </div>
        <div class="flex-grow">
            <a href="https://medium.com/@{username}" target="_blank" 
               class="block font-semibold text-white">{name}</a>
            <button class="px-3 py-1 mt-1 text-sm text-white bg-green-600 rounded-lg">
                <a href="https://medium.com/@{username}" target="_blank" class="text-sm text-white">Follow</a>
            </button>
        </div>
    </div>
    <div class="px-4 pb-2">
        <div class="flex flex-wrap items-center space-x-2 text-sm text-gray-400">
            {collection_html}
            <span>~{reading_time} min read</span>
            <span>·</span>
            <span class="text-yellow-400">Free: {free_access}</span>
        </div>
    </div>
</div>
"""


def escape_html(text: str) -> str:
    """Escape HTML special characters."""
    if not text:
        return ""
    return html.escape(str(text))


def render_markup(text: str, markups: List[Dict]) -> str:
    """Apply markups (bold, italic, links, code) to text."""
    if not text or not markups:
        return escape_html(text)
    
    # Sort markups by start position (reversed for correct insertion)
    sorted_markups = sorted(markups, key=lambda m: (m.get("start", 0), -m.get("end", 0)))
    
    result = list(text)
    insertions = []  # (position, is_closing, tag)
    
    for markup in sorted_markups:
        start = markup.get("start", 0)
        end = markup.get("end", len(text))
        markup_type = markup.get("type", "")
        
        if markup_type == "STRONG":
            insertions.append((start, False, "<strong>"))
            insertions.append((end, True, "</strong>"))
        elif markup_type == "EM":
            insertions.append((start, False, "<em>"))
            insertions.append((end, True, "</em>"))
        elif markup_type == "CODE":
            insertions.append((start, False, '<code class="p-1.5 bg-gray-600 rounded">'))
            insertions.append((end, True, "</code>"))
        elif markup_type == "A":
            href = escape_html(markup.get("href", ""))
            if markup.get("anchorType") == "USER":
                href = f"https://medium.com/u/{markup.get('userId', '')}"
            target = "" if href.startswith("#") else ' target="_blank"'
            insertions.append((start, False, f'<a href="{href}"{target} class="underline text-blue-400">'))
            insertions.append((end, True, "</a>"))
    
    # Sort insertions: by position, then closing tags before opening
    insertions.sort(key=lambda x: (x[0], x[1]))
    
    # Build result with insertions
    offset = 0
    escaped = escape_html(text)
    result_parts = []
    last_pos = 0
    
    for pos, is_closing, tag in insertions:
        if pos > last_pos:
            result_parts.append(escaped[last_pos:pos])
        result_parts.append(tag)
        last_pos = pos
    
    result_parts.append(escaped[last_pos:])
    return "".join(result_parts)


def render_paragraph(paragraph: Dict, is_code: bool = False) -> str:
    """Render a single paragraph to HTML."""
    para_type = paragraph.get("type", "P")
    text = paragraph.get("text", "")
    markups = paragraph.get("markups", [])
    
    # Apply markups
    formatted_text = render_markup(text, markups) if not is_code else escape_html(text)
    
    if para_type == "H2":
        return f'<h2 class="pt-12 font-bold font-sans break-normal text-gray-100 text-2xl">{formatted_text}</h2>'
    
    elif para_type == "H3":
        return f'<h3 class="pt-12 font-bold font-sans break-normal text-gray-100 text-2xl">{formatted_text}</h3>'
    
    elif para_type == "H4":
        return f'<h4 class="pt-8 font-bold font-sans break-normal text-gray-100 text-xl">{formatted_text}</h4>'
    
    elif para_type == "P":
        css_class = "leading-8 mt-7"
        if paragraph.get("hasDropCap"):
            css_class += " first-letter:text-7xl first-letter:float-left first-letter:mr-2"
        return f'<p class="{css_class}">{formatted_text}</p>'
    
    elif para_type == "IMG":
        metadata = paragraph.get("metadata") or {}
        image_id = metadata.get("id", "")
        alt = escape_html(metadata.get("alt", ""))
        caption = formatted_text
        
        img_html = f'''
        <div class="mt-7">
            <img loading="eager" alt="{alt}" class="pt-5 m-auto" 
                 referrerpolicy="no-referrer" 
                 src="https://miro.medium.com/v2/resize:fit:1400/{image_id}">
        </div>
        '''
        if caption:
            img_html += f'<figcaption class="mt-3 text-sm text-center text-gray-400">{caption}</figcaption>'
        return img_html
    
    elif para_type == "PRE":
        code_meta = paragraph.get("codeBlockMetadata") or {}
        lang = code_meta.get("lang") or ""
        lang_class = f"language-{lang}" if lang else "nohighlight"
        return f'<pre class="flex flex-col mt-7 border border-gray-700"><code class="p-2 bg-gray-900 overflow-x-auto {lang_class}">{escape_html(text)}</code></pre>'
    
    elif para_type == "BQ":
        return f'''
        <blockquote style="box-shadow: inset 3px 0 0 0 rgb(209 207 239);" class="px-5 pt-3 pb-3 mt-5">
            <p class="font-italic">{formatted_text}</p>
        </blockquote>
        '''
    
    elif para_type == "PQ":
        return f'<blockquote class="ml-5 text-2xl text-gray-300 mt-7"><p>{formatted_text}</p></blockquote>'
    
    elif para_type == "ULI":
        return f'<li class="mt-3">{formatted_text}</li>'
    
    elif para_type == "OLI":
        return f'<li class="mt-3">{formatted_text}</li>'
    
    elif para_type == "IFRAME":
        iframe_data = paragraph.get("iframe") or {}
        media_resource = iframe_data.get("mediaResource") or {}
        src = media_resource.get("iframeSrc", "")
        width = media_resource.get("iframeWidth", "100%")
        height = media_resource.get("iframeHeight", "400")
        
        if src:
            return f'''
            <div class="mt-7">
                <iframe class="w-full" src="{escape_html(src)}" 
                        width="{width}" height="{height}" 
                        allowfullscreen frameborder="0"></iframe>
            </div>
            '''
        return ""
    
    elif para_type == "MIXTAPE_EMBED":
        mixtape = paragraph.get("mixtapeMetadata") or {}
        href = escape_html(mixtape.get("href", ""))
        thumbnail = mixtape.get("thumbnailImageId", "")
        
        # Parse title/description from markups
        parts = text.split("\n") if text else ["", ""]
        embed_title = parts[0] if len(parts) > 0 else ""
        embed_desc = parts[1] if len(parts) > 1 else ""
        
        return f'''
        <div class="items-center p-2 overflow-hidden border border-gray-600 mt-7">
            <a rel="noopener follow" href="{href}" target="_blank">
                <div class="flex flex-row justify-between p-2 overflow-hidden">
                    <div class="flex flex-col justify-center p-2">
                        <h2 class="text-base font-bold text-gray-100">{escape_html(embed_title)}</h2>
                        <div class="block mt-2">
                            <h3 class="text-sm text-gray-400">{escape_html(embed_desc)}</h3>
                        </div>
                    </div>
                    <div class="relative flex h-40 flew-row w-60">
                        <div class="absolute inset-0 bg-center bg-cover" 
                             style="background-image: url('https://miro.medium.com/v2/resize:fit:800/{thumbnail}');">
                        </div>
                    </div>
                </div>
            </a>
        </div>
        '''
    
    else:
        logger.warning(f"Unknown paragraph type: {para_type}")
        return f'<p class="mt-7">{formatted_text}</p>'


def render_paragraphs(paragraphs: List[Dict], title: str = "", subtitle: str = "", preview_image_id: str = "") -> str:
    """Render all paragraphs to HTML content."""
    if not paragraphs:
        return ""
    
    out_parts = []
    i = 0
    
    while i < len(paragraphs):
        para = paragraphs[i]
        para_type = para.get("type", "")
        para_text = para.get("text", "")
        
        # Skip duplicate title/subtitle in first 4 paragraphs
        if i < 4:
            if para_type in ["H3", "H4", "H2"] and title and _similarity(para_text, title) > 0.8:
                i += 1
                continue
            if para_type in ["H4", "P"] and subtitle and _similarity(para_text, subtitle) > 0.8:
                i += 1
                continue
            if para_type == "IMG":
                metadata = para.get("metadata") or {}
                if metadata.get("id") == preview_image_id:
                    i += 1
                    continue
        
        # Handle grouped elements (lists, code blocks)
        if para_type == "ULI":
            list_items = []
            while i < len(paragraphs) and paragraphs[i].get("type") == "ULI":
                list_items.append(render_paragraph(paragraphs[i]))
                i += 1
            out_parts.append(f'<ul class="pl-8 mt-2 list-disc">{"".join(list_items)}</ul>')
            continue
        
        if para_type == "OLI":
            list_items = []
            while i < len(paragraphs) and paragraphs[i].get("type") == "OLI":
                list_items.append(render_paragraph(paragraphs[i]))
                i += 1
            out_parts.append(f'<ol class="pl-8 mt-2 list-decimal">{"".join(list_items)}</ol>')
            continue
        
        if para_type == "PRE":
            code_blocks = []
            while i < len(paragraphs) and paragraphs[i].get("type") == "PRE":
                code_blocks.append(escape_html(paragraphs[i].get("text", "")))
                i += 1
            code_meta = para.get("codeBlockMetadata") or {}
            lang = code_meta.get("lang") or ""
            lang_class = f"language-{lang}" if lang else "nohighlight"
            joined_code = "\n".join(code_blocks)
            out_parts.append(f'<pre class="flex flex-col mt-7 border border-gray-700"><code class="p-2 bg-gray-900 overflow-x-auto {lang_class}">{joined_code}</code></pre>')
            continue
        
        # Regular paragraph
        out_parts.append(render_paragraph(para))
        i += 1
    
    return "\n".join(out_parts)


def _similarity(s1: str, s2: str) -> float:
    """Calculate similarity ratio between two strings."""
    if not s1 or not s2:
        return 0.0
    s1, s2 = s1.lower(), s2.lower()
    if s1 == s2:
        return 1.0
    # Simple character overlap
    common = len(set(s1) & set(s2))
    total = len(set(s1) | set(s2))
    return common / total if total > 0 else 0.0


def render_article_html(article_data: Dict[str, Any]) -> str:
    """
    Render article data to HTML content (not full page).
    
    Args:
        article_data: Dict with title, author, markdownContent, etc.
        
    Returns:
        HTML string for article content
    """
    title = escape_html(article_data.get("title", "Untitled"))
    subtitle = article_data.get("subtitle", "")
    url = escape_html(article_data.get("url", ""))
    
    # Author info
    author = article_data.get("author") or {}
    if isinstance(author, str):
        author = {"name": author}
    
    author_name = escape_html(author.get("name", "Unknown"))
    author_username = escape_html(author.get("username", ""))
    author_image = author.get("imageId", "1*dmbNkD5D-u45r44go_cf0g.png")
    
    # Collection/publication
    collection = article_data.get("publication") or article_data.get("collection") or {}
    if isinstance(collection, str):
        collection = {"name": collection}
    collection_html = ""
    if collection and isinstance(collection, dict) and collection.get("name"):
        collection_html = f'''
        <a href="https://medium.com/{escape_html(collection.get('slug', ''))}" target="_blank" class="flex items-center space-x-1">
            <p>{escape_html(collection.get('name', ''))}</p>
        </a>
        <span>·</span>
        '''
    
    # Reading time
    reading_time = article_data.get("readingTime", 5)
    if isinstance(reading_time, float):
        reading_time = int(reading_time)
    
    # Free access
    is_locked = article_data.get("isLocked", False)
    free_access = "No" if is_locked else "Yes"
    
    # Preview image
    preview_image_id = article_data.get("previewImageId", "")
    preview_image_html = ""
    if preview_image_id:
        preview_image_html = f'''
        <img alt="Preview image" style="max-height: 65vh; width: auto; margin: auto" 
             loading="eager" referrerpolicy="no-referrer" 
             src="https://miro.medium.com/v2/resize:fit:1400/{preview_image_id}">
        '''
    
    # Subtitle
    subtitle_html = ""
    if subtitle:
        subtitle_html = f'<h2 class="pt-1 font-sans font-medium text-gray-400 break-normal text-1xl">{escape_html(subtitle)}</h2>'
    
    # Tags
    tags = article_data.get("tags", [])
    tags_html = ""
    for tag in tags[:10]:
        tag_slug = tag.get("normalizedTagSlug", tag) if isinstance(tag, dict) else str(tag)
        tag_display = tag.get("displayTitle", tag_slug) if isinstance(tag, dict) else tag_slug
        tags_html += f'''
        <a title="{escape_html(tag_display)}" target="_blank" href="https://medium.com/tag/{escape_html(tag_slug)}">
            <span class="px-2 py-1 text-xs text-green-400 bg-green-900 rounded-full">#{escape_html(tag_slug)}</span>
        </a>
        '''
    
    # Author card
    author_card = AUTHOR_CARD_TEMPLATE.format(
        username=author_username,
        image_id=author_image,
        name=author_name,
        collection_html=collection_html,
        reading_time=reading_time,
        free_access=free_access
    )
    
    # Content - try paragraphs first, fallback to markdown
    paragraphs = article_data.get("paragraphs", [])
    markdown_content = article_data.get("markdownContent", "")
    
    # Smart Detection: If paragraphs contain raw markdown syntax (e.g. from V2 fallback),
    # switch to Robust Markdown Rendering for better quality.
    use_markdown_renderer = False
    
    if not paragraphs:
        use_markdown_renderer = True
    elif markdown_content and _is_likely_markdown(paragraphs):
        logger.info("Detected raw markdown in paragraphs - switching to Markdown Renderer")
        use_markdown_renderer = True
        
    if use_markdown_renderer:
        # Convert markdown to robust HTML using V8 engine
        # If markdownContent missing but paragraphs present, reconstruct from text
        if not markdown_content and paragraphs:
            markdown_content = "\n\n".join([p.get("text", "") for p in paragraphs])
            
        content_html = _markdown_to_html(markdown_content)
    else:
        content_html = render_paragraphs(paragraphs, title, subtitle, preview_image_id)
    
    # Build article HTML
    article_html = ARTICLE_TEMPLATE.format(
        url=url,
        preview_image=preview_image_html,
        title=title,
        subtitle_html=subtitle_html,
        author_card=author_card,
        content=content_html,
        tags_html=tags_html
    )
    
    return article_html


def _is_likely_markdown(paragraphs: List[Dict]) -> bool:
    """
    Detect if paragraphs are actually just containers for raw markdown.
    This happens when the scraper falls back to dumping markdown tokens into the text field.
    """
    if not paragraphs:
        return False
    
    # Check the first few paragraphs for tell-tale markdown syntax
    # that shouldn't appear in clean text
    sample_text = "\n".join([p.get("text", "") for p in paragraphs[:8]])
    
    triggers = [
        "#### ",      # Headers
        "![",         # Images
        "](http",     # Links
        "```",        # Code blocks
        "** ",        # Bold at start
        "---",        # HR
    ]
    
    return any(trigger in sample_text for trigger in triggers)


def render_full_page(article_data: Dict[str, Any]) -> str:
    """
    Render article data to a complete standalone HTML page.
    
    Args:
        article_data: Dict with title, author, markdownContent, etc.
        
    Returns:
        Complete HTML page string
    """
    title = escape_html(article_data.get("title", "Untitled"))
    content = render_article_html(article_data)
    
    return BASE_TEMPLATE.format(title=title, content=content)


import markdown as md_lib

def _markdown_to_html(markdown_text: str) -> str:
    """Robust markdown to HTML conversion using library."""
    if not markdown_text:
        return ""
    
    # Use extra extension for tables, code blocks, etc.
    html_content = md_lib.markdown(
        markdown_text,
        extensions=['extra', 'codehilite', 'nl2br', 'sane_lists', 'fenced_code'],
        output_format='html5'
    )
    
    # Post-process for Tailwind/Prose styling matching our CSS
    # Enhance headers
    html_content = html_content.replace('<h1>', '<h1 class="pt-12 font-bold text-3xl">')
    html_content = html_content.replace('<h2>', '<h2 class="pt-12 font-bold text-2xl">')
    html_content = html_content.replace('<h3>', '<h3 class="pt-8 font-bold text-xl">')
    html_content = html_content.replace('<h4>', '<h4 class="pt-6 font-bold text-lg">')
    
    # Enhance paragraphs (add margin/leading)
    html_content = html_content.replace('<p>', '<p class="mt-4 leading-8">')
    
    # Enhance lists
    html_content = html_content.replace('<ul>', '<ul class="pl-8 mt-2 list-disc">')
    html_content = html_content.replace('<ol>', '<ol class="pl-8 mt-2 list-decimal">')
    html_content = html_content.replace('<li>', '<li class="ml-4 mt-1">')
    
    # Enhance blockquotes
    html_content = html_content.replace('blockquote>', 'blockquote class="px-5 py-3 mt-5 border-l-4 border-gray-500">')
    
    # Enhance pre/code
    html_content = html_content.replace('<pre>', '<pre class="mt-7 border border-gray-700 bg-gray-900 p-4 rounded overflow-x-auto">')
    
    return html_content