Medium-MCP / src /html_renderer.py
Nikhil Pravin Pise
fix: Upgrade Medium images to high resolution (1400px) across entire app
60742a2
"""
Medium Article HTML Renderer
Renders article data to beautiful HTML matching Medium's styling.
Based on Freedium's medium-parser/core.py template rendering.
"""
import html
from typing import Dict, List, Any, Optional
import logging
# Import centralized image URL utilities
from src.utils import MEDIUM_IMAGE_DEFAULT_WIDTH
logger = logging.getLogger("HTMLRenderer")
# Base HTML template for standalone page
BASE_TEMPLATE = """<!DOCTYPE html>
<html lang="en" class="dark">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{title} | Medium Scraper</title>
<style>
@import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@400;700&family=Inter:wght@300;400;600&family=JetBrains+Mono:wght@400;600&display=swap');
:root {{
--bg-color: #121212;
--text-color: #e5e5e5;
--accent-color: #6366f1;
--code-bg: #1e1e1e;
}}
body {{
background-color: var(--bg-color);
color: var(--text-color);
font-family: 'Inter', sans-serif;
line-height: 1.6;
margin: 0;
padding: 0;
}}
/* Container for PDF and Web consistency */
.container {{
max-width: 100%;
margin: 0 auto;
padding: 40px;
}}
/* Typography */
h1, h2, h3, h4 {{
font-family: 'Playfair Display', serif;
color: #ffffff;
margin-top: 2em;
margin-bottom: 0.5em;
line-height: 1.25;
}}
h1 {{ font-size: 2.5rem; border-bottom: 1px solid rgba(255,255,255,0.1); padding-bottom: 20px; }}
h2 {{ font-size: 1.8rem; }}
h3 {{ font-size: 1.5rem; }}
p {{ margin-bottom: 1.5em; font-size: 1.1rem; }}
a {{ color: var(--accent-color); text-decoration: none; }}
a:hover {{ text-decoration: underline; }}
/* Code Blocks */
pre {{
background: var(--code-bg);
padding: 20px;
border-radius: 8px;
overflow-x: auto;
border: 1px solid rgba(255,255,255,0.1);
margin: 2em 0;
}}
code {{
font-family: 'JetBrains Mono', monospace;
font-size: 0.9em;
color: #efefef;
}}
p code {{
background: rgba(255,255,255,0.1);
padding: 2px 6px;
border-radius: 4px;
}}
/* Blockquotes */
blockquote {{
border-left: 4px solid var(--accent-color);
margin: 2em 0;
padding-left: 20px;
font-style: italic;
color: #a1a1aa;
}}
/* Images */
img {{
max-width: 100%;
height: auto;
border-radius: 8px;
margin: 2em auto;
display: block;
}}
/* Lists */
ul, ol {{ margin: 1.5em 0; padding-left: 2em; }}
li {{ margin-bottom: 0.5em; }}
/* Tables */
table {{
width: 100%;
border-collapse: collapse;
margin: 2em 0;
}}
th, td {{
padding: 12px;
border-bottom: 1px solid rgba(255,255,255,0.1);
text-align: left;
}}
th {{ font-weight: 600; color: #fff; }}
/* Author Card */
.author-card {{
background: rgba(255,255,255,0.05);
padding: 20px;
border-radius: 12px;
margin-bottom: 40px;
display: flex;
align-items: center;
gap: 20px;
}}
.author-card img {{ margin: 0; width: 64px; height: 64px; border-radius: 50%; }}
/* Print Overrides */
@media print {{
body {{ background: white; color: black; }}
h1, h2, h3 {{ color: black; }}
pre {{ background: #f5f5f5; border: 1px solid #ddd; color: black; }}
code {{ color: black; }}
a {{ color: #000; text-decoration: underline; }}
.container {{ padding: 0; }}
}}
</style>
</head>
<body class="bg-gray-900 text-gray-100">
{content}
</body>
</html>"""
# Article content template
ARTICLE_TEMPLATE = """
<div class="container w-full pt-20 mx-auto text-gray-100 break-words bg-gray-800 max-w-none">
<div class="w-full px-4 text-xl leading-normal md:px-6" style="font-family:Georgia,serif;">
<div class="font-sans">
<p class="pb-3 text-base font-bold text-green-500 md:text-sm">
<a href="{url}#bypass" class="text-sm font-bold text-green-500 no-underline md:text-sm hover:underline">&lt; Go to the original</a>
</p>
{preview_image}
<h1 class="pt-6 pb-2 font-sans text-3xl font-bold text-gray-100 break-normal md:text-4xl">{title}</h1>
{subtitle_html}
</div>
{author_card}
<div class="mt-8 main-content">
{content}
</div>
<div class="flex flex-wrap gap-2 mt-5">
{tags_html}
</div>
<div class="container w-full pt-12 mx-auto"></div>
</div>
</div>
"""
# Author card template
AUTHOR_CARD_TEMPLATE = """
<div class="m-2 mt-5 bg-gray-700 border border-gray-600">
<div class="flex items-center p-4 space-x-4">
<div class="flex-shrink-0">
<a href="https://medium.com/@{username}" target="_blank" class="relative block">
<img src="https://miro.medium.com/v2/resize:fill:88:88/{image_id}"
alt="{name}" loading="eager" referrerpolicy="no-referrer"
class="rounded-full h-11 w-11">
</a>
</div>
<div class="flex-grow">
<a href="https://medium.com/@{username}" target="_blank"
class="block font-semibold text-white">{name}</a>
<button class="px-3 py-1 mt-1 text-sm text-white bg-green-600 rounded-lg">
<a href="https://medium.com/@{username}" target="_blank" class="text-sm text-white">Follow</a>
</button>
</div>
</div>
<div class="px-4 pb-2">
<div class="flex flex-wrap items-center space-x-2 text-sm text-gray-400">
{collection_html}
<span>~{reading_time} min read</span>
<span>·</span>
<span class="text-yellow-400">Free: {free_access}</span>
</div>
</div>
</div>
"""
def escape_html(text: str) -> str:
"""Escape HTML special characters."""
if not text:
return ""
return html.escape(str(text))
def render_markup(text: str, markups: List[Dict]) -> str:
"""Apply markups (bold, italic, links, code) to text."""
if not text or not markups:
return escape_html(text)
# Sort markups by start position (reversed for correct insertion)
sorted_markups = sorted(markups, key=lambda m: (m.get("start", 0), -m.get("end", 0)))
result = list(text)
insertions = [] # (position, is_closing, tag)
for markup in sorted_markups:
start = markup.get("start", 0)
end = markup.get("end", len(text))
markup_type = markup.get("type", "")
if markup_type == "STRONG":
insertions.append((start, False, "<strong>"))
insertions.append((end, True, "</strong>"))
elif markup_type == "EM":
insertions.append((start, False, "<em>"))
insertions.append((end, True, "</em>"))
elif markup_type == "CODE":
insertions.append((start, False, '<code class="p-1.5 bg-gray-600 rounded">'))
insertions.append((end, True, "</code>"))
elif markup_type == "A":
href = escape_html(markup.get("href", ""))
if markup.get("anchorType") == "USER":
href = f"https://medium.com/u/{markup.get('userId', '')}"
target = "" if href.startswith("#") else ' target="_blank"'
insertions.append((start, False, f'<a href="{href}"{target} class="underline text-blue-400">'))
insertions.append((end, True, "</a>"))
# Sort insertions: by position, then closing tags before opening
insertions.sort(key=lambda x: (x[0], x[1]))
# Build result with insertions
offset = 0
escaped = escape_html(text)
result_parts = []
last_pos = 0
for pos, is_closing, tag in insertions:
if pos > last_pos:
result_parts.append(escaped[last_pos:pos])
result_parts.append(tag)
last_pos = pos
result_parts.append(escaped[last_pos:])
return "".join(result_parts)
def render_paragraph(paragraph: Dict, is_code: bool = False) -> str:
"""Render a single paragraph to HTML."""
para_type = paragraph.get("type", "P")
text = paragraph.get("text", "")
markups = paragraph.get("markups", [])
# Apply markups
formatted_text = render_markup(text, markups) if not is_code else escape_html(text)
if para_type == "H2":
return f'<h2 class="pt-12 font-bold font-sans break-normal text-gray-100 text-2xl">{formatted_text}</h2>'
elif para_type == "H3":
return f'<h3 class="pt-12 font-bold font-sans break-normal text-gray-100 text-2xl">{formatted_text}</h3>'
elif para_type == "H4":
return f'<h4 class="pt-8 font-bold font-sans break-normal text-gray-100 text-xl">{formatted_text}</h4>'
elif para_type == "P":
css_class = "leading-8 mt-7"
if paragraph.get("hasDropCap"):
css_class += " first-letter:text-7xl first-letter:float-left first-letter:mr-2"
return f'<p class="{css_class}">{formatted_text}</p>'
elif para_type == "IMG":
metadata = paragraph.get("metadata") or {}
image_id = metadata.get("id", "")
alt = escape_html(metadata.get("alt", ""))
caption = formatted_text
img_html = f'''
<div class="mt-7">
<img loading="eager" alt="{alt}" class="pt-5 m-auto"
referrerpolicy="no-referrer"
src="https://miro.medium.com/v2/resize:fit:1400/{image_id}">
</div>
'''
if caption:
img_html += f'<figcaption class="mt-3 text-sm text-center text-gray-400">{caption}</figcaption>'
return img_html
elif para_type == "PRE":
code_meta = paragraph.get("codeBlockMetadata") or {}
lang = code_meta.get("lang") or ""
lang_class = f"language-{lang}" if lang else "nohighlight"
return f'<pre class="flex flex-col mt-7 border border-gray-700"><code class="p-2 bg-gray-900 overflow-x-auto {lang_class}">{escape_html(text)}</code></pre>'
elif para_type == "BQ":
return f'''
<blockquote style="box-shadow: inset 3px 0 0 0 rgb(209 207 239);" class="px-5 pt-3 pb-3 mt-5">
<p class="font-italic">{formatted_text}</p>
</blockquote>
'''
elif para_type == "PQ":
return f'<blockquote class="ml-5 text-2xl text-gray-300 mt-7"><p>{formatted_text}</p></blockquote>'
elif para_type == "ULI":
return f'<li class="mt-3">{formatted_text}</li>'
elif para_type == "OLI":
return f'<li class="mt-3">{formatted_text}</li>'
elif para_type == "IFRAME":
iframe_data = paragraph.get("iframe") or {}
media_resource = iframe_data.get("mediaResource") or {}
src = media_resource.get("iframeSrc", "")
width = media_resource.get("iframeWidth", "100%")
height = media_resource.get("iframeHeight", "400")
if src:
return f'''
<div class="mt-7">
<iframe class="w-full" src="{escape_html(src)}"
width="{width}" height="{height}"
allowfullscreen frameborder="0"></iframe>
</div>
'''
return ""
elif para_type == "MIXTAPE_EMBED":
mixtape = paragraph.get("mixtapeMetadata") or {}
href = escape_html(mixtape.get("href", ""))
thumbnail = mixtape.get("thumbnailImageId", "")
# Parse title/description from markups
parts = text.split("\n") if text else ["", ""]
embed_title = parts[0] if len(parts) > 0 else ""
embed_desc = parts[1] if len(parts) > 1 else ""
return f'''
<div class="items-center p-2 overflow-hidden border border-gray-600 mt-7">
<a rel="noopener follow" href="{href}" target="_blank">
<div class="flex flex-row justify-between p-2 overflow-hidden">
<div class="flex flex-col justify-center p-2">
<h2 class="text-base font-bold text-gray-100">{escape_html(embed_title)}</h2>
<div class="block mt-2">
<h3 class="text-sm text-gray-400">{escape_html(embed_desc)}</h3>
</div>
</div>
<div class="relative flex h-40 flew-row w-60">
<div class="absolute inset-0 bg-center bg-cover"
style="background-image: url('https://miro.medium.com/v2/resize:fit:800/{thumbnail}');">
</div>
</div>
</div>
</a>
</div>
'''
else:
logger.warning(f"Unknown paragraph type: {para_type}")
return f'<p class="mt-7">{formatted_text}</p>'
def render_paragraphs(paragraphs: List[Dict], title: str = "", subtitle: str = "", preview_image_id: str = "") -> str:
"""Render all paragraphs to HTML content."""
if not paragraphs:
return ""
out_parts = []
i = 0
while i < len(paragraphs):
para = paragraphs[i]
para_type = para.get("type", "")
para_text = para.get("text", "")
# Skip duplicate title/subtitle in first 4 paragraphs
if i < 4:
if para_type in ["H3", "H4", "H2"] and title and _similarity(para_text, title) > 0.8:
i += 1
continue
if para_type in ["H4", "P"] and subtitle and _similarity(para_text, subtitle) > 0.8:
i += 1
continue
if para_type == "IMG":
metadata = para.get("metadata") or {}
if metadata.get("id") == preview_image_id:
i += 1
continue
# Handle grouped elements (lists, code blocks)
if para_type == "ULI":
list_items = []
while i < len(paragraphs) and paragraphs[i].get("type") == "ULI":
list_items.append(render_paragraph(paragraphs[i]))
i += 1
out_parts.append(f'<ul class="pl-8 mt-2 list-disc">{"".join(list_items)}</ul>')
continue
if para_type == "OLI":
list_items = []
while i < len(paragraphs) and paragraphs[i].get("type") == "OLI":
list_items.append(render_paragraph(paragraphs[i]))
i += 1
out_parts.append(f'<ol class="pl-8 mt-2 list-decimal">{"".join(list_items)}</ol>')
continue
if para_type == "PRE":
code_blocks = []
while i < len(paragraphs) and paragraphs[i].get("type") == "PRE":
code_blocks.append(escape_html(paragraphs[i].get("text", "")))
i += 1
code_meta = para.get("codeBlockMetadata") or {}
lang = code_meta.get("lang") or ""
lang_class = f"language-{lang}" if lang else "nohighlight"
joined_code = "\n".join(code_blocks)
out_parts.append(f'<pre class="flex flex-col mt-7 border border-gray-700"><code class="p-2 bg-gray-900 overflow-x-auto {lang_class}">{joined_code}</code></pre>')
continue
# Regular paragraph
out_parts.append(render_paragraph(para))
i += 1
return "\n".join(out_parts)
def _similarity(s1: str, s2: str) -> float:
"""Calculate similarity ratio between two strings."""
if not s1 or not s2:
return 0.0
s1, s2 = s1.lower(), s2.lower()
if s1 == s2:
return 1.0
# Simple character overlap
common = len(set(s1) & set(s2))
total = len(set(s1) | set(s2))
return common / total if total > 0 else 0.0
def render_article_html(article_data: Dict[str, Any]) -> str:
"""
Render article data to HTML content (not full page).
Args:
article_data: Dict with title, author, markdownContent, etc.
Returns:
HTML string for article content
"""
title = escape_html(article_data.get("title", "Untitled"))
subtitle = article_data.get("subtitle", "")
url = escape_html(article_data.get("url", ""))
# Author info
author = article_data.get("author") or {}
if isinstance(author, str):
author = {"name": author}
author_name = escape_html(author.get("name", "Unknown"))
author_username = escape_html(author.get("username", ""))
author_image = author.get("imageId", "1*dmbNkD5D-u45r44go_cf0g.png")
# Collection/publication
collection = article_data.get("publication") or article_data.get("collection") or {}
if isinstance(collection, str):
collection = {"name": collection}
collection_html = ""
if collection and isinstance(collection, dict) and collection.get("name"):
collection_html = f'''
<a href="https://medium.com/{escape_html(collection.get('slug', ''))}" target="_blank" class="flex items-center space-x-1">
<p>{escape_html(collection.get('name', ''))}</p>
</a>
<span>·</span>
'''
# Reading time
reading_time = article_data.get("readingTime", 5)
if isinstance(reading_time, float):
reading_time = int(reading_time)
# Free access
is_locked = article_data.get("isLocked", False)
free_access = "No" if is_locked else "Yes"
# Preview image
preview_image_id = article_data.get("previewImageId", "")
preview_image_html = ""
if preview_image_id:
preview_image_html = f'''
<img alt="Preview image" style="max-height: 65vh; width: auto; margin: auto"
loading="eager" referrerpolicy="no-referrer"
src="https://miro.medium.com/v2/resize:fit:1400/{preview_image_id}">
'''
# Subtitle
subtitle_html = ""
if subtitle:
subtitle_html = f'<h2 class="pt-1 font-sans font-medium text-gray-400 break-normal text-1xl">{escape_html(subtitle)}</h2>'
# Tags
tags = article_data.get("tags", [])
tags_html = ""
for tag in tags[:10]:
tag_slug = tag.get("normalizedTagSlug", tag) if isinstance(tag, dict) else str(tag)
tag_display = tag.get("displayTitle", tag_slug) if isinstance(tag, dict) else tag_slug
tags_html += f'''
<a title="{escape_html(tag_display)}" target="_blank" href="https://medium.com/tag/{escape_html(tag_slug)}">
<span class="px-2 py-1 text-xs text-green-400 bg-green-900 rounded-full">#{escape_html(tag_slug)}</span>
</a>
'''
# Author card
author_card = AUTHOR_CARD_TEMPLATE.format(
username=author_username,
image_id=author_image,
name=author_name,
collection_html=collection_html,
reading_time=reading_time,
free_access=free_access
)
# Content - try paragraphs first, fallback to markdown
paragraphs = article_data.get("paragraphs", [])
markdown_content = article_data.get("markdownContent", "")
# Smart Detection: If paragraphs contain raw markdown syntax (e.g. from V2 fallback),
# switch to Robust Markdown Rendering for better quality.
use_markdown_renderer = False
if not paragraphs:
use_markdown_renderer = True
elif markdown_content and _is_likely_markdown(paragraphs):
logger.info("Detected raw markdown in paragraphs - switching to Markdown Renderer")
use_markdown_renderer = True
if use_markdown_renderer:
# Convert markdown to robust HTML using V8 engine
# If markdownContent missing but paragraphs present, reconstruct from text
if not markdown_content and paragraphs:
markdown_content = "\n\n".join([p.get("text", "") for p in paragraphs])
content_html = _markdown_to_html(markdown_content)
else:
content_html = render_paragraphs(paragraphs, title, subtitle, preview_image_id)
# Build article HTML
article_html = ARTICLE_TEMPLATE.format(
url=url,
preview_image=preview_image_html,
title=title,
subtitle_html=subtitle_html,
author_card=author_card,
content=content_html,
tags_html=tags_html
)
return article_html
def _is_likely_markdown(paragraphs: List[Dict]) -> bool:
"""
Detect if paragraphs are actually just containers for raw markdown.
This happens when the scraper falls back to dumping markdown tokens into the text field.
"""
if not paragraphs:
return False
# Check the first few paragraphs for tell-tale markdown syntax
# that shouldn't appear in clean text
sample_text = "\n".join([p.get("text", "") for p in paragraphs[:8]])
triggers = [
"#### ", # Headers
"![", # Images
"](http", # Links
"```", # Code blocks
"** ", # Bold at start
"---", # HR
]
return any(trigger in sample_text for trigger in triggers)
def render_full_page(article_data: Dict[str, Any]) -> str:
"""
Render article data to a complete standalone HTML page.
Args:
article_data: Dict with title, author, markdownContent, etc.
Returns:
Complete HTML page string
"""
title = escape_html(article_data.get("title", "Untitled"))
content = render_article_html(article_data)
return BASE_TEMPLATE.format(title=title, content=content)
import markdown as md_lib
def _markdown_to_html(markdown_text: str) -> str:
"""Robust markdown to HTML conversion using library."""
if not markdown_text:
return ""
# Use extra extension for tables, code blocks, etc.
html_content = md_lib.markdown(
markdown_text,
extensions=['extra', 'codehilite', 'nl2br', 'sane_lists', 'fenced_code'],
output_format='html5'
)
# Post-process for Tailwind/Prose styling matching our CSS
# Enhance headers
html_content = html_content.replace('<h1>', '<h1 class="pt-12 font-bold text-3xl">')
html_content = html_content.replace('<h2>', '<h2 class="pt-12 font-bold text-2xl">')
html_content = html_content.replace('<h3>', '<h3 class="pt-8 font-bold text-xl">')
html_content = html_content.replace('<h4>', '<h4 class="pt-6 font-bold text-lg">')
# Enhance paragraphs (add margin/leading)
html_content = html_content.replace('<p>', '<p class="mt-4 leading-8">')
# Enhance lists
html_content = html_content.replace('<ul>', '<ul class="pl-8 mt-2 list-disc">')
html_content = html_content.replace('<ol>', '<ol class="pl-8 mt-2 list-decimal">')
html_content = html_content.replace('<li>', '<li class="ml-4 mt-1">')
# Enhance blockquotes
html_content = html_content.replace('blockquote>', 'blockquote class="px-5 py-3 mt-5 border-l-4 border-gray-500">')
# Enhance pre/code
html_content = html_content.replace('<pre>', '<pre class="mt-7 border border-gray-700 bg-gray-900 p-4 rounded overflow-x-auto">')
return html_content