"""
Medium Article HTML Renderer
Renders article data to beautiful HTML matching Medium's styling.
Based on Freedium's medium-parser/core.py template rendering.
"""
import html
from typing import Dict, List, Any, Optional
import logging
# Import centralized image URL utilities
from src.utils import MEDIUM_IMAGE_DEFAULT_WIDTH
logger = logging.getLogger("HTMLRenderer")
# Base HTML template for standalone page
BASE_TEMPLATE = """
{title} | Medium Scraper
{content}
"""
# Article content template
ARTICLE_TEMPLATE = """
{author_card}
{content}
{tags_html}
"""
# Author card template
AUTHOR_CARD_TEMPLATE = """
{collection_html}
~{reading_time} min read
·
Free: {free_access}
"""
def escape_html(text: str) -> str:
"""Escape HTML special characters."""
if not text:
return ""
return html.escape(str(text))
def render_markup(text: str, markups: List[Dict]) -> str:
"""Apply markups (bold, italic, links, code) to text."""
if not text or not markups:
return escape_html(text)
# Sort markups by start position (reversed for correct insertion)
sorted_markups = sorted(markups, key=lambda m: (m.get("start", 0), -m.get("end", 0)))
result = list(text)
insertions = [] # (position, is_closing, tag)
for markup in sorted_markups:
start = markup.get("start", 0)
end = markup.get("end", len(text))
markup_type = markup.get("type", "")
if markup_type == "STRONG":
insertions.append((start, False, ""))
insertions.append((end, True, ""))
elif markup_type == "EM":
insertions.append((start, False, ""))
insertions.append((end, True, ""))
elif markup_type == "CODE":
insertions.append((start, False, ''))
insertions.append((end, True, ""))
elif markup_type == "A":
href = escape_html(markup.get("href", ""))
if markup.get("anchorType") == "USER":
href = f"https://medium.com/u/{markup.get('userId', '')}"
target = "" if href.startswith("#") else ' target="_blank"'
insertions.append((start, False, f''))
insertions.append((end, True, ""))
# Sort insertions: by position, then closing tags before opening
insertions.sort(key=lambda x: (x[0], x[1]))
# Build result with insertions
offset = 0
escaped = escape_html(text)
result_parts = []
last_pos = 0
for pos, is_closing, tag in insertions:
if pos > last_pos:
result_parts.append(escaped[last_pos:pos])
result_parts.append(tag)
last_pos = pos
result_parts.append(escaped[last_pos:])
return "".join(result_parts)
def render_paragraph(paragraph: Dict, is_code: bool = False) -> str:
"""Render a single paragraph to HTML."""
para_type = paragraph.get("type", "P")
text = paragraph.get("text", "")
markups = paragraph.get("markups", [])
# Apply markups
formatted_text = render_markup(text, markups) if not is_code else escape_html(text)
if para_type == "H2":
return f'{formatted_text}
'
elif para_type == "H3":
return f'{formatted_text}
'
elif para_type == "H4":
return f'{formatted_text}
'
elif para_type == "P":
css_class = "leading-8 mt-7"
if paragraph.get("hasDropCap"):
css_class += " first-letter:text-7xl first-letter:float-left first-letter:mr-2"
return f'{formatted_text}
'
elif para_type == "IMG":
metadata = paragraph.get("metadata") or {}
image_id = metadata.get("id", "")
alt = escape_html(metadata.get("alt", ""))
caption = formatted_text
img_html = f'''
'''
if caption:
img_html += f'{caption}'
return img_html
elif para_type == "PRE":
code_meta = paragraph.get("codeBlockMetadata") or {}
lang = code_meta.get("lang") or ""
lang_class = f"language-{lang}" if lang else "nohighlight"
return f'{escape_html(text)}
'
elif para_type == "BQ":
return f'''
{formatted_text}
'''
elif para_type == "PQ":
return f'{formatted_text}
'
elif para_type == "ULI":
return f'{formatted_text}'
elif para_type == "OLI":
return f'{formatted_text}'
elif para_type == "IFRAME":
iframe_data = paragraph.get("iframe") or {}
media_resource = iframe_data.get("mediaResource") or {}
src = media_resource.get("iframeSrc", "")
width = media_resource.get("iframeWidth", "100%")
height = media_resource.get("iframeHeight", "400")
if src:
return f'''
'''
return ""
elif para_type == "MIXTAPE_EMBED":
mixtape = paragraph.get("mixtapeMetadata") or {}
href = escape_html(mixtape.get("href", ""))
thumbnail = mixtape.get("thumbnailImageId", "")
# Parse title/description from markups
parts = text.split("\n") if text else ["", ""]
embed_title = parts[0] if len(parts) > 0 else ""
embed_desc = parts[1] if len(parts) > 1 else ""
return f'''
'''
else:
logger.warning(f"Unknown paragraph type: {para_type}")
return f'{formatted_text}
'
def render_paragraphs(paragraphs: List[Dict], title: str = "", subtitle: str = "", preview_image_id: str = "") -> str:
"""Render all paragraphs to HTML content."""
if not paragraphs:
return ""
out_parts = []
i = 0
while i < len(paragraphs):
para = paragraphs[i]
para_type = para.get("type", "")
para_text = para.get("text", "")
# Skip duplicate title/subtitle in first 4 paragraphs
if i < 4:
if para_type in ["H3", "H4", "H2"] and title and _similarity(para_text, title) > 0.8:
i += 1
continue
if para_type in ["H4", "P"] and subtitle and _similarity(para_text, subtitle) > 0.8:
i += 1
continue
if para_type == "IMG":
metadata = para.get("metadata") or {}
if metadata.get("id") == preview_image_id:
i += 1
continue
# Handle grouped elements (lists, code blocks)
if para_type == "ULI":
list_items = []
while i < len(paragraphs) and paragraphs[i].get("type") == "ULI":
list_items.append(render_paragraph(paragraphs[i]))
i += 1
out_parts.append(f'')
continue
if para_type == "OLI":
list_items = []
while i < len(paragraphs) and paragraphs[i].get("type") == "OLI":
list_items.append(render_paragraph(paragraphs[i]))
i += 1
out_parts.append(f'{"".join(list_items)}
')
continue
if para_type == "PRE":
code_blocks = []
while i < len(paragraphs) and paragraphs[i].get("type") == "PRE":
code_blocks.append(escape_html(paragraphs[i].get("text", "")))
i += 1
code_meta = para.get("codeBlockMetadata") or {}
lang = code_meta.get("lang") or ""
lang_class = f"language-{lang}" if lang else "nohighlight"
joined_code = "\n".join(code_blocks)
out_parts.append(f'{joined_code}
')
continue
# Regular paragraph
out_parts.append(render_paragraph(para))
i += 1
return "\n".join(out_parts)
def _similarity(s1: str, s2: str) -> float:
"""Calculate similarity ratio between two strings."""
if not s1 or not s2:
return 0.0
s1, s2 = s1.lower(), s2.lower()
if s1 == s2:
return 1.0
# Simple character overlap
common = len(set(s1) & set(s2))
total = len(set(s1) | set(s2))
return common / total if total > 0 else 0.0
def render_article_html(article_data: Dict[str, Any]) -> str:
"""
Render article data to HTML content (not full page).
Args:
article_data: Dict with title, author, markdownContent, etc.
Returns:
HTML string for article content
"""
title = escape_html(article_data.get("title", "Untitled"))
subtitle = article_data.get("subtitle", "")
url = escape_html(article_data.get("url", ""))
# Author info
author = article_data.get("author") or {}
if isinstance(author, str):
author = {"name": author}
author_name = escape_html(author.get("name", "Unknown"))
author_username = escape_html(author.get("username", ""))
author_image = author.get("imageId", "1*dmbNkD5D-u45r44go_cf0g.png")
# Collection/publication
collection = article_data.get("publication") or article_data.get("collection") or {}
if isinstance(collection, str):
collection = {"name": collection}
collection_html = ""
if collection and isinstance(collection, dict) and collection.get("name"):
collection_html = f'''
{escape_html(collection.get('name', ''))}
·
'''
# Reading time
reading_time = article_data.get("readingTime", 5)
if isinstance(reading_time, float):
reading_time = int(reading_time)
# Free access
is_locked = article_data.get("isLocked", False)
free_access = "No" if is_locked else "Yes"
# Preview image
preview_image_id = article_data.get("previewImageId", "")
preview_image_html = ""
if preview_image_id:
preview_image_html = f'''
'''
# Subtitle
subtitle_html = ""
if subtitle:
subtitle_html = f'{escape_html(subtitle)}
'
# Tags
tags = article_data.get("tags", [])
tags_html = ""
for tag in tags[:10]:
tag_slug = tag.get("normalizedTagSlug", tag) if isinstance(tag, dict) else str(tag)
tag_display = tag.get("displayTitle", tag_slug) if isinstance(tag, dict) else tag_slug
tags_html += f'''
#{escape_html(tag_slug)}
'''
# Author card
author_card = AUTHOR_CARD_TEMPLATE.format(
username=author_username,
image_id=author_image,
name=author_name,
collection_html=collection_html,
reading_time=reading_time,
free_access=free_access
)
# Content - try paragraphs first, fallback to markdown
paragraphs = article_data.get("paragraphs", [])
markdown_content = article_data.get("markdownContent", "")
# Smart Detection: If paragraphs contain raw markdown syntax (e.g. from V2 fallback),
# switch to Robust Markdown Rendering for better quality.
use_markdown_renderer = False
if not paragraphs:
use_markdown_renderer = True
elif markdown_content and _is_likely_markdown(paragraphs):
logger.info("Detected raw markdown in paragraphs - switching to Markdown Renderer")
use_markdown_renderer = True
if use_markdown_renderer:
# Convert markdown to robust HTML using V8 engine
# If markdownContent missing but paragraphs present, reconstruct from text
if not markdown_content and paragraphs:
markdown_content = "\n\n".join([p.get("text", "") for p in paragraphs])
content_html = _markdown_to_html(markdown_content)
else:
content_html = render_paragraphs(paragraphs, title, subtitle, preview_image_id)
# Build article HTML
article_html = ARTICLE_TEMPLATE.format(
url=url,
preview_image=preview_image_html,
title=title,
subtitle_html=subtitle_html,
author_card=author_card,
content=content_html,
tags_html=tags_html
)
return article_html
def _is_likely_markdown(paragraphs: List[Dict]) -> bool:
"""
Detect if paragraphs are actually just containers for raw markdown.
This happens when the scraper falls back to dumping markdown tokens into the text field.
"""
if not paragraphs:
return False
# Check the first few paragraphs for tell-tale markdown syntax
# that shouldn't appear in clean text
sample_text = "\n".join([p.get("text", "") for p in paragraphs[:8]])
triggers = [
"#### ", # Headers
"
def render_full_page(article_data: Dict[str, Any]) -> str:
"""
Render article data to a complete standalone HTML page.
Args:
article_data: Dict with title, author, markdownContent, etc.
Returns:
Complete HTML page string
"""
title = escape_html(article_data.get("title", "Untitled"))
content = render_article_html(article_data)
return BASE_TEMPLATE.format(title=title, content=content)
import markdown as md_lib
def _markdown_to_html(markdown_text: str) -> str:
"""Robust markdown to HTML conversion using library."""
if not markdown_text:
return ""
# Use extra extension for tables, code blocks, etc.
html_content = md_lib.markdown(
markdown_text,
extensions=['extra', 'codehilite', 'nl2br', 'sane_lists', 'fenced_code'],
output_format='html5'
)
# Post-process for Tailwind/Prose styling matching our CSS
# Enhance headers
html_content = html_content.replace('', '')
html_content = html_content.replace('', '')
html_content = html_content.replace('', '')
html_content = html_content.replace('', '')
# Enhance paragraphs (add margin/leading)
html_content = html_content.replace('
', '
')
# Enhance lists
html_content = html_content.replace('
', '')
html_content = html_content.replace('', '')
html_content = html_content.replace('- ', '
- ')
# Enhance blockquotes
html_content = html_content.replace('blockquote>', 'blockquote class="px-5 py-3 mt-5 border-l-4 border-gray-500">')
# Enhance pre/code
html_content = html_content.replace('
', '')
return html_content