Spaces:

T0X1N
/

Medium-MCP

Sleeping

Medium-MCP / src /html_renderer.py

Nikhil Pravin Pise

fix: Upgrade Medium images to high resolution (1400px) across entire app

60742a2 3 months ago

23.9 kB

	"""
	Medium Article HTML Renderer

	Renders article data to beautiful HTML matching Medium's styling.
	Based on Freedium's medium-parser/core.py template rendering.
	"""

	import html
	from typing import Dict, List, Any, Optional
	import logging

	# Import centralized image URL utilities
	from src.utils import MEDIUM_IMAGE_DEFAULT_WIDTH

	logger = logging.getLogger("HTMLRenderer")

	# Base HTML template for standalone page
	BASE_TEMPLATE = """<!DOCTYPE html>
	<html lang="en" class="dark">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>{title} \| Medium Scraper</title>
	<style>
	@import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@400;700&family=Inter:wght@300;400;600&family=JetBrains+Mono:wght@400;600&display=swap');

	:root {{
	--bg-color: #121212;
	--text-color: #e5e5e5;
	--accent-color: #6366f1;
	--code-bg: #1e1e1e;
	}}

	body {{
	background-color: var(--bg-color);
	color: var(--text-color);
	font-family: 'Inter', sans-serif;
	line-height: 1.6;
	margin: 0;
	padding: 0;
	}}

	/* Container for PDF and Web consistency */
	.container {{
	max-width: 100%;
	margin: 0 auto;
	padding: 40px;
	}}

	/* Typography */
	h1, h2, h3, h4 {{
	font-family: 'Playfair Display', serif;
	color: #ffffff;
	margin-top: 2em;
	margin-bottom: 0.5em;
	line-height: 1.25;
	}}

	h1 {{ font-size: 2.5rem; border-bottom: 1px solid rgba(255,255,255,0.1); padding-bottom: 20px; }}
	h2 {{ font-size: 1.8rem; }}
	h3 {{ font-size: 1.5rem; }}

	p {{ margin-bottom: 1.5em; font-size: 1.1rem; }}

	a {{ color: var(--accent-color); text-decoration: none; }}
	a:hover {{ text-decoration: underline; }}

	/* Code Blocks */
	pre {{
	background: var(--code-bg);
	padding: 20px;
	border-radius: 8px;
	overflow-x: auto;
	border: 1px solid rgba(255,255,255,0.1);
	margin: 2em 0;
	}}

	code {{
	font-family: 'JetBrains Mono', monospace;
	font-size: 0.9em;
	color: #efefef;
	}}

	p code {{
	background: rgba(255,255,255,0.1);
	padding: 2px 6px;
	border-radius: 4px;
	}}

	/* Blockquotes */
	blockquote {{
	border-left: 4px solid var(--accent-color);
	margin: 2em 0;
	padding-left: 20px;
	font-style: italic;
	color: #a1a1aa;
	}}

	/* Images */
	img {{
	max-width: 100%;
	height: auto;
	border-radius: 8px;
	margin: 2em auto;
	display: block;
	}}

	/* Lists */
	ul, ol {{ margin: 1.5em 0; padding-left: 2em; }}
	li {{ margin-bottom: 0.5em; }}

	/* Tables */
	table {{
	width: 100%;
	border-collapse: collapse;
	margin: 2em 0;
	}}
	th, td {{
	padding: 12px;
	border-bottom: 1px solid rgba(255,255,255,0.1);
	text-align: left;
	}}
	th {{ font-weight: 600; color: #fff; }}

	/* Author Card */
	.author-card {{
	background: rgba(255,255,255,0.05);
	padding: 20px;
	border-radius: 12px;
	margin-bottom: 40px;
	display: flex;
	align-items: center;
	gap: 20px;
	}}
	.author-card img {{ margin: 0; width: 64px; height: 64px; border-radius: 50%; }}

	/* Print Overrides */
	@media print {{
	body {{ background: white; color: black; }}
	h1, h2, h3 {{ color: black; }}
	pre {{ background: #f5f5f5; border: 1px solid #ddd; color: black; }}
	code {{ color: black; }}
	a {{ color: #000; text-decoration: underline; }}
	.container {{ padding: 0; }}
	}}
	</style>
	</head>
	<body class="bg-gray-900 text-gray-100">
	{content}
	</body>
	</html>"""

	# Article content template
	ARTICLE_TEMPLATE = """
	<div class="container w-full pt-20 mx-auto text-gray-100 break-words bg-gray-800 max-w-none">
	<div class="w-full px-4 text-xl leading-normal md:px-6" style="font-family:Georgia,serif;">
	<div class="font-sans">
	<p class="pb-3 text-base font-bold text-green-500 md:text-sm">
	<a href="{url}#bypass" class="text-sm font-bold text-green-500 no-underline md:text-sm hover:underline">< Go to the original</a>
	</p>
	{preview_image}
	<h1 class="pt-6 pb-2 font-sans text-3xl font-bold text-gray-100 break-normal md:text-4xl">{title}</h1>
	{subtitle_html}
	</div>
	{author_card}
	<div class="mt-8 main-content">
	{content}
	</div>
	<div class="flex flex-wrap gap-2 mt-5">
	{tags_html}
	</div>
	<div class="container w-full pt-12 mx-auto"></div>
	</div>
	</div>
	"""

	# Author card template
	AUTHOR_CARD_TEMPLATE = """
	<div class="m-2 mt-5 bg-gray-700 border border-gray-600">
	<div class="flex items-center p-4 space-x-4">
	<div class="flex-shrink-0">
	<a href="https://medium.com/@{username}" target="_blank" class="relative block">
	<img src="https://miro.medium.com/v2/resize:fill:88:88/{image_id}"
	alt="{name}" loading="eager" referrerpolicy="no-referrer"
	class="rounded-full h-11 w-11">
	</a>
	</div>
	<div class="flex-grow">
	<a href="https://medium.com/@{username}" target="_blank"
	class="block font-semibold text-white">{name}</a>
	<button class="px-3 py-1 mt-1 text-sm text-white bg-green-600 rounded-lg">
	<a href="https://medium.com/@{username}" target="_blank" class="text-sm text-white">Follow</a>
	</button>
	</div>
	</div>
	<div class="px-4 pb-2">
	<div class="flex flex-wrap items-center space-x-2 text-sm text-gray-400">
	{collection_html}
	<span>~{reading_time} min read</span>
	<span>·</span>
	<span class="text-yellow-400">Free: {free_access}</span>
	</div>
	</div>
	</div>
	"""


	def escape_html(text: str) -> str:
	"""Escape HTML special characters."""
	if not text:
	return ""
	return html.escape(str(text))


	def render_markup(text: str, markups: List[Dict]) -> str:
	"""Apply markups (bold, italic, links, code) to text."""
	if not text or not markups:
	return escape_html(text)

	# Sort markups by start position (reversed for correct insertion)
	sorted_markups = sorted(markups, key=lambda m: (m.get("start", 0), -m.get("end", 0)))

	result = list(text)
	insertions = [] # (position, is_closing, tag)

	for markup in sorted_markups:
	start = markup.get("start", 0)
	end = markup.get("end", len(text))
	markup_type = markup.get("type", "")

	if markup_type == "STRONG":
	insertions.append((start, False, "<strong>"))
	insertions.append((end, True, "</strong>"))
	elif markup_type == "EM":
	insertions.append((start, False, "<em>"))
	insertions.append((end, True, "</em>"))
	elif markup_type == "CODE":
	insertions.append((start, False, '<code class="p-1.5 bg-gray-600 rounded">'))
	insertions.append((end, True, "</code>"))
	elif markup_type == "A":
	href = escape_html(markup.get("href", ""))
	if markup.get("anchorType") == "USER":
	href = f"https://medium.com/u/{markup.get('userId', '')}"
	target = "" if href.startswith("#") else ' target="_blank"'
	insertions.append((start, False, f'<a href="{href}"{target} class="underline text-blue-400">'))
	insertions.append((end, True, "</a>"))

	# Sort insertions: by position, then closing tags before opening
	insertions.sort(key=lambda x: (x[0], x[1]))

	# Build result with insertions
	offset = 0
	escaped = escape_html(text)
	result_parts = []
	last_pos = 0

	for pos, is_closing, tag in insertions:
	if pos > last_pos:
	result_parts.append(escaped[last_pos:pos])
	result_parts.append(tag)
	last_pos = pos

	result_parts.append(escaped[last_pos:])
	return "".join(result_parts)


	def render_paragraph(paragraph: Dict, is_code: bool = False) -> str:
	"""Render a single paragraph to HTML."""
	para_type = paragraph.get("type", "P")
	text = paragraph.get("text", "")
	markups = paragraph.get("markups", [])

	# Apply markups
	formatted_text = render_markup(text, markups) if not is_code else escape_html(text)

	if para_type == "H2":
	return f'<h2 class="pt-12 font-bold font-sans break-normal text-gray-100 text-2xl">{formatted_text}</h2>'

	elif para_type == "H3":
	return f'<h3 class="pt-12 font-bold font-sans break-normal text-gray-100 text-2xl">{formatted_text}</h3>'

	elif para_type == "H4":
	return f'<h4 class="pt-8 font-bold font-sans break-normal text-gray-100 text-xl">{formatted_text}</h4>'

	elif para_type == "P":
	css_class = "leading-8 mt-7"
	if paragraph.get("hasDropCap"):
	css_class += " first-letter:text-7xl first-letter:float-left first-letter:mr-2"
	return f'<p class="{css_class}">{formatted_text}</p>'

	elif para_type == "IMG":
	metadata = paragraph.get("metadata") or {}
	image_id = metadata.get("id", "")
	alt = escape_html(metadata.get("alt", ""))
	caption = formatted_text

	img_html = f'''
	<div class="mt-7">
	<img loading="eager" alt="{alt}" class="pt-5 m-auto"
	referrerpolicy="no-referrer"
	src="https://miro.medium.com/v2/resize:fit:1400/{image_id}">
	</div>
	'''
	if caption:
	img_html += f'<figcaption class="mt-3 text-sm text-center text-gray-400">{caption}</figcaption>'
	return img_html

	elif para_type == "PRE":
	code_meta = paragraph.get("codeBlockMetadata") or {}
	lang = code_meta.get("lang") or ""
	lang_class = f"language-{lang}" if lang else "nohighlight"
	return f'<pre class="flex flex-col mt-7 border border-gray-700"><code class="p-2 bg-gray-900 overflow-x-auto {lang_class}">{escape_html(text)}</code></pre>'

	elif para_type == "BQ":
	return f'''
	<blockquote style="box-shadow: inset 3px 0 0 0 rgb(209 207 239);" class="px-5 pt-3 pb-3 mt-5">
	<p class="font-italic">{formatted_text}</p>
	</blockquote>
	'''

	elif para_type == "PQ":
	return f'<blockquote class="ml-5 text-2xl text-gray-300 mt-7"><p>{formatted_text}</p></blockquote>'

	elif para_type == "ULI":
	return f'<li class="mt-3">{formatted_text}</li>'

	elif para_type == "OLI":
	return f'<li class="mt-3">{formatted_text}</li>'

	elif para_type == "IFRAME":
	iframe_data = paragraph.get("iframe") or {}
	media_resource = iframe_data.get("mediaResource") or {}
	src = media_resource.get("iframeSrc", "")
	width = media_resource.get("iframeWidth", "100%")
	height = media_resource.get("iframeHeight", "400")

	if src:
	return f'''
	<div class="mt-7">
	<iframe class="w-full" src="{escape_html(src)}"
	width="{width}" height="{height}"
	allowfullscreen frameborder="0"></iframe>
	</div>
	'''
	return ""

	elif para_type == "MIXTAPE_EMBED":
	mixtape = paragraph.get("mixtapeMetadata") or {}
	href = escape_html(mixtape.get("href", ""))
	thumbnail = mixtape.get("thumbnailImageId", "")

	# Parse title/description from markups
	parts = text.split("\n") if text else ["", ""]
	embed_title = parts[0] if len(parts) > 0 else ""
	embed_desc = parts[1] if len(parts) > 1 else ""

	return f'''
	<div class="items-center p-2 overflow-hidden border border-gray-600 mt-7">
	<a rel="noopener follow" href="{href}" target="_blank">
	<div class="flex flex-row justify-between p-2 overflow-hidden">
	<div class="flex flex-col justify-center p-2">
	<h2 class="text-base font-bold text-gray-100">{escape_html(embed_title)}</h2>
	<div class="block mt-2">
	<h3 class="text-sm text-gray-400">{escape_html(embed_desc)}</h3>
	</div>
	</div>
	<div class="relative flex h-40 flew-row w-60">
	<div class="absolute inset-0 bg-center bg-cover"
	style="background-image: url('https://miro.medium.com/v2/resize:fit:800/{thumbnail}');">
	</div>
	</div>
	</div>
	</a>
	</div>
	'''

	else:
	logger.warning(f"Unknown paragraph type: {para_type}")
	return f'<p class="mt-7">{formatted_text}</p>'


	def render_paragraphs(paragraphs: List[Dict], title: str = "", subtitle: str = "", preview_image_id: str = "") -> str:
	"""Render all paragraphs to HTML content."""
	if not paragraphs:
	return ""

	out_parts = []
	i = 0

	while i < len(paragraphs):
	para = paragraphs[i]
	para_type = para.get("type", "")
	para_text = para.get("text", "")

	# Skip duplicate title/subtitle in first 4 paragraphs
	if i < 4:
	if para_type in ["H3", "H4", "H2"] and title and _similarity(para_text, title) > 0.8:
	i += 1
	continue
	if para_type in ["H4", "P"] and subtitle and _similarity(para_text, subtitle) > 0.8:
	i += 1
	continue
	if para_type == "IMG":
	metadata = para.get("metadata") or {}
	if metadata.get("id") == preview_image_id:
	i += 1
	continue

	# Handle grouped elements (lists, code blocks)
	if para_type == "ULI":
	list_items = []
	while i < len(paragraphs) and paragraphs[i].get("type") == "ULI":
	list_items.append(render_paragraph(paragraphs[i]))
	i += 1
	out_parts.append(f'<ul class="pl-8 mt-2 list-disc">{"".join(list_items)}</ul>')
	continue

	if para_type == "OLI":
	list_items = []
	while i < len(paragraphs) and paragraphs[i].get("type") == "OLI":
	list_items.append(render_paragraph(paragraphs[i]))
	i += 1
	out_parts.append(f'<ol class="pl-8 mt-2 list-decimal">{"".join(list_items)}</ol>')
	continue

	if para_type == "PRE":
	code_blocks = []
	while i < len(paragraphs) and paragraphs[i].get("type") == "PRE":
	code_blocks.append(escape_html(paragraphs[i].get("text", "")))
	i += 1
	code_meta = para.get("codeBlockMetadata") or {}
	lang = code_meta.get("lang") or ""
	lang_class = f"language-{lang}" if lang else "nohighlight"
	joined_code = "\n".join(code_blocks)
	out_parts.append(f'<pre class="flex flex-col mt-7 border border-gray-700"><code class="p-2 bg-gray-900 overflow-x-auto {lang_class}">{joined_code}</code></pre>')
	continue

	# Regular paragraph
	out_parts.append(render_paragraph(para))
	i += 1

	return "\n".join(out_parts)


	def _similarity(s1: str, s2: str) -> float:
	"""Calculate similarity ratio between two strings."""
	if not s1 or not s2:
	return 0.0
	s1, s2 = s1.lower(), s2.lower()
	if s1 == s2:
	return 1.0
	# Simple character overlap
	common = len(set(s1) & set(s2))
	total = len(set(s1) \| set(s2))
	return common / total if total > 0 else 0.0


	def render_article_html(article_data: Dict[str, Any]) -> str:
	"""
	Render article data to HTML content (not full page).

	Args:
	article_data: Dict with title, author, markdownContent, etc.

	Returns:
	HTML string for article content
	"""
	title = escape_html(article_data.get("title", "Untitled"))
	subtitle = article_data.get("subtitle", "")
	url = escape_html(article_data.get("url", ""))

	# Author info
	author = article_data.get("author") or {}
	if isinstance(author, str):
	author = {"name": author}

	author_name = escape_html(author.get("name", "Unknown"))
	author_username = escape_html(author.get("username", ""))
	author_image = author.get("imageId", "1*dmbNkD5D-u45r44go_cf0g.png")

	# Collection/publication
	collection = article_data.get("publication") or article_data.get("collection") or {}
	if isinstance(collection, str):
	collection = {"name": collection}
	collection_html = ""
	if collection and isinstance(collection, dict) and collection.get("name"):
	collection_html = f'''
	<a href="https://medium.com/{escape_html(collection.get('slug', ''))}" target="_blank" class="flex items-center space-x-1">
	<p>{escape_html(collection.get('name', ''))}</p>
	</a>
	<span>·</span>
	'''

	# Reading time
	reading_time = article_data.get("readingTime", 5)
	if isinstance(reading_time, float):
	reading_time = int(reading_time)

	# Free access
	is_locked = article_data.get("isLocked", False)
	free_access = "No" if is_locked else "Yes"

	# Preview image
	preview_image_id = article_data.get("previewImageId", "")
	preview_image_html = ""
	if preview_image_id:
	preview_image_html = f'''
	<img alt="Preview image" style="max-height: 65vh; width: auto; margin: auto"
	loading="eager" referrerpolicy="no-referrer"
	src="https://miro.medium.com/v2/resize:fit:1400/{preview_image_id}">
	'''

	# Subtitle
	subtitle_html = ""
	if subtitle:
	subtitle_html = f'<h2 class="pt-1 font-sans font-medium text-gray-400 break-normal text-1xl">{escape_html(subtitle)}</h2>'

	# Tags
	tags = article_data.get("tags", [])
	tags_html = ""
	for tag in tags[:10]:
	tag_slug = tag.get("normalizedTagSlug", tag) if isinstance(tag, dict) else str(tag)
	tag_display = tag.get("displayTitle", tag_slug) if isinstance(tag, dict) else tag_slug
	tags_html += f'''
	<a title="{escape_html(tag_display)}" target="_blank" href="https://medium.com/tag/{escape_html(tag_slug)}">
	<span class="px-2 py-1 text-xs text-green-400 bg-green-900 rounded-full">#{escape_html(tag_slug)}</span>
	</a>
	'''

	# Author card
	author_card = AUTHOR_CARD_TEMPLATE.format(
	username=author_username,
	image_id=author_image,
	name=author_name,
	collection_html=collection_html,
	reading_time=reading_time,
	free_access=free_access
	)

	# Content - try paragraphs first, fallback to markdown
	paragraphs = article_data.get("paragraphs", [])
	markdown_content = article_data.get("markdownContent", "")

	# Smart Detection: If paragraphs contain raw markdown syntax (e.g. from V2 fallback),
	# switch to Robust Markdown Rendering for better quality.
	use_markdown_renderer = False

	if not paragraphs:
	use_markdown_renderer = True
	elif markdown_content and _is_likely_markdown(paragraphs):
	logger.info("Detected raw markdown in paragraphs - switching to Markdown Renderer")
	use_markdown_renderer = True

	if use_markdown_renderer:
	# Convert markdown to robust HTML using V8 engine
	# If markdownContent missing but paragraphs present, reconstruct from text
	if not markdown_content and paragraphs:
	markdown_content = "\n\n".join([p.get("text", "") for p in paragraphs])

	content_html = _markdown_to_html(markdown_content)
	else:
	content_html = render_paragraphs(paragraphs, title, subtitle, preview_image_id)

	# Build article HTML
	article_html = ARTICLE_TEMPLATE.format(
	url=url,
	preview_image=preview_image_html,
	title=title,
	subtitle_html=subtitle_html,
	author_card=author_card,
	content=content_html,
	tags_html=tags_html
	)

	return article_html


	def _is_likely_markdown(paragraphs: List[Dict]) -> bool:
	"""
	Detect if paragraphs are actually just containers for raw markdown.
	This happens when the scraper falls back to dumping markdown tokens into the text field.
	"""
	if not paragraphs:
	return False

	# Check the first few paragraphs for tell-tale markdown syntax
	# that shouldn't appear in clean text
	sample_text = "\n".join([p.get("text", "") for p in paragraphs[:8]])

	triggers = [
	"#### ", # Headers
	"![", # Images
	"](http", # Links
	"```", # Code blocks
	"** ", # Bold at start
	"---", # HR
	]

	return any(trigger in sample_text for trigger in triggers)


	def render_full_page(article_data: Dict[str, Any]) -> str:
	"""
	Render article data to a complete standalone HTML page.

	Args:
	article_data: Dict with title, author, markdownContent, etc.

	Returns:
	Complete HTML page string
	"""
	title = escape_html(article_data.get("title", "Untitled"))
	content = render_article_html(article_data)

	return BASE_TEMPLATE.format(title=title, content=content)


	import markdown as md_lib

	def _markdown_to_html(markdown_text: str) -> str:
	"""Robust markdown to HTML conversion using library."""
	if not markdown_text:
	return ""

	# Use extra extension for tables, code blocks, etc.
	html_content = md_lib.markdown(
	markdown_text,
	extensions=['extra', 'codehilite', 'nl2br', 'sane_lists', 'fenced_code'],
	output_format='html5'
	)

	# Post-process for Tailwind/Prose styling matching our CSS
	# Enhance headers
	html_content = html_content.replace('<h1>', '<h1 class="pt-12 font-bold text-3xl">')
	html_content = html_content.replace('<h2>', '<h2 class="pt-12 font-bold text-2xl">')
	html_content = html_content.replace('<h3>', '<h3 class="pt-8 font-bold text-xl">')
	html_content = html_content.replace('<h4>', '<h4 class="pt-6 font-bold text-lg">')

	# Enhance paragraphs (add margin/leading)
	html_content = html_content.replace('<p>', '<p class="mt-4 leading-8">')

	# Enhance lists
	html_content = html_content.replace('<ul>', '<ul class="pl-8 mt-2 list-disc">')
	html_content = html_content.replace('<ol>', '<ol class="pl-8 mt-2 list-decimal">')
	html_content = html_content.replace('<li>', '<li class="ml-4 mt-1">')

	# Enhance blockquotes
	html_content = html_content.replace('blockquote>', 'blockquote class="px-5 py-3 mt-5 border-l-4 border-gray-500">')

	# Enhance pre/code
	html_content = html_content.replace('<pre>', '<pre class="mt-7 border border-gray-700 bg-gray-900 p-4 rounded overflow-x-auto">')

	return html_content