""" Medium Article HTML Renderer Renders article data to beautiful HTML matching Medium's styling. Based on Freedium's medium-parser/core.py template rendering. """ import html from typing import Dict, List, Any, Optional import logging # Import centralized image URL utilities from src.utils import MEDIUM_IMAGE_DEFAULT_WIDTH logger = logging.getLogger("HTMLRenderer") # Base HTML template for standalone page BASE_TEMPLATE = """ {title} | Medium Scraper {content} """ # Article content template ARTICLE_TEMPLATE = """

{preview_image}

{title}

{subtitle_html}

{author_card}

{content}

{tags_html}

""" # Author card template AUTHOR_CARD_TEMPLATE = """

{name}

{collection_html} ~{reading_time} min read · Free: {free_access}

""" def escape_html(text: str) -> str: """Escape HTML special characters.""" if not text: return "" return html.escape(str(text)) def render_markup(text: str, markups: List[Dict]) -> str: """Apply markups (bold, italic, links, code) to text.""" if not text or not markups: return escape_html(text) # Sort markups by start position (reversed for correct insertion) sorted_markups = sorted(markups, key=lambda m: (m.get("start", 0), -m.get("end", 0))) result = list(text) insertions = [] # (position, is_closing, tag) for markup in sorted_markups: start = markup.get("start", 0) end = markup.get("end", len(text)) markup_type = markup.get("type", "") if markup_type == "STRONG": insertions.append((start, False, "")) insertions.append((end, True, "")) elif markup_type == "EM": insertions.append((start, False, "")) insertions.append((end, True, "")) elif markup_type == "CODE": insertions.append((start, False, '

'))
            insertions.append((end, True, "

")) elif markup_type == "A": href = escape_html(markup.get("href", "")) if markup.get("anchorType") == "USER": href = f"https://medium.com/u/{markup.get('userId', '')}" target = "" if href.startswith("#") else ' target="_blank"' insertions.append((start, False, f'')) insertions.append((end, True, "")) # Sort insertions: by position, then closing tags before opening insertions.sort(key=lambda x: (x[0], x[1])) # Build result with insertions offset = 0 escaped = escape_html(text) result_parts = [] last_pos = 0 for pos, is_closing, tag in insertions: if pos > last_pos: result_parts.append(escaped[last_pos:pos]) result_parts.append(tag) last_pos = pos result_parts.append(escaped[last_pos:]) return "".join(result_parts) def render_paragraph(paragraph: Dict, is_code: bool = False) -> str: """Render a single paragraph to HTML.""" para_type = paragraph.get("type", "P") text = paragraph.get("text", "") markups = paragraph.get("markups", []) # Apply markups formatted_text = render_markup(text, markups) if not is_code else escape_html(text) if para_type == "H2": return f'

{formatted_text}

' elif para_type == "H3": return f'

{formatted_text}

' elif para_type == "H4": return f'

{formatted_text}

' elif para_type == "P": css_class = "leading-8 mt-7" if paragraph.get("hasDropCap"): css_class += " first-letter:text-7xl first-letter:float-left first-letter:mr-2" return f'

{formatted_text}

' elif para_type == "IMG": metadata = paragraph.get("metadata") or {} image_id = metadata.get("id", "") alt = escape_html(metadata.get("alt", "")) caption = formatted_text img_html = f'''

''' if caption: img_html += f'

{caption}

' return img_html elif para_type == "PRE": code_meta = paragraph.get("codeBlockMetadata") or {} lang = code_meta.get("lang") or "" lang_class = f"language-{lang}" if lang else "nohighlight" return f'

{escape_html(text)}

' elif para_type == "BQ": return f'''

{formatted_text}

''' elif para_type == "PQ": return f'

{formatted_text}

' elif para_type == "ULI": return f'

{formatted_text}

' elif para_type == "OLI": return f'

{formatted_text}

' elif para_type == "IFRAME": iframe_data = paragraph.get("iframe") or {} media_resource = iframe_data.get("mediaResource") or {} src = media_resource.get("iframeSrc", "") width = media_resource.get("iframeWidth", "100%") height = media_resource.get("iframeHeight", "400") if src: return f'''

''' return "" elif para_type == "MIXTAPE_EMBED": mixtape = paragraph.get("mixtapeMetadata") or {} href = escape_html(mixtape.get("href", "")) thumbnail = mixtape.get("thumbnailImageId", "") # Parse title/description from markups parts = text.split("\n") if text else ["", ""] embed_title = parts[0] if len(parts) > 0 else "" embed_desc = parts[1] if len(parts) > 1 else "" return f'''

{escape_html(embed_title)}

{escape_html(embed_desc)}

''' else: logger.warning(f"Unknown paragraph type: {para_type}") return f'

{formatted_text}

' def render_paragraphs(paragraphs: List[Dict], title: str = "", subtitle: str = "", preview_image_id: str = "") -> str: """Render all paragraphs to HTML content.""" if not paragraphs: return "" out_parts = [] i = 0 while i < len(paragraphs): para = paragraphs[i] para_type = para.get("type", "") para_text = para.get("text", "") # Skip duplicate title/subtitle in first 4 paragraphs if i < 4: if para_type in ["H3", "H4", "H2"] and title and _similarity(para_text, title) > 0.8: i += 1 continue if para_type in ["H4", "P"] and subtitle and _similarity(para_text, subtitle) > 0.8: i += 1 continue if para_type == "IMG": metadata = para.get("metadata") or {} if metadata.get("id") == preview_image_id: i += 1 continue # Handle grouped elements (lists, code blocks) if para_type == "ULI": list_items = [] while i < len(paragraphs) and paragraphs[i].get("type") == "ULI": list_items.append(render_paragraph(paragraphs[i])) i += 1 out_parts.append(f'

{"".join(list_items)}') continue if para_type == "OLI": list_items = [] while i < len(paragraphs) and paragraphs[i].get("type") == "OLI": list_items.append(render_paragraph(paragraphs[i])) i += 1 out_parts.append(f'

{"".join(list_items)}') continue if para_type == "PRE": code_blocks = [] while i < len(paragraphs) and paragraphs[i].get("type") == "PRE": code_blocks.append(escape_html(paragraphs[i].get("text", ""))) i += 1 code_meta = para.get("codeBlockMetadata") or {} lang = code_meta.get("lang") or "" lang_class = f"language-{lang}" if lang else "nohighlight" joined_code = "\n".join(code_blocks) out_parts.append(f'

{joined_code}

') continue # Regular paragraph out_parts.append(render_paragraph(para)) i += 1 return "\n".join(out_parts) def _similarity(s1: str, s2: str) -> float: """Calculate similarity ratio between two strings.""" if not s1 or not s2: return 0.0 s1, s2 = s1.lower(), s2.lower() if s1 == s2: return 1.0 # Simple character overlap common = len(set(s1) & set(s2)) total = len(set(s1) | set(s2)) return common / total if total > 0 else 0.0 def render_article_html(article_data: Dict[str, Any]) -> str: """ Render article data to HTML content (not full page). Args: article_data: Dict with title, author, markdownContent, etc. Returns: HTML string for article content """ title = escape_html(article_data.get("title", "Untitled")) subtitle = article_data.get("subtitle", "") url = escape_html(article_data.get("url", "")) # Author info author = article_data.get("author") or {} if isinstance(author, str): author = {"name": author} author_name = escape_html(author.get("name", "Unknown")) author_username = escape_html(author.get("username", "")) author_image = author.get("imageId", "1*dmbNkD5D-u45r44go_cf0g.png") # Collection/publication collection = article_data.get("publication") or article_data.get("collection") or {} if isinstance(collection, str): collection = {"name": collection} collection_html = "" if collection and isinstance(collection, dict) and collection.get("name"): collection_html = f'''

{escape_html(collection.get('name', ''))}

· ''' # Reading time reading_time = article_data.get("readingTime", 5) if isinstance(reading_time, float): reading_time = int(reading_time) # Free access is_locked = article_data.get("isLocked", False) free_access = "No" if is_locked else "Yes" # Preview image preview_image_id = article_data.get("previewImageId", "") preview_image_html = "" if preview_image_id: preview_image_html = f'''

''' # Subtitle subtitle_html = "" if subtitle: subtitle_html = f'

{escape_html(subtitle)}

' # Tags tags = article_data.get("tags", []) tags_html = "" for tag in tags[:10]: tag_slug = tag.get("normalizedTagSlug", tag) if isinstance(tag, dict) else str(tag) tag_display = tag.get("displayTitle", tag_slug) if isinstance(tag, dict) else tag_slug tags_html += f''' #{escape_html(tag_slug)} ''' # Author card author_card = AUTHOR_CARD_TEMPLATE.format( username=author_username, image_id=author_image, name=author_name, collection_html=collection_html, reading_time=reading_time, free_access=free_access ) # Content - try paragraphs first, fallback to markdown paragraphs = article_data.get("paragraphs", []) markdown_content = article_data.get("markdownContent", "") # Smart Detection: If paragraphs contain raw markdown syntax (e.g. from V2 fallback), # switch to Robust Markdown Rendering for better quality. use_markdown_renderer = False if not paragraphs: use_markdown_renderer = True elif markdown_content and _is_likely_markdown(paragraphs): logger.info("Detected raw markdown in paragraphs - switching to Markdown Renderer") use_markdown_renderer = True if use_markdown_renderer: # Convert markdown to robust HTML using V8 engine # If markdownContent missing but paragraphs present, reconstruct from text if not markdown_content and paragraphs: markdown_content = "\n\n".join([p.get("text", "") for p in paragraphs]) content_html = _markdown_to_html(markdown_content) else: content_html = render_paragraphs(paragraphs, title, subtitle, preview_image_id) # Build article HTML article_html = ARTICLE_TEMPLATE.format( url=url, preview_image=preview_image_html, title=title, subtitle_html=subtitle_html, author_card=author_card, content=content_html, tags_html=tags_html ) return article_html def _is_likely_markdown(paragraphs: List[Dict]) -> bool: """ Detect if paragraphs are actually just containers for raw markdown. This happens when the scraper falls back to dumping markdown tokens into the text field. """ if not paragraphs: return False # Check the first few paragraphs for tell-tale markdown syntax # that shouldn't appear in clean text sample_text = "\n".join([p.get("text", "") for p in paragraphs[:8]]) triggers = [ "#### ", # Headers "![", # Images "](http", # Links "```", # Code blocks "** ", # Bold at start "---", # HR ] return any(trigger in sample_text for trigger in triggers) def render_full_page(article_data: Dict[str, Any]) -> str: """ Render article data to a complete standalone HTML page. Args: article_data: Dict with title, author, markdownContent, etc. Returns: Complete HTML page string """ title = escape_html(article_data.get("title", "Untitled")) content = render_article_html(article_data) return BASE_TEMPLATE.format(title=title, content=content) import markdown as md_lib def _markdown_to_html(markdown_text: str) -> str: """Robust markdown to HTML conversion using library.""" if not markdown_text: return "" # Use extra extension for tables, code blocks, etc. html_content = md_lib.markdown( markdown_text, extensions=['extra', 'codehilite', 'nl2br', 'sane_lists', 'fenced_code'], output_format='html5' ) # Post-process for Tailwind/Prose styling matching our CSS # Enhance headers html_content = html_content.replace('

', '

') html_content = html_content.replace('

', '

') html_content = html_content.replace('

', '

') html_content = html_content.replace('

', '

') # Enhance paragraphs (add margin/leading) html_content = html_content.replace('

', '

') # Enhance lists html_content = html_content.replace('

', '
') # Enhance blockquotes html_content = html_content.replace('blockquote>', 'blockquote class="px-5 py-3 mt-5 border-l-4 border-gray-500">') # Enhance pre/code html_content = html_content.replace('
```
', '')
    
    return html_content
```