File size: 23,867 Bytes
ae588db
 
 
 
 
 
 
 
 
 
 
60742a2
 
 
ae588db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60742a2
ae588db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60742a2
ae588db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60742a2
ae588db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
"""
Medium Article HTML Renderer

Renders article data to beautiful HTML matching Medium's styling.
Based on Freedium's medium-parser/core.py template rendering.
"""

import html
from typing import Dict, List, Any, Optional
import logging

# Import centralized image URL utilities
from src.utils import MEDIUM_IMAGE_DEFAULT_WIDTH

logger = logging.getLogger("HTMLRenderer")

# Base HTML template for standalone page
BASE_TEMPLATE = """<!DOCTYPE html>
<html lang="en" class="dark">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>{title} | Medium Scraper</title>
    <style>
        @import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@400;700&family=Inter:wght@300;400;600&family=JetBrains+Mono:wght@400;600&display=swap');
        
        :root {{
            --bg-color: #121212;
            --text-color: #e5e5e5;
            --accent-color: #6366f1;
            --code-bg: #1e1e1e;
        }}
        
        body {{
            background-color: var(--bg-color);
            color: var(--text-color);
            font-family: 'Inter', sans-serif;
            line-height: 1.6;
            margin: 0;
            padding: 0;
        }}

        /* Container for PDF and Web consistency */
        .container {{
            max-width: 100%;
            margin: 0 auto;
            padding: 40px;
        }}

        /* Typography */
        h1, h2, h3, h4 {{
            font-family: 'Playfair Display', serif;
            color: #ffffff;
            margin-top: 2em;
            margin-bottom: 0.5em;
            line-height: 1.25;
        }}
        
        h1 {{ font-size: 2.5rem; border-bottom: 1px solid rgba(255,255,255,0.1); padding-bottom: 20px; }}
        h2 {{ font-size: 1.8rem; }}
        h3 {{ font-size: 1.5rem; }}
        
        p {{ margin-bottom: 1.5em; font-size: 1.1rem; }}
        
        a {{ color: var(--accent-color); text-decoration: none; }}
        a:hover {{ text-decoration: underline; }}

        /* Code Blocks */
        pre {{
            background: var(--code-bg);
            padding: 20px;
            border-radius: 8px;
            overflow-x: auto;
            border: 1px solid rgba(255,255,255,0.1);
            margin: 2em 0;
        }}
        
        code {{
            font-family: 'JetBrains Mono', monospace;
            font-size: 0.9em;
            color: #efefef;
        }}
        
        p code {{
            background: rgba(255,255,255,0.1);
            padding: 2px 6px;
            border-radius: 4px;
        }}

        /* Blockquotes */
        blockquote {{
            border-left: 4px solid var(--accent-color);
            margin: 2em 0;
            padding-left: 20px;
            font-style: italic;
            color: #a1a1aa;
        }}

        /* Images */
        img {{
            max-width: 100%;
            height: auto;
            border-radius: 8px;
            margin: 2em auto;
            display: block;
        }}
        
        /* Lists */
        ul, ol {{ margin: 1.5em 0; padding-left: 2em; }}
        li {{ margin-bottom: 0.5em; }}
        
        /* Tables */
        table {{
            width: 100%;
            border-collapse: collapse;
            margin: 2em 0;
        }}
        th, td {{
            padding: 12px;
            border-bottom: 1px solid rgba(255,255,255,0.1);
            text-align: left;
        }}
        th {{ font-weight: 600; color: #fff; }}

        /* Author Card */
        .author-card {{
            background: rgba(255,255,255,0.05);
            padding: 20px;
            border-radius: 12px;
            margin-bottom: 40px;
            display: flex;
            align-items: center;
            gap: 20px;
        }}
        .author-card img {{ margin: 0; width: 64px; height: 64px; border-radius: 50%; }}
        
        /* Print Overrides */
        @media print {{
            body {{ background: white; color: black; }}
            h1, h2, h3 {{ color: black; }}
            pre {{ background: #f5f5f5; border: 1px solid #ddd; color: black; }}
            code {{ color: black; }}
            a {{ color: #000; text-decoration: underline; }}
            .container {{ padding: 0; }}
        }}
    </style>
</head>
<body class="bg-gray-900 text-gray-100">
    {content}
</body>
</html>"""

# Article content template
ARTICLE_TEMPLATE = """
<div class="container w-full pt-20 mx-auto text-gray-100 break-words bg-gray-800 max-w-none">
    <div class="w-full px-4 text-xl leading-normal md:px-6" style="font-family:Georgia,serif;">
        <div class="font-sans">
            <p class="pb-3 text-base font-bold text-green-500 md:text-sm">
                <a href="{url}#bypass" class="text-sm font-bold text-green-500 no-underline md:text-sm hover:underline">&lt; Go to the original</a>
            </p>
            {preview_image}
            <h1 class="pt-6 pb-2 font-sans text-3xl font-bold text-gray-100 break-normal md:text-4xl">{title}</h1>
            {subtitle_html}
        </div>
        {author_card}
        <div class="mt-8 main-content">
            {content}
        </div>
        <div class="flex flex-wrap gap-2 mt-5">
            {tags_html}
        </div>
        <div class="container w-full pt-12 mx-auto"></div>
    </div>
</div>
"""

# Author card template
AUTHOR_CARD_TEMPLATE = """
<div class="m-2 mt-5 bg-gray-700 border border-gray-600">
    <div class="flex items-center p-4 space-x-4">
        <div class="flex-shrink-0">
            <a href="https://medium.com/@{username}" target="_blank" class="relative block">
                <img src="https://miro.medium.com/v2/resize:fill:88:88/{image_id}" 
                     alt="{name}" loading="eager" referrerpolicy="no-referrer" 
                     class="rounded-full h-11 w-11">
            </a>
        </div>
        <div class="flex-grow">
            <a href="https://medium.com/@{username}" target="_blank" 
               class="block font-semibold text-white">{name}</a>
            <button class="px-3 py-1 mt-1 text-sm text-white bg-green-600 rounded-lg">
                <a href="https://medium.com/@{username}" target="_blank" class="text-sm text-white">Follow</a>
            </button>
        </div>
    </div>
    <div class="px-4 pb-2">
        <div class="flex flex-wrap items-center space-x-2 text-sm text-gray-400">
            {collection_html}
            <span>~{reading_time} min read</span>
            <span>·</span>
            <span class="text-yellow-400">Free: {free_access}</span>
        </div>
    </div>
</div>
"""


def escape_html(text: str) -> str:
    """Escape HTML special characters."""
    if not text:
        return ""
    return html.escape(str(text))


def render_markup(text: str, markups: List[Dict]) -> str:
    """Apply markups (bold, italic, links, code) to text."""
    if not text or not markups:
        return escape_html(text)
    
    # Sort markups by start position (reversed for correct insertion)
    sorted_markups = sorted(markups, key=lambda m: (m.get("start", 0), -m.get("end", 0)))
    
    result = list(text)
    insertions = []  # (position, is_closing, tag)
    
    for markup in sorted_markups:
        start = markup.get("start", 0)
        end = markup.get("end", len(text))
        markup_type = markup.get("type", "")
        
        if markup_type == "STRONG":
            insertions.append((start, False, "<strong>"))
            insertions.append((end, True, "</strong>"))
        elif markup_type == "EM":
            insertions.append((start, False, "<em>"))
            insertions.append((end, True, "</em>"))
        elif markup_type == "CODE":
            insertions.append((start, False, '<code class="p-1.5 bg-gray-600 rounded">'))
            insertions.append((end, True, "</code>"))
        elif markup_type == "A":
            href = escape_html(markup.get("href", ""))
            if markup.get("anchorType") == "USER":
                href = f"https://medium.com/u/{markup.get('userId', '')}"
            target = "" if href.startswith("#") else ' target="_blank"'
            insertions.append((start, False, f'<a href="{href}"{target} class="underline text-blue-400">'))
            insertions.append((end, True, "</a>"))
    
    # Sort insertions: by position, then closing tags before opening
    insertions.sort(key=lambda x: (x[0], x[1]))
    
    # Build result with insertions
    offset = 0
    escaped = escape_html(text)
    result_parts = []
    last_pos = 0
    
    for pos, is_closing, tag in insertions:
        if pos > last_pos:
            result_parts.append(escaped[last_pos:pos])
        result_parts.append(tag)
        last_pos = pos
    
    result_parts.append(escaped[last_pos:])
    return "".join(result_parts)


def render_paragraph(paragraph: Dict, is_code: bool = False) -> str:
    """Render a single paragraph to HTML."""
    para_type = paragraph.get("type", "P")
    text = paragraph.get("text", "")
    markups = paragraph.get("markups", [])
    
    # Apply markups
    formatted_text = render_markup(text, markups) if not is_code else escape_html(text)
    
    if para_type == "H2":
        return f'<h2 class="pt-12 font-bold font-sans break-normal text-gray-100 text-2xl">{formatted_text}</h2>'
    
    elif para_type == "H3":
        return f'<h3 class="pt-12 font-bold font-sans break-normal text-gray-100 text-2xl">{formatted_text}</h3>'
    
    elif para_type == "H4":
        return f'<h4 class="pt-8 font-bold font-sans break-normal text-gray-100 text-xl">{formatted_text}</h4>'
    
    elif para_type == "P":
        css_class = "leading-8 mt-7"
        if paragraph.get("hasDropCap"):
            css_class += " first-letter:text-7xl first-letter:float-left first-letter:mr-2"
        return f'<p class="{css_class}">{formatted_text}</p>'
    
    elif para_type == "IMG":
        metadata = paragraph.get("metadata") or {}
        image_id = metadata.get("id", "")
        alt = escape_html(metadata.get("alt", ""))
        caption = formatted_text
        
        img_html = f'''
        <div class="mt-7">
            <img loading="eager" alt="{alt}" class="pt-5 m-auto" 
                 referrerpolicy="no-referrer" 
                 src="https://miro.medium.com/v2/resize:fit:1400/{image_id}">
        </div>
        '''
        if caption:
            img_html += f'<figcaption class="mt-3 text-sm text-center text-gray-400">{caption}</figcaption>'
        return img_html
    
    elif para_type == "PRE":
        code_meta = paragraph.get("codeBlockMetadata") or {}
        lang = code_meta.get("lang") or ""
        lang_class = f"language-{lang}" if lang else "nohighlight"
        return f'<pre class="flex flex-col mt-7 border border-gray-700"><code class="p-2 bg-gray-900 overflow-x-auto {lang_class}">{escape_html(text)}</code></pre>'
    
    elif para_type == "BQ":
        return f'''
        <blockquote style="box-shadow: inset 3px 0 0 0 rgb(209 207 239);" class="px-5 pt-3 pb-3 mt-5">
            <p class="font-italic">{formatted_text}</p>
        </blockquote>
        '''
    
    elif para_type == "PQ":
        return f'<blockquote class="ml-5 text-2xl text-gray-300 mt-7"><p>{formatted_text}</p></blockquote>'
    
    elif para_type == "ULI":
        return f'<li class="mt-3">{formatted_text}</li>'
    
    elif para_type == "OLI":
        return f'<li class="mt-3">{formatted_text}</li>'
    
    elif para_type == "IFRAME":
        iframe_data = paragraph.get("iframe") or {}
        media_resource = iframe_data.get("mediaResource") or {}
        src = media_resource.get("iframeSrc", "")
        width = media_resource.get("iframeWidth", "100%")
        height = media_resource.get("iframeHeight", "400")
        
        if src:
            return f'''
            <div class="mt-7">
                <iframe class="w-full" src="{escape_html(src)}" 
                        width="{width}" height="{height}" 
                        allowfullscreen frameborder="0"></iframe>
            </div>
            '''
        return ""
    
    elif para_type == "MIXTAPE_EMBED":
        mixtape = paragraph.get("mixtapeMetadata") or {}
        href = escape_html(mixtape.get("href", ""))
        thumbnail = mixtape.get("thumbnailImageId", "")
        
        # Parse title/description from markups
        parts = text.split("\n") if text else ["", ""]
        embed_title = parts[0] if len(parts) > 0 else ""
        embed_desc = parts[1] if len(parts) > 1 else ""
        
        return f'''
        <div class="items-center p-2 overflow-hidden border border-gray-600 mt-7">
            <a rel="noopener follow" href="{href}" target="_blank">
                <div class="flex flex-row justify-between p-2 overflow-hidden">
                    <div class="flex flex-col justify-center p-2">
                        <h2 class="text-base font-bold text-gray-100">{escape_html(embed_title)}</h2>
                        <div class="block mt-2">
                            <h3 class="text-sm text-gray-400">{escape_html(embed_desc)}</h3>
                        </div>
                    </div>
                    <div class="relative flex h-40 flew-row w-60">
                        <div class="absolute inset-0 bg-center bg-cover" 
                             style="background-image: url('https://miro.medium.com/v2/resize:fit:800/{thumbnail}');">
                        </div>
                    </div>
                </div>
            </a>
        </div>
        '''
    
    else:
        logger.warning(f"Unknown paragraph type: {para_type}")
        return f'<p class="mt-7">{formatted_text}</p>'


def render_paragraphs(paragraphs: List[Dict], title: str = "", subtitle: str = "", preview_image_id: str = "") -> str:
    """Render all paragraphs to HTML content."""
    if not paragraphs:
        return ""
    
    out_parts = []
    i = 0
    
    while i < len(paragraphs):
        para = paragraphs[i]
        para_type = para.get("type", "")
        para_text = para.get("text", "")
        
        # Skip duplicate title/subtitle in first 4 paragraphs
        if i < 4:
            if para_type in ["H3", "H4", "H2"] and title and _similarity(para_text, title) > 0.8:
                i += 1
                continue
            if para_type in ["H4", "P"] and subtitle and _similarity(para_text, subtitle) > 0.8:
                i += 1
                continue
            if para_type == "IMG":
                metadata = para.get("metadata") or {}
                if metadata.get("id") == preview_image_id:
                    i += 1
                    continue
        
        # Handle grouped elements (lists, code blocks)
        if para_type == "ULI":
            list_items = []
            while i < len(paragraphs) and paragraphs[i].get("type") == "ULI":
                list_items.append(render_paragraph(paragraphs[i]))
                i += 1
            out_parts.append(f'<ul class="pl-8 mt-2 list-disc">{"".join(list_items)}</ul>')
            continue
        
        if para_type == "OLI":
            list_items = []
            while i < len(paragraphs) and paragraphs[i].get("type") == "OLI":
                list_items.append(render_paragraph(paragraphs[i]))
                i += 1
            out_parts.append(f'<ol class="pl-8 mt-2 list-decimal">{"".join(list_items)}</ol>')
            continue
        
        if para_type == "PRE":
            code_blocks = []
            while i < len(paragraphs) and paragraphs[i].get("type") == "PRE":
                code_blocks.append(escape_html(paragraphs[i].get("text", "")))
                i += 1
            code_meta = para.get("codeBlockMetadata") or {}
            lang = code_meta.get("lang") or ""
            lang_class = f"language-{lang}" if lang else "nohighlight"
            joined_code = "\n".join(code_blocks)
            out_parts.append(f'<pre class="flex flex-col mt-7 border border-gray-700"><code class="p-2 bg-gray-900 overflow-x-auto {lang_class}">{joined_code}</code></pre>')
            continue
        
        # Regular paragraph
        out_parts.append(render_paragraph(para))
        i += 1
    
    return "\n".join(out_parts)


def _similarity(s1: str, s2: str) -> float:
    """Calculate similarity ratio between two strings."""
    if not s1 or not s2:
        return 0.0
    s1, s2 = s1.lower(), s2.lower()
    if s1 == s2:
        return 1.0
    # Simple character overlap
    common = len(set(s1) & set(s2))
    total = len(set(s1) | set(s2))
    return common / total if total > 0 else 0.0


def render_article_html(article_data: Dict[str, Any]) -> str:
    """
    Render article data to HTML content (not full page).
    
    Args:
        article_data: Dict with title, author, markdownContent, etc.
        
    Returns:
        HTML string for article content
    """
    title = escape_html(article_data.get("title", "Untitled"))
    subtitle = article_data.get("subtitle", "")
    url = escape_html(article_data.get("url", ""))
    
    # Author info
    author = article_data.get("author") or {}
    if isinstance(author, str):
        author = {"name": author}
    
    author_name = escape_html(author.get("name", "Unknown"))
    author_username = escape_html(author.get("username", ""))
    author_image = author.get("imageId", "1*dmbNkD5D-u45r44go_cf0g.png")
    
    # Collection/publication
    collection = article_data.get("publication") or article_data.get("collection") or {}
    if isinstance(collection, str):
        collection = {"name": collection}
    collection_html = ""
    if collection and isinstance(collection, dict) and collection.get("name"):
        collection_html = f'''
        <a href="https://medium.com/{escape_html(collection.get('slug', ''))}" target="_blank" class="flex items-center space-x-1">
            <p>{escape_html(collection.get('name', ''))}</p>
        </a>
        <span>·</span>
        '''
    
    # Reading time
    reading_time = article_data.get("readingTime", 5)
    if isinstance(reading_time, float):
        reading_time = int(reading_time)
    
    # Free access
    is_locked = article_data.get("isLocked", False)
    free_access = "No" if is_locked else "Yes"
    
    # Preview image
    preview_image_id = article_data.get("previewImageId", "")
    preview_image_html = ""
    if preview_image_id:
        preview_image_html = f'''
        <img alt="Preview image" style="max-height: 65vh; width: auto; margin: auto" 
             loading="eager" referrerpolicy="no-referrer" 
             src="https://miro.medium.com/v2/resize:fit:1400/{preview_image_id}">
        '''
    
    # Subtitle
    subtitle_html = ""
    if subtitle:
        subtitle_html = f'<h2 class="pt-1 font-sans font-medium text-gray-400 break-normal text-1xl">{escape_html(subtitle)}</h2>'
    
    # Tags
    tags = article_data.get("tags", [])
    tags_html = ""
    for tag in tags[:10]:
        tag_slug = tag.get("normalizedTagSlug", tag) if isinstance(tag, dict) else str(tag)
        tag_display = tag.get("displayTitle", tag_slug) if isinstance(tag, dict) else tag_slug
        tags_html += f'''
        <a title="{escape_html(tag_display)}" target="_blank" href="https://medium.com/tag/{escape_html(tag_slug)}">
            <span class="px-2 py-1 text-xs text-green-400 bg-green-900 rounded-full">#{escape_html(tag_slug)}</span>
        </a>
        '''
    
    # Author card
    author_card = AUTHOR_CARD_TEMPLATE.format(
        username=author_username,
        image_id=author_image,
        name=author_name,
        collection_html=collection_html,
        reading_time=reading_time,
        free_access=free_access
    )
    
    # Content - try paragraphs first, fallback to markdown
    paragraphs = article_data.get("paragraphs", [])
    markdown_content = article_data.get("markdownContent", "")
    
    # Smart Detection: If paragraphs contain raw markdown syntax (e.g. from V2 fallback),
    # switch to Robust Markdown Rendering for better quality.
    use_markdown_renderer = False
    
    if not paragraphs:
        use_markdown_renderer = True
    elif markdown_content and _is_likely_markdown(paragraphs):
        logger.info("Detected raw markdown in paragraphs - switching to Markdown Renderer")
        use_markdown_renderer = True
        
    if use_markdown_renderer:
        # Convert markdown to robust HTML using V8 engine
        # If markdownContent missing but paragraphs present, reconstruct from text
        if not markdown_content and paragraphs:
            markdown_content = "\n\n".join([p.get("text", "") for p in paragraphs])
            
        content_html = _markdown_to_html(markdown_content)
    else:
        content_html = render_paragraphs(paragraphs, title, subtitle, preview_image_id)
    
    # Build article HTML
    article_html = ARTICLE_TEMPLATE.format(
        url=url,
        preview_image=preview_image_html,
        title=title,
        subtitle_html=subtitle_html,
        author_card=author_card,
        content=content_html,
        tags_html=tags_html
    )
    
    return article_html


def _is_likely_markdown(paragraphs: List[Dict]) -> bool:
    """
    Detect if paragraphs are actually just containers for raw markdown.
    This happens when the scraper falls back to dumping markdown tokens into the text field.
    """
    if not paragraphs:
        return False
    
    # Check the first few paragraphs for tell-tale markdown syntax
    # that shouldn't appear in clean text
    sample_text = "\n".join([p.get("text", "") for p in paragraphs[:8]])
    
    triggers = [
        "#### ",      # Headers
        "![",         # Images
        "](http",     # Links
        "```",        # Code blocks
        "** ",        # Bold at start
        "---",        # HR
    ]
    
    return any(trigger in sample_text for trigger in triggers)


def render_full_page(article_data: Dict[str, Any]) -> str:
    """
    Render article data to a complete standalone HTML page.
    
    Args:
        article_data: Dict with title, author, markdownContent, etc.
        
    Returns:
        Complete HTML page string
    """
    title = escape_html(article_data.get("title", "Untitled"))
    content = render_article_html(article_data)
    
    return BASE_TEMPLATE.format(title=title, content=content)


import markdown as md_lib

def _markdown_to_html(markdown_text: str) -> str:
    """Robust markdown to HTML conversion using library."""
    if not markdown_text:
        return ""
    
    # Use extra extension for tables, code blocks, etc.
    html_content = md_lib.markdown(
        markdown_text,
        extensions=['extra', 'codehilite', 'nl2br', 'sane_lists', 'fenced_code'],
        output_format='html5'
    )
    
    # Post-process for Tailwind/Prose styling matching our CSS
    # Enhance headers
    html_content = html_content.replace('<h1>', '<h1 class="pt-12 font-bold text-3xl">')
    html_content = html_content.replace('<h2>', '<h2 class="pt-12 font-bold text-2xl">')
    html_content = html_content.replace('<h3>', '<h3 class="pt-8 font-bold text-xl">')
    html_content = html_content.replace('<h4>', '<h4 class="pt-6 font-bold text-lg">')
    
    # Enhance paragraphs (add margin/leading)
    html_content = html_content.replace('<p>', '<p class="mt-4 leading-8">')
    
    # Enhance lists
    html_content = html_content.replace('<ul>', '<ul class="pl-8 mt-2 list-disc">')
    html_content = html_content.replace('<ol>', '<ol class="pl-8 mt-2 list-decimal">')
    html_content = html_content.replace('<li>', '<li class="ml-4 mt-1">')
    
    # Enhance blockquotes
    html_content = html_content.replace('blockquote>', 'blockquote class="px-5 py-3 mt-5 border-l-4 border-gray-500">')
    
    # Enhance pre/code
    html_content = html_content.replace('<pre>', '<pre class="mt-7 border border-gray-700 bg-gray-900 p-4 rounded overflow-x-auto">')
    
    return html_content