"""
Streamlit HTML to PDF Converter with Image Support and Proper Page Breaks
Save this file as: src/streamlit_app.py
"""
import streamlit as st
import subprocess
import os
import tempfile
import shutil
from pathlib import Path
import base64
import re
import mimetypes
st.set_page_config(
page_title="HTML to PDF Converter",
page_icon="📄",
layout="wide"
)
def detect_aspect_ratio(html_content):
"""Detect aspect ratio from HTML content"""
viewport_match = re.search(r']*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
if viewport_match:
viewport = viewport_match.group(1).lower()
if 'orientation=portrait' in viewport:
return "9:16"
elif 'orientation=landscape' in viewport:
return "16:9"
aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
if aspect_match:
width = int(aspect_match.group(1))
height = int(aspect_match.group(2))
ratio = width / height
if ratio > 1.5:
return "16:9"
elif ratio < 0.7:
return "9:16"
else:
return "1:1"
if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
return "16:9"
return "9:16"
def image_to_base64(image_file):
"""Convert uploaded image to base64 data URL"""
try:
image_bytes = image_file.getvalue()
mime_type, _ = mimetypes.guess_type(image_file.name)
if not mime_type:
ext = os.path.splitext(image_file.name)[1].lower()
mime_map = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.svg': 'image/svg+xml',
'.webp': 'image/webp',
'.bmp': 'image/bmp'
}
mime_type = mime_map.get(ext, 'image/png')
b64_data = base64.b64encode(image_bytes).decode('utf-8')
data_url = f"data:{mime_type};base64,{b64_data}"
return data_url
except Exception as e:
st.error(f"Error converting {image_file.name} to base64: {str(e)}")
return None
def embed_images_as_base64(html_content, uploaded_images):
"""Embed all images directly as base64 data URLs in the HTML"""
if not uploaded_images:
return html_content, {}
image_data_urls = {}
for img in uploaded_images:
data_url = image_to_base64(img)
if data_url:
image_data_urls[img.name] = data_url
st.write(f"✓ Converted {img.name} to base64 ({len(data_url)} chars)")
if not image_data_urls:
return html_content, {}
replacements = {}
for filename, data_url in image_data_urls.items():
escaped_name = re.escape(filename)
# Pattern 1: img src attribute
pattern1 = rf'(]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
count1 = len(matches1)
if matches1:
html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
replacements[f"{filename} (img src)"] = count1
# Pattern 2: background-image
pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
count2 = len(matches2)
if matches2:
html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
replacements[f"{filename} (bg-image)"] = count2
# Pattern 3: CSS url()
pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
count3 = len(matches3)
if matches3:
html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
replacements[f"{filename} (url)"] = count3
if replacements:
st.success("✅ Image Replacements:")
for key, count in replacements.items():
st.write(f" • {key}: {count} replacement(s)")
else:
st.warning("⚠️ No image references found in HTML matching uploaded files!")
st.write("Uploaded files:", [img.name for img in uploaded_images])
with st.expander("🔍 Debug: Show HTML image references"):
img_lines = [line for line in html_content.split('\n')
if any(k in line.lower() for k in ['
/* Define page size */
@page {{
size: {page_size};
margin: 0;
}}
/* Reset body */
html, body {{
margin: 0 !important;
padding: 0 !important;
width: 100% !important;
height: 100% !important;
}}
/* Page containers - each should be one page */
.page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
width: 100% !important;
min-height: 100vh !important;
height: 100vh !important;
page-break-after: always !important;
break-after: page !important;
page-break-inside: avoid !important;
break-inside: avoid !important;
position: relative !important;
box-sizing: border-box !important;
overflow: hidden !important;
}}
/* Last page shouldn't force a break */
.page:last-child, .slide:last-child,
section.page:last-child, article.page:last-child {{
page-break-after: auto !important;
break-after: auto !important;
}}
/* If no explicit page class, treat direct body children as pages */
body > section:not(.no-page-break),
body > article:not(.no-page-break),
body > div:not(.no-page-break) {{
page-break-after: always !important;
break-after: page !important;
min-height: 100vh;
}}
body > section:last-child,
body > article:last-child,
body > div:last-child {{
page-break-after: auto !important;
}}
/* Utility classes for manual control */
.page-break, .page-break-after {{
page-break-after: always !important;
break-after: page !important;
}}
.page-break-before {{
page-break-before: always !important;
break-before: page !important;
}}
.no-page-break, .keep-together {{
page-break-inside: avoid !important;
break-inside: avoid !important;
}}
/* Prevent awkward breaks in content */
h1, h2, h3, h4, h5, h6 {{
page-break-after: avoid !important;
break-after: avoid !important;
page-break-inside: avoid !important;
break-inside: avoid !important;
}}
img, figure, table, pre, blockquote {{
page-break-inside: avoid !important;
break-inside: avoid !important;
}}
/* Preserve colors and backgrounds */
* {{
-webkit-print-color-adjust: exact !important;
print-color-adjust: exact !important;
color-adjust: exact !important;
}}
"""
# Inject CSS into HTML
if '' in html_content:
html_content = html_content.replace('', page_css + '')
elif '
Content here...