""" Streamlit HTML to PDF Converter with Image Support and Proper Page Breaks Save this file as: src/streamlit_app.py """ import streamlit as st import subprocess import os import tempfile import shutil from pathlib import Path import base64 import re import mimetypes st.set_page_config( page_title="HTML to PDF Converter", page_icon="📄", layout="wide" ) def detect_aspect_ratio(html_content): """Detect aspect ratio from HTML content""" viewport_match = re.search(r']*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE) if viewport_match: viewport = viewport_match.group(1).lower() if 'orientation=portrait' in viewport: return "9:16" elif 'orientation=landscape' in viewport: return "16:9" aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE) if aspect_match: width = int(aspect_match.group(1)) height = int(aspect_match.group(2)) ratio = width / height if ratio > 1.5: return "16:9" elif ratio < 0.7: return "9:16" else: return "1:1" if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']): return "16:9" return "9:16" def image_to_base64(image_file): """Convert uploaded image to base64 data URL""" try: image_bytes = image_file.getvalue() mime_type, _ = mimetypes.guess_type(image_file.name) if not mime_type: ext = os.path.splitext(image_file.name)[1].lower() mime_map = { '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.png': 'image/png', '.gif': 'image/gif', '.svg': 'image/svg+xml', '.webp': 'image/webp', '.bmp': 'image/bmp' } mime_type = mime_map.get(ext, 'image/png') b64_data = base64.b64encode(image_bytes).decode('utf-8') data_url = f"data:{mime_type};base64,{b64_data}" return data_url except Exception as e: st.error(f"Error converting {image_file.name} to base64: {str(e)}") return None def embed_images_as_base64(html_content, uploaded_images): """Embed all images directly as base64 data URLs in the HTML""" if not uploaded_images: return html_content, {} image_data_urls = {} for img in uploaded_images: data_url = image_to_base64(img) if data_url: image_data_urls[img.name] = data_url st.write(f"✓ Converted {img.name} to base64 ({len(data_url)} chars)") if not image_data_urls: return html_content, {} replacements = {} for filename, data_url in image_data_urls.items(): escaped_name = re.escape(filename) # Pattern 1: img src attribute pattern1 = rf'(]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2' matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL)) count1 = len(matches1) if matches1: html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL) replacements[f"{filename} (img src)"] = count1 # Pattern 2: background-image pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))' matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE)) count2 = len(matches2) if matches2: html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE) replacements[f"{filename} (bg-image)"] = count2 # Pattern 3: CSS url() pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))' matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE)) count3 = len(matches3) if matches3: html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE) replacements[f"{filename} (url)"] = count3 if replacements: st.success("✅ Image Replacements:") for key, count in replacements.items(): st.write(f" • {key}: {count} replacement(s)") else: st.warning("⚠️ No image references found in HTML matching uploaded files!") st.write("Uploaded files:", [img.name for img in uploaded_images]) with st.expander("🔍 Debug: Show HTML image references"): img_lines = [line for line in html_content.split('\n') if any(k in line.lower() for k in [' /* Define page size */ @page {{ size: {page_size}; margin: 0; }} /* Reset body */ html, body {{ margin: 0 !important; padding: 0 !important; width: 100% !important; height: 100% !important; }} /* Page containers - each should be one page */ .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{ width: 100% !important; min-height: 100vh !important; height: 100vh !important; page-break-after: always !important; break-after: page !important; page-break-inside: avoid !important; break-inside: avoid !important; position: relative !important; box-sizing: border-box !important; overflow: hidden !important; }} /* Last page shouldn't force a break */ .page:last-child, .slide:last-child, section.page:last-child, article.page:last-child {{ page-break-after: auto !important; break-after: auto !important; }} /* If no explicit page class, treat direct body children as pages */ body > section:not(.no-page-break), body > article:not(.no-page-break), body > div:not(.no-page-break) {{ page-break-after: always !important; break-after: page !important; min-height: 100vh; }} body > section:last-child, body > article:last-child, body > div:last-child {{ page-break-after: auto !important; }} /* Utility classes for manual control */ .page-break, .page-break-after {{ page-break-after: always !important; break-after: page !important; }} .page-break-before {{ page-break-before: always !important; break-before: page !important; }} .no-page-break, .keep-together {{ page-break-inside: avoid !important; break-inside: avoid !important; }} /* Prevent awkward breaks in content */ h1, h2, h3, h4, h5, h6 {{ page-break-after: avoid !important; break-after: avoid !important; page-break-inside: avoid !important; break-inside: avoid !important; }} img, figure, table, pre, blockquote {{ page-break-inside: avoid !important; break-inside: avoid !important; }} /* Preserve colors and backgrounds */ * {{ -webkit-print-color-adjust: exact !important; print-color-adjust: exact !important; color-adjust: exact !important; }} """ # Inject CSS into HTML if '' in html_content: html_content = html_content.replace('', page_css + '') elif '' return iframe_html def render_pdf_preview(pdf_bytes): """Render PDF preview using embedded PDF.js""" b64 = base64.b64encode(pdf_bytes).decode() pdf_viewer_html = f'''
Loading PDF...
''' return pdf_viewer_html def convert_html_to_pdf(html_content, aspect_ratio, temp_dir): """Convert HTML content to PDF using Puppeteer with proper page breaks""" try: # Step 1: Inject page break CSS st.write("🔧 Injecting page break CSS...") html_content = inject_page_breaks(html_content, aspect_ratio) # Save HTML to temp file html_file = os.path.join(temp_dir, "input.html") with open(html_file, 'w', encoding='utf-8') as f: f.write(html_content) st.write(f"📝 Saved HTML: {os.path.getsize(html_file):,} bytes") # Find puppeteer script script_dir = os.path.dirname(os.path.abspath(__file__)) possible_paths = [ os.path.join(os.path.dirname(script_dir), 'puppeteer_pdf.js'), os.path.join(script_dir, 'puppeteer_pdf.js'), os.path.join(script_dir, '..', 'puppeteer_pdf.js'), 'puppeteer_pdf.js' ] puppeteer_script = None for path in possible_paths: if os.path.exists(path): puppeteer_script = path break if not puppeteer_script: return None, "Error: puppeteer_pdf.js not found" st.write(f"🔧 Using Puppeteer: {puppeteer_script}") # Run conversion result = subprocess.run( ['node', puppeteer_script, html_file, aspect_ratio], capture_output=True, text=True, timeout=60, cwd=os.path.dirname(os.path.abspath(puppeteer_script)) ) if result.returncode != 0: return None, f"PDF conversion failed: {result.stderr}" # Read PDF pdf_file = html_file.replace('.html', '.pdf') if not os.path.exists(pdf_file): return None, "PDF file was not generated" with open(pdf_file, 'rb') as f: pdf_bytes = f.read() st.write(f"✅ PDF generated: {len(pdf_bytes):,} bytes") return pdf_bytes, None except subprocess.TimeoutExpired: return None, "Error: PDF conversion timed out (60 seconds)" except Exception as e: return None, f"Error: {str(e)}" # Main UI st.title("📄 HTML to PDF Converter") st.markdown(""" Convert HTML to PDF with **proper page breaks** and **embedded base64 images**! ✨ Each page in your HTML will be preserved as a separate PDF page. """) # Create tabs tab1, tab2 = st.tabs(["📤 Upload HTML File", "📝 Paste HTML Code"]) # Tab 1: Upload HTML File with tab1: uploaded_file = st.file_uploader( "Choose an HTML file", type=['html', 'htm'], key="file_uploader", help="Upload an HTML file" ) uploaded_images = st.file_uploader( "📷 Upload Images", type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'], key="image_uploader", help="Upload images - they will be embedded as base64 in the HTML", accept_multiple_files=True ) if uploaded_images: st.success(f"✅ {len(uploaded_images)} image(s) uploaded") with st.expander("View uploaded images"): cols = st.columns(min(len(uploaded_images), 4)) for idx, img in enumerate(uploaded_images): with cols[idx % 4]: st.image(img, caption=img.name, use_container_width=True) if uploaded_file: st.success(f"✅ File: {uploaded_file.name}") uploaded_file.seek(0) try: html_content = uploaded_file.getvalue().decode('utf-8') except UnicodeDecodeError: uploaded_file.seek(0) html_content = uploaded_file.getvalue().decode('latin-1') detected_ratio = detect_aspect_ratio(html_content) col1, col2 = st.columns([1, 1]) with col1: st.subheader("⚙️ Settings") auto_detect = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_file") if auto_detect: aspect_ratio = detected_ratio st.info(f"🔍 Detected: **{detected_ratio}**") else: aspect_ratio = st.radio( "Aspect Ratio", options=["16:9", "1:1", "9:16"], index=["16:9", "1:1", "9:16"].index(detected_ratio), key="aspect_file" ) convert_btn = st.button("🔄 Convert to PDF", key="conv_file", type="primary", use_container_width=True) with col2: st.subheader("👁️ Preview") with st.expander("Show HTML"): st.components.v1.html(render_html_preview(html_content), height=400, scrolling=True) if convert_btn: temp_dir = None try: with st.spinner("Converting..."): temp_dir = tempfile.mkdtemp() # Embed images as base64 processed_html = html_content if uploaded_images: with st.expander("🖼️ Image Processing", expanded=True): processed_html, replacements = embed_images_as_base64(html_content, uploaded_images) if not replacements: st.warning("⚠️ Images uploaded but no matches found in HTML!") st.write("**Tip:** Make sure image filenames in HTML match uploaded files exactly") # Convert to PDF pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio, temp_dir) if error: st.error(f"❌ {error}") else: st.success("✅ PDF generated with proper page breaks!") output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf') if not output_name.endswith('.pdf'): output_name += '.pdf' col_a, col_b = st.columns(2) with col_a: st.download_button( "⬇️ Download PDF", data=pdf_bytes, file_name=output_name, mime="application/pdf", use_container_width=True ) with col_b: st.info(f"Size: {len(pdf_bytes):,} bytes") st.subheader("📄 PDF Preview") st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True) except Exception as e: st.error(f"❌ Error: {str(e)}") finally: if temp_dir and os.path.exists(temp_dir): shutil.rmtree(temp_dir, ignore_errors=True) # Tab 2: Paste HTML with tab2: html_code = st.text_area( "HTML Content", value="""

Page 1: Hello PDF! 🌍

Page 2: Separate Page! 📄

Page 3: Final Page! ✨

""", height=400, key="html_code" ) uploaded_images_text = st.file_uploader( "📷 Upload Images", type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'], key="image_text", help="Upload images to embed in your HTML", accept_multiple_files=True ) if uploaded_images_text: st.success(f"✅ {len(uploaded_images_text)} image(s) uploaded") with st.expander("View images"): cols = st.columns(min(len(uploaded_images_text), 4)) for idx, img in enumerate(uploaded_images_text): with cols[idx % 4]: st.image(img, caption=img.name, use_container_width=True) if html_code.strip(): detected_ratio_text = detect_aspect_ratio(html_code) auto_detect_text = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_text") if auto_detect_text: aspect_ratio_text = detected_ratio_text st.info(f"🔍 Detected: **{detected_ratio_text}**") else: aspect_ratio_text = st.radio( "Aspect Ratio", options=["16:9", "1:1", "9:16"], index=["16:9", "1:1", "9:16"].index(detected_ratio_text), key="aspect_text" ) convert_text_btn = st.button("🔄 Convert", key="conv_text", type="primary", use_container_width=True) if convert_text_btn: temp_dir = None try: with st.spinner("Converting..."): temp_dir = tempfile.mkdtemp() processed_html = html_code if uploaded_images_text: with st.expander("🖼️ Image Processing", expanded=True): processed_html, replacements = embed_images_as_base64(html_code, uploaded_images_text) if not replacements: st.warning("⚠️ Images uploaded but no matches found!") pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio_text, temp_dir) if error: st.error(f"❌ {error}") else: st.success("✅ PDF generated with proper page breaks!") col_a, col_b = st.columns(2) with col_a: st.download_button( "⬇️ Download PDF", data=pdf_bytes, file_name="converted.pdf", mime="application/pdf", use_container_width=True ) with col_b: st.info(f"Size: {len(pdf_bytes):,} bytes") st.subheader("📄 PDF Preview") st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True) except Exception as e: st.error(f"❌ Error: {str(e)}") finally: if temp_dir and os.path.exists(temp_dir): shutil.rmtree(temp_dir, ignore_errors=True) # Footer st.markdown("---") st.markdown(""" ### 💡 How Page Breaks Work: **Automatic Page Detection:** - Elements with class `page`, `slide`, or `section.page` are treated as separate pages - Each page automatically gets `page-break-after: always` CSS - Last page won't have a trailing break **HTML Structure for Multiple Pages:** ```html
Page 1 content
Page 2 content
Page 3 content
``` **Manual Page Breaks:** - Add class `page-break` to force a break after an element - Add class `page-break-before` to force a break before an element - Add class `no-page-break` to prevent breaks inside an element **Image Embedding:** - Images are converted to base64 and embedded directly in HTML - Ensures images always appear in the PDF - Filename in HTML must match uploaded file exactly ### 📝 Example HTML: ```html

First Page

Logo

Second Page

Content here...

``` Then upload a file named: `logo.png` """)