htmlpdf / src /streamlit_app.py
ABDALLALSWAITI's picture
Rename src/streamlit_app (1).py to src/streamlit_app.py
703de2f verified
"""
Streamlit HTML to PDF Converter with Image Support and Proper Page Breaks
Save this file as: src/streamlit_app.py
"""
import streamlit as st
import subprocess
import os
import tempfile
import shutil
from pathlib import Path
import base64
import re
import mimetypes
st.set_page_config(
page_title="HTML to PDF Converter",
page_icon="πŸ“„",
layout="wide"
)
def detect_aspect_ratio(html_content):
"""Detect aspect ratio from HTML content"""
viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
if viewport_match:
viewport = viewport_match.group(1).lower()
if 'orientation=portrait' in viewport:
return "9:16"
elif 'orientation=landscape' in viewport:
return "16:9"
aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
if aspect_match:
width = int(aspect_match.group(1))
height = int(aspect_match.group(2))
ratio = width / height
if ratio > 1.5:
return "16:9"
elif ratio < 0.7:
return "9:16"
else:
return "1:1"
if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
return "16:9"
return "9:16"
def image_to_base64(image_file):
"""Convert uploaded image to base64 data URL"""
try:
image_bytes = image_file.getvalue()
mime_type, _ = mimetypes.guess_type(image_file.name)
if not mime_type:
ext = os.path.splitext(image_file.name)[1].lower()
mime_map = {
'.jpg': 'image/jpeg',
'.jpeg': 'image/jpeg',
'.png': 'image/png',
'.gif': 'image/gif',
'.svg': 'image/svg+xml',
'.webp': 'image/webp',
'.bmp': 'image/bmp'
}
mime_type = mime_map.get(ext, 'image/png')
b64_data = base64.b64encode(image_bytes).decode('utf-8')
data_url = f"data:{mime_type};base64,{b64_data}"
return data_url
except Exception as e:
st.error(f"Error converting {image_file.name} to base64: {str(e)}")
return None
def embed_images_as_base64(html_content, uploaded_images):
"""Embed all images directly as base64 data URLs in the HTML"""
if not uploaded_images:
return html_content, {}
image_data_urls = {}
for img in uploaded_images:
data_url = image_to_base64(img)
if data_url:
image_data_urls[img.name] = data_url
st.write(f"βœ“ Converted {img.name} to base64 ({len(data_url)} chars)")
if not image_data_urls:
return html_content, {}
replacements = {}
for filename, data_url in image_data_urls.items():
escaped_name = re.escape(filename)
# Pattern 1: img src attribute
pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
count1 = len(matches1)
if matches1:
html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
replacements[f"{filename} (img src)"] = count1
# Pattern 2: background-image
pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
count2 = len(matches2)
if matches2:
html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
replacements[f"{filename} (bg-image)"] = count2
# Pattern 3: CSS url()
pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
count3 = len(matches3)
if matches3:
html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
replacements[f"{filename} (url)"] = count3
if replacements:
st.success("βœ… Image Replacements:")
for key, count in replacements.items():
st.write(f" β€’ {key}: {count} replacement(s)")
else:
st.warning("⚠️ No image references found in HTML matching uploaded files!")
st.write("Uploaded files:", [img.name for img in uploaded_images])
with st.expander("πŸ” Debug: Show HTML image references"):
img_lines = [line for line in html_content.split('\n')
if any(k in line.lower() for k in ['<img', 'src=', 'url(', 'background'])]
if img_lines:
for line in img_lines[:10]:
st.code(line.strip(), language='html')
else:
st.write("No image-related lines found in HTML")
return html_content, replacements
def inject_page_breaks(html_content: str, aspect_ratio: str):
"""Automatically inject page breaks and page sizing CSS"""
# Determine page orientation
if aspect_ratio == "16:9":
page_size = "A4 landscape"
orientation = "landscape"
elif aspect_ratio == "1:1":
page_size = "210mm 210mm"
orientation = "portrait"
else: # 9:16
page_size = "A4 portrait"
orientation = "portrait"
# Comprehensive page break CSS
page_css = f"""
<style id="auto-page-breaks">
/* Define page size */
@page {{
size: {page_size};
margin: 0;
}}
/* Reset body */
html, body {{
margin: 0 !important;
padding: 0 !important;
width: 100% !important;
height: 100% !important;
}}
/* Page containers - each should be one page */
.page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
width: 100% !important;
min-height: 100vh !important;
height: 100vh !important;
page-break-after: always !important;
break-after: page !important;
page-break-inside: avoid !important;
break-inside: avoid !important;
position: relative !important;
box-sizing: border-box !important;
overflow: hidden !important;
}}
/* Last page shouldn't force a break */
.page:last-child, .slide:last-child,
section.page:last-child, article.page:last-child {{
page-break-after: auto !important;
break-after: auto !important;
}}
/* If no explicit page class, treat direct body children as pages */
body > section:not(.no-page-break),
body > article:not(.no-page-break),
body > div:not(.no-page-break) {{
page-break-after: always !important;
break-after: page !important;
min-height: 100vh;
}}
body > section:last-child,
body > article:last-child,
body > div:last-child {{
page-break-after: auto !important;
}}
/* Utility classes for manual control */
.page-break, .page-break-after {{
page-break-after: always !important;
break-after: page !important;
}}
.page-break-before {{
page-break-before: always !important;
break-before: page !important;
}}
.no-page-break, .keep-together {{
page-break-inside: avoid !important;
break-inside: avoid !important;
}}
/* Prevent awkward breaks in content */
h1, h2, h3, h4, h5, h6 {{
page-break-after: avoid !important;
break-after: avoid !important;
page-break-inside: avoid !important;
break-inside: avoid !important;
}}
img, figure, table, pre, blockquote {{
page-break-inside: avoid !important;
break-inside: avoid !important;
}}
/* Preserve colors and backgrounds */
* {{
-webkit-print-color-adjust: exact !important;
print-color-adjust: exact !important;
color-adjust: exact !important;
}}
</style>
"""
# Inject CSS into HTML
if '</head>' in html_content:
html_content = html_content.replace('</head>', page_css + '</head>')
elif '<body' in html_content:
html_content = html_content.replace('<body', page_css + '<body', 1)
else:
html_content = page_css + html_content
return html_content
def render_html_preview(html_content):
"""Render HTML preview in an iframe"""
b64 = base64.b64encode(html_content.encode()).decode()
iframe_html = f'<iframe src="data:text/html;base64,{b64}" width="100%" height="600" style="border: 2px solid #ddd; border-radius: 5px;"></iframe>'
return iframe_html
def render_pdf_preview(pdf_bytes):
"""Render PDF preview using embedded PDF.js"""
b64 = base64.b64encode(pdf_bytes).decode()
pdf_viewer_html = f'''
<!DOCTYPE html>
<html>
<head>
<style>
body {{
margin: 0;
padding: 0;
overflow: hidden;
background: #525659;
}}
#pdf-container {{
width: 100%;
height: 100vh;
overflow: auto;
display: flex;
flex-direction: column;
align-items: center;
padding: 20px;
box-sizing: border-box;
}}
canvas {{
box-shadow: 0 2px 8px rgba(0,0,0,0.3);
margin-bottom: 10px;
background: white;
}}
#loading {{
color: white;
font-family: Arial, sans-serif;
font-size: 18px;
padding: 20px;
}}
</style>
</head>
<body>
<div id="pdf-container">
<div id="loading">Loading PDF...</div>
</div>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
<script>
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
const pdfData = atob('{b64}');
const pdfContainer = document.getElementById('pdf-container');
const loading = document.getElementById('loading');
const uint8Array = new Uint8Array(pdfData.length);
for (let i = 0; i < pdfData.length; i++) {{
uint8Array[i] = pdfData.charCodeAt(i);
}}
pdfjsLib.getDocument({{data: uint8Array}}).promise.then(function(pdf) {{
loading.style.display = 'none';
const numPages = pdf.numPages;
const promises = [];
for (let pageNum = 1; pageNum <= numPages; pageNum++) {{
promises.push(
pdf.getPage(pageNum).then(function(page) {{
const scale = 1.5;
const viewport = page.getViewport({{scale: scale}});
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
pdfContainer.appendChild(canvas);
return page.render({{
canvasContext: context,
viewport: viewport
}}).promise;
}})
);
}}
return Promise.all(promises);
}}).catch(function(error) {{
loading.innerHTML = '<div style="color:#ff6b6b;">Error: ' + error.message + '</div>';
}});
</script>
</body>
</html>
'''
return pdf_viewer_html
def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
"""Convert HTML content to PDF using Puppeteer with proper page breaks"""
try:
# Step 1: Inject page break CSS
st.write("πŸ”§ Injecting page break CSS...")
html_content = inject_page_breaks(html_content, aspect_ratio)
# Save HTML to temp file
html_file = os.path.join(temp_dir, "input.html")
with open(html_file, 'w', encoding='utf-8') as f:
f.write(html_content)
st.write(f"πŸ“ Saved HTML: {os.path.getsize(html_file):,} bytes")
# Find puppeteer script
script_dir = os.path.dirname(os.path.abspath(__file__))
possible_paths = [
os.path.join(os.path.dirname(script_dir), 'puppeteer_pdf.js'),
os.path.join(script_dir, 'puppeteer_pdf.js'),
os.path.join(script_dir, '..', 'puppeteer_pdf.js'),
'puppeteer_pdf.js'
]
puppeteer_script = None
for path in possible_paths:
if os.path.exists(path):
puppeteer_script = path
break
if not puppeteer_script:
return None, "Error: puppeteer_pdf.js not found"
st.write(f"πŸ”§ Using Puppeteer: {puppeteer_script}")
# Run conversion
result = subprocess.run(
['node', puppeteer_script, html_file, aspect_ratio],
capture_output=True,
text=True,
timeout=60,
cwd=os.path.dirname(os.path.abspath(puppeteer_script))
)
if result.returncode != 0:
return None, f"PDF conversion failed: {result.stderr}"
# Read PDF
pdf_file = html_file.replace('.html', '.pdf')
if not os.path.exists(pdf_file):
return None, "PDF file was not generated"
with open(pdf_file, 'rb') as f:
pdf_bytes = f.read()
st.write(f"βœ… PDF generated: {len(pdf_bytes):,} bytes")
return pdf_bytes, None
except subprocess.TimeoutExpired:
return None, "Error: PDF conversion timed out (60 seconds)"
except Exception as e:
return None, f"Error: {str(e)}"
# Main UI
st.title("πŸ“„ HTML to PDF Converter")
st.markdown("""
Convert HTML to PDF with **proper page breaks** and **embedded base64 images**!
✨ Each page in your HTML will be preserved as a separate PDF page.
""")
# Create tabs
tab1, tab2 = st.tabs(["πŸ“€ Upload HTML File", "πŸ“ Paste HTML Code"])
# Tab 1: Upload HTML File
with tab1:
uploaded_file = st.file_uploader(
"Choose an HTML file",
type=['html', 'htm'],
key="file_uploader",
help="Upload an HTML file"
)
uploaded_images = st.file_uploader(
"πŸ“· Upload Images",
type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
key="image_uploader",
help="Upload images - they will be embedded as base64 in the HTML",
accept_multiple_files=True
)
if uploaded_images:
st.success(f"βœ… {len(uploaded_images)} image(s) uploaded")
with st.expander("View uploaded images"):
cols = st.columns(min(len(uploaded_images), 4))
for idx, img in enumerate(uploaded_images):
with cols[idx % 4]:
st.image(img, caption=img.name, use_container_width=True)
if uploaded_file:
st.success(f"βœ… File: {uploaded_file.name}")
uploaded_file.seek(0)
try:
html_content = uploaded_file.getvalue().decode('utf-8')
except UnicodeDecodeError:
uploaded_file.seek(0)
html_content = uploaded_file.getvalue().decode('latin-1')
detected_ratio = detect_aspect_ratio(html_content)
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("βš™οΈ Settings")
auto_detect = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_file")
if auto_detect:
aspect_ratio = detected_ratio
st.info(f"πŸ” Detected: **{detected_ratio}**")
else:
aspect_ratio = st.radio(
"Aspect Ratio",
options=["16:9", "1:1", "9:16"],
index=["16:9", "1:1", "9:16"].index(detected_ratio),
key="aspect_file"
)
convert_btn = st.button("πŸ”„ Convert to PDF", key="conv_file", type="primary", use_container_width=True)
with col2:
st.subheader("πŸ‘οΈ Preview")
with st.expander("Show HTML"):
st.components.v1.html(render_html_preview(html_content), height=400, scrolling=True)
if convert_btn:
temp_dir = None
try:
with st.spinner("Converting..."):
temp_dir = tempfile.mkdtemp()
# Embed images as base64
processed_html = html_content
if uploaded_images:
with st.expander("πŸ–ΌοΈ Image Processing", expanded=True):
processed_html, replacements = embed_images_as_base64(html_content, uploaded_images)
if not replacements:
st.warning("⚠️ Images uploaded but no matches found in HTML!")
st.write("**Tip:** Make sure image filenames in HTML match uploaded files exactly")
# Convert to PDF
pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio, temp_dir)
if error:
st.error(f"❌ {error}")
else:
st.success("βœ… PDF generated with proper page breaks!")
output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
if not output_name.endswith('.pdf'):
output_name += '.pdf'
col_a, col_b = st.columns(2)
with col_a:
st.download_button(
"⬇️ Download PDF",
data=pdf_bytes,
file_name=output_name,
mime="application/pdf",
use_container_width=True
)
with col_b:
st.info(f"Size: {len(pdf_bytes):,} bytes")
st.subheader("πŸ“„ PDF Preview")
st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
except Exception as e:
st.error(f"❌ Error: {str(e)}")
finally:
if temp_dir and os.path.exists(temp_dir):
shutil.rmtree(temp_dir, ignore_errors=True)
# Tab 2: Paste HTML
with tab2:
html_code = st.text_area(
"HTML Content",
value="""<!DOCTYPE html>
<html>
<head>
<style>
body {
font-family: Arial;
margin: 0;
padding: 0;
}
.page {
width: 100%;
height: 100vh;
display: flex;
align-items: center;
justify-content: center;
box-sizing: border-box;
padding: 40px;
}
.page:nth-child(1) {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
}
.page:nth-child(2) {
background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
color: white;
}
.page:nth-child(3) {
background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
color: white;
}
h1 { font-size: 48px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
</style>
</head>
<body>
<div class="page">
<h1>Page 1: Hello PDF! 🌍</h1>
</div>
<div class="page">
<h1>Page 2: Separate Page! πŸ“„</h1>
</div>
<div class="page">
<h1>Page 3: Final Page! ✨</h1>
</div>
</body>
</html>""",
height=400,
key="html_code"
)
uploaded_images_text = st.file_uploader(
"πŸ“· Upload Images",
type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
key="image_text",
help="Upload images to embed in your HTML",
accept_multiple_files=True
)
if uploaded_images_text:
st.success(f"βœ… {len(uploaded_images_text)} image(s) uploaded")
with st.expander("View images"):
cols = st.columns(min(len(uploaded_images_text), 4))
for idx, img in enumerate(uploaded_images_text):
with cols[idx % 4]:
st.image(img, caption=img.name, use_container_width=True)
if html_code.strip():
detected_ratio_text = detect_aspect_ratio(html_code)
auto_detect_text = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_text")
if auto_detect_text:
aspect_ratio_text = detected_ratio_text
st.info(f"πŸ” Detected: **{detected_ratio_text}**")
else:
aspect_ratio_text = st.radio(
"Aspect Ratio",
options=["16:9", "1:1", "9:16"],
index=["16:9", "1:1", "9:16"].index(detected_ratio_text),
key="aspect_text"
)
convert_text_btn = st.button("πŸ”„ Convert", key="conv_text", type="primary", use_container_width=True)
if convert_text_btn:
temp_dir = None
try:
with st.spinner("Converting..."):
temp_dir = tempfile.mkdtemp()
processed_html = html_code
if uploaded_images_text:
with st.expander("πŸ–ΌοΈ Image Processing", expanded=True):
processed_html, replacements = embed_images_as_base64(html_code, uploaded_images_text)
if not replacements:
st.warning("⚠️ Images uploaded but no matches found!")
pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio_text, temp_dir)
if error:
st.error(f"❌ {error}")
else:
st.success("βœ… PDF generated with proper page breaks!")
col_a, col_b = st.columns(2)
with col_a:
st.download_button(
"⬇️ Download PDF",
data=pdf_bytes,
file_name="converted.pdf",
mime="application/pdf",
use_container_width=True
)
with col_b:
st.info(f"Size: {len(pdf_bytes):,} bytes")
st.subheader("πŸ“„ PDF Preview")
st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
except Exception as e:
st.error(f"❌ Error: {str(e)}")
finally:
if temp_dir and os.path.exists(temp_dir):
shutil.rmtree(temp_dir, ignore_errors=True)
# Footer
st.markdown("---")
st.markdown("""
### πŸ’‘ How Page Breaks Work:
**Automatic Page Detection:**
- Elements with class `page`, `slide`, or `section.page` are treated as separate pages
- Each page automatically gets `page-break-after: always` CSS
- Last page won't have a trailing break
**HTML Structure for Multiple Pages:**
```html
<div class="page">Page 1 content</div>
<div class="page">Page 2 content</div>
<div class="page">Page 3 content</div>
```
**Manual Page Breaks:**
- Add class `page-break` to force a break after an element
- Add class `page-break-before` to force a break before an element
- Add class `no-page-break` to prevent breaks inside an element
**Image Embedding:**
- Images are converted to base64 and embedded directly in HTML
- Ensures images always appear in the PDF
- Filename in HTML must match uploaded file exactly
### πŸ“ Example HTML:
```html
<!DOCTYPE html>
<html>
<body>
<div class="page">
<h1>First Page</h1>
<img src="logo.png" alt="Logo">
</div>
<div class="page">
<h1>Second Page</h1>
<p>Content here...</p>
</div>
</body>
</html>
```
Then upload a file named: `logo.png`
""")