htmlpdf / src /streamlit_app.py
ABDALLALSWAITI's picture
Update src/streamlit_app.py
69a76c1 verified
raw
history blame
23.8 kB
import streamlit as st
import subprocess
import os
import tempfile
import shutil
from pathlib import Path
import base64
import re
st.set_page_config(
page_title="HTML to PDF Converter",
page_icon="πŸ“„",
layout="wide"
)
def detect_aspect_ratio(html_content):
"""
Detect aspect ratio from HTML content
Returns: "16:9", "1:1", or "9:16"
"""
# Check for viewport meta tag
viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
if viewport_match:
viewport = viewport_match.group(1).lower()
if 'width=device-width' in viewport or 'width=100%' in viewport:
# Check for orientation hints
if 'orientation=portrait' in viewport:
return "9:16"
elif 'orientation=landscape' in viewport:
return "16:9"
# Check for CSS aspect-ratio property
aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
if aspect_match:
width = int(aspect_match.group(1))
height = int(aspect_match.group(2))
ratio = width / height
if ratio > 1.5:
return "16:9"
elif ratio < 0.7:
return "9:16"
else:
return "1:1"
# Check for common presentation frameworks
if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
return "16:9"
# Check body style for width/height hints
body_match = re.search(r'<body[^>]*style=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
if body_match:
style = body_match.group(1).lower()
if 'width' in style and 'height' in style:
width_match = re.search(r'width\s*:\s*(\d+)', style)
height_match = re.search(r'height\s*:\s*(\d+)', style)
if width_match and height_match:
w = int(width_match.group(1))
h = int(height_match.group(1))
ratio = w / h
if ratio > 1.5:
return "16:9"
elif ratio < 0.7:
return "9:16"
# Default to A4 portrait for documents
return "9:16"
def save_uploaded_images(images, temp_dir):
"""Save uploaded images and return mapping"""
image_mapping = {}
images_dir = os.path.join(temp_dir, "images")
os.makedirs(images_dir, exist_ok=True)
for image in images:
# Save image
image_path = os.path.join(images_dir, image.name)
with open(image_path, 'wb') as f:
f.write(image.getvalue())
# Create mapping
image_mapping[image.name] = f"images/{image.name}"
print(f"Saved image: {image.name} -> {image_path}")
return image_mapping
def process_html_with_images(html_content, temp_dir, image_mapping):
"""Process HTML to handle image references with absolute file paths"""
import re
for original_name, relative_path in image_mapping.items():
# Get absolute path for the image
absolute_path = os.path.abspath(os.path.join(temp_dir, relative_path))
file_url = f"file://{absolute_path}"
# Replace various image reference patterns
# Pattern 1: src="filename" or src='filename'
html_content = re.sub(
rf'src=["\'](?:\./)?{re.escape(original_name)}["\']',
f'src="{file_url}"',
html_content,
flags=re.IGNORECASE
)
# Pattern 2: background-image: url(filename)
html_content = re.sub(
rf'url\(["\']?(?:\./)?{re.escape(original_name)}["\']?\)',
f'url("{file_url}")',
html_content,
flags=re.IGNORECASE
)
# Pattern 3: href for links
html_content = re.sub(
rf'href=["\'](?:\./)?{re.escape(original_name)}["\']',
f'href="{file_url}"',
html_content,
flags=re.IGNORECASE
)
return html_content
def render_html_preview(html_content):
"""Render HTML preview in an iframe"""
# Encode HTML content
b64 = base64.b64encode(html_content.encode()).decode()
iframe_html = f'<iframe src="data:text/html;base64,{b64}" width="100%" height="600" style="border: 2px solid #ddd; border-radius: 5px;"></iframe>'
return iframe_html
def render_pdf_preview(pdf_bytes):
"""Render PDF preview using embedded PDF.js"""
b64 = base64.b64encode(pdf_bytes).decode()
pdf_viewer_html = f'''
<!DOCTYPE html>
<html>
<head>
<style>
body {{
margin: 0;
padding: 0;
overflow: hidden;
background: #525659;
}}
#pdf-container {{
width: 100%;
height: 100vh;
overflow: auto;
display: flex;
flex-direction: column;
align-items: center;
padding: 20px;
box-sizing: border-box;
}}
canvas {{
box-shadow: 0 2px 8px rgba(0,0,0,0.3);
margin-bottom: 10px;
background: white;
}}
#loading {{
color: white;
font-family: Arial, sans-serif;
font-size: 18px;
padding: 20px;
}}
.error {{
color: #ff6b6b;
font-family: Arial, sans-serif;
padding: 20px;
background: rgba(0,0,0,0.5);
border-radius: 5px;
margin: 20px;
}}
</style>
</head>
<body>
<div id="pdf-container">
<div id="loading">Loading PDF...</div>
</div>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
<script>
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
const pdfData = atob('{b64}');
const pdfContainer = document.getElementById('pdf-container');
const loading = document.getElementById('loading');
const uint8Array = new Uint8Array(pdfData.length);
for (let i = 0; i < pdfData.length; i++) {{
uint8Array[i] = pdfData.charCodeAt(i);
}}
pdfjsLib.getDocument({{data: uint8Array}}).promise.then(function(pdf) {{
loading.style.display = 'none';
const numPages = pdf.numPages;
const promises = [];
for (let pageNum = 1; pageNum <= numPages; pageNum++) {{
promises.push(
pdf.getPage(pageNum).then(function(page) {{
const scale = 1.5;
const viewport = page.getViewport({{scale: scale}});
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
pdfContainer.appendChild(canvas);
return page.render({{
canvasContext: context,
viewport: viewport
}}).promise;
}})
);
}}
return Promise.all(promises);
}}).catch(function(error) {{
loading.innerHTML = '<div class="error">Error loading PDF: ' + error.message + '</div>';
console.error('Error loading PDF:', error);
}});
</script>
</body>
</html>
'''
return pdf_viewer_html
def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
"""
Convert HTML content to PDF using Puppeteer with better styling preservation
Args:
html_content: String containing HTML content
aspect_ratio: One of "16:9", "1:1", or "9:16"
temp_dir: Temporary directory for processing
Returns:
Tuple of (pdf_bytes, error_message)
"""
try:
# Inject CSS to preserve styles better
style_injection = """
<style>
@page {
margin: 0;
}
* {
-webkit-print-color-adjust: exact !important;
print-color-adjust: exact !important;
color-adjust: exact !important;
}
body {
-webkit-print-color-adjust: exact !important;
print-color-adjust: exact !important;
}
</style>
"""
# Insert style injection before closing head tag or at the start of body
if '</head>' in html_content:
html_content = html_content.replace('</head>', style_injection + '</head>')
elif '<body' in html_content:
html_content = html_content.replace('<body', style_injection + '<body', 1)
else:
html_content = style_injection + html_content
# Save HTML content to temporary file
html_file = os.path.join(temp_dir, "input.html")
with open(html_file, 'w', encoding='utf-8') as f:
f.write(html_content)
# Get the path to puppeteer_pdf.js
script_dir = os.path.dirname(os.path.abspath(__file__))
puppeteer_script = os.path.join(os.path.dirname(script_dir), 'puppeteer_pdf.js')
# Run Node.js script to convert HTML to PDF
result = subprocess.run(
['node', puppeteer_script, html_file, aspect_ratio],
capture_output=True,
text=True,
timeout=60,
cwd=os.path.dirname(script_dir)
)
if result.returncode != 0:
return None, f"PDF conversion failed: {result.stderr}"
# Get the generated PDF path
pdf_file = html_file.replace('.html', '.pdf')
if not os.path.exists(pdf_file):
return None, "PDF file was not generated"
# Read PDF file into memory
with open(pdf_file, 'rb') as f:
pdf_bytes = f.read()
return pdf_bytes, None
except subprocess.TimeoutExpired:
return None, "Error: PDF conversion timed out (60 seconds)"
except Exception as e:
return None, f"Error: {str(e)}"
# Page header
st.title("πŸ“„ HTML to PDF Converter")
st.markdown("""
Convert HTML files or HTML code to PDF using Puppeteer with automatic aspect ratio detection.
✨ **NEW:** Upload images alongside your HTML files!
""")
# Create tabs
tab1, tab2 = st.tabs(["πŸ“€ Upload HTML File", "πŸ“ Paste HTML Code"])
# Tab 1: Upload HTML File
with tab1:
uploaded_file = st.file_uploader(
"Choose an HTML file",
type=['html', 'htm'],
key="file_uploader",
help="Upload an HTML file (max 200MB)",
accept_multiple_files=False
)
# Image uploader
uploaded_images = st.file_uploader(
"πŸ“· Upload Images (optional)",
type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
key="image_uploader",
help="Upload images referenced in your HTML",
accept_multiple_files=True
)
if uploaded_images:
st.success(f"βœ… {len(uploaded_images)} image(s) uploaded")
with st.expander("View uploaded images"):
cols = st.columns(min(len(uploaded_images), 4))
for idx, img in enumerate(uploaded_images):
with cols[idx % 4]:
st.image(img, caption=img.name, use_container_width=True)
if uploaded_file is not None:
st.success(f"βœ… File uploaded: {uploaded_file.name} ({uploaded_file.size:,} bytes)")
# Read file content
uploaded_file.seek(0)
try:
html_content = uploaded_file.getvalue().decode('utf-8')
except UnicodeDecodeError:
uploaded_file.seek(0)
html_content = uploaded_file.getvalue().decode('latin-1')
# Auto-detect aspect ratio
detected_ratio = detect_aspect_ratio(html_content)
col1, col2 = st.columns([1, 1])
with col1:
st.subheader("βš™οΈ Settings")
auto_detect = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_detect_file")
if auto_detect:
aspect_ratio_file = detected_ratio
st.info(f"πŸ” Detected: **{detected_ratio}**")
else:
aspect_ratio_file = st.radio(
"Aspect Ratio",
options=["16:9", "1:1", "9:16"],
index=["16:9", "1:1", "9:16"].index(detected_ratio),
key="aspect_file",
help="Select the page orientation and dimensions"
)
st.markdown(f"""
**Selected: {aspect_ratio_file}**
- 16:9 = Landscape (297mm Γ— 210mm)
- 1:1 = Square (210mm Γ— 210mm)
- 9:16 = Portrait (210mm Γ— 297mm)
""")
convert_file_btn = st.button("πŸ”„ Convert to PDF", key="convert_file", type="primary", width="stretch")
with col2:
st.subheader("πŸ‘οΈ HTML Preview")
with st.expander("Show HTML Preview", expanded=False):
st.components.v1.html(render_html_preview(html_content), height=600, scrolling=True)
# Conversion section
if convert_file_btn:
temp_dir = None
try:
with st.spinner("Converting HTML to PDF..."):
# Create temp directory
temp_dir = tempfile.mkdtemp()
# Process images if uploaded
if uploaded_images:
image_mapping = save_uploaded_images(uploaded_images, temp_dir)
html_content = process_html_with_images(html_content, temp_dir, image_mapping)
st.info(f"πŸ“· Processed {len(uploaded_images)} image(s)")
# Debug info
with st.expander("πŸ” Debug: Image Mapping"):
for orig, new in image_mapping.items():
st.text(f"{orig} -> {new}")
full_path = os.path.join(temp_dir, new)
st.text(f"Full path: {full_path}")
st.text(f"Exists: {os.path.exists(full_path)}")
# Convert to PDF
pdf_bytes, error = convert_html_to_pdf(html_content, aspect_ratio_file, temp_dir)
# Cleanup
if temp_dir:
shutil.rmtree(temp_dir, ignore_errors=True)
if error:
st.error(f"❌ {error}")
with st.expander("Show error details"):
st.code(error)
else:
st.success("βœ… PDF generated successfully!")
col_a, col_b = st.columns([1, 1])
with col_a:
output_filename = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
if not output_filename.endswith('.pdf'):
output_filename += '.pdf'
st.download_button(
label="⬇️ Download PDF",
data=pdf_bytes,
file_name=output_filename,
mime="application/pdf",
width="stretch",
key="download_file_pdf"
)
with col_b:
st.info(f"πŸ“¦ Size: {len(pdf_bytes):,} bytes")
# PDF Preview
st.subheader("πŸ“„ PDF Preview")
st.components.v1.html(render_pdf_preview(pdf_bytes), height=620, scrolling=True)
except Exception as e:
if temp_dir:
shutil.rmtree(temp_dir, ignore_errors=True)
st.error(f"❌ Error: {str(e)}")
# Tab 2: Paste HTML Code
with tab2:
col1, col2 = st.columns([1, 1])
with col1:
html_code = st.text_area(
"HTML Content",
value="""<!DOCTYPE html>
<html>
<head>
<title>Sample Document</title>
<style>
body {
font-family: Arial, sans-serif;
margin: 40px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
}
h1 {
font-size: 48px;
margin-bottom: 20px;
text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
}
p {
font-size: 18px;
line-height: 1.6;
}
.box {
background: rgba(255,255,255,0.1);
padding: 20px;
border-radius: 10px;
margin-top: 20px;
}
</style>
</head>
<body>
<h1>Hello, PDF World! 🌍</h1>
<p>This is a sample HTML document converted to PDF.</p>
<div class="box">
<p>✨ Styles, colors, and gradients are preserved!</p>
</div>
</body>
</html>""",
height=400,
key="html_code"
)
# Image uploader for text tab
uploaded_images_text = st.file_uploader(
"πŸ“· Upload Images (optional)",
type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
key="image_uploader_text",
help="Upload images referenced in your HTML code",
accept_multiple_files=True
)
if uploaded_images_text:
st.success(f"βœ… {len(uploaded_images_text)} image(s) uploaded")
with st.expander("View uploaded images"):
cols = st.columns(min(len(uploaded_images_text), 4))
for idx, img in enumerate(uploaded_images_text):
with cols[idx % 4]:
st.image(img, caption=img.name, use_container_width=True)
if html_code and html_code.strip():
# Auto-detect aspect ratio
detected_ratio_text = detect_aspect_ratio(html_code)
auto_detect_text = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_detect_text")
if auto_detect_text:
aspect_ratio_text = detected_ratio_text
st.info(f"πŸ” Detected: **{detected_ratio_text}**")
else:
aspect_ratio_text = st.radio(
"Aspect Ratio",
options=["16:9", "1:1", "9:16"],
index=["16:9", "1:1", "9:16"].index(detected_ratio_text),
key="aspect_text",
help="Select the page orientation and dimensions"
)
convert_text_btn = st.button("πŸ”„ Convert to PDF", key="convert_text", type="primary", width="stretch")
else:
convert_text_btn = False
with col2:
if html_code and html_code.strip():
st.subheader("πŸ‘οΈ HTML Preview")
with st.expander("Show HTML Preview", expanded=False):
st.components.v1.html(render_html_preview(html_code), height=600, scrolling=True)
if convert_text_btn and html_code and html_code.strip():
temp_dir = None
try:
with st.spinner("Converting HTML to PDF..."):
# Create temp directory
temp_dir = tempfile.mkdtemp()
# Process images if uploaded
processed_html = html_code
if uploaded_images_text:
image_mapping = save_uploaded_images(uploaded_images_text, temp_dir)
processed_html = process_html_with_images(html_code, temp_dir, image_mapping)
st.info(f"πŸ“· Processed {len(uploaded_images_text)} image(s)")
# Debug info
with st.expander("πŸ” Debug: Image Mapping"):
for orig, new in image_mapping.items():
st.text(f"{orig} -> {new}")
full_path = os.path.join(temp_dir, new)
st.text(f"Full path: {full_path}")
st.text(f"Exists: {os.path.exists(full_path)}")
# Convert to PDF
pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio_text, temp_dir)
# Cleanup
if temp_dir:
shutil.rmtree(temp_dir, ignore_errors=True)
if error:
st.error(f"❌ {error}")
with st.expander("Show error details"):
st.code(error)
else:
st.success("βœ… PDF generated successfully!")
col_a, col_b = st.columns([1, 1])
with col_a:
st.download_button(
label="⬇️ Download PDF",
data=pdf_bytes,
file_name="converted.pdf",
mime="application/pdf",
width="stretch",
key="download_text_pdf"
)
with col_b:
st.info(f"πŸ“¦ Size: {len(pdf_bytes):,} bytes")
# PDF Preview
st.subheader("πŸ“„ PDF Preview")
st.components.v1.html(render_pdf_preview(pdf_bytes), height=620, scrolling=True)
except Exception as e:
if temp_dir:
shutil.rmtree(temp_dir, ignore_errors=True)
st.error(f"❌ Error: {str(e)}")
# Footer with tips
st.markdown("---")
st.markdown("""
### πŸ’‘ Tips:
- **Auto-detection** analyzes your HTML to suggest the best aspect ratio
- **16:9** - Best for presentations and landscape documents (297mm Γ— 210mm)
- **1:1** - Square format (210mm Γ— 210mm)
- **9:16** - Portrait format, standard A4 (210mm Γ— 297mm)
- **Image Support** - Upload JPG, PNG, GIF, SVG, WebP, or BMP images
- All CSS styles, colors, gradients, and fonts are preserved
- Use inline CSS or `<style>` tags for best results
- Reference images by filename in your HTML (e.g., `<img src="image.jpg">`)
- External resources should use absolute URLs
- **PDF Preview** renders directly in the browser using PDF.js
### πŸ–ΌοΈ Using Images:
1. Upload your HTML file
2. Upload all images referenced in the HTML
3. Make sure image filenames in HTML match uploaded files exactly
4. The converter will automatically embed images in the PDF
""")