Spaces:

ABDALLALSWAITI
/

htmlpdf

Sleeping

App Files Files Community

htmlpdf / src /streamlit_app.py

ABDALLALSWAITI

Update src/streamlit_app.py

69a76c1 verified 5 months ago

raw

history blame

23.8 kB

	import streamlit as st
	import subprocess
	import os
	import tempfile
	import shutil
	from pathlib import Path
	import base64
	import re

	st.set_page_config(
	page_title="HTML to PDF Converter",
	page_icon="📄",
	layout="wide"
	)

	def detect_aspect_ratio(html_content):
	"""
	Detect aspect ratio from HTML content
	Returns: "16:9", "1:1", or "9:16"
	"""
	# Check for viewport meta tag
	viewport_match = re.search(r'<meta[^>]viewport[^>]content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
	if viewport_match:
	viewport = viewport_match.group(1).lower()
	if 'width=device-width' in viewport or 'width=100%' in viewport:
	# Check for orientation hints
	if 'orientation=portrait' in viewport:
	return "9:16"
	elif 'orientation=landscape' in viewport:
	return "16:9"

	# Check for CSS aspect-ratio property
	aspect_match = re.search(r'aspect-ratio\s:\s(\d+)\s/\s(\d+)', html_content, re.IGNORECASE)
	if aspect_match:
	width = int(aspect_match.group(1))
	height = int(aspect_match.group(2))
	ratio = width / height
	if ratio > 1.5:
	return "16:9"
	elif ratio < 0.7:
	return "9:16"
	else:
	return "1:1"

	# Check for common presentation frameworks
	if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
	return "16:9"

	# Check body style for width/height hints
	body_match = re.search(r'<body[^>]style=["\']([^"\'])["\']', html_content, re.IGNORECASE)
	if body_match:
	style = body_match.group(1).lower()
	if 'width' in style and 'height' in style:
	width_match = re.search(r'width\s:\s(\d+)', style)
	height_match = re.search(r'height\s:\s(\d+)', style)
	if width_match and height_match:
	w = int(width_match.group(1))
	h = int(height_match.group(1))
	ratio = w / h
	if ratio > 1.5:
	return "16:9"
	elif ratio < 0.7:
	return "9:16"

	# Default to A4 portrait for documents
	return "9:16"

	def save_uploaded_images(images, temp_dir):
	"""Save uploaded images and return mapping"""
	image_mapping = {}
	images_dir = os.path.join(temp_dir, "images")
	os.makedirs(images_dir, exist_ok=True)

	for image in images:
	# Save image
	image_path = os.path.join(images_dir, image.name)
	with open(image_path, 'wb') as f:
	f.write(image.getvalue())

	# Create mapping
	image_mapping[image.name] = f"images/{image.name}"
	print(f"Saved image: {image.name} -> {image_path}")

	return image_mapping

	def process_html_with_images(html_content, temp_dir, image_mapping):
	"""Process HTML to handle image references with absolute file paths"""
	import re

	for original_name, relative_path in image_mapping.items():
	# Get absolute path for the image
	absolute_path = os.path.abspath(os.path.join(temp_dir, relative_path))
	file_url = f"file://{absolute_path}"

	# Replace various image reference patterns
	# Pattern 1: src="filename" or src='filename'
	html_content = re.sub(
	rf'src=["\'](?:\./)?{re.escape(original_name)}["\']',
	f'src="{file_url}"',
	html_content,
	flags=re.IGNORECASE
	)

	# Pattern 2: background-image: url(filename)
	html_content = re.sub(
	rf'url\(["\']?(?:\./)?{re.escape(original_name)}["\']?\)',
	f'url("{file_url}")',
	html_content,
	flags=re.IGNORECASE
	)

	# Pattern 3: href for links
	html_content = re.sub(
	rf'href=["\'](?:\./)?{re.escape(original_name)}["\']',
	f'href="{file_url}"',
	html_content,
	flags=re.IGNORECASE
	)

	return html_content

	def render_html_preview(html_content):
	"""Render HTML preview in an iframe"""
	# Encode HTML content
	b64 = base64.b64encode(html_content.encode()).decode()
	iframe_html = f'<iframe src="data:text/html;base64,{b64}" width="100%" height="600" style="border: 2px solid #ddd; border-radius: 5px;"></iframe>'
	return iframe_html

	def render_pdf_preview(pdf_bytes):
	"""Render PDF preview using embedded PDF.js"""
	b64 = base64.b64encode(pdf_bytes).decode()

	pdf_viewer_html = f'''
	<!DOCTYPE html>
	<html>
	<head>
	<style>
	body {{
	margin: 0;
	padding: 0;
	overflow: hidden;
	background: #525659;
	}}
	#pdf-container {{
	width: 100%;
	height: 100vh;
	overflow: auto;
	display: flex;
	flex-direction: column;
	align-items: center;
	padding: 20px;
	box-sizing: border-box;
	}}
	canvas {{
	box-shadow: 0 2px 8px rgba(0,0,0,0.3);
	margin-bottom: 10px;
	background: white;
	}}
	#loading {{
	color: white;
	font-family: Arial, sans-serif;
	font-size: 18px;
	padding: 20px;
	}}
	.error {{
	color: #ff6b6b;
	font-family: Arial, sans-serif;
	padding: 20px;
	background: rgba(0,0,0,0.5);
	border-radius: 5px;
	margin: 20px;
	}}
	</style>
	</head>
	<body>
	<div id="pdf-container">
	<div id="loading">Loading PDF...</div>
	</div>

	<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
	<script>
	pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';

	const pdfData = atob('{b64}');
	const pdfContainer = document.getElementById('pdf-container');
	const loading = document.getElementById('loading');

	const uint8Array = new Uint8Array(pdfData.length);
	for (let i = 0; i < pdfData.length; i++) {{
	uint8Array[i] = pdfData.charCodeAt(i);
	}}

	pdfjsLib.getDocument({{data: uint8Array}}).promise.then(function(pdf) {{
	loading.style.display = 'none';

	const numPages = pdf.numPages;
	const promises = [];

	for (let pageNum = 1; pageNum <= numPages; pageNum++) {{
	promises.push(
	pdf.getPage(pageNum).then(function(page) {{
	const scale = 1.5;
	const viewport = page.getViewport({{scale: scale}});

	const canvas = document.createElement('canvas');
	const context = canvas.getContext('2d');
	canvas.height = viewport.height;
	canvas.width = viewport.width;

	pdfContainer.appendChild(canvas);

	return page.render({{
	canvasContext: context,
	viewport: viewport
	}}).promise;
	}})
	);
	}}

	return Promise.all(promises);
	}}).catch(function(error) {{
	loading.innerHTML = '<div class="error">Error loading PDF: ' + error.message + '</div>';
	console.error('Error loading PDF:', error);
	}});
	</script>
	</body>
	</html>
	'''
	return pdf_viewer_html

	def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
	"""
	Convert HTML content to PDF using Puppeteer with better styling preservation

	Args:
	html_content: String containing HTML content
	aspect_ratio: One of "16:9", "1:1", or "9:16"
	temp_dir: Temporary directory for processing

	Returns:
	Tuple of (pdf_bytes, error_message)
	"""
	try:
	# Inject CSS to preserve styles better
	style_injection = """
	<style>
	@page {
	margin: 0;
	}
	* {
	-webkit-print-color-adjust: exact !important;
	print-color-adjust: exact !important;
	color-adjust: exact !important;
	}
	body {
	-webkit-print-color-adjust: exact !important;
	print-color-adjust: exact !important;
	}
	</style>
	"""

	# Insert style injection before closing head tag or at the start of body
	if '</head>' in html_content:
	html_content = html_content.replace('</head>', style_injection + '</head>')
	elif '<body' in html_content:
	html_content = html_content.replace('<body', style_injection + '<body', 1)
	else:
	html_content = style_injection + html_content

	# Save HTML content to temporary file
	html_file = os.path.join(temp_dir, "input.html")
	with open(html_file, 'w', encoding='utf-8') as f:
	f.write(html_content)

	# Get the path to puppeteer_pdf.js
	script_dir = os.path.dirname(os.path.abspath(__file__))
	puppeteer_script = os.path.join(os.path.dirname(script_dir), 'puppeteer_pdf.js')

	# Run Node.js script to convert HTML to PDF
	result = subprocess.run(
	['node', puppeteer_script, html_file, aspect_ratio],
	capture_output=True,
	text=True,
	timeout=60,
	cwd=os.path.dirname(script_dir)
	)

	if result.returncode != 0:
	return None, f"PDF conversion failed: {result.stderr}"

	# Get the generated PDF path
	pdf_file = html_file.replace('.html', '.pdf')

	if not os.path.exists(pdf_file):
	return None, "PDF file was not generated"

	# Read PDF file into memory
	with open(pdf_file, 'rb') as f:
	pdf_bytes = f.read()

	return pdf_bytes, None

	except subprocess.TimeoutExpired:
	return None, "Error: PDF conversion timed out (60 seconds)"
	except Exception as e:
	return None, f"Error: {str(e)}"

	# Page header
	st.title("📄 HTML to PDF Converter")
	st.markdown("""
	Convert HTML files or HTML code to PDF using Puppeteer with automatic aspect ratio detection.
	✨ NEW: Upload images alongside your HTML files!
	""")

	# Create tabs
	tab1, tab2 = st.tabs(["📤 Upload HTML File", "📝 Paste HTML Code"])

	# Tab 1: Upload HTML File
	with tab1:
	uploaded_file = st.file_uploader(
	"Choose an HTML file",
	type=['html', 'htm'],
	key="file_uploader",
	help="Upload an HTML file (max 200MB)",
	accept_multiple_files=False
	)

	# Image uploader
	uploaded_images = st.file_uploader(
	"📷 Upload Images (optional)",
	type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
	key="image_uploader",
	help="Upload images referenced in your HTML",
	accept_multiple_files=True
	)

	if uploaded_images:
	st.success(f"✅ {len(uploaded_images)} image(s) uploaded")
	with st.expander("View uploaded images"):
	cols = st.columns(min(len(uploaded_images), 4))
	for idx, img in enumerate(uploaded_images):
	with cols[idx % 4]:
	st.image(img, caption=img.name, use_container_width=True)

	if uploaded_file is not None:
	st.success(f"✅ File uploaded: {uploaded_file.name} ({uploaded_file.size:,} bytes)")

	# Read file content
	uploaded_file.seek(0)
	try:
	html_content = uploaded_file.getvalue().decode('utf-8')
	except UnicodeDecodeError:
	uploaded_file.seek(0)
	html_content = uploaded_file.getvalue().decode('latin-1')

	# Auto-detect aspect ratio
	detected_ratio = detect_aspect_ratio(html_content)

	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("⚙️ Settings")

	auto_detect = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_detect_file")

	if auto_detect:
	aspect_ratio_file = detected_ratio
	st.info(f"🔍 Detected: {detected_ratio}")
	else:
	aspect_ratio_file = st.radio(
	"Aspect Ratio",
	options=["16:9", "1:1", "9:16"],
	index=["16:9", "1:1", "9:16"].index(detected_ratio),
	key="aspect_file",
	help="Select the page orientation and dimensions"
	)

	st.markdown(f"""
	Selected: {aspect_ratio_file}
	- 16:9 = Landscape (297mm × 210mm)
	- 1:1 = Square (210mm × 210mm)
	- 9:16 = Portrait (210mm × 297mm)
	""")

	convert_file_btn = st.button("🔄 Convert to PDF", key="convert_file", type="primary", width="stretch")

	with col2:
	st.subheader("👁️ HTML Preview")
	with st.expander("Show HTML Preview", expanded=False):
	st.components.v1.html(render_html_preview(html_content), height=600, scrolling=True)

	# Conversion section
	if convert_file_btn:
	temp_dir = None
	try:
	with st.spinner("Converting HTML to PDF..."):
	# Create temp directory
	temp_dir = tempfile.mkdtemp()

	# Process images if uploaded
	if uploaded_images:
	image_mapping = save_uploaded_images(uploaded_images, temp_dir)
	html_content = process_html_with_images(html_content, temp_dir, image_mapping)
	st.info(f"📷 Processed {len(uploaded_images)} image(s)")
	# Debug info
	with st.expander("🔍 Debug: Image Mapping"):
	for orig, new in image_mapping.items():
	st.text(f"{orig} -> {new}")
	full_path = os.path.join(temp_dir, new)
	st.text(f"Full path: {full_path}")
	st.text(f"Exists: {os.path.exists(full_path)}")

	# Convert to PDF
	pdf_bytes, error = convert_html_to_pdf(html_content, aspect_ratio_file, temp_dir)

	# Cleanup
	if temp_dir:
	shutil.rmtree(temp_dir, ignore_errors=True)

	if error:
	st.error(f"❌ {error}")
	with st.expander("Show error details"):
	st.code(error)
	else:
	st.success("✅ PDF generated successfully!")

	col_a, col_b = st.columns([1, 1])

	with col_a:
	output_filename = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
	if not output_filename.endswith('.pdf'):
	output_filename += '.pdf'

	st.download_button(
	label="⬇️ Download PDF",
	data=pdf_bytes,
	file_name=output_filename,
	mime="application/pdf",
	width="stretch",
	key="download_file_pdf"
	)

	with col_b:
	st.info(f"📦 Size: {len(pdf_bytes):,} bytes")

	# PDF Preview
	st.subheader("📄 PDF Preview")
	st.components.v1.html(render_pdf_preview(pdf_bytes), height=620, scrolling=True)
	except Exception as e:
	if temp_dir:
	shutil.rmtree(temp_dir, ignore_errors=True)
	st.error(f"❌ Error: {str(e)}")

	# Tab 2: Paste HTML Code
	with tab2:
	col1, col2 = st.columns([1, 1])

	with col1:
	html_code = st.text_area(
	"HTML Content",
	value="""<!DOCTYPE html>
	<html>
	<head>
	<title>Sample Document</title>
	<style>
	body {
	font-family: Arial, sans-serif;
	margin: 40px;
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	}
	h1 {
	font-size: 48px;
	margin-bottom: 20px;
	text-shadow: 2px 2px 4px rgba(0,0,0,0.3);
	}
	p {
	font-size: 18px;
	line-height: 1.6;
	}
	.box {
	background: rgba(255,255,255,0.1);
	padding: 20px;
	border-radius: 10px;
	margin-top: 20px;
	}
	</style>
	</head>
	<body>
	<h1>Hello, PDF World! 🌍</h1>
	<p>This is a sample HTML document converted to PDF.</p>
	<div class="box">
	<p>✨ Styles, colors, and gradients are preserved!</p>
	</div>
	</body>
	</html>""",
	height=400,
	key="html_code"
	)

	# Image uploader for text tab
	uploaded_images_text = st.file_uploader(
	"📷 Upload Images (optional)",
	type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
	key="image_uploader_text",
	help="Upload images referenced in your HTML code",
	accept_multiple_files=True
	)

	if uploaded_images_text:
	st.success(f"✅ {len(uploaded_images_text)} image(s) uploaded")
	with st.expander("View uploaded images"):
	cols = st.columns(min(len(uploaded_images_text), 4))
	for idx, img in enumerate(uploaded_images_text):
	with cols[idx % 4]:
	st.image(img, caption=img.name, use_container_width=True)

	if html_code and html_code.strip():
	# Auto-detect aspect ratio
	detected_ratio_text = detect_aspect_ratio(html_code)

	auto_detect_text = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_detect_text")

	if auto_detect_text:
	aspect_ratio_text = detected_ratio_text
	st.info(f"🔍 Detected: {detected_ratio_text}")
	else:
	aspect_ratio_text = st.radio(
	"Aspect Ratio",
	options=["16:9", "1:1", "9:16"],
	index=["16:9", "1:1", "9:16"].index(detected_ratio_text),
	key="aspect_text",
	help="Select the page orientation and dimensions"
	)

	convert_text_btn = st.button("🔄 Convert to PDF", key="convert_text", type="primary", width="stretch")
	else:
	convert_text_btn = False

	with col2:
	if html_code and html_code.strip():
	st.subheader("👁️ HTML Preview")
	with st.expander("Show HTML Preview", expanded=False):
	st.components.v1.html(render_html_preview(html_code), height=600, scrolling=True)

	if convert_text_btn and html_code and html_code.strip():
	temp_dir = None
	try:
	with st.spinner("Converting HTML to PDF..."):
	# Create temp directory
	temp_dir = tempfile.mkdtemp()

	# Process images if uploaded
	processed_html = html_code
	if uploaded_images_text:
	image_mapping = save_uploaded_images(uploaded_images_text, temp_dir)
	processed_html = process_html_with_images(html_code, temp_dir, image_mapping)
	st.info(f"📷 Processed {len(uploaded_images_text)} image(s)")
	# Debug info
	with st.expander("🔍 Debug: Image Mapping"):
	for orig, new in image_mapping.items():
	st.text(f"{orig} -> {new}")
	full_path = os.path.join(temp_dir, new)
	st.text(f"Full path: {full_path}")
	st.text(f"Exists: {os.path.exists(full_path)}")

	# Convert to PDF
	pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio_text, temp_dir)

	# Cleanup
	if temp_dir:
	shutil.rmtree(temp_dir, ignore_errors=True)

	if error:
	st.error(f"❌ {error}")
	with st.expander("Show error details"):
	st.code(error)
	else:
	st.success("✅ PDF generated successfully!")

	col_a, col_b = st.columns([1, 1])

	with col_a:
	st.download_button(
	label="⬇️ Download PDF",
	data=pdf_bytes,
	file_name="converted.pdf",
	mime="application/pdf",
	width="stretch",
	key="download_text_pdf"
	)

	with col_b:
	st.info(f"📦 Size: {len(pdf_bytes):,} bytes")

	# PDF Preview
	st.subheader("📄 PDF Preview")
	st.components.v1.html(render_pdf_preview(pdf_bytes), height=620, scrolling=True)
	except Exception as e:
	if temp_dir:
	shutil.rmtree(temp_dir, ignore_errors=True)
	st.error(f"❌ Error: {str(e)}")

	# Footer with tips
	st.markdown("---")
	st.markdown("""
	### 💡 Tips:
	- Auto-detection analyzes your HTML to suggest the best aspect ratio
	- 16:9 - Best for presentations and landscape documents (297mm × 210mm)
	- 1:1 - Square format (210mm × 210mm)
	- 9:16 - Portrait format, standard A4 (210mm × 297mm)
	- Image Support - Upload JPG, PNG, GIF, SVG, WebP, or BMP images
	- All CSS styles, colors, gradients, and fonts are preserved
	- Use inline CSS or `<style>` tags for best results
	- Reference images by filename in your HTML (e.g., `<img src="image.jpg">`)
	- External resources should use absolute URLs
	- PDF Preview renders directly in the browser using PDF.js

	### 🖼️ Using Images:
	1. Upload your HTML file
	2. Upload all images referenced in the HTML
	3. Make sure image filenames in HTML match uploaded files exactly
	4. The converter will automatically embed images in the PDF
	""")