Spaces:

ABDALLALSWAITI
/

htmlpdf

Sleeping

App Files Files Community

htmlpdf / src /streamlit_app.py

ABDALLALSWAITI

Rename src/streamlit_app (1).py to src/streamlit_app.py

703de2f verified 4 months ago

raw

history blame contribute delete

25.5 kB

	"""
	Streamlit HTML to PDF Converter with Image Support and Proper Page Breaks
	Save this file as: src/streamlit_app.py
	"""
	import streamlit as st
	import subprocess
	import os
	import tempfile
	import shutil
	from pathlib import Path
	import base64
	import re
	import mimetypes

	st.set_page_config(
	page_title="HTML to PDF Converter",
	page_icon="📄",
	layout="wide"
	)

	def detect_aspect_ratio(html_content):
	"""Detect aspect ratio from HTML content"""
	viewport_match = re.search(r'<meta[^>]viewport[^>]content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
	if viewport_match:
	viewport = viewport_match.group(1).lower()
	if 'orientation=portrait' in viewport:
	return "9:16"
	elif 'orientation=landscape' in viewport:
	return "16:9"

	aspect_match = re.search(r'aspect-ratio\s:\s(\d+)\s/\s(\d+)', html_content, re.IGNORECASE)
	if aspect_match:
	width = int(aspect_match.group(1))
	height = int(aspect_match.group(2))
	ratio = width / height
	if ratio > 1.5:
	return "16:9"
	elif ratio < 0.7:
	return "9:16"
	else:
	return "1:1"

	if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
	return "16:9"

	return "9:16"

	def image_to_base64(image_file):
	"""Convert uploaded image to base64 data URL"""
	try:
	image_bytes = image_file.getvalue()
	mime_type, _ = mimetypes.guess_type(image_file.name)
	if not mime_type:
	ext = os.path.splitext(image_file.name)[1].lower()
	mime_map = {
	'.jpg': 'image/jpeg',
	'.jpeg': 'image/jpeg',
	'.png': 'image/png',
	'.gif': 'image/gif',
	'.svg': 'image/svg+xml',
	'.webp': 'image/webp',
	'.bmp': 'image/bmp'
	}
	mime_type = mime_map.get(ext, 'image/png')

	b64_data = base64.b64encode(image_bytes).decode('utf-8')
	data_url = f"data:{mime_type};base64,{b64_data}"
	return data_url
	except Exception as e:
	st.error(f"Error converting {image_file.name} to base64: {str(e)}")
	return None

	def embed_images_as_base64(html_content, uploaded_images):
	"""Embed all images directly as base64 data URLs in the HTML"""
	if not uploaded_images:
	return html_content, {}

	image_data_urls = {}
	for img in uploaded_images:
	data_url = image_to_base64(img)
	if data_url:
	image_data_urls[img.name] = data_url
	st.write(f"✓ Converted {img.name} to base64 ({len(data_url)} chars)")

	if not image_data_urls:
	return html_content, {}

	replacements = {}

	for filename, data_url in image_data_urls.items():
	escaped_name = re.escape(filename)

	# Pattern 1: img src attribute
	pattern1 = rf'(<img[^>]\s+src\s=\s)(["\'])(?:[^"\']?/)?{escaped_name}\2'
	matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE \| re.DOTALL))
	count1 = len(matches1)
	if matches1:
	html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE \| re.DOTALL)
	replacements[f"{filename} (img src)"] = count1

	# Pattern 2: background-image
	pattern2 = rf'(background-image\s:\surl\s\()(["\']?)(?:[^)"\']/)?{escaped_name}\2(\))'
	matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
	count2 = len(matches2)
	if matches2:
	html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
	replacements[f"{filename} (bg-image)"] = count2

	# Pattern 3: CSS url()
	pattern3 = rf'(url\s\()(["\']?)(?:[^)"\']/)?{escaped_name}\2(\))'
	matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
	count3 = len(matches3)
	if matches3:
	html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
	replacements[f"{filename} (url)"] = count3

	if replacements:
	st.success("✅ Image Replacements:")
	for key, count in replacements.items():
	st.write(f" • {key}: {count} replacement(s)")
	else:
	st.warning("⚠️ No image references found in HTML matching uploaded files!")
	st.write("Uploaded files:", [img.name for img in uploaded_images])

	with st.expander("🔍 Debug: Show HTML image references"):
	img_lines = [line for line in html_content.split('\n')
	if any(k in line.lower() for k in ['<img', 'src=', 'url(', 'background'])]
	if img_lines:
	for line in img_lines[:10]:
	st.code(line.strip(), language='html')
	else:
	st.write("No image-related lines found in HTML")

	return html_content, replacements

	def inject_page_breaks(html_content: str, aspect_ratio: str):
	"""Automatically inject page breaks and page sizing CSS"""

	# Determine page orientation
	if aspect_ratio == "16:9":
	page_size = "A4 landscape"
	orientation = "landscape"
	elif aspect_ratio == "1:1":
	page_size = "210mm 210mm"
	orientation = "portrait"
	else: # 9:16
	page_size = "A4 portrait"
	orientation = "portrait"

	# Comprehensive page break CSS
	page_css = f"""
	<style id="auto-page-breaks">
	/* Define page size */
	@page {{
	size: {page_size};
	margin: 0;
	}}

	/* Reset body */
	html, body {{
	margin: 0 !important;
	padding: 0 !important;
	width: 100% !important;
	height: 100% !important;
	}}

	/* Page containers - each should be one page */
	.page, .slide, section.page, article.page, div[class="page"], div[class="slide"] {{
	width: 100% !important;
	min-height: 100vh !important;
	height: 100vh !important;
	page-break-after: always !important;
	break-after: page !important;
	page-break-inside: avoid !important;
	break-inside: avoid !important;
	position: relative !important;
	box-sizing: border-box !important;
	overflow: hidden !important;
	}}

	/* Last page shouldn't force a break */
	.page:last-child, .slide:last-child,
	section.page:last-child, article.page:last-child {{
	page-break-after: auto !important;
	break-after: auto !important;
	}}

	/* If no explicit page class, treat direct body children as pages */
	body > section:not(.no-page-break),
	body > article:not(.no-page-break),
	body > div:not(.no-page-break) {{
	page-break-after: always !important;
	break-after: page !important;
	min-height: 100vh;
	}}

	body > section:last-child,
	body > article:last-child,
	body > div:last-child {{
	page-break-after: auto !important;
	}}

	/* Utility classes for manual control */
	.page-break, .page-break-after {{
	page-break-after: always !important;
	break-after: page !important;
	}}

	.page-break-before {{
	page-break-before: always !important;
	break-before: page !important;
	}}

	.no-page-break, .keep-together {{
	page-break-inside: avoid !important;
	break-inside: avoid !important;
	}}

	/* Prevent awkward breaks in content */
	h1, h2, h3, h4, h5, h6 {{
	page-break-after: avoid !important;
	break-after: avoid !important;
	page-break-inside: avoid !important;
	break-inside: avoid !important;
	}}

	img, figure, table, pre, blockquote {{
	page-break-inside: avoid !important;
	break-inside: avoid !important;
	}}

	/* Preserve colors and backgrounds */
	* {{
	-webkit-print-color-adjust: exact !important;
	print-color-adjust: exact !important;
	color-adjust: exact !important;
	}}
	</style>
	"""

	# Inject CSS into HTML
	if '</head>' in html_content:
	html_content = html_content.replace('</head>', page_css + '</head>')
	elif '<body' in html_content:
	html_content = html_content.replace('<body', page_css + '<body', 1)
	else:
	html_content = page_css + html_content

	return html_content

	def render_html_preview(html_content):
	"""Render HTML preview in an iframe"""
	b64 = base64.b64encode(html_content.encode()).decode()
	iframe_html = f'<iframe src="data:text/html;base64,{b64}" width="100%" height="600" style="border: 2px solid #ddd; border-radius: 5px;"></iframe>'
	return iframe_html

	def render_pdf_preview(pdf_bytes):
	"""Render PDF preview using embedded PDF.js"""
	b64 = base64.b64encode(pdf_bytes).decode()

	pdf_viewer_html = f'''
	<!DOCTYPE html>
	<html>
	<head>
	<style>
	body {{
	margin: 0;
	padding: 0;
	overflow: hidden;
	background: #525659;
	}}
	#pdf-container {{
	width: 100%;
	height: 100vh;
	overflow: auto;
	display: flex;
	flex-direction: column;
	align-items: center;
	padding: 20px;
	box-sizing: border-box;
	}}
	canvas {{
	box-shadow: 0 2px 8px rgba(0,0,0,0.3);
	margin-bottom: 10px;
	background: white;
	}}
	#loading {{
	color: white;
	font-family: Arial, sans-serif;
	font-size: 18px;
	padding: 20px;
	}}
	</style>
	</head>
	<body>
	<div id="pdf-container">
	<div id="loading">Loading PDF...</div>
	</div>
	<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
	<script>
	pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
	const pdfData = atob('{b64}');
	const pdfContainer = document.getElementById('pdf-container');
	const loading = document.getElementById('loading');
	const uint8Array = new Uint8Array(pdfData.length);
	for (let i = 0; i < pdfData.length; i++) {{
	uint8Array[i] = pdfData.charCodeAt(i);
	}}
	pdfjsLib.getDocument({{data: uint8Array}}).promise.then(function(pdf) {{
	loading.style.display = 'none';
	const numPages = pdf.numPages;
	const promises = [];
	for (let pageNum = 1; pageNum <= numPages; pageNum++) {{
	promises.push(
	pdf.getPage(pageNum).then(function(page) {{
	const scale = 1.5;
	const viewport = page.getViewport({{scale: scale}});
	const canvas = document.createElement('canvas');
	const context = canvas.getContext('2d');
	canvas.height = viewport.height;
	canvas.width = viewport.width;
	pdfContainer.appendChild(canvas);
	return page.render({{
	canvasContext: context,
	viewport: viewport
	}}).promise;
	}})
	);
	}}
	return Promise.all(promises);
	}}).catch(function(error) {{
	loading.innerHTML = '<div style="color:#ff6b6b;">Error: ' + error.message + '</div>';
	}});
	</script>
	</body>
	</html>
	'''
	return pdf_viewer_html

	def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
	"""Convert HTML content to PDF using Puppeteer with proper page breaks"""
	try:
	# Step 1: Inject page break CSS
	st.write("🔧 Injecting page break CSS...")
	html_content = inject_page_breaks(html_content, aspect_ratio)

	# Save HTML to temp file
	html_file = os.path.join(temp_dir, "input.html")
	with open(html_file, 'w', encoding='utf-8') as f:
	f.write(html_content)

	st.write(f"📝 Saved HTML: {os.path.getsize(html_file):,} bytes")

	# Find puppeteer script
	script_dir = os.path.dirname(os.path.abspath(__file__))
	possible_paths = [
	os.path.join(os.path.dirname(script_dir), 'puppeteer_pdf.js'),
	os.path.join(script_dir, 'puppeteer_pdf.js'),
	os.path.join(script_dir, '..', 'puppeteer_pdf.js'),
	'puppeteer_pdf.js'
	]

	puppeteer_script = None
	for path in possible_paths:
	if os.path.exists(path):
	puppeteer_script = path
	break

	if not puppeteer_script:
	return None, "Error: puppeteer_pdf.js not found"

	st.write(f"🔧 Using Puppeteer: {puppeteer_script}")

	# Run conversion
	result = subprocess.run(
	['node', puppeteer_script, html_file, aspect_ratio],
	capture_output=True,
	text=True,
	timeout=60,
	cwd=os.path.dirname(os.path.abspath(puppeteer_script))
	)

	if result.returncode != 0:
	return None, f"PDF conversion failed: {result.stderr}"

	# Read PDF
	pdf_file = html_file.replace('.html', '.pdf')
	if not os.path.exists(pdf_file):
	return None, "PDF file was not generated"

	with open(pdf_file, 'rb') as f:
	pdf_bytes = f.read()

	st.write(f"✅ PDF generated: {len(pdf_bytes):,} bytes")
	return pdf_bytes, None

	except subprocess.TimeoutExpired:
	return None, "Error: PDF conversion timed out (60 seconds)"
	except Exception as e:
	return None, f"Error: {str(e)}"

	# Main UI
	st.title("📄 HTML to PDF Converter")
	st.markdown("""
	Convert HTML to PDF with proper page breaks and embedded base64 images!
	✨ Each page in your HTML will be preserved as a separate PDF page.
	""")

	# Create tabs
	tab1, tab2 = st.tabs(["📤 Upload HTML File", "📝 Paste HTML Code"])

	# Tab 1: Upload HTML File
	with tab1:
	uploaded_file = st.file_uploader(
	"Choose an HTML file",
	type=['html', 'htm'],
	key="file_uploader",
	help="Upload an HTML file"
	)

	uploaded_images = st.file_uploader(
	"📷 Upload Images",
	type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
	key="image_uploader",
	help="Upload images - they will be embedded as base64 in the HTML",
	accept_multiple_files=True
	)

	if uploaded_images:
	st.success(f"✅ {len(uploaded_images)} image(s) uploaded")
	with st.expander("View uploaded images"):
	cols = st.columns(min(len(uploaded_images), 4))
	for idx, img in enumerate(uploaded_images):
	with cols[idx % 4]:
	st.image(img, caption=img.name, use_container_width=True)

	if uploaded_file:
	st.success(f"✅ File: {uploaded_file.name}")

	uploaded_file.seek(0)
	try:
	html_content = uploaded_file.getvalue().decode('utf-8')
	except UnicodeDecodeError:
	uploaded_file.seek(0)
	html_content = uploaded_file.getvalue().decode('latin-1')

	detected_ratio = detect_aspect_ratio(html_content)

	col1, col2 = st.columns([1, 1])

	with col1:
	st.subheader("⚙️ Settings")
	auto_detect = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_file")

	if auto_detect:
	aspect_ratio = detected_ratio
	st.info(f"🔍 Detected: {detected_ratio}")
	else:
	aspect_ratio = st.radio(
	"Aspect Ratio",
	options=["16:9", "1:1", "9:16"],
	index=["16:9", "1:1", "9:16"].index(detected_ratio),
	key="aspect_file"
	)

	convert_btn = st.button("🔄 Convert to PDF", key="conv_file", type="primary", use_container_width=True)

	with col2:
	st.subheader("👁️ Preview")
	with st.expander("Show HTML"):
	st.components.v1.html(render_html_preview(html_content), height=400, scrolling=True)

	if convert_btn:
	temp_dir = None
	try:
	with st.spinner("Converting..."):
	temp_dir = tempfile.mkdtemp()

	# Embed images as base64
	processed_html = html_content
	if uploaded_images:
	with st.expander("🖼️ Image Processing", expanded=True):
	processed_html, replacements = embed_images_as_base64(html_content, uploaded_images)

	if not replacements:
	st.warning("⚠️ Images uploaded but no matches found in HTML!")
	st.write("Tip: Make sure image filenames in HTML match uploaded files exactly")

	# Convert to PDF
	pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio, temp_dir)

	if error:
	st.error(f"❌ {error}")
	else:
	st.success("✅ PDF generated with proper page breaks!")

	output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
	if not output_name.endswith('.pdf'):
	output_name += '.pdf'

	col_a, col_b = st.columns(2)
	with col_a:
	st.download_button(
	"⬇️ Download PDF",
	data=pdf_bytes,
	file_name=output_name,
	mime="application/pdf",
	use_container_width=True
	)
	with col_b:
	st.info(f"Size: {len(pdf_bytes):,} bytes")

	st.subheader("📄 PDF Preview")
	st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
	except Exception as e:
	st.error(f"❌ Error: {str(e)}")
	finally:
	if temp_dir and os.path.exists(temp_dir):
	shutil.rmtree(temp_dir, ignore_errors=True)

	# Tab 2: Paste HTML
	with tab2:
	html_code = st.text_area(
	"HTML Content",
	value="""<!DOCTYPE html>
	<html>
	<head>
	<style>
	body {
	font-family: Arial;
	margin: 0;
	padding: 0;
	}
	.page {
	width: 100%;
	height: 100vh;
	display: flex;
	align-items: center;
	justify-content: center;
	box-sizing: border-box;
	padding: 40px;
	}
	.page:nth-child(1) {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	color: white;
	}
	.page:nth-child(2) {
	background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
	color: white;
	}
	.page:nth-child(3) {
	background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
	color: white;
	}
	h1 { font-size: 48px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
	</style>
	</head>
	<body>
	<div class="page">
	<h1>Page 1: Hello PDF! 🌍</h1>
	</div>

	<div class="page">
	<h1>Page 2: Separate Page! 📄</h1>
	</div>

	<div class="page">
	<h1>Page 3: Final Page! ✨</h1>
	</div>
	</body>
	</html>""",
	height=400,
	key="html_code"
	)

	uploaded_images_text = st.file_uploader(
	"📷 Upload Images",
	type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
	key="image_text",
	help="Upload images to embed in your HTML",
	accept_multiple_files=True
	)

	if uploaded_images_text:
	st.success(f"✅ {len(uploaded_images_text)} image(s) uploaded")
	with st.expander("View images"):
	cols = st.columns(min(len(uploaded_images_text), 4))
	for idx, img in enumerate(uploaded_images_text):
	with cols[idx % 4]:
	st.image(img, caption=img.name, use_container_width=True)

	if html_code.strip():
	detected_ratio_text = detect_aspect_ratio(html_code)
	auto_detect_text = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_text")

	if auto_detect_text:
	aspect_ratio_text = detected_ratio_text
	st.info(f"🔍 Detected: {detected_ratio_text}")
	else:
	aspect_ratio_text = st.radio(
	"Aspect Ratio",
	options=["16:9", "1:1", "9:16"],
	index=["16:9", "1:1", "9:16"].index(detected_ratio_text),
	key="aspect_text"
	)

	convert_text_btn = st.button("🔄 Convert", key="conv_text", type="primary", use_container_width=True)

	if convert_text_btn:
	temp_dir = None
	try:
	with st.spinner("Converting..."):
	temp_dir = tempfile.mkdtemp()

	processed_html = html_code
	if uploaded_images_text:
	with st.expander("🖼️ Image Processing", expanded=True):
	processed_html, replacements = embed_images_as_base64(html_code, uploaded_images_text)

	if not replacements:
	st.warning("⚠️ Images uploaded but no matches found!")

	pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio_text, temp_dir)

	if error:
	st.error(f"❌ {error}")
	else:
	st.success("✅ PDF generated with proper page breaks!")

	col_a, col_b = st.columns(2)
	with col_a:
	st.download_button(
	"⬇️ Download PDF",
	data=pdf_bytes,
	file_name="converted.pdf",
	mime="application/pdf",
	use_container_width=True
	)
	with col_b:
	st.info(f"Size: {len(pdf_bytes):,} bytes")

	st.subheader("📄 PDF Preview")
	st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
	except Exception as e:
	st.error(f"❌ Error: {str(e)}")
	finally:
	if temp_dir and os.path.exists(temp_dir):
	shutil.rmtree(temp_dir, ignore_errors=True)

	# Footer
	st.markdown("---")
	st.markdown("""
	### 💡 How Page Breaks Work:

	Automatic Page Detection:
	- Elements with class `page`, `slide`, or `section.page` are treated as separate pages
	- Each page automatically gets `page-break-after: always` CSS
	- Last page won't have a trailing break

	HTML Structure for Multiple Pages:
	```html
	<div class="page">Page 1 content</div>
	<div class="page">Page 2 content</div>
	<div class="page">Page 3 content</div>
	```

	Manual Page Breaks:
	- Add class `page-break` to force a break after an element
	- Add class `page-break-before` to force a break before an element
	- Add class `no-page-break` to prevent breaks inside an element

	Image Embedding:
	- Images are converted to base64 and embedded directly in HTML
	- Ensures images always appear in the PDF
	- Filename in HTML must match uploaded file exactly

	### 📝 Example HTML:
	```html
	<!DOCTYPE html>
	<html>
	<body>
	<div class="page">
	<h1>First Page</h1>
	<img src="logo.png" alt="Logo">
	</div>

	<div class="page">
	<h1>Second Page</h1>
	<p>Content here...</p>
	</div>
	</body>
	</html>
	```
	Then upload a file named: `logo.png`
	""")