Spaces:

ABDALLALSWAITI
/

htmlpdf

Sleeping

App Files Files Community

ABDALLALSWAITI commited on Oct 16, 2025

Commit

9f4d1a1

verified ·

1 Parent(s): a75900b

Upload streamlit_app (1).py

Browse files

Files changed (1) hide show

src/streamlit_app (1).py +695 -0

src/streamlit_app (1).py ADDED Viewed

	@@ -0,0 +1,695 @@

+"""
+Streamlit HTML to PDF Converter with Image Support and Proper Page Breaks
+Save this file as: src/streamlit_app.py
+"""
+import streamlit as st
+import subprocess
+import os
+import tempfile
+import shutil
+from pathlib import Path
+import base64
+import re
+import mimetypes
+st.set_page_config(
+    page_title="HTML to PDF Converter",
+    page_icon="📄",
+    layout="wide"
+)
+def detect_aspect_ratio(html_content):
+    """Detect aspect ratio from HTML content"""
+    viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
+    if viewport_match:
+        viewport = viewport_match.group(1).lower()
+        if 'orientation=portrait' in viewport:
+            return "9:16"
+        elif 'orientation=landscape' in viewport:
+            return "16:9"
+    aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
+    if aspect_match:
+        width = int(aspect_match.group(1))
+        height = int(aspect_match.group(2))
+        ratio = width / height
+        if ratio > 1.5:
+            return "16:9"
+        elif ratio < 0.7:
+            return "9:16"
+        else:
+            return "1:1"
+    if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
+        return "16:9"
+    return "9:16"
+def image_to_base64(image_file):
+    """Convert uploaded image to base64 data URL"""
+    try:
+        image_bytes = image_file.getvalue()
+        mime_type, _ = mimetypes.guess_type(image_file.name)
+        if not mime_type:
+            ext = os.path.splitext(image_file.name)[1].lower()
+            mime_map = {
+                '.jpg': 'image/jpeg',
+                '.jpeg': 'image/jpeg',
+                '.png': 'image/png',
+                '.gif': 'image/gif',
+                '.svg': 'image/svg+xml',
+                '.webp': 'image/webp',
+                '.bmp': 'image/bmp'
+            }
+            mime_type = mime_map.get(ext, 'image/png')
+        b64_data = base64.b64encode(image_bytes).decode('utf-8')
+        data_url = f"data:{mime_type};base64,{b64_data}"
+        return data_url
+    except Exception as e:
+        st.error(f"Error converting {image_file.name} to base64: {str(e)}")
+        return None
+def embed_images_as_base64(html_content, uploaded_images):
+    """Embed all images directly as base64 data URLs in the HTML"""
+    if not uploaded_images:
+        return html_content, {}
+    image_data_urls = {}
+    for img in uploaded_images:
+        data_url = image_to_base64(img)
+        if data_url:
+            image_data_urls[img.name] = data_url
+            st.write(f"✓ Converted {img.name} to base64 ({len(data_url)} chars)")
+    if not image_data_urls:
+        return html_content, {}
+    replacements = {}
+    for filename, data_url in image_data_urls.items():
+        escaped_name = re.escape(filename)
+        # Pattern 1: img src attribute
+        pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
+        matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
+        count1 = len(matches1)
+        if matches1:
+            html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
+            replacements[f"{filename} (img src)"] = count1
+        # Pattern 2: background-image
+        pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
+        matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
+        count2 = len(matches2)
+        if matches2:
+            html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
+            replacements[f"{filename} (bg-image)"] = count2
+        # Pattern 3: CSS url()
+        pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
+        matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
+        count3 = len(matches3)
+        if matches3:
+            html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
+            replacements[f"{filename} (url)"] = count3
+    if replacements:
+        st.success("✅ Image Replacements:")
+        for key, count in replacements.items():
+            st.write(f"  • {key}: {count} replacement(s)")
+    else:
+        st.warning("⚠️ No image references found in HTML matching uploaded files!")
+        st.write("Uploaded files:", [img.name for img in uploaded_images])
+        with st.expander("🔍 Debug: Show HTML image references"):
+            img_lines = [line for line in html_content.split('\n')
+                        if any(k in line.lower() for k in ['<img', 'src=', 'url(', 'background'])]
+            if img_lines:
+                for line in img_lines[:10]:
+                    st.code(line.strip(), language='html')
+            else:
+                st.write("No image-related lines found in HTML")
+    return html_content, replacements
+def inject_page_breaks(html_content: str, aspect_ratio: str):
+    """Automatically inject page breaks and page sizing CSS"""
+    # Determine page orientation
+    if aspect_ratio == "16:9":
+        page_size = "A4 landscape"
+        orientation = "landscape"
+    elif aspect_ratio == "1:1":
+        page_size = "210mm 210mm"
+        orientation = "portrait"
+    else:  # 9:16
+        page_size = "A4 portrait"
+        orientation = "portrait"
+    # Comprehensive page break CSS
+    page_css = f"""
+    <style id="auto-page-breaks">
+        /* Define page size */
+        @page {{
+            size: {page_size};
+            margin: 0;
+        }}
+        /* Reset body */
+        html, body {{
+            margin: 0 !important;
+            padding: 0 !important;
+            width: 100% !important;
+            height: 100% !important;
+        }}
+        /* Page containers - each should be one page */
+        .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
+            width: 100% !important;
+            min-height: 100vh !important;
+            height: 100vh !important;
+            page-break-after: always !important;
+            break-after: page !important;
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+            position: relative !important;
+            box-sizing: border-box !important;
+            overflow: hidden !important;
+        }}
+        /* Last page shouldn't force a break */
+        .page:last-child, .slide:last-child,
+        section.page:last-child, article.page:last-child {{
+            page-break-after: auto !important;
+            break-after: auto !important;
+        }}
+        /* If no explicit page class, treat direct body children as pages */
+        body > section:not(.no-page-break),
+        body > article:not(.no-page-break),
+        body > div:not(.no-page-break) {{
+            page-break-after: always !important;
+            break-after: page !important;
+            min-height: 100vh;
+        }}
+        body > section:last-child,
+        body > article:last-child,
+        body > div:last-child {{
+            page-break-after: auto !important;
+        }}
+        /* Utility classes for manual control */
+        .page-break, .page-break-after {{
+            page-break-after: always !important;
+            break-after: page !important;
+        }}
+        .page-break-before {{
+            page-break-before: always !important;
+            break-before: page !important;
+        }}
+        .no-page-break, .keep-together {{
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        /* Prevent awkward breaks in content */
+        h1, h2, h3, h4, h5, h6 {{
+            page-break-after: avoid !important;
+            break-after: avoid !important;
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        img, figure, table, pre, blockquote {{
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        /* Preserve colors and backgrounds */
+        * {{
+            -webkit-print-color-adjust: exact !important;
+            print-color-adjust: exact !important;
+            color-adjust: exact !important;
+        }}
+    </style>
+    """
+    # Inject CSS into HTML
+    if '</head>' in html_content:
+        html_content = html_content.replace('</head>', page_css + '</head>')
+    elif '<body' in html_content:
+        html_content = html_content.replace('<body', page_css + '<body', 1)
+    else:
+        html_content = page_css + html_content
+    return html_content
+def render_html_preview(html_content):
+    """Render HTML preview in an iframe"""
+    b64 = base64.b64encode(html_content.encode()).decode()
+    iframe_html = f'<iframe src="data:text/html;base64,{b64}" width="100%" height="600" style="border: 2px solid #ddd; border-radius: 5px;"></iframe>'
+    return iframe_html
+def render_pdf_preview(pdf_bytes):
+    """Render PDF preview using embedded PDF.js"""
+    b64 = base64.b64encode(pdf_bytes).decode()
+    pdf_viewer_html = f'''
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <style>
+            body {{
+                margin: 0;
+                padding: 0;
+                overflow: hidden;
+                background: #525659;
+            }}
+            #pdf-container {{
+                width: 100%;
+                height: 100vh;
+                overflow: auto;
+                display: flex;
+                flex-direction: column;
+                align-items: center;
+                padding: 20px;
+                box-sizing: border-box;
+            }}
+            canvas {{
+                box-shadow: 0 2px 8px rgba(0,0,0,0.3);
+                margin-bottom: 10px;
+                background: white;
+            }}
+            #loading {{
+                color: white;
+                font-family: Arial, sans-serif;
+                font-size: 18px;
+                padding: 20px;
+            }}
+        </style>
+    </head>
+    <body>
+        <div id="pdf-container">
+            <div id="loading">Loading PDF...</div>
+        </div>
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
+        <script>
+            pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
+            const pdfData = atob('{b64}');
+            const pdfContainer = document.getElementById('pdf-container');
+            const loading = document.getElementById('loading');
+            const uint8Array = new Uint8Array(pdfData.length);
+            for (let i = 0; i < pdfData.length; i++) {{
+                uint8Array[i] = pdfData.charCodeAt(i);
+            }}
+            pdfjsLib.getDocument({{data: uint8Array}}).promise.then(function(pdf) {{
+                loading.style.display = 'none';
+                const numPages = pdf.numPages;
+                const promises = [];
+                for (let pageNum = 1; pageNum <= numPages; pageNum++) {{
+                    promises.push(
+                        pdf.getPage(pageNum).then(function(page) {{
+                            const scale = 1.5;
+                            const viewport = page.getViewport({{scale: scale}});
+                            const canvas = document.createElement('canvas');
+                            const context = canvas.getContext('2d');
+                            canvas.height = viewport.height;
+                            canvas.width = viewport.width;
+                            pdfContainer.appendChild(canvas);
+                            return page.render({{
+                                canvasContext: context,
+                                viewport: viewport
+                            }}).promise;
+                        }})
+                    );
+                }}
+                return Promise.all(promises);
+            }}).catch(function(error) {{
+                loading.innerHTML = '<div style="color:#ff6b6b;">Error: ' + error.message + '</div>';
+            }});
+        </script>
+    </body>
+    </html>
+    '''
+    return pdf_viewer_html
+def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
+    """Convert HTML content to PDF using Puppeteer with proper page breaks"""
+    try:
+        # Step 1: Inject page break CSS
+        st.write("🔧 Injecting page break CSS...")
+        html_content = inject_page_breaks(html_content, aspect_ratio)
+        # Save HTML to temp file
+        html_file = os.path.join(temp_dir, "input.html")
+        with open(html_file, 'w', encoding='utf-8') as f:
+            f.write(html_content)
+        st.write(f"📝 Saved HTML: {os.path.getsize(html_file):,} bytes")
+        # Find puppeteer script
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        possible_paths = [
+            os.path.join(os.path.dirname(script_dir), 'puppeteer_pdf.js'),
+            os.path.join(script_dir, 'puppeteer_pdf.js'),
+            os.path.join(script_dir, '..', 'puppeteer_pdf.js'),
+            'puppeteer_pdf.js'
+        ]
+        puppeteer_script = None
+        for path in possible_paths:
+            if os.path.exists(path):
+                puppeteer_script = path
+                break
+        if not puppeteer_script:
+            return None, "Error: puppeteer_pdf.js not found"
+        st.write(f"🔧 Using Puppeteer: {puppeteer_script}")
+        # Run conversion
+        result = subprocess.run(
+            ['node', puppeteer_script, html_file, aspect_ratio],
+            capture_output=True,
+            text=True,
+            timeout=60,
+            cwd=os.path.dirname(os.path.abspath(puppeteer_script))
+        )
+        if result.returncode != 0:
+            return None, f"PDF conversion failed: {result.stderr}"
+        # Read PDF
+        pdf_file = html_file.replace('.html', '.pdf')
+        if not os.path.exists(pdf_file):
+            return None, "PDF file was not generated"
+        with open(pdf_file, 'rb') as f:
+            pdf_bytes = f.read()
+        st.write(f"✅ PDF generated: {len(pdf_bytes):,} bytes")
+        return pdf_bytes, None
+    except subprocess.TimeoutExpired:
+        return None, "Error: PDF conversion timed out (60 seconds)"
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# Main UI
+st.title("📄 HTML to PDF Converter")
+st.markdown("""
+Convert HTML to PDF with **proper page breaks** and **embedded base64 images**!
+✨ Each page in your HTML will be preserved as a separate PDF page.
+""")
+# Create tabs
+tab1, tab2 = st.tabs(["📤 Upload HTML File", "📝 Paste HTML Code"])
+# Tab 1: Upload HTML File
+with tab1:
+    uploaded_file = st.file_uploader(
+        "Choose an HTML file",
+        type=['html', 'htm'],
+        key="file_uploader",
+        help="Upload an HTML file"
+    )
+    uploaded_images = st.file_uploader(
+        "📷 Upload Images",
+        type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
+        key="image_uploader",
+        help="Upload images - they will be embedded as base64 in the HTML",
+        accept_multiple_files=True
+    )
+    if uploaded_images:
+        st.success(f"✅ {len(uploaded_images)} image(s) uploaded")
+        with st.expander("View uploaded images"):
+            cols = st.columns(min(len(uploaded_images), 4))
+            for idx, img in enumerate(uploaded_images):
+                with cols[idx % 4]:
+                    st.image(img, caption=img.name, use_container_width=True)
+    if uploaded_file:
+        st.success(f"✅ File: {uploaded_file.name}")
+        uploaded_file.seek(0)
+        try:
+            html_content = uploaded_file.getvalue().decode('utf-8')
+        except UnicodeDecodeError:
+            uploaded_file.seek(0)
+            html_content = uploaded_file.getvalue().decode('latin-1')
+        detected_ratio = detect_aspect_ratio(html_content)
+        col1, col2 = st.columns([1, 1])
+        with col1:
+            st.subheader("⚙️ Settings")
+            auto_detect = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_file")
+            if auto_detect:
+                aspect_ratio = detected_ratio
+                st.info(f"🔍 Detected: **{detected_ratio}**")
+            else:
+                aspect_ratio = st.radio(
+                    "Aspect Ratio",
+                    options=["16:9", "1:1", "9:16"],
+                    index=["16:9", "1:1", "9:16"].index(detected_ratio),
+                    key="aspect_file"
+                )
+            convert_btn = st.button("🔄 Convert to PDF", key="conv_file", type="primary", use_container_width=True)
+        with col2:
+            st.subheader("👁️ Preview")
+            with st.expander("Show HTML"):
+                st.components.v1.html(render_html_preview(html_content), height=400, scrolling=True)
+        if convert_btn:
+            temp_dir = None
+            try:
+                with st.spinner("Converting..."):
+                    temp_dir = tempfile.mkdtemp()
+                    # Embed images as base64
+                    processed_html = html_content
+                    if uploaded_images:
+                        with st.expander("🖼️ Image Processing", expanded=True):
+                            processed_html, replacements = embed_images_as_base64(html_content, uploaded_images)
+                            if not replacements:
+                                st.warning("⚠️ Images uploaded but no matches found in HTML!")
+                                st.write("**Tip:** Make sure image filenames in HTML match uploaded files exactly")
+                    # Convert to PDF
+                    pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio, temp_dir)
+                    if error:
+                        st.error(f"❌ {error}")
+                    else:
+                        st.success("✅ PDF generated with proper page breaks!")
+                        output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
+                        if not output_name.endswith('.pdf'):
+                            output_name += '.pdf'
+                        col_a, col_b = st.columns(2)
+                        with col_a:
+                            st.download_button(
+                                "⬇️ Download PDF",
+                                data=pdf_bytes,
+                                file_name=output_name,
+                                mime="application/pdf",
+                                use_container_width=True
+                            )
+                        with col_b:
+                            st.info(f"Size: {len(pdf_bytes):,} bytes")
+                        st.subheader("📄 PDF Preview")
+                        st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
+            except Exception as e:
+                st.error(f"❌ Error: {str(e)}")
+            finally:
+                if temp_dir and os.path.exists(temp_dir):
+                    shutil.rmtree(temp_dir, ignore_errors=True)
+# Tab 2: Paste HTML
+with tab2:
+    html_code = st.text_area(
+        "HTML Content",
+        value="""<!DOCTYPE html>
+<html>
+<head>
+    <style>
+        body {
+            font-family: Arial;
+            margin: 0;
+            padding: 0;
+        }
+        .page {
+            width: 100%;
+            height: 100vh;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            box-sizing: border-box;
+            padding: 40px;
+        }
+        .page:nth-child(1) {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+        }
+        .page:nth-child(2) {
+            background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+            color: white;
+        }
+        .page:nth-child(3) {
+            background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
+            color: white;
+        }
+        h1 { font-size: 48px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
+    </style>
+</head>
+<body>
+    <div class="page">
+        <h1>Page 1: Hello PDF! 🌍</h1>
+    </div>
+    <div class="page">
+        <h1>Page 2: Separate Page! 📄</h1>
+    </div>
+    <div class="page">
+        <h1>Page 3: Final Page! ✨</h1>
+    </div>
+</body>
+</html>""",
+        height=400,
+        key="html_code"
+    )
+    uploaded_images_text = st.file_uploader(
+        "📷 Upload Images",
+        type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
+        key="image_text",
+        help="Upload images to embed in your HTML",
+        accept_multiple_files=True
+    )
+    if uploaded_images_text:
+        st.success(f"✅ {len(uploaded_images_text)} image(s) uploaded")
+        with st.expander("View images"):
+            cols = st.columns(min(len(uploaded_images_text), 4))
+            for idx, img in enumerate(uploaded_images_text):
+                with cols[idx % 4]:
+                    st.image(img, caption=img.name, use_container_width=True)
+    if html_code.strip():
+        detected_ratio_text = detect_aspect_ratio(html_code)
+        auto_detect_text = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_text")
+        if auto_detect_text:
+            aspect_ratio_text = detected_ratio_text
+            st.info(f"🔍 Detected: **{detected_ratio_text}**")
+        else:
+            aspect_ratio_text = st.radio(
+                "Aspect Ratio",
+                options=["16:9", "1:1", "9:16"],
+                index=["16:9", "1:1", "9:16"].index(detected_ratio_text),
+                key="aspect_text"
+            )
+        convert_text_btn = st.button("🔄 Convert", key="conv_text", type="primary", use_container_width=True)
+        if convert_text_btn:
+            temp_dir = None
+            try:
+                with st.spinner("Converting..."):
+                    temp_dir = tempfile.mkdtemp()
+                    processed_html = html_code
+                    if uploaded_images_text:
+                        with st.expander("🖼️ Image Processing", expanded=True):
+                            processed_html, replacements = embed_images_as_base64(html_code, uploaded_images_text)
+                            if not replacements:
+                                st.warning("⚠️ Images uploaded but no matches found!")
+                    pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio_text, temp_dir)
+                    if error:
+                        st.error(f"❌ {error}")
+                    else:
+                        st.success("✅ PDF generated with proper page breaks!")
+                        col_a, col_b = st.columns(2)
+                        with col_a:
+                            st.download_button(
+                                "⬇️ Download PDF",
+                                data=pdf_bytes,
+                                file_name="converted.pdf",
+                                mime="application/pdf",
+                                use_container_width=True
+                            )
+                        with col_b:
+                            st.info(f"Size: {len(pdf_bytes):,} bytes")
+                        st.subheader("📄 PDF Preview")
+                        st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
+            except Exception as e:
+                st.error(f"❌ Error: {str(e)}")
+            finally:
+                if temp_dir and os.path.exists(temp_dir):
+                    shutil.rmtree(temp_dir, ignore_errors=True)
+# Footer
+st.markdown("---")
+st.markdown("""
+### 💡 How Page Breaks Work:
+**Automatic Page Detection:**
+- Elements with class `page`, `slide`, or `section.page` are treated as separate pages
+- Each page automatically gets `page-break-after: always` CSS
+- Last page won't have a trailing break
+**HTML Structure for Multiple Pages:**
+```html
+<div class="page">Page 1 content</div>
+<div class="page">Page 2 content</div>
+<div class="page">Page 3 content</div>
+```
+**Manual Page Breaks:**
+- Add class `page-break` to force a break after an element
+- Add class `page-break-before` to force a break before an element
+- Add class `no-page-break` to prevent breaks inside an element
+**Image Embedding:**
+- Images are converted to base64 and embedded directly in HTML
+- Ensures images always appear in the PDF
+- Filename in HTML must match uploaded file exactly
+### 📝 Example HTML:
+```html
+<!DOCTYPE html>
+<html>
+<body>
+    <div class="page">
+        <h1>First Page</h1>
+        <img src="logo.png" alt="Logo">
+    </div>
+    <div class="page">
+        <h1>Second Page</h1>
+        <p>Content here...</p>
+    </div>
+</body>
+</html>
+```
+Then upload a file named: `logo.png`
+""")