htmlpdfs

Sleeping

App Files Files Community

ABDALLALSWAITI commited on Oct 17, 2025

Commit

73d201c

verified ·

1 Parent(s): 8635750

Delete src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +0 -695

src/streamlit_app.py DELETED Viewed

@@ -1,695 +0,0 @@
-"""
-Streamlit HTML to PDF Converter with Image Support and Proper Page Breaks
-Save this file as: src/streamlit_app.py
-"""
-import streamlit as st
-import subprocess
-import os
-import tempfile
-import shutil
-from pathlib import Path
-import base64
-import re
-import mimetypes
-st.set_page_config(
-    page_title="HTML to PDF Converter",
-    page_icon="📄",
-    layout="wide"
-)
-def detect_aspect_ratio(html_content):
-    """Detect aspect ratio from HTML content"""
-    viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
-    if viewport_match:
-        viewport = viewport_match.group(1).lower()
-        if 'orientation=portrait' in viewport:
-            return "9:16"
-        elif 'orientation=landscape' in viewport:
-            return "16:9"
-    aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
-    if aspect_match:
-        width = int(aspect_match.group(1))
-        height = int(aspect_match.group(2))
-        ratio = width / height
-        if ratio > 1.5:
-            return "16:9"
-        elif ratio < 0.7:
-            return "9:16"
-        else:
-            return "1:1"
-    if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
-        return "16:9"
-    return "9:16"
-def image_to_base64(image_file):
-    """Convert uploaded image to base64 data URL"""
-    try:
-        image_bytes = image_file.getvalue()
-        mime_type, _ = mimetypes.guess_type(image_file.name)
-        if not mime_type:
-            ext = os.path.splitext(image_file.name)[1].lower()
-            mime_map = {
-                '.jpg': 'image/jpeg',
-                '.jpeg': 'image/jpeg',
-                '.png': 'image/png',
-                '.gif': 'image/gif',
-                '.svg': 'image/svg+xml',
-                '.webp': 'image/webp',
-                '.bmp': 'image/bmp'
-            }
-            mime_type = mime_map.get(ext, 'image/png')
-        b64_data = base64.b64encode(image_bytes).decode('utf-8')
-        data_url = f"data:{mime_type};base64,{b64_data}"
-        return data_url
-    except Exception as e:
-        st.error(f"Error converting {image_file.name} to base64: {str(e)}")
-        return None
-def embed_images_as_base64(html_content, uploaded_images):
-    """Embed all images directly as base64 data URLs in the HTML"""
-    if not uploaded_images:
-        return html_content, {}
-    image_data_urls = {}
-    for img in uploaded_images:
-        data_url = image_to_base64(img)
-        if data_url:
-            image_data_urls[img.name] = data_url
-            st.write(f"✓ Converted {img.name} to base64 ({len(data_url)} chars)")
-    if not image_data_urls:
-        return html_content, {}
-    replacements = {}
-    for filename, data_url in image_data_urls.items():
-        escaped_name = re.escape(filename)
-        # Pattern 1: img src attribute
-        pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
-        matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
-        count1 = len(matches1)
-        if matches1:
-            html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
-            replacements[f"{filename} (img src)"] = count1
-        # Pattern 2: background-image
-        pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
-        matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
-        count2 = len(matches2)
-        if matches2:
-            html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
-            replacements[f"{filename} (bg-image)"] = count2
-        # Pattern 3: CSS url()
-        pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
-        matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
-        count3 = len(matches3)
-        if matches3:
-            html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
-            replacements[f"{filename} (url)"] = count3
-    if replacements:
-        st.success("✅ Image Replacements:")
-        for key, count in replacements.items():
-            st.write(f"  • {key}: {count} replacement(s)")
-    else:
-        st.warning("⚠️ No image references found in HTML matching uploaded files!")
-        st.write("Uploaded files:", [img.name for img in uploaded_images])
-        with st.expander("🔍 Debug: Show HTML image references"):
-            img_lines = [line for line in html_content.split('\n')
-                        if any(k in line.lower() for k in ['<img', 'src=', 'url(', 'background'])]
-            if img_lines:
-                for line in img_lines[:10]:
-                    st.code(line.strip(), language='html')
-            else:
-                st.write("No image-related lines found in HTML")
-    return html_content, replacements
-def inject_page_breaks(html_content: str, aspect_ratio: str):
-    """Automatically inject page breaks and page sizing CSS"""
-    # Determine page orientation
-    if aspect_ratio == "16:9":
-        page_size = "A4 landscape"
-        orientation = "landscape"
-    elif aspect_ratio == "1:1":
-        page_size = "210mm 210mm"
-        orientation = "portrait"
-    else:  # 9:16
-        page_size = "A4 portrait"
-        orientation = "portrait"
-    # Comprehensive page break CSS
-    page_css = f"""
-    <style id="auto-page-breaks">
-        /* Define page size */
-        @page {{
-            size: {page_size};
-            margin: 0;
-        }}
-        /* Reset body */
-        html, body {{
-            margin: 0 !important;
-            padding: 0 !important;
-            width: 100% !important;
-            height: 100% !important;
-        }}
-        /* Page containers - each should be one page */
-        .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
-            width: 100% !important;
-            min-height: 100vh !important;
-            height: 100vh !important;
-            page-break-after: always !important;
-            break-after: page !important;
-            page-break-inside: avoid !important;
-            break-inside: avoid !important;
-            position: relative !important;
-            box-sizing: border-box !important;
-            overflow: hidden !important;
-        }}
-        /* Last page shouldn't force a break */
-        .page:last-child, .slide:last-child,
-        section.page:last-child, article.page:last-child {{
-            page-break-after: auto !important;
-            break-after: auto !important;
-        }}
-        /* If no explicit page class, treat direct body children as pages */
-        body > section:not(.no-page-break),
-        body > article:not(.no-page-break),
-        body > div:not(.no-page-break) {{
-            page-break-after: always !important;
-            break-after: page !important;
-            min-height: 100vh;
-        }}
-        body > section:last-child,
-        body > article:last-child,
-        body > div:last-child {{
-            page-break-after: auto !important;
-        }}
-        /* Utility classes for manual control */
-        .page-break, .page-break-after {{
-            page-break-after: always !important;
-            break-after: page !important;
-        }}
-        .page-break-before {{
-            page-break-before: always !important;
-            break-before: page !important;
-        }}
-        .no-page-break, .keep-together {{
-            page-break-inside: avoid !important;
-            break-inside: avoid !important;
-        }}
-        /* Prevent awkward breaks in content */
-        h1, h2, h3, h4, h5, h6 {{
-            page-break-after: avoid !important;
-            break-after: avoid !important;
-            page-break-inside: avoid !important;
-            break-inside: avoid !important;
-        }}
-        img, figure, table, pre, blockquote {{
-            page-break-inside: avoid !important;
-            break-inside: avoid !important;
-        }}
-        /* Preserve colors and backgrounds */
-        * {{
-            -webkit-print-color-adjust: exact !important;
-            print-color-adjust: exact !important;
-            color-adjust: exact !important;
-        }}
-    </style>
-    """
-    # Inject CSS into HTML
-    if '</head>' in html_content:
-        html_content = html_content.replace('</head>', page_css + '</head>')
-    elif '<body' in html_content:
-        html_content = html_content.replace('<body', page_css + '<body', 1)
-    else:
-        html_content = page_css + html_content
-    return html_content
-def render_html_preview(html_content):
-    """Render HTML preview in an iframe"""
-    b64 = base64.b64encode(html_content.encode()).decode()
-    iframe_html = f'<iframe src="data:text/html;base64,{b64}" width="100%" height="600" style="border: 2px solid #ddd; border-radius: 5px;"></iframe>'
-    return iframe_html
-def render_pdf_preview(pdf_bytes):
-    """Render PDF preview using embedded PDF.js"""
-    b64 = base64.b64encode(pdf_bytes).decode()
-    pdf_viewer_html = f'''
-    <!DOCTYPE html>
-    <html>
-    <head>
-        <style>
-            body {{
-                margin: 0;
-                padding: 0;
-                overflow: hidden;
-                background: #525659;
-            }}
-            #pdf-container {{
-                width: 100%;
-                height: 100vh;
-                overflow: auto;
-                display: flex;
-                flex-direction: column;
-                align-items: center;
-                padding: 20px;
-                box-sizing: border-box;
-            }}
-            canvas {{
-                box-shadow: 0 2px 8px rgba(0,0,0,0.3);
-                margin-bottom: 10px;
-                background: white;
-            }}
-            #loading {{
-                color: white;
-                font-family: Arial, sans-serif;
-                font-size: 18px;
-                padding: 20px;
-            }}
-        </style>
-    </head>
-    <body>
-        <div id="pdf-container">
-            <div id="loading">Loading PDF...</div>
-        </div>
-        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
-        <script>
-            pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
-            const pdfData = atob('{b64}');
-            const pdfContainer = document.getElementById('pdf-container');
-            const loading = document.getElementById('loading');
-            const uint8Array = new Uint8Array(pdfData.length);
-            for (let i = 0; i < pdfData.length; i++) {{
-                uint8Array[i] = pdfData.charCodeAt(i);
-            }}
-            pdfjsLib.getDocument({{data: uint8Array}}).promise.then(function(pdf) {{
-                loading.style.display = 'none';
-                const numPages = pdf.numPages;
-                const promises = [];
-                for (let pageNum = 1; pageNum <= numPages; pageNum++) {{
-                    promises.push(
-                        pdf.getPage(pageNum).then(function(page) {{
-                            const scale = 1.5;
-                            const viewport = page.getViewport({{scale: scale}});
-                            const canvas = document.createElement('canvas');
-                            const context = canvas.getContext('2d');
-                            canvas.height = viewport.height;
-                            canvas.width = viewport.width;
-                            pdfContainer.appendChild(canvas);
-                            return page.render({{
-                                canvasContext: context,
-                                viewport: viewport
-                            }}).promise;
-                        }})
-                    );
-                }}
-                return Promise.all(promises);
-            }}).catch(function(error) {{
-                loading.innerHTML = '<div style="color:#ff6b6b;">Error: ' + error.message + '</div>';
-            }});
-        </script>
-    </body>
-    </html>
-    '''
-    return pdf_viewer_html
-def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
-    """Convert HTML content to PDF using Puppeteer with proper page breaks"""
-    try:
-        # Step 1: Inject page break CSS
-        st.write("🔧 Injecting page break CSS...")
-        html_content = inject_page_breaks(html_content, aspect_ratio)
-        # Save HTML to temp file
-        html_file = os.path.join(temp_dir, "input.html")
-        with open(html_file, 'w', encoding='utf-8') as f:
-            f.write(html_content)
-        st.write(f"📝 Saved HTML: {os.path.getsize(html_file):,} bytes")
-        # Find puppeteer script
-        script_dir = os.path.dirname(os.path.abspath(__file__))
-        possible_paths = [
-            os.path.join(os.path.dirname(script_dir), 'puppeteer_pdf.js'),
-            os.path.join(script_dir, 'puppeteer_pdf.js'),
-            os.path.join(script_dir, '..', 'puppeteer_pdf.js'),
-            'puppeteer_pdf.js'
-        ]
-        puppeteer_script = None
-        for path in possible_paths:
-            if os.path.exists(path):
-                puppeteer_script = path
-                break
-        if not puppeteer_script:
-            return None, "Error: puppeteer_pdf.js not found"
-        st.write(f"🔧 Using Puppeteer: {puppeteer_script}")
-        # Run conversion
-        result = subprocess.run(
-            ['node', puppeteer_script, html_file, aspect_ratio],
-            capture_output=True,
-            text=True,
-            timeout=60,
-            cwd=os.path.dirname(os.path.abspath(puppeteer_script))
-        )
-        if result.returncode != 0:
-            return None, f"PDF conversion failed: {result.stderr}"
-        # Read PDF
-        pdf_file = html_file.replace('.html', '.pdf')
-        if not os.path.exists(pdf_file):
-            return None, "PDF file was not generated"
-        with open(pdf_file, 'rb') as f:
-            pdf_bytes = f.read()
-        st.write(f"✅ PDF generated: {len(pdf_bytes):,} bytes")
-        return pdf_bytes, None
-    except subprocess.TimeoutExpired:
-        return None, "Error: PDF conversion timed out (60 seconds)"
-    except Exception as e:
-        return None, f"Error: {str(e)}"
-# Main UI
-st.title("📄 HTML to PDF Converter")
-st.markdown("""
-Convert HTML to PDF with **proper page breaks** and **embedded base64 images**!
-✨ Each page in your HTML will be preserved as a separate PDF page.
-""")
-# Create tabs
-tab1, tab2 = st.tabs(["📤 Upload HTML File", "📝 Paste HTML Code"])
-# Tab 1: Upload HTML File
-with tab1:
-    uploaded_file = st.file_uploader(
-        "Choose an HTML file",
-        type=['html', 'htm'],
-        key="file_uploader",
-        help="Upload an HTML file"
-    )
-    uploaded_images = st.file_uploader(
-        "📷 Upload Images",
-        type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
-        key="image_uploader",
-        help="Upload images - they will be embedded as base64 in the HTML",
-        accept_multiple_files=True
-    )
-    if uploaded_images:
-        st.success(f"✅ {len(uploaded_images)} image(s) uploaded")
-        with st.expander("View uploaded images"):
-            cols = st.columns(min(len(uploaded_images), 4))
-            for idx, img in enumerate(uploaded_images):
-                with cols[idx % 4]:
-                    st.image(img, caption=img.name, use_container_width=True)
-    if uploaded_file:
-        st.success(f"✅ File: {uploaded_file.name}")
-        uploaded_file.seek(0)
-        try:
-            html_content = uploaded_file.getvalue().decode('utf-8')
-        except UnicodeDecodeError:
-            uploaded_file.seek(0)
-            html_content = uploaded_file.getvalue().decode('latin-1')
-        detected_ratio = detect_aspect_ratio(html_content)
-        col1, col2 = st.columns([1, 1])
-        with col1:
-            st.subheader("⚙️ Settings")
-            auto_detect = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_file")
-            if auto_detect:
-                aspect_ratio = detected_ratio
-                st.info(f"🔍 Detected: **{detected_ratio}**")
-            else:
-                aspect_ratio = st.radio(
-                    "Aspect Ratio",
-                    options=["16:9", "1:1", "9:16"],
-                    index=["16:9", "1:1", "9:16"].index(detected_ratio),
-                    key="aspect_file"
-                )
-            convert_btn = st.button("🔄 Convert to PDF", key="conv_file", type="primary", use_container_width=True)
-        with col2:
-            st.subheader("👁️ Preview")
-            with st.expander("Show HTML"):
-                st.components.v1.html(render_html_preview(html_content), height=400, scrolling=True)
-        if convert_btn:
-            temp_dir = None
-            try:
-                with st.spinner("Converting..."):
-                    temp_dir = tempfile.mkdtemp()
-                    # Embed images as base64
-                    processed_html = html_content
-                    if uploaded_images:
-                        with st.expander("🖼️ Image Processing", expanded=True):
-                            processed_html, replacements = embed_images_as_base64(html_content, uploaded_images)
-                            if not replacements:
-                                st.warning("⚠️ Images uploaded but no matches found in HTML!")
-                                st.write("**Tip:** Make sure image filenames in HTML match uploaded files exactly")
-                    # Convert to PDF
-                    pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio, temp_dir)
-                    if error:
-                        st.error(f"❌ {error}")
-                    else:
-                        st.success("✅ PDF generated with proper page breaks!")
-                        output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
-                        if not output_name.endswith('.pdf'):
-                            output_name += '.pdf'
-                        col_a, col_b = st.columns(2)
-                        with col_a:
-                            st.download_button(
-                                "⬇️ Download PDF",
-                                data=pdf_bytes,
-                                file_name=output_name,
-                                mime="application/pdf",
-                                use_container_width=True
-                            )
-                        with col_b:
-                            st.info(f"Size: {len(pdf_bytes):,} bytes")
-                        st.subheader("📄 PDF Preview")
-                        st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
-            except Exception as e:
-                st.error(f"❌ Error: {str(e)}")
-            finally:
-                if temp_dir and os.path.exists(temp_dir):
-                    shutil.rmtree(temp_dir, ignore_errors=True)
-# Tab 2: Paste HTML
-with tab2:
-    html_code = st.text_area(
-        "HTML Content",
-        value="""<!DOCTYPE html>
-<html>
-<head>
-    <style>
-        body {
-            font-family: Arial;
-            margin: 0;
-            padding: 0;
-        }
-        .page {
-            width: 100%;
-            height: 100vh;
-            display: flex;
-            align-items: center;
-            justify-content: center;
-            box-sizing: border-box;
-            padding: 40px;
-        }
-        .page:nth-child(1) {
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            color: white;
-        }
-        .page:nth-child(2) {
-            background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
-            color: white;
-        }
-        .page:nth-child(3) {
-            background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
-            color: white;
-        }
-        h1 { font-size: 48px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
-    </style>
-</head>
-<body>
-    <div class="page">
-        <h1>Page 1: Hello PDF! 🌍</h1>
-    </div>
-    <div class="page">
-        <h1>Page 2: Separate Page! 📄</h1>
-    </div>
-    <div class="page">
-        <h1>Page 3: Final Page! ✨</h1>
-    </div>
-</body>
-</html>""",
-        height=400,
-        key="html_code"
-    )
-    uploaded_images_text = st.file_uploader(
-        "📷 Upload Images",
-        type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
-        key="image_text",
-        help="Upload images to embed in your HTML",
-        accept_multiple_files=True
-    )
-    if uploaded_images_text:
-        st.success(f"✅ {len(uploaded_images_text)} image(s) uploaded")
-        with st.expander("View images"):
-            cols = st.columns(min(len(uploaded_images_text), 4))
-            for idx, img in enumerate(uploaded_images_text):
-                with cols[idx % 4]:
-                    st.image(img, caption=img.name, use_container_width=True)
-    if html_code.strip():
-        detected_ratio_text = detect_aspect_ratio(html_code)
-        auto_detect_text = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_text")
-        if auto_detect_text:
-            aspect_ratio_text = detected_ratio_text
-            st.info(f"🔍 Detected: **{detected_ratio_text}**")
-        else:
-            aspect_ratio_text = st.radio(
-                "Aspect Ratio",
-                options=["16:9", "1:1", "9:16"],
-                index=["16:9", "1:1", "9:16"].index(detected_ratio_text),
-                key="aspect_text"
-            )
-        convert_text_btn = st.button("🔄 Convert", key="conv_text", type="primary", use_container_width=True)
-        if convert_text_btn:
-            temp_dir = None
-            try:
-                with st.spinner("Converting..."):
-                    temp_dir = tempfile.mkdtemp()
-                    processed_html = html_code
-                    if uploaded_images_text:
-                        with st.expander("🖼️ Image Processing", expanded=True):
-                            processed_html, replacements = embed_images_as_base64(html_code, uploaded_images_text)
-                            if not replacements:
-                                st.warning("⚠️ Images uploaded but no matches found!")
-                    pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio_text, temp_dir)
-                    if error:
-                        st.error(f"❌ {error}")
-                    else:
-                        st.success("✅ PDF generated with proper page breaks!")
-                        col_a, col_b = st.columns(2)
-                        with col_a:
-                            st.download_button(
-                                "⬇️ Download PDF",
-                                data=pdf_bytes,
-                                file_name="converted.pdf",
-                                mime="application/pdf",
-                                use_container_width=True
-                            )
-                        with col_b:
-                            st.info(f"Size: {len(pdf_bytes):,} bytes")
-                        st.subheader("📄 PDF Preview")
-                        st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
-            except Exception as e:
-                st.error(f"❌ Error: {str(e)}")
-            finally:
-                if temp_dir and os.path.exists(temp_dir):
-                    shutil.rmtree(temp_dir, ignore_errors=True)
-# Footer
-st.markdown("---")
-st.markdown("""
-### 💡 How Page Breaks Work:
-**Automatic Page Detection:**
-- Elements with class `page`, `slide`, or `section.page` are treated as separate pages
-- Each page automatically gets `page-break-after: always` CSS
-- Last page won't have a trailing break
-**HTML Structure for Multiple Pages:**
-```html
-<div class="page">Page 1 content</div>
-<div class="page">Page 2 content</div>
-<div class="page">Page 3 content</div>
-```
-**Manual Page Breaks:**
-- Add class `page-break` to force a break after an element
-- Add class `page-break-before` to force a break before an element
-- Add class `no-page-break` to prevent breaks inside an element
-**Image Embedding:**
-- Images are converted to base64 and embedded directly in HTML
-- Ensures images always appear in the PDF
-- Filename in HTML must match uploaded file exactly
-### 📝 Example HTML:
-```html
-<!DOCTYPE html>
-<html>
-<body>
-    <div class="page">
-        <h1>First Page</h1>
-        <img src="logo.png" alt="Logo">
-    </div>
-    <div class="page">
-        <h1>Second Page</h1>
-        <p>Content here...</p>
-    </div>
-</body>
-</html>
-```
-Then upload a file named: `logo.png`
-""")