Spaces:

ABDALLALSWAITI
/

htmlpdf

Sleeping

File size: 25,474 Bytes

9f4d1a1

"""
Streamlit HTML to PDF Converter with Image Support and Proper Page Breaks
Save this file as: src/streamlit_app.py
"""
import streamlit as st
import subprocess
import os
import tempfile
import shutil
from pathlib import Path
import base64
import re
import mimetypes

st.set_page_config(
    page_title="HTML to PDF Converter",
    page_icon="📄",
    layout="wide"
)

def detect_aspect_ratio(html_content):
    """Detect aspect ratio from HTML content"""
    viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
    if viewport_match:
        viewport = viewport_match.group(1).lower()
        if 'orientation=portrait' in viewport:
            return "9:16"
        elif 'orientation=landscape' in viewport:
            return "16:9"
    
    aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
    if aspect_match:
        width = int(aspect_match.group(1))
        height = int(aspect_match.group(2))
        ratio = width / height
        if ratio > 1.5:
            return "16:9"
        elif ratio < 0.7:
            return "9:16"
        else:
            return "1:1"
    
    if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
        return "16:9"
    
    return "9:16"

def image_to_base64(image_file):
    """Convert uploaded image to base64 data URL"""
    try:
        image_bytes = image_file.getvalue()
        mime_type, _ = mimetypes.guess_type(image_file.name)
        if not mime_type:
            ext = os.path.splitext(image_file.name)[1].lower()
            mime_map = {
                '.jpg': 'image/jpeg',
                '.jpeg': 'image/jpeg',
                '.png': 'image/png',
                '.gif': 'image/gif',
                '.svg': 'image/svg+xml',
                '.webp': 'image/webp',
                '.bmp': 'image/bmp'
            }
            mime_type = mime_map.get(ext, 'image/png')
        
        b64_data = base64.b64encode(image_bytes).decode('utf-8')
        data_url = f"data:{mime_type};base64,{b64_data}"
        return data_url
    except Exception as e:
        st.error(f"Error converting {image_file.name} to base64: {str(e)}")
        return None

def embed_images_as_base64(html_content, uploaded_images):
    """Embed all images directly as base64 data URLs in the HTML"""
    if not uploaded_images:
        return html_content, {}
    
    image_data_urls = {}
    for img in uploaded_images:
        data_url = image_to_base64(img)
        if data_url:
            image_data_urls[img.name] = data_url
            st.write(f"✓ Converted {img.name} to base64 ({len(data_url)} chars)")
    
    if not image_data_urls:
        return html_content, {}
    
    replacements = {}
    
    for filename, data_url in image_data_urls.items():
        escaped_name = re.escape(filename)
        
        # Pattern 1: img src attribute
        pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
        matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
        count1 = len(matches1)
        if matches1:
            html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
            replacements[f"{filename} (img src)"] = count1
        
        # Pattern 2: background-image
        pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
        matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
        count2 = len(matches2)
        if matches2:
            html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
            replacements[f"{filename} (bg-image)"] = count2
        
        # Pattern 3: CSS url()
        pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
        matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
        count3 = len(matches3)
        if matches3:
            html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
            replacements[f"{filename} (url)"] = count3
    
    if replacements:
        st.success("✅ Image Replacements:")
        for key, count in replacements.items():
            st.write(f"  • {key}: {count} replacement(s)")
    else:
        st.warning("⚠️ No image references found in HTML matching uploaded files!")
        st.write("Uploaded files:", [img.name for img in uploaded_images])
        
        with st.expander("🔍 Debug: Show HTML image references"):
            img_lines = [line for line in html_content.split('\n') 
                        if any(k in line.lower() for k in ['<img', 'src=', 'url(', 'background'])]
            if img_lines:
                for line in img_lines[:10]:
                    st.code(line.strip(), language='html')
            else:
                st.write("No image-related lines found in HTML")
    
    return html_content, replacements

def inject_page_breaks(html_content: str, aspect_ratio: str):
    """Automatically inject page breaks and page sizing CSS"""
    
    # Determine page orientation
    if aspect_ratio == "16:9":
        page_size = "A4 landscape"
        orientation = "landscape"
    elif aspect_ratio == "1:1":
        page_size = "210mm 210mm"
        orientation = "portrait"
    else:  # 9:16
        page_size = "A4 portrait"
        orientation = "portrait"
    
    # Comprehensive page break CSS
    page_css = f"""
    <style id="auto-page-breaks">
        /* Define page size */
        @page {{
            size: {page_size};
            margin: 0;
        }}
        
        /* Reset body */
        html, body {{
            margin: 0 !important;
            padding: 0 !important;
            width: 100% !important;
            height: 100% !important;
        }}
        
        /* Page containers - each should be one page */
        .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
            width: 100% !important;
            min-height: 100vh !important;
            height: 100vh !important;
            page-break-after: always !important;
            break-after: page !important;
            page-break-inside: avoid !important;
            break-inside: avoid !important;
            position: relative !important;
            box-sizing: border-box !important;
            overflow: hidden !important;
        }}
        
        /* Last page shouldn't force a break */
        .page:last-child, .slide:last-child,
        section.page:last-child, article.page:last-child {{
            page-break-after: auto !important;
            break-after: auto !important;
        }}
        
        /* If no explicit page class, treat direct body children as pages */
        body > section:not(.no-page-break),
        body > article:not(.no-page-break),
        body > div:not(.no-page-break) {{
            page-break-after: always !important;
            break-after: page !important;
            min-height: 100vh;
        }}
        
        body > section:last-child,
        body > article:last-child,
        body > div:last-child {{
            page-break-after: auto !important;
        }}
        
        /* Utility classes for manual control */
        .page-break, .page-break-after {{
            page-break-after: always !important;
            break-after: page !important;
        }}
        
        .page-break-before {{
            page-break-before: always !important;
            break-before: page !important;
        }}
        
        .no-page-break, .keep-together {{
            page-break-inside: avoid !important;
            break-inside: avoid !important;
        }}
        
        /* Prevent awkward breaks in content */
        h1, h2, h3, h4, h5, h6 {{
            page-break-after: avoid !important;
            break-after: avoid !important;
            page-break-inside: avoid !important;
            break-inside: avoid !important;
        }}
        
        img, figure, table, pre, blockquote {{
            page-break-inside: avoid !important;
            break-inside: avoid !important;
        }}
        
        /* Preserve colors and backgrounds */
        * {{
            -webkit-print-color-adjust: exact !important;
            print-color-adjust: exact !important;
            color-adjust: exact !important;
        }}
    </style>
    """
    
    # Inject CSS into HTML
    if '</head>' in html_content:
        html_content = html_content.replace('</head>', page_css + '</head>')
    elif '<body' in html_content:
        html_content = html_content.replace('<body', page_css + '<body', 1)
    else:
        html_content = page_css + html_content
    
    return html_content

def render_html_preview(html_content):
    """Render HTML preview in an iframe"""
    b64 = base64.b64encode(html_content.encode()).decode()
    iframe_html = f'<iframe src="data:text/html;base64,{b64}" width="100%" height="600" style="border: 2px solid #ddd; border-radius: 5px;"></iframe>'
    return iframe_html

def render_pdf_preview(pdf_bytes):
    """Render PDF preview using embedded PDF.js"""
    b64 = base64.b64encode(pdf_bytes).decode()
    
    pdf_viewer_html = f'''
    <!DOCTYPE html>
    <html>
    <head>
        <style>
            body {{
                margin: 0;
                padding: 0;
                overflow: hidden;
                background: #525659;
            }}
            #pdf-container {{
                width: 100%;
                height: 100vh;
                overflow: auto;
                display: flex;
                flex-direction: column;
                align-items: center;
                padding: 20px;
                box-sizing: border-box;
            }}
            canvas {{
                box-shadow: 0 2px 8px rgba(0,0,0,0.3);
                margin-bottom: 10px;
                background: white;
            }}
            #loading {{
                color: white;
                font-family: Arial, sans-serif;
                font-size: 18px;
                padding: 20px;
            }}
        </style>
    </head>
    <body>
        <div id="pdf-container">
            <div id="loading">Loading PDF...</div>
        </div>
        <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
        <script>
            pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
            const pdfData = atob('{b64}');
            const pdfContainer = document.getElementById('pdf-container');
            const loading = document.getElementById('loading');
            const uint8Array = new Uint8Array(pdfData.length);
            for (let i = 0; i < pdfData.length; i++) {{
                uint8Array[i] = pdfData.charCodeAt(i);
            }}
            pdfjsLib.getDocument({{data: uint8Array}}).promise.then(function(pdf) {{
                loading.style.display = 'none';
                const numPages = pdf.numPages;
                const promises = [];
                for (let pageNum = 1; pageNum <= numPages; pageNum++) {{
                    promises.push(
                        pdf.getPage(pageNum).then(function(page) {{
                            const scale = 1.5;
                            const viewport = page.getViewport({{scale: scale}});
                            const canvas = document.createElement('canvas');
                            const context = canvas.getContext('2d');
                            canvas.height = viewport.height;
                            canvas.width = viewport.width;
                            pdfContainer.appendChild(canvas);
                            return page.render({{
                                canvasContext: context,
                                viewport: viewport
                            }}).promise;
                        }})
                    );
                }}
                return Promise.all(promises);
            }}).catch(function(error) {{
                loading.innerHTML = '<div style="color:#ff6b6b;">Error: ' + error.message + '</div>';
            }});
        </script>
    </body>
    </html>
    '''
    return pdf_viewer_html

def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
    """Convert HTML content to PDF using Puppeteer with proper page breaks"""
    try:
        # Step 1: Inject page break CSS
        st.write("🔧 Injecting page break CSS...")
        html_content = inject_page_breaks(html_content, aspect_ratio)
        
        # Save HTML to temp file
        html_file = os.path.join(temp_dir, "input.html")
        with open(html_file, 'w', encoding='utf-8') as f:
            f.write(html_content)
        
        st.write(f"📝 Saved HTML: {os.path.getsize(html_file):,} bytes")
        
        # Find puppeteer script
        script_dir = os.path.dirname(os.path.abspath(__file__))
        possible_paths = [
            os.path.join(os.path.dirname(script_dir), 'puppeteer_pdf.js'),
            os.path.join(script_dir, 'puppeteer_pdf.js'),
            os.path.join(script_dir, '..', 'puppeteer_pdf.js'),
            'puppeteer_pdf.js'
        ]
        
        puppeteer_script = None
        for path in possible_paths:
            if os.path.exists(path):
                puppeteer_script = path
                break
        
        if not puppeteer_script:
            return None, "Error: puppeteer_pdf.js not found"
        
        st.write(f"🔧 Using Puppeteer: {puppeteer_script}")
        
        # Run conversion
        result = subprocess.run(
            ['node', puppeteer_script, html_file, aspect_ratio],
            capture_output=True,
            text=True,
            timeout=60,
            cwd=os.path.dirname(os.path.abspath(puppeteer_script))
        )
        
        if result.returncode != 0:
            return None, f"PDF conversion failed: {result.stderr}"
        
        # Read PDF
        pdf_file = html_file.replace('.html', '.pdf')
        if not os.path.exists(pdf_file):
            return None, "PDF file was not generated"
        
        with open(pdf_file, 'rb') as f:
            pdf_bytes = f.read()
        
        st.write(f"✅ PDF generated: {len(pdf_bytes):,} bytes")
        return pdf_bytes, None
        
    except subprocess.TimeoutExpired:
        return None, "Error: PDF conversion timed out (60 seconds)"
    except Exception as e:
        return None, f"Error: {str(e)}"

# Main UI
st.title("📄 HTML to PDF Converter")
st.markdown("""
Convert HTML to PDF with **proper page breaks** and **embedded base64 images**!  
✨ Each page in your HTML will be preserved as a separate PDF page.
""")

# Create tabs
tab1, tab2 = st.tabs(["📤 Upload HTML File", "📝 Paste HTML Code"])

# Tab 1: Upload HTML File
with tab1:
    uploaded_file = st.file_uploader(
        "Choose an HTML file",
        type=['html', 'htm'],
        key="file_uploader",
        help="Upload an HTML file"
    )
    
    uploaded_images = st.file_uploader(
        "📷 Upload Images",
        type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
        key="image_uploader",
        help="Upload images - they will be embedded as base64 in the HTML",
        accept_multiple_files=True
    )
    
    if uploaded_images:
        st.success(f"✅ {len(uploaded_images)} image(s) uploaded")
        with st.expander("View uploaded images"):
            cols = st.columns(min(len(uploaded_images), 4))
            for idx, img in enumerate(uploaded_images):
                with cols[idx % 4]:
                    st.image(img, caption=img.name, use_container_width=True)
    
    if uploaded_file:
        st.success(f"✅ File: {uploaded_file.name}")
        
        uploaded_file.seek(0)
        try:
            html_content = uploaded_file.getvalue().decode('utf-8')
        except UnicodeDecodeError:
            uploaded_file.seek(0)
            html_content = uploaded_file.getvalue().decode('latin-1')
        
        detected_ratio = detect_aspect_ratio(html_content)
        
        col1, col2 = st.columns([1, 1])
        
        with col1:
            st.subheader("⚙️ Settings")
            auto_detect = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_file")
            
            if auto_detect:
                aspect_ratio = detected_ratio
                st.info(f"🔍 Detected: **{detected_ratio}**")
            else:
                aspect_ratio = st.radio(
                    "Aspect Ratio",
                    options=["16:9", "1:1", "9:16"],
                    index=["16:9", "1:1", "9:16"].index(detected_ratio),
                    key="aspect_file"
                )
            
            convert_btn = st.button("🔄 Convert to PDF", key="conv_file", type="primary", use_container_width=True)
        
        with col2:
            st.subheader("👁️ Preview")
            with st.expander("Show HTML"):
                st.components.v1.html(render_html_preview(html_content), height=400, scrolling=True)
        
        if convert_btn:
            temp_dir = None
            try:
                with st.spinner("Converting..."):
                    temp_dir = tempfile.mkdtemp()
                    
                    # Embed images as base64
                    processed_html = html_content
                    if uploaded_images:
                        with st.expander("🖼️ Image Processing", expanded=True):
                            processed_html, replacements = embed_images_as_base64(html_content, uploaded_images)
                            
                            if not replacements:
                                st.warning("⚠️ Images uploaded but no matches found in HTML!")
                                st.write("**Tip:** Make sure image filenames in HTML match uploaded files exactly")
                    
                    # Convert to PDF
                    pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio, temp_dir)
                    
                    if error:
                        st.error(f"❌ {error}")
                    else:
                        st.success("✅ PDF generated with proper page breaks!")
                        
                        output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
                        if not output_name.endswith('.pdf'):
                            output_name += '.pdf'
                        
                        col_a, col_b = st.columns(2)
                        with col_a:
                            st.download_button(
                                "⬇️ Download PDF",
                                data=pdf_bytes,
                                file_name=output_name,
                                mime="application/pdf",
                                use_container_width=True
                            )
                        with col_b:
                            st.info(f"Size: {len(pdf_bytes):,} bytes")
                        
                        st.subheader("📄 PDF Preview")
                        st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
            except Exception as e:
                st.error(f"❌ Error: {str(e)}")
            finally:
                if temp_dir and os.path.exists(temp_dir):
                    shutil.rmtree(temp_dir, ignore_errors=True)

# Tab 2: Paste HTML
with tab2:
    html_code = st.text_area(
        "HTML Content",
        value="""<!DOCTYPE html>
<html>
<head>
    <style>
        body {
            font-family: Arial;
            margin: 0;
            padding: 0;
        }
        .page {
            width: 100%;
            height: 100vh;
            display: flex;
            align-items: center;
            justify-content: center;
            box-sizing: border-box;
            padding: 40px;
        }
        .page:nth-child(1) {
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
        }
        .page:nth-child(2) {
            background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
            color: white;
        }
        .page:nth-child(3) {
            background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
            color: white;
        }
        h1 { font-size: 48px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
    </style>
</head>
<body>
    <div class="page">
        <h1>Page 1: Hello PDF! 🌍</h1>
    </div>
    
    <div class="page">
        <h1>Page 2: Separate Page! 📄</h1>
    </div>
    
    <div class="page">
        <h1>Page 3: Final Page! ✨</h1>
    </div>
</body>
</html>""",
        height=400,
        key="html_code"
    )
    
    uploaded_images_text = st.file_uploader(
        "📷 Upload Images",
        type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
        key="image_text",
        help="Upload images to embed in your HTML",
        accept_multiple_files=True
    )
    
    if uploaded_images_text:
        st.success(f"✅ {len(uploaded_images_text)} image(s) uploaded")
        with st.expander("View images"):
            cols = st.columns(min(len(uploaded_images_text), 4))
            for idx, img in enumerate(uploaded_images_text):
                with cols[idx % 4]:
                    st.image(img, caption=img.name, use_container_width=True)
    
    if html_code.strip():
        detected_ratio_text = detect_aspect_ratio(html_code)
        auto_detect_text = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_text")
        
        if auto_detect_text:
            aspect_ratio_text = detected_ratio_text
            st.info(f"🔍 Detected: **{detected_ratio_text}**")
        else:
            aspect_ratio_text = st.radio(
                "Aspect Ratio",
                options=["16:9", "1:1", "9:16"],
                index=["16:9", "1:1", "9:16"].index(detected_ratio_text),
                key="aspect_text"
            )
        
        convert_text_btn = st.button("🔄 Convert", key="conv_text", type="primary", use_container_width=True)
        
        if convert_text_btn:
            temp_dir = None
            try:
                with st.spinner("Converting..."):
                    temp_dir = tempfile.mkdtemp()
                    
                    processed_html = html_code
                    if uploaded_images_text:
                        with st.expander("🖼️ Image Processing", expanded=True):
                            processed_html, replacements = embed_images_as_base64(html_code, uploaded_images_text)
                            
                            if not replacements:
                                st.warning("⚠️ Images uploaded but no matches found!")
                    
                    pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio_text, temp_dir)
                    
                    if error:
                        st.error(f"❌ {error}")
                    else:
                        st.success("✅ PDF generated with proper page breaks!")
                        
                        col_a, col_b = st.columns(2)
                        with col_a:
                            st.download_button(
                                "⬇️ Download PDF",
                                data=pdf_bytes,
                                file_name="converted.pdf",
                                mime="application/pdf",
                                use_container_width=True
                            )
                        with col_b:
                            st.info(f"Size: {len(pdf_bytes):,} bytes")
                        
                        st.subheader("📄 PDF Preview")
                        st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
            except Exception as e:
                st.error(f"❌ Error: {str(e)}")
            finally:
                if temp_dir and os.path.exists(temp_dir):
                    shutil.rmtree(temp_dir, ignore_errors=True)

# Footer
st.markdown("---")
st.markdown("""
### 💡 How Page Breaks Work:

**Automatic Page Detection:**
- Elements with class `page`, `slide`, or `section.page` are treated as separate pages
- Each page automatically gets `page-break-after: always` CSS
- Last page won't have a trailing break

**HTML Structure for Multiple Pages:**
```html
<div class="page">Page 1 content</div>
<div class="page">Page 2 content</div>
<div class="page">Page 3 content</div>
```

**Manual Page Breaks:**
- Add class `page-break` to force a break after an element
- Add class `page-break-before` to force a break before an element
- Add class `no-page-break` to prevent breaks inside an element

**Image Embedding:**
- Images are converted to base64 and embedded directly in HTML
- Ensures images always appear in the PDF
- Filename in HTML must match uploaded file exactly

### 📝 Example HTML:
```html
<!DOCTYPE html>
<html>
<body>
    <div class="page">
        <h1>First Page</h1>
        <img src="logo.png" alt="Logo">
    </div>
    
    <div class="page">
        <h1>Second Page</h1>
        <p>Content here...</p>
    </div>
</body>
</html>
```
Then upload a file named: `logo.png`
""")