Spaces:

ABDALLALSWAITI
/

htmlpdf

Sleeping

App Files Files Community

ABDALLALSWAITI commited on Oct 16, 2025

Commit

8d2aae9

verified ·

1 Parent(s): 70df375

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +197 -71

src/streamlit_app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Streamlit HTML to PDF Converter with Image Support - REVISED
 Save this file as: src/streamlit_app.py
 """
 import streamlit as st
@@ -48,13 +48,9 @@ def detect_aspect_ratio(html_content):
 def image_to_base64(image_file):
     """Convert uploaded image to base64 data URL"""
     try:
-        # Read image bytes
         image_bytes = image_file.getvalue()
-        # Get MIME type
         mime_type, _ = mimetypes.guess_type(image_file.name)
         if not mime_type:
-            # Fallback based on extension
             ext = os.path.splitext(image_file.name)[1].lower()
             mime_map = {
                 '.jpg': 'image/jpeg',
@@ -67,24 +63,18 @@ def image_to_base64(image_file):
             }
             mime_type = mime_map.get(ext, 'image/png')
-        # Convert to base64
         b64_data = base64.b64encode(image_bytes).decode('utf-8')
         data_url = f"data:{mime_type};base64,{b64_data}"
         return data_url
     except Exception as e:
         st.error(f"Error converting {image_file.name} to base64: {str(e)}")
         return None
 def embed_images_as_base64(html_content, uploaded_images):
-    """
-    Embed all images directly as base64 data URLs in the HTML
-    This ensures images are always included in the PDF
-    """
     if not uploaded_images:
         return html_content, {}
-    # Create mapping of filename to base64 data URL
     image_data_urls = {}
     for img in uploaded_images:
         data_url = image_to_base64(img)
@@ -95,16 +85,12 @@ def embed_images_as_base64(html_content, uploaded_images):
     if not image_data_urls:
         return html_content, {}
-    # Track replacements
     replacements = {}
-    original_html = html_content
     for filename, data_url in image_data_urls.items():
-        # Escape filename for regex
         escaped_name = re.escape(filename)
-        # Pattern 1: src attribute - match any path variation
-        # Examples: src="image.jpg", src="./image.jpg", src="images/image.jpg"
         pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
         matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
         count1 = len(matches1)
@@ -112,7 +98,7 @@ def embed_images_as_base64(html_content, uploaded_images):
             html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
             replacements[f"{filename} (img src)"] = count1
-        # Pattern 2: background-image in style attributes
         pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
         matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
         count2 = len(matches2)
@@ -120,7 +106,7 @@ def embed_images_as_base64(html_content, uploaded_images):
             html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
             replacements[f"{filename} (bg-image)"] = count2
-        # Pattern 3: CSS url() without background-image
         pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
         matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
         count3 = len(matches3)
@@ -128,7 +114,6 @@ def embed_images_as_base64(html_content, uploaded_images):
             html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
             replacements[f"{filename} (url)"] = count3
-    # Show replacement summary
     if replacements:
         st.success("✅ Image Replacements:")
         for key, count in replacements.items():
@@ -137,7 +122,6 @@ def embed_images_as_base64(html_content, uploaded_images):
         st.warning("⚠️ No image references found in HTML matching uploaded files!")
         st.write("Uploaded files:", [img.name for img in uploaded_images])
-        # Show sample HTML for debugging
         with st.expander("🔍 Debug: Show HTML image references"):
             img_lines = [line for line in html_content.split('\n')
                         if any(k in line.lower() for k in ['<img', 'src=', 'url(', 'background'])]
@@ -149,6 +133,121 @@ def embed_images_as_base64(html_content, uploaded_images):
     return html_content, replacements
 def render_html_preview(html_content):
     """Render HTML preview in an iframe"""
     b64 = base64.b64encode(html_content.encode()).decode()
@@ -239,30 +338,11 @@ def render_pdf_preview(pdf_bytes):
     return pdf_viewer_html
 def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
-    """Convert HTML content to PDF using Puppeteer"""
     try:
-        # Inject CSS to preserve styles
-        style_injection = """
-        <style>
-            @page { margin: 0; }
-            * {
-                -webkit-print-color-adjust: exact !important;
-                print-color-adjust: exact !important;
-                color-adjust: exact !important;
-            }
-            body {
-                -webkit-print-color-adjust: exact !important;
-                print-color-adjust: exact !important;
-            }
-        </style>
-        """
-        if '</head>' in html_content:
-            html_content = html_content.replace('</head>', style_injection + '</head>')
-        elif '<body' in html_content:
-            html_content = html_content.replace('<body', style_injection + '<body', 1)
-        else:
-            html_content = style_injection + html_content
         # Save HTML to temp file
         html_file = os.path.join(temp_dir, "input.html")
@@ -322,8 +402,8 @@ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
 # Main UI
 st.title("📄 HTML to PDF Converter")
 st.markdown("""
-Convert HTML to PDF with **embedded base64 images** for guaranteed display!
-✨ Images are converted to base64 and embedded directly in the HTML.
 """)
 # Create tabs
@@ -412,7 +492,7 @@ with tab1:
                     if error:
                         st.error(f"❌ {error}")
                     else:
-                        st.success("✅ PDF generated!")
                         output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
                         if not output_name.endswith('.pdf'):
@@ -448,23 +528,44 @@ with tab2:
     <style>
         body {
             font-family: Arial;
-            margin: 40px;
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             color: white;
         }
-        h1 { font-size: 48px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
-        .box {
-            background: rgba(255,255,255,0.1);
-            padding: 20px;
-            border-radius: 10px;
-            margin: 20px 0;
         }
     </style>
 </head>
 <body>
-    <h1>Hello PDF! 🌍</h1>
-    <div class="box">
-        <p>Styles and gradients preserved!</p>
     </div>
 </body>
 </html>""",
@@ -524,7 +625,7 @@ with tab2:
                     if error:
                         st.error(f"❌ {error}")
                     else:
-                        st.success("✅ PDF generated!")
                         col_a, col_b = st.columns(2)
                         with col_a:
@@ -549,21 +650,46 @@ with tab2:
 # Footer
 st.markdown("---")
 st.markdown("""
-### 💡 How It Works:
-- **Base64 Embedding**: Images are converted to base64 data URLs and embedded directly in HTML
-- **No File Paths**: No need for file:// URLs or temp directories
-- **Guaranteed Display**: Images are part of the HTML, so they always appear in the PDF
-- **Filename Matching**: Your HTML must reference images by exact filename (e.g., `<img src="photo.jpg">`)
-### ✅ Supported:
-- `<img src="photo.jpg">`
-- `<img src="./images/logo.png">`
-- `background-image: url('banner.jpg')`
-- `style="background: url(bg.png)"`
-### 📝 Example:
 ```html
-<img src="logo.png" alt="Logo">
 ```
-Then upload a file named exactly: `logo.png`
 """)

 """
+Streamlit HTML to PDF Converter with Image Support and Proper Page Breaks
 Save this file as: src/streamlit_app.py
 """
 import streamlit as st
 def image_to_base64(image_file):
     """Convert uploaded image to base64 data URL"""
     try:
         image_bytes = image_file.getvalue()
         mime_type, _ = mimetypes.guess_type(image_file.name)
         if not mime_type:
             ext = os.path.splitext(image_file.name)[1].lower()
             mime_map = {
                 '.jpg': 'image/jpeg',
             }
             mime_type = mime_map.get(ext, 'image/png')
         b64_data = base64.b64encode(image_bytes).decode('utf-8')
         data_url = f"data:{mime_type};base64,{b64_data}"
         return data_url
     except Exception as e:
         st.error(f"Error converting {image_file.name} to base64: {str(e)}")
         return None
 def embed_images_as_base64(html_content, uploaded_images):
+    """Embed all images directly as base64 data URLs in the HTML"""
     if not uploaded_images:
         return html_content, {}
     image_data_urls = {}
     for img in uploaded_images:
         data_url = image_to_base64(img)
     if not image_data_urls:
         return html_content, {}
     replacements = {}
     for filename, data_url in image_data_urls.items():
         escaped_name = re.escape(filename)
+        # Pattern 1: img src attribute
         pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
         matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
         count1 = len(matches1)
             html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
             replacements[f"{filename} (img src)"] = count1
+        # Pattern 2: background-image
         pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
         matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
         count2 = len(matches2)
             html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
             replacements[f"{filename} (bg-image)"] = count2
+        # Pattern 3: CSS url()
         pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
         matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
         count3 = len(matches3)
             html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
             replacements[f"{filename} (url)"] = count3
     if replacements:
         st.success("✅ Image Replacements:")
         for key, count in replacements.items():
         st.warning("⚠️ No image references found in HTML matching uploaded files!")
         st.write("Uploaded files:", [img.name for img in uploaded_images])
         with st.expander("🔍 Debug: Show HTML image references"):
             img_lines = [line for line in html_content.split('\n')
                         if any(k in line.lower() for k in ['<img', 'src=', 'url(', 'background'])]
     return html_content, replacements
+def inject_page_breaks(html_content: str, aspect_ratio: str):
+    """Automatically inject page breaks and page sizing CSS"""
+    # Determine page orientation
+    if aspect_ratio == "16:9":
+        page_size = "A4 landscape"
+        orientation = "landscape"
+    elif aspect_ratio == "1:1":
+        page_size = "210mm 210mm"
+        orientation = "portrait"
+    else:  # 9:16
+        page_size = "A4 portrait"
+        orientation = "portrait"
+    # Comprehensive page break CSS
+    page_css = f"""
+    <style id="auto-page-breaks">
+        /* Define page size */
+        @page {{
+            size: {page_size};
+            margin: 0;
+        }}
+        /* Reset body */
+        html, body {{
+            margin: 0 !important;
+            padding: 0 !important;
+            width: 100% !important;
+            height: 100% !important;
+        }}
+        /* Page containers - each should be one page */
+        .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
+            width: 100% !important;
+            min-height: 100vh !important;
+            height: 100vh !important;
+            page-break-after: always !important;
+            break-after: page !important;
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+            position: relative !important;
+            box-sizing: border-box !important;
+            overflow: hidden !important;
+        }}
+        /* Last page shouldn't force a break */
+        .page:last-child, .slide:last-child,
+        section.page:last-child, article.page:last-child {{
+            page-break-after: auto !important;
+            break-after: auto !important;
+        }}
+        /* If no explicit page class, treat direct body children as pages */
+        body > section:not(.no-page-break),
+        body > article:not(.no-page-break),
+        body > div:not(.no-page-break) {{
+            page-break-after: always !important;
+            break-after: page !important;
+            min-height: 100vh;
+        }}
+        body > section:last-child,
+        body > article:last-child,
+        body > div:last-child {{
+            page-break-after: auto !important;
+        }}
+        /* Utility classes for manual control */
+        .page-break, .page-break-after {{
+            page-break-after: always !important;
+            break-after: page !important;
+        }}
+        .page-break-before {{
+            page-break-before: always !important;
+            break-before: page !important;
+        }}
+        .no-page-break, .keep-together {{
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        /* Prevent awkward breaks in content */
+        h1, h2, h3, h4, h5, h6 {{
+            page-break-after: avoid !important;
+            break-after: avoid !important;
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        img, figure, table, pre, blockquote {{
+            page-break-inside: avoid !important;
+            break-inside: avoid !important;
+        }}
+        /* Preserve colors and backgrounds */
+        * {{
+            -webkit-print-color-adjust: exact !important;
+            print-color-adjust: exact !important;
+            color-adjust: exact !important;
+        }}
+    </style>
+    """
+    # Inject CSS into HTML
+    if '</head>' in html_content:
+        html_content = html_content.replace('</head>', page_css + '</head>')
+    elif '<body' in html_content:
+        html_content = html_content.replace('<body', page_css + '<body', 1)
+    else:
+        html_content = page_css + html_content
+    return html_content
 def render_html_preview(html_content):
     """Render HTML preview in an iframe"""
     b64 = base64.b64encode(html_content.encode()).decode()
     return pdf_viewer_html
 def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
+    """Convert HTML content to PDF using Puppeteer with proper page breaks"""
     try:
+        # Step 1: Inject page break CSS
+        st.write("🔧 Injecting page break CSS...")
+        html_content = inject_page_breaks(html_content, aspect_ratio)
         # Save HTML to temp file
         html_file = os.path.join(temp_dir, "input.html")
 # Main UI
 st.title("📄 HTML to PDF Converter")
 st.markdown("""
+Convert HTML to PDF with **proper page breaks** and **embedded base64 images**!
+✨ Each page in your HTML will be preserved as a separate PDF page.
 """)
 # Create tabs
                     if error:
                         st.error(f"❌ {error}")
                     else:
+                        st.success("✅ PDF generated with proper page breaks!")
                         output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
                         if not output_name.endswith('.pdf'):
     <style>
         body {
             font-family: Arial;
+            margin: 0;
+            padding: 0;
+        }
+        .page {
+            width: 100%;
+            height: 100vh;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            box-sizing: border-box;
+            padding: 40px;
+        }
+        .page:nth-child(1) {
             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
             color: white;
         }
+        .page:nth-child(2) {
+            background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+            color: white;
         }
+        .page:nth-child(3) {
+            background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
+            color: white;
+        }
+        h1 { font-size: 48px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
     </style>
 </head>
 <body>
+    <div class="page">
+        <h1>Page 1: Hello PDF! 🌍</h1>
+    </div>
+    <div class="page">
+        <h1>Page 2: Separate Page! 📄</h1>
+    </div>
+    <div class="page">
+        <h1>Page 3: Final Page! ✨</h1>
     </div>
 </body>
 </html>""",
                     if error:
                         st.error(f"❌ {error}")
                     else:
+                        st.success("✅ PDF generated with proper page breaks!")
                         col_a, col_b = st.columns(2)
                         with col_a:
 # Footer
 st.markdown("---")
 st.markdown("""
+### 💡 How Page Breaks Work:
+**Automatic Page Detection:**
+- Elements with class `page`, `slide`, or `section.page` are treated as separate pages
+- Each page automatically gets `page-break-after: always` CSS
+- Last page won't have a trailing break
+**HTML Structure for Multiple Pages:**
+```html
+<div class="page">Page 1 content</div>
+<div class="page">Page 2 content</div>
+<div class="page">Page 3 content</div>
+```
+**Manual Page Breaks:**
+- Add class `page-break` to force a break after an element
+- Add class `page-break-before` to force a break before an element
+- Add class `no-page-break` to prevent breaks inside an element
+**Image Embedding:**
+- Images are converted to base64 and embedded directly in HTML
+- Ensures images always appear in the PDF
+- Filename in HTML must match uploaded file exactly
+### 📝 Example HTML:
 ```html
+<!DOCTYPE html>
+<html>
+<body>
+    <div class="page">
+        <h1>First Page</h1>
+        <img src="logo.png" alt="Logo">
+    </div>
+    <div class="page">
+        <h1>Second Page</h1>
+        <p>Content here...</p>
+    </div>
+</body>
+</html>
 ```
+Then upload a file named: `logo.png`
 """)