ABDALLALSWAITI commited on
Commit
8d2aae9
Β·
verified Β·
1 Parent(s): 70df375

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +197 -71
src/streamlit_app.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Streamlit HTML to PDF Converter with Image Support - REVISED
3
  Save this file as: src/streamlit_app.py
4
  """
5
  import streamlit as st
@@ -48,13 +48,9 @@ def detect_aspect_ratio(html_content):
48
  def image_to_base64(image_file):
49
  """Convert uploaded image to base64 data URL"""
50
  try:
51
- # Read image bytes
52
  image_bytes = image_file.getvalue()
53
-
54
- # Get MIME type
55
  mime_type, _ = mimetypes.guess_type(image_file.name)
56
  if not mime_type:
57
- # Fallback based on extension
58
  ext = os.path.splitext(image_file.name)[1].lower()
59
  mime_map = {
60
  '.jpg': 'image/jpeg',
@@ -67,24 +63,18 @@ def image_to_base64(image_file):
67
  }
68
  mime_type = mime_map.get(ext, 'image/png')
69
 
70
- # Convert to base64
71
  b64_data = base64.b64encode(image_bytes).decode('utf-8')
72
  data_url = f"data:{mime_type};base64,{b64_data}"
73
-
74
  return data_url
75
  except Exception as e:
76
  st.error(f"Error converting {image_file.name} to base64: {str(e)}")
77
  return None
78
 
79
  def embed_images_as_base64(html_content, uploaded_images):
80
- """
81
- Embed all images directly as base64 data URLs in the HTML
82
- This ensures images are always included in the PDF
83
- """
84
  if not uploaded_images:
85
  return html_content, {}
86
 
87
- # Create mapping of filename to base64 data URL
88
  image_data_urls = {}
89
  for img in uploaded_images:
90
  data_url = image_to_base64(img)
@@ -95,16 +85,12 @@ def embed_images_as_base64(html_content, uploaded_images):
95
  if not image_data_urls:
96
  return html_content, {}
97
 
98
- # Track replacements
99
  replacements = {}
100
- original_html = html_content
101
 
102
  for filename, data_url in image_data_urls.items():
103
- # Escape filename for regex
104
  escaped_name = re.escape(filename)
105
 
106
- # Pattern 1: src attribute - match any path variation
107
- # Examples: src="image.jpg", src="./image.jpg", src="images/image.jpg"
108
  pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
109
  matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
110
  count1 = len(matches1)
@@ -112,7 +98,7 @@ def embed_images_as_base64(html_content, uploaded_images):
112
  html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
113
  replacements[f"{filename} (img src)"] = count1
114
 
115
- # Pattern 2: background-image in style attributes
116
  pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
117
  matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
118
  count2 = len(matches2)
@@ -120,7 +106,7 @@ def embed_images_as_base64(html_content, uploaded_images):
120
  html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
121
  replacements[f"{filename} (bg-image)"] = count2
122
 
123
- # Pattern 3: CSS url() without background-image
124
  pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
125
  matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
126
  count3 = len(matches3)
@@ -128,7 +114,6 @@ def embed_images_as_base64(html_content, uploaded_images):
128
  html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
129
  replacements[f"{filename} (url)"] = count3
130
 
131
- # Show replacement summary
132
  if replacements:
133
  st.success("βœ… Image Replacements:")
134
  for key, count in replacements.items():
@@ -137,7 +122,6 @@ def embed_images_as_base64(html_content, uploaded_images):
137
  st.warning("⚠️ No image references found in HTML matching uploaded files!")
138
  st.write("Uploaded files:", [img.name for img in uploaded_images])
139
 
140
- # Show sample HTML for debugging
141
  with st.expander("πŸ” Debug: Show HTML image references"):
142
  img_lines = [line for line in html_content.split('\n')
143
  if any(k in line.lower() for k in ['<img', 'src=', 'url(', 'background'])]
@@ -149,6 +133,121 @@ def embed_images_as_base64(html_content, uploaded_images):
149
 
150
  return html_content, replacements
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def render_html_preview(html_content):
153
  """Render HTML preview in an iframe"""
154
  b64 = base64.b64encode(html_content.encode()).decode()
@@ -239,30 +338,11 @@ def render_pdf_preview(pdf_bytes):
239
  return pdf_viewer_html
240
 
241
  def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
242
- """Convert HTML content to PDF using Puppeteer"""
243
  try:
244
- # Inject CSS to preserve styles
245
- style_injection = """
246
- <style>
247
- @page { margin: 0; }
248
- * {
249
- -webkit-print-color-adjust: exact !important;
250
- print-color-adjust: exact !important;
251
- color-adjust: exact !important;
252
- }
253
- body {
254
- -webkit-print-color-adjust: exact !important;
255
- print-color-adjust: exact !important;
256
- }
257
- </style>
258
- """
259
-
260
- if '</head>' in html_content:
261
- html_content = html_content.replace('</head>', style_injection + '</head>')
262
- elif '<body' in html_content:
263
- html_content = html_content.replace('<body', style_injection + '<body', 1)
264
- else:
265
- html_content = style_injection + html_content
266
 
267
  # Save HTML to temp file
268
  html_file = os.path.join(temp_dir, "input.html")
@@ -322,8 +402,8 @@ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
322
  # Main UI
323
  st.title("πŸ“„ HTML to PDF Converter")
324
  st.markdown("""
325
- Convert HTML to PDF with **embedded base64 images** for guaranteed display!
326
- ✨ Images are converted to base64 and embedded directly in the HTML.
327
  """)
328
 
329
  # Create tabs
@@ -412,7 +492,7 @@ with tab1:
412
  if error:
413
  st.error(f"❌ {error}")
414
  else:
415
- st.success("βœ… PDF generated!")
416
 
417
  output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
418
  if not output_name.endswith('.pdf'):
@@ -448,23 +528,44 @@ with tab2:
448
  <style>
449
  body {
450
  font-family: Arial;
451
- margin: 40px;
 
 
 
 
 
 
 
 
 
 
 
 
452
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
453
  color: white;
454
  }
455
- h1 { font-size: 48px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
456
- .box {
457
- background: rgba(255,255,255,0.1);
458
- padding: 20px;
459
- border-radius: 10px;
460
- margin: 20px 0;
461
  }
 
 
 
 
 
462
  </style>
463
  </head>
464
  <body>
465
- <h1>Hello PDF! 🌍</h1>
466
- <div class="box">
467
- <p>Styles and gradients preserved!</p>
 
 
 
 
 
 
 
468
  </div>
469
  </body>
470
  </html>""",
@@ -524,7 +625,7 @@ with tab2:
524
  if error:
525
  st.error(f"❌ {error}")
526
  else:
527
- st.success("βœ… PDF generated!")
528
 
529
  col_a, col_b = st.columns(2)
530
  with col_a:
@@ -549,21 +650,46 @@ with tab2:
549
  # Footer
550
  st.markdown("---")
551
  st.markdown("""
552
- ### πŸ’‘ How It Works:
553
- - **Base64 Embedding**: Images are converted to base64 data URLs and embedded directly in HTML
554
- - **No File Paths**: No need for file:// URLs or temp directories
555
- - **Guaranteed Display**: Images are part of the HTML, so they always appear in the PDF
556
- - **Filename Matching**: Your HTML must reference images by exact filename (e.g., `<img src="photo.jpg">`)
 
 
 
 
 
 
 
 
 
 
 
 
 
557
 
558
- ### βœ… Supported:
559
- - `<img src="photo.jpg">`
560
- - `<img src="./images/logo.png">`
561
- - `background-image: url('banner.jpg')`
562
- - `style="background: url(bg.png)"`
563
 
564
- ### πŸ“ Example:
565
  ```html
566
- <img src="logo.png" alt="Logo">
 
 
 
 
 
 
 
 
 
 
 
 
 
567
  ```
568
- Then upload a file named exactly: `logo.png`
569
  """)
 
1
  """
2
+ Streamlit HTML to PDF Converter with Image Support and Proper Page Breaks
3
  Save this file as: src/streamlit_app.py
4
  """
5
  import streamlit as st
 
48
  def image_to_base64(image_file):
49
  """Convert uploaded image to base64 data URL"""
50
  try:
 
51
  image_bytes = image_file.getvalue()
 
 
52
  mime_type, _ = mimetypes.guess_type(image_file.name)
53
  if not mime_type:
 
54
  ext = os.path.splitext(image_file.name)[1].lower()
55
  mime_map = {
56
  '.jpg': 'image/jpeg',
 
63
  }
64
  mime_type = mime_map.get(ext, 'image/png')
65
 
 
66
  b64_data = base64.b64encode(image_bytes).decode('utf-8')
67
  data_url = f"data:{mime_type};base64,{b64_data}"
 
68
  return data_url
69
  except Exception as e:
70
  st.error(f"Error converting {image_file.name} to base64: {str(e)}")
71
  return None
72
 
73
  def embed_images_as_base64(html_content, uploaded_images):
74
+ """Embed all images directly as base64 data URLs in the HTML"""
 
 
 
75
  if not uploaded_images:
76
  return html_content, {}
77
 
 
78
  image_data_urls = {}
79
  for img in uploaded_images:
80
  data_url = image_to_base64(img)
 
85
  if not image_data_urls:
86
  return html_content, {}
87
 
 
88
  replacements = {}
 
89
 
90
  for filename, data_url in image_data_urls.items():
 
91
  escaped_name = re.escape(filename)
92
 
93
+ # Pattern 1: img src attribute
 
94
  pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
95
  matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
96
  count1 = len(matches1)
 
98
  html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
99
  replacements[f"{filename} (img src)"] = count1
100
 
101
+ # Pattern 2: background-image
102
  pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
103
  matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
104
  count2 = len(matches2)
 
106
  html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
107
  replacements[f"{filename} (bg-image)"] = count2
108
 
109
+ # Pattern 3: CSS url()
110
  pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
111
  matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
112
  count3 = len(matches3)
 
114
  html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
115
  replacements[f"{filename} (url)"] = count3
116
 
 
117
  if replacements:
118
  st.success("βœ… Image Replacements:")
119
  for key, count in replacements.items():
 
122
  st.warning("⚠️ No image references found in HTML matching uploaded files!")
123
  st.write("Uploaded files:", [img.name for img in uploaded_images])
124
 
 
125
  with st.expander("πŸ” Debug: Show HTML image references"):
126
  img_lines = [line for line in html_content.split('\n')
127
  if any(k in line.lower() for k in ['<img', 'src=', 'url(', 'background'])]
 
133
 
134
  return html_content, replacements
135
 
136
+ def inject_page_breaks(html_content: str, aspect_ratio: str):
137
+ """Automatically inject page breaks and page sizing CSS"""
138
+
139
+ # Determine page orientation
140
+ if aspect_ratio == "16:9":
141
+ page_size = "A4 landscape"
142
+ orientation = "landscape"
143
+ elif aspect_ratio == "1:1":
144
+ page_size = "210mm 210mm"
145
+ orientation = "portrait"
146
+ else: # 9:16
147
+ page_size = "A4 portrait"
148
+ orientation = "portrait"
149
+
150
+ # Comprehensive page break CSS
151
+ page_css = f"""
152
+ <style id="auto-page-breaks">
153
+ /* Define page size */
154
+ @page {{
155
+ size: {page_size};
156
+ margin: 0;
157
+ }}
158
+
159
+ /* Reset body */
160
+ html, body {{
161
+ margin: 0 !important;
162
+ padding: 0 !important;
163
+ width: 100% !important;
164
+ height: 100% !important;
165
+ }}
166
+
167
+ /* Page containers - each should be one page */
168
+ .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
169
+ width: 100% !important;
170
+ min-height: 100vh !important;
171
+ height: 100vh !important;
172
+ page-break-after: always !important;
173
+ break-after: page !important;
174
+ page-break-inside: avoid !important;
175
+ break-inside: avoid !important;
176
+ position: relative !important;
177
+ box-sizing: border-box !important;
178
+ overflow: hidden !important;
179
+ }}
180
+
181
+ /* Last page shouldn't force a break */
182
+ .page:last-child, .slide:last-child,
183
+ section.page:last-child, article.page:last-child {{
184
+ page-break-after: auto !important;
185
+ break-after: auto !important;
186
+ }}
187
+
188
+ /* If no explicit page class, treat direct body children as pages */
189
+ body > section:not(.no-page-break),
190
+ body > article:not(.no-page-break),
191
+ body > div:not(.no-page-break) {{
192
+ page-break-after: always !important;
193
+ break-after: page !important;
194
+ min-height: 100vh;
195
+ }}
196
+
197
+ body > section:last-child,
198
+ body > article:last-child,
199
+ body > div:last-child {{
200
+ page-break-after: auto !important;
201
+ }}
202
+
203
+ /* Utility classes for manual control */
204
+ .page-break, .page-break-after {{
205
+ page-break-after: always !important;
206
+ break-after: page !important;
207
+ }}
208
+
209
+ .page-break-before {{
210
+ page-break-before: always !important;
211
+ break-before: page !important;
212
+ }}
213
+
214
+ .no-page-break, .keep-together {{
215
+ page-break-inside: avoid !important;
216
+ break-inside: avoid !important;
217
+ }}
218
+
219
+ /* Prevent awkward breaks in content */
220
+ h1, h2, h3, h4, h5, h6 {{
221
+ page-break-after: avoid !important;
222
+ break-after: avoid !important;
223
+ page-break-inside: avoid !important;
224
+ break-inside: avoid !important;
225
+ }}
226
+
227
+ img, figure, table, pre, blockquote {{
228
+ page-break-inside: avoid !important;
229
+ break-inside: avoid !important;
230
+ }}
231
+
232
+ /* Preserve colors and backgrounds */
233
+ * {{
234
+ -webkit-print-color-adjust: exact !important;
235
+ print-color-adjust: exact !important;
236
+ color-adjust: exact !important;
237
+ }}
238
+ </style>
239
+ """
240
+
241
+ # Inject CSS into HTML
242
+ if '</head>' in html_content:
243
+ html_content = html_content.replace('</head>', page_css + '</head>')
244
+ elif '<body' in html_content:
245
+ html_content = html_content.replace('<body', page_css + '<body', 1)
246
+ else:
247
+ html_content = page_css + html_content
248
+
249
+ return html_content
250
+
251
  def render_html_preview(html_content):
252
  """Render HTML preview in an iframe"""
253
  b64 = base64.b64encode(html_content.encode()).decode()
 
338
  return pdf_viewer_html
339
 
340
  def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
341
+ """Convert HTML content to PDF using Puppeteer with proper page breaks"""
342
  try:
343
+ # Step 1: Inject page break CSS
344
+ st.write("πŸ”§ Injecting page break CSS...")
345
+ html_content = inject_page_breaks(html_content, aspect_ratio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
347
  # Save HTML to temp file
348
  html_file = os.path.join(temp_dir, "input.html")
 
402
  # Main UI
403
  st.title("πŸ“„ HTML to PDF Converter")
404
  st.markdown("""
405
+ Convert HTML to PDF with **proper page breaks** and **embedded base64 images**!
406
+ ✨ Each page in your HTML will be preserved as a separate PDF page.
407
  """)
408
 
409
  # Create tabs
 
492
  if error:
493
  st.error(f"❌ {error}")
494
  else:
495
+ st.success("βœ… PDF generated with proper page breaks!")
496
 
497
  output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
498
  if not output_name.endswith('.pdf'):
 
528
  <style>
529
  body {
530
  font-family: Arial;
531
+ margin: 0;
532
+ padding: 0;
533
+ }
534
+ .page {
535
+ width: 100%;
536
+ height: 100vh;
537
+ display: flex;
538
+ align-items: center;
539
+ justify-content: center;
540
+ box-sizing: border-box;
541
+ padding: 40px;
542
+ }
543
+ .page:nth-child(1) {
544
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
545
  color: white;
546
  }
547
+ .page:nth-child(2) {
548
+ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
549
+ color: white;
 
 
 
550
  }
551
+ .page:nth-child(3) {
552
+ background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
553
+ color: white;
554
+ }
555
+ h1 { font-size: 48px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
556
  </style>
557
  </head>
558
  <body>
559
+ <div class="page">
560
+ <h1>Page 1: Hello PDF! 🌍</h1>
561
+ </div>
562
+
563
+ <div class="page">
564
+ <h1>Page 2: Separate Page! πŸ“„</h1>
565
+ </div>
566
+
567
+ <div class="page">
568
+ <h1>Page 3: Final Page! ✨</h1>
569
  </div>
570
  </body>
571
  </html>""",
 
625
  if error:
626
  st.error(f"❌ {error}")
627
  else:
628
+ st.success("βœ… PDF generated with proper page breaks!")
629
 
630
  col_a, col_b = st.columns(2)
631
  with col_a:
 
650
  # Footer
651
  st.markdown("---")
652
  st.markdown("""
653
+ ### πŸ’‘ How Page Breaks Work:
654
+
655
+ **Automatic Page Detection:**
656
+ - Elements with class `page`, `slide`, or `section.page` are treated as separate pages
657
+ - Each page automatically gets `page-break-after: always` CSS
658
+ - Last page won't have a trailing break
659
+
660
+ **HTML Structure for Multiple Pages:**
661
+ ```html
662
+ <div class="page">Page 1 content</div>
663
+ <div class="page">Page 2 content</div>
664
+ <div class="page">Page 3 content</div>
665
+ ```
666
+
667
+ **Manual Page Breaks:**
668
+ - Add class `page-break` to force a break after an element
669
+ - Add class `page-break-before` to force a break before an element
670
+ - Add class `no-page-break` to prevent breaks inside an element
671
 
672
+ **Image Embedding:**
673
+ - Images are converted to base64 and embedded directly in HTML
674
+ - Ensures images always appear in the PDF
675
+ - Filename in HTML must match uploaded file exactly
 
676
 
677
+ ### πŸ“ Example HTML:
678
  ```html
679
+ <!DOCTYPE html>
680
+ <html>
681
+ <body>
682
+ <div class="page">
683
+ <h1>First Page</h1>
684
+ <img src="logo.png" alt="Logo">
685
+ </div>
686
+
687
+ <div class="page">
688
+ <h1>Second Page</h1>
689
+ <p>Content here...</p>
690
+ </div>
691
+ </body>
692
+ </html>
693
  ```
694
+ Then upload a file named: `logo.png`
695
  """)