ABDALLALSWAITI commited on
Commit
e126d9e
Β·
verified Β·
1 Parent(s): 5e552f0

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +226 -515
src/streamlit_app.py CHANGED
@@ -1,22 +1,28 @@
1
  """
2
- Streamlit HTML to PDF Converter with Image Support and Proper Page Breaks
3
- Save this file as: src/streamlit_app.py
 
4
  """
5
  import streamlit as st
 
 
 
6
  import subprocess
7
  import os
8
  import tempfile
9
  import shutil
10
- from pathlib import Path
11
  import base64
12
  import re
13
  import mimetypes
 
 
 
14
 
15
- st.set_page_config(
16
- page_title="HTML to PDF Converter",
17
- page_icon="πŸ“„",
18
- layout="wide"
19
- )
20
 
21
  def detect_aspect_ratio(html_content):
22
  """Detect aspect ratio from HTML content"""
@@ -48,10 +54,17 @@ def detect_aspect_ratio(html_content):
48
  def image_to_base64(image_file):
49
  """Convert uploaded image to base64 data URL"""
50
  try:
51
- image_bytes = image_file.getvalue()
52
- mime_type, _ = mimetypes.guess_type(image_file.name)
 
 
 
 
 
 
 
53
  if not mime_type:
54
- ext = os.path.splitext(image_file.name)[1].lower()
55
  mime_map = {
56
  '.jpg': 'image/jpeg',
57
  '.jpeg': 'image/jpeg',
@@ -67,7 +80,7 @@ def image_to_base64(image_file):
67
  data_url = f"data:{mime_type};base64,{b64_data}"
68
  return data_url
69
  except Exception as e:
70
- st.error(f"Error converting {image_file.name} to base64: {str(e)}")
71
  return None
72
 
73
  def embed_images_as_base64(html_content, uploaded_images):
@@ -79,8 +92,8 @@ def embed_images_as_base64(html_content, uploaded_images):
79
  for img in uploaded_images:
80
  data_url = image_to_base64(img)
81
  if data_url:
82
- image_data_urls[img.name] = data_url
83
- st.write(f"βœ“ Converted {img.name} to base64 ({len(data_url)} chars)")
84
 
85
  if not image_data_urls:
86
  return html_content, {}
@@ -90,155 +103,42 @@ def embed_images_as_base64(html_content, uploaded_images):
90
  for filename, data_url in image_data_urls.items():
91
  escaped_name = re.escape(filename)
92
 
93
- # Pattern 1: img src attribute
94
  pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
95
  matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
96
- count1 = len(matches1)
97
  if matches1:
98
  html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
99
- replacements[f"{filename} (img src)"] = count1
100
 
101
- # Pattern 2: background-image
102
  pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
103
  matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
104
- count2 = len(matches2)
105
  if matches2:
106
  html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
107
- replacements[f"{filename} (bg-image)"] = count2
108
 
109
- # Pattern 3: CSS url()
110
  pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
111
  matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
112
- count3 = len(matches3)
113
  if matches3:
114
  html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
115
- replacements[f"{filename} (url)"] = count3
116
-
117
- if replacements:
118
- st.success("βœ… Image Replacements:")
119
- for key, count in replacements.items():
120
- st.write(f" β€’ {key}: {count} replacement(s)")
121
- else:
122
- st.warning("⚠️ No image references found in HTML matching uploaded files!")
123
- st.write("Uploaded files:", [img.name for img in uploaded_images])
124
-
125
- with st.expander("πŸ” Debug: Show HTML image references"):
126
- img_lines = [line for line in html_content.split('\n')
127
- if any(k in line.lower() for k in ['<img', 'src=', 'url(', 'background'])]
128
- if img_lines:
129
- for line in img_lines[:10]:
130
- st.code(line.strip(), language='html')
131
- else:
132
- st.write("No image-related lines found in HTML")
133
 
134
  return html_content, replacements
135
 
136
  def inject_page_breaks(html_content: str, aspect_ratio: str):
137
- """Automatically inject page breaks and page sizing CSS"""
 
138
 
139
- # Determine page orientation
140
- if aspect_ratio == "16:9":
141
- page_size = "A4 landscape"
142
- orientation = "landscape"
143
- elif aspect_ratio == "1:1":
144
- page_size = "210mm 210mm"
145
- orientation = "portrait"
146
- else: # 9:16
147
- page_size = "A4 portrait"
148
- orientation = "portrait"
149
-
150
- # Comprehensive page break CSS
151
  page_css = f"""
152
  <style id="auto-page-breaks">
153
- /* Define page size */
154
- @page {{
155
- size: {page_size};
156
- margin: 0;
157
- }}
158
-
159
- /* Reset body */
160
- html, body {{
161
- margin: 0 !important;
162
- padding: 0 !important;
163
- width: 100% !important;
164
- height: 100% !important;
165
- }}
166
-
167
- /* Page containers - each should be one page */
168
- .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
169
- width: 100% !important;
170
- min-height: 100vh !important;
171
- height: 100vh !important;
172
- page-break-after: always !important;
173
- break-after: page !important;
174
- page-break-inside: avoid !important;
175
- break-inside: avoid !important;
176
- position: relative !important;
177
- box-sizing: border-box !important;
178
- overflow: hidden !important;
179
- }}
180
-
181
- /* Last page shouldn't force a break */
182
- .page:last-child, .slide:last-child,
183
- section.page:last-child, article.page:last-child {{
184
- page-break-after: auto !important;
185
- break-after: auto !important;
186
- }}
187
-
188
- /* If no explicit page class, treat direct body children as pages */
189
- body > section:not(.no-page-break),
190
- body > article:not(.no-page-break),
191
- body > div:not(.no-page-break) {{
192
- page-break-after: always !important;
193
- break-after: page !important;
194
- min-height: 100vh;
195
- }}
196
-
197
- body > section:last-child,
198
- body > article:last-child,
199
- body > div:last-child {{
200
- page-break-after: auto !important;
201
- }}
202
-
203
- /* Utility classes for manual control */
204
- .page-break, .page-break-after {{
205
- page-break-after: always !important;
206
- break-after: page !important;
207
- }}
208
-
209
- .page-break-before {{
210
- page-break-before: always !important;
211
- break-before: page !important;
212
- }}
213
-
214
- .no-page-break, .keep-together {{
215
- page-break-inside: avoid !important;
216
- break-inside: avoid !important;
217
- }}
218
-
219
- /* Prevent awkward breaks in content */
220
- h1, h2, h3, h4, h5, h6 {{
221
- page-break-after: avoid !important;
222
- break-after: avoid !important;
223
- page-break-inside: avoid !important;
224
- break-inside: avoid !important;
225
- }}
226
-
227
- img, figure, table, pre, blockquote {{
228
- page-break-inside: avoid !important;
229
- break-inside: avoid !important;
230
- }}
231
-
232
- /* Preserve colors and backgrounds */
233
- * {{
234
- -webkit-print-color-adjust: exact !important;
235
- print-color-adjust: exact !important;
236
- color-adjust: exact !important;
237
- }}
238
  </style>
239
  """
240
 
241
- # Inject CSS into HTML
242
  if '</head>' in html_content:
243
  html_content = html_content.replace('</head>', page_css + '</head>')
244
  elif '<body' in html_content:
@@ -248,142 +148,32 @@ def inject_page_breaks(html_content: str, aspect_ratio: str):
248
 
249
  return html_content
250
 
251
- def render_html_preview(html_content):
252
- """Render HTML preview in an iframe"""
253
- b64 = base64.b64encode(html_content.encode()).decode()
254
- iframe_html = f'<iframe src="data:text/html;base64,{b64}" width="100%" height="600" style="border: 2px solid #ddd; border-radius: 5px;"></iframe>'
255
- return iframe_html
256
-
257
- def render_pdf_preview(pdf_bytes):
258
- """Render PDF preview using embedded PDF.js"""
259
- b64 = base64.b64encode(pdf_bytes).decode()
260
-
261
- pdf_viewer_html = f'''
262
- <!DOCTYPE html>
263
- <html>
264
- <head>
265
- <style>
266
- body {{
267
- margin: 0;
268
- padding: 0;
269
- overflow: hidden;
270
- background: #525659;
271
- }}
272
- #pdf-container {{
273
- width: 100%;
274
- height: 100vh;
275
- overflow: auto;
276
- display: flex;
277
- flex-direction: column;
278
- align-items: center;
279
- padding: 20px;
280
- box-sizing: border-box;
281
- }}
282
- canvas {{
283
- box-shadow: 0 2px 8px rgba(0,0,0,0.3);
284
- margin-bottom: 10px;
285
- background: white;
286
- }}
287
- #loading {{
288
- color: white;
289
- font-family: Arial, sans-serif;
290
- font-size: 18px;
291
- padding: 20px;
292
- }}
293
- </style>
294
- </head>
295
- <body>
296
- <div id="pdf-container">
297
- <div id="loading">Loading PDF...</div>
298
- </div>
299
- <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.min.js"></script>
300
- <script>
301
- pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.11.174/pdf.worker.min.js';
302
- const pdfData = atob('{b64}');
303
- const pdfContainer = document.getElementById('pdf-container');
304
- const loading = document.getElementById('loading');
305
- const uint8Array = new Uint8Array(pdfData.length);
306
- for (let i = 0; i < pdfData.length; i++) {{
307
- uint8Array[i] = pdfData.charCodeAt(i);
308
- }}
309
- pdfjsLib.getDocument({{data: uint8Array}}).promise.then(function(pdf) {{
310
- loading.style.display = 'none';
311
- const numPages = pdf.numPages;
312
- const promises = [];
313
- for (let pageNum = 1; pageNum <= numPages; pageNum++) {{
314
- promises.push(
315
- pdf.getPage(pageNum).then(function(page) {{
316
- const scale = 1.5;
317
- const viewport = page.getViewport({{scale: scale}});
318
- const canvas = document.createElement('canvas');
319
- const context = canvas.getContext('2d');
320
- canvas.height = viewport.height;
321
- canvas.width = viewport.width;
322
- pdfContainer.appendChild(canvas);
323
- return page.render({{
324
- canvasContext: context,
325
- viewport: viewport
326
- }}).promise;
327
- }})
328
- );
329
- }}
330
- return Promise.all(promises);
331
- }}).catch(function(error) {{
332
- loading.innerHTML = '<div style="color:#ff6b6b;">Error: ' + error.message + '</div>';
333
- }});
334
- </script>
335
- </body>
336
- </html>
337
- '''
338
- return pdf_viewer_html
339
-
340
  def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
341
- """Convert HTML content to PDF using Puppeteer with proper page breaks"""
342
  try:
343
- # Step 1: Inject page break CSS
344
- st.write("πŸ”§ Injecting page break CSS...")
345
  html_content = inject_page_breaks(html_content, aspect_ratio)
346
 
347
- # Save HTML to temp file
348
  html_file = os.path.join(temp_dir, "input.html")
349
  with open(html_file, 'w', encoding='utf-8') as f:
350
  f.write(html_content)
351
 
352
- st.write(f"πŸ“ Saved HTML: {os.path.getsize(html_file):,} bytes")
353
-
354
- # Find puppeteer script
355
  script_dir = os.path.dirname(os.path.abspath(__file__))
356
- possible_paths = [
357
- os.path.join(os.path.dirname(script_dir), 'puppeteer_pdf.js'),
358
- os.path.join(script_dir, 'puppeteer_pdf.js'),
359
- os.path.join(script_dir, '..', 'puppeteer_pdf.js'),
360
- 'puppeteer_pdf.js'
361
- ]
362
 
363
- puppeteer_script = None
364
- for path in possible_paths:
365
- if os.path.exists(path):
366
- puppeteer_script = path
367
- break
368
 
369
- if not puppeteer_script:
370
- return None, "Error: puppeteer_pdf.js not found"
371
-
372
- st.write(f"πŸ”§ Using Puppeteer: {puppeteer_script}")
373
-
374
- # Run conversion
375
  result = subprocess.run(
376
  ['node', puppeteer_script, html_file, aspect_ratio],
377
  capture_output=True,
378
  text=True,
379
  timeout=60,
380
- cwd=os.path.dirname(os.path.abspath(puppeteer_script))
381
  )
382
 
383
  if result.returncode != 0:
384
  return None, f"PDF conversion failed: {result.stderr}"
385
 
386
- # Read PDF
387
  pdf_file = html_file.replace('.html', '.pdf')
388
  if not os.path.exists(pdf_file):
389
  return None, "PDF file was not generated"
@@ -391,305 +181,226 @@ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
391
  with open(pdf_file, 'rb') as f:
392
  pdf_bytes = f.read()
393
 
394
- st.write(f"βœ… PDF generated: {len(pdf_bytes):,} bytes")
395
  return pdf_bytes, None
396
 
397
  except subprocess.TimeoutExpired:
398
- return None, "Error: PDF conversion timed out (60 seconds)"
399
  except Exception as e:
400
  return None, f"Error: {str(e)}"
401
 
402
- # Main UI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  st.title("πŸ“„ HTML to PDF Converter")
404
- st.markdown("""
405
- Convert HTML to PDF with **proper page breaks** and **embedded base64 images**!
406
- ✨ Each page in your HTML will be preserved as a separate PDF page.
 
 
 
 
407
  """)
408
 
409
- # Create tabs
410
- tab1, tab2 = st.tabs(["πŸ“€ Upload HTML File", "πŸ“ Paste HTML Code"])
411
 
412
- # Tab 1: Upload HTML File
413
  with tab1:
414
- uploaded_file = st.file_uploader(
415
- "Choose an HTML file",
416
- type=['html', 'htm'],
417
- key="file_uploader",
418
- help="Upload an HTML file"
419
- )
420
-
421
- uploaded_images = st.file_uploader(
422
- "πŸ“· Upload Images",
423
- type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
424
- key="image_uploader",
425
- help="Upload images - they will be embedded as base64 in the HTML",
426
- accept_multiple_files=True
427
- )
428
-
429
- if uploaded_images:
430
- st.success(f"βœ… {len(uploaded_images)} image(s) uploaded")
431
- with st.expander("View uploaded images"):
432
- cols = st.columns(min(len(uploaded_images), 4))
433
- for idx, img in enumerate(uploaded_images):
434
- with cols[idx % 4]:
435
- st.image(img, caption=img.name, use_container_width=True)
436
 
437
  if uploaded_file:
438
- st.success(f"βœ… File: {uploaded_file.name}")
439
-
440
- uploaded_file.seek(0)
441
- try:
442
- html_content = uploaded_file.getvalue().decode('utf-8')
443
- except UnicodeDecodeError:
444
- uploaded_file.seek(0)
445
- html_content = uploaded_file.getvalue().decode('latin-1')
446
-
447
  detected_ratio = detect_aspect_ratio(html_content)
448
 
449
- col1, col2 = st.columns([1, 1])
450
-
451
- with col1:
452
- st.subheader("βš™οΈ Settings")
453
- auto_detect = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_file")
454
-
455
- if auto_detect:
456
- aspect_ratio = detected_ratio
457
- st.info(f"πŸ” Detected: **{detected_ratio}**")
458
- else:
459
- aspect_ratio = st.radio(
460
- "Aspect Ratio",
461
- options=["16:9", "1:1", "9:16"],
462
- index=["16:9", "1:1", "9:16"].index(detected_ratio),
463
- key="aspect_file"
464
- )
465
-
466
- convert_btn = st.button("πŸ”„ Convert to PDF", key="conv_file", type="primary", use_container_width=True)
467
-
468
- with col2:
469
- st.subheader("πŸ‘οΈ Preview")
470
- with st.expander("Show HTML"):
471
- st.components.v1.html(render_html_preview(html_content), height=400, scrolling=True)
472
 
473
- if convert_btn:
474
- temp_dir = None
475
  try:
476
  with st.spinner("Converting..."):
477
- temp_dir = tempfile.mkdtemp()
478
-
479
- # Embed images as base64
480
  processed_html = html_content
481
  if uploaded_images:
482
- with st.expander("πŸ–ΌοΈ Image Processing", expanded=True):
483
- processed_html, replacements = embed_images_as_base64(html_content, uploaded_images)
484
-
485
- if not replacements:
486
- st.warning("⚠️ Images uploaded but no matches found in HTML!")
487
- st.write("**Tip:** Make sure image filenames in HTML match uploaded files exactly")
488
 
489
- # Convert to PDF
490
  pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio, temp_dir)
491
 
492
  if error:
493
  st.error(f"❌ {error}")
494
  else:
495
- st.success("βœ… PDF generated with proper page breaks!")
496
-
497
- output_name = uploaded_file.name.replace('.html', '.pdf').replace('.htm', '.pdf')
498
- if not output_name.endswith('.pdf'):
499
- output_name += '.pdf'
500
-
501
- col_a, col_b = st.columns(2)
502
- with col_a:
503
- st.download_button(
504
- "⬇️ Download PDF",
505
- data=pdf_bytes,
506
- file_name=output_name,
507
- mime="application/pdf",
508
- use_container_width=True
509
- )
510
- with col_b:
511
- st.info(f"Size: {len(pdf_bytes):,} bytes")
512
-
513
- st.subheader("πŸ“„ PDF Preview")
514
- st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
515
- except Exception as e:
516
- st.error(f"❌ Error: {str(e)}")
517
  finally:
518
- if temp_dir and os.path.exists(temp_dir):
519
  shutil.rmtree(temp_dir, ignore_errors=True)
520
 
521
- # Tab 2: Paste HTML
522
  with tab2:
523
- html_code = st.text_area(
524
- "HTML Content",
525
- value="""<!DOCTYPE html>
526
  <html>
527
- <head>
528
- <style>
529
- body {
530
- font-family: Arial;
531
- margin: 0;
532
- padding: 0;
533
- }
534
- .page {
535
- width: 100%;
536
- height: 100vh;
537
- display: flex;
538
- align-items: center;
539
- justify-content: center;
540
- box-sizing: border-box;
541
- padding: 40px;
542
- }
543
- .page:nth-child(1) {
544
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
545
- color: white;
546
- }
547
- .page:nth-child(2) {
548
- background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
549
- color: white;
550
- }
551
- .page:nth-child(3) {
552
- background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
553
- color: white;
554
- }
555
- h1 { font-size: 48px; text-shadow: 2px 2px 4px rgba(0,0,0,0.3); }
556
- </style>
557
- </head>
558
  <body>
559
- <div class="page">
560
- <h1>Page 1: Hello PDF! 🌍</h1>
561
- </div>
562
-
563
- <div class="page">
564
- <h1>Page 2: Separate Page! πŸ“„</h1>
565
  </div>
566
-
567
- <div class="page">
568
- <h1>Page 3: Final Page! ✨</h1>
569
  </div>
570
  </body>
571
- </html>""",
572
- height=400,
573
- key="html_code"
574
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
 
576
- uploaded_images_text = st.file_uploader(
577
- "πŸ“· Upload Images",
578
- type=['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp', 'bmp'],
579
- key="image_text",
580
- help="Upload images to embed in your HTML",
581
- accept_multiple_files=True
582
  )
583
 
584
- if uploaded_images_text:
585
- st.success(f"βœ… {len(uploaded_images_text)} image(s) uploaded")
586
- with st.expander("View images"):
587
- cols = st.columns(min(len(uploaded_images_text), 4))
588
- for idx, img in enumerate(uploaded_images_text):
589
- with cols[idx % 4]:
590
- st.image(img, caption=img.name, use_container_width=True)
591
 
592
- if html_code.strip():
593
- detected_ratio_text = detect_aspect_ratio(html_code)
594
- auto_detect_text = st.checkbox("Auto-detect aspect ratio", value=True, key="auto_text")
595
-
596
- if auto_detect_text:
597
- aspect_ratio_text = detected_ratio_text
598
- st.info(f"πŸ” Detected: **{detected_ratio_text}**")
599
- else:
600
- aspect_ratio_text = st.radio(
601
- "Aspect Ratio",
602
- options=["16:9", "1:1", "9:16"],
603
- index=["16:9", "1:1", "9:16"].index(detected_ratio_text),
604
- key="aspect_text"
605
- )
606
-
607
- convert_text_btn = st.button("πŸ”„ Convert", key="conv_text", type="primary", use_container_width=True)
608
-
609
- if convert_text_btn:
610
- temp_dir = None
611
- try:
612
- with st.spinner("Converting..."):
613
- temp_dir = tempfile.mkdtemp()
614
-
615
- processed_html = html_code
616
- if uploaded_images_text:
617
- with st.expander("πŸ–ΌοΈ Image Processing", expanded=True):
618
- processed_html, replacements = embed_images_as_base64(html_code, uploaded_images_text)
619
-
620
- if not replacements:
621
- st.warning("⚠️ Images uploaded but no matches found!")
622
-
623
- pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio_text, temp_dir)
624
-
625
- if error:
626
- st.error(f"❌ {error}")
627
- else:
628
- st.success("βœ… PDF generated with proper page breaks!")
629
-
630
- col_a, col_b = st.columns(2)
631
- with col_a:
632
- st.download_button(
633
- "⬇️ Download PDF",
634
- data=pdf_bytes,
635
- file_name="converted.pdf",
636
- mime="application/pdf",
637
- use_container_width=True
638
- )
639
- with col_b:
640
- st.info(f"Size: {len(pdf_bytes):,} bytes")
641
-
642
- st.subheader("πŸ“„ PDF Preview")
643
- st.components.v1.html(render_pdf_preview(pdf_bytes), height=600, scrolling=True)
644
- except Exception as e:
645
- st.error(f"❌ Error: {str(e)}")
646
- finally:
647
- if temp_dir and os.path.exists(temp_dir):
648
- shutil.rmtree(temp_dir, ignore_errors=True)
649
 
650
- # Footer
651
  st.markdown("---")
652
- st.markdown("""
653
- ### πŸ’‘ How Page Breaks Work:
654
-
655
- **Automatic Page Detection:**
656
- - Elements with class `page`, `slide`, or `section.page` are treated as separate pages
657
- - Each page automatically gets `page-break-after: always` CSS
658
- - Last page won't have a trailing break
659
-
660
- **HTML Structure for Multiple Pages:**
661
- ```html
662
- <div class="page">Page 1 content</div>
663
- <div class="page">Page 2 content</div>
664
- <div class="page">Page 3 content</div>
665
- ```
666
-
667
- **Manual Page Breaks:**
668
- - Add class `page-break` to force a break after an element
669
- - Add class `page-break-before` to force a break before an element
670
- - Add class `no-page-break` to prevent breaks inside an element
671
-
672
- **Image Embedding:**
673
- - Images are converted to base64 and embedded directly in HTML
674
- - Ensures images always appear in the PDF
675
- - Filename in HTML must match uploaded file exactly
676
-
677
- ### πŸ“ Example HTML:
678
- ```html
679
- <!DOCTYPE html>
680
- <html>
681
- <body>
682
- <div class="page">
683
- <h1>First Page</h1>
684
- <img src="logo.png" alt="Logo">
685
- </div>
686
-
687
- <div class="page">
688
- <h1>Second Page</h1>
689
- <p>Content here...</p>
690
- </div>
691
- </body>
692
- </html>
693
- ```
694
- Then upload a file named: `logo.png`
695
- """)
 
1
  """
2
+ Combined Streamlit UI + FastAPI REST API
3
+ Single port solution for Hugging Face Spaces
4
+ Save as: app.py
5
  """
6
  import streamlit as st
7
+ from fastapi import FastAPI, File, UploadFile, Form, HTTPException
8
+ from fastapi.responses import Response
9
+ from fastapi.middleware.cors import CORSMiddleware
10
  import subprocess
11
  import os
12
  import tempfile
13
  import shutil
 
14
  import base64
15
  import re
16
  import mimetypes
17
+ from typing import List, Optional
18
+ import uvicorn
19
+ import threading
20
 
21
+ # Import your existing conversion functions
22
+ import sys
23
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
24
+
25
+ # ============= SHARED CONVERSION FUNCTIONS =============
26
 
27
  def detect_aspect_ratio(html_content):
28
  """Detect aspect ratio from HTML content"""
 
54
  def image_to_base64(image_file):
55
  """Convert uploaded image to base64 data URL"""
56
  try:
57
+ if hasattr(image_file, 'getvalue'):
58
+ image_bytes = image_file.getvalue()
59
+ filename = image_file.name
60
+ else:
61
+ image_file.file.seek(0)
62
+ image_bytes = image_file.file.read()
63
+ filename = image_file.filename
64
+
65
+ mime_type, _ = mimetypes.guess_type(filename)
66
  if not mime_type:
67
+ ext = os.path.splitext(filename)[1].lower()
68
  mime_map = {
69
  '.jpg': 'image/jpeg',
70
  '.jpeg': 'image/jpeg',
 
80
  data_url = f"data:{mime_type};base64,{b64_data}"
81
  return data_url
82
  except Exception as e:
83
+ print(f"Error converting to base64: {str(e)}")
84
  return None
85
 
86
  def embed_images_as_base64(html_content, uploaded_images):
 
92
  for img in uploaded_images:
93
  data_url = image_to_base64(img)
94
  if data_url:
95
+ filename = img.name if hasattr(img, 'name') else img.filename
96
+ image_data_urls[filename] = data_url
97
 
98
  if not image_data_urls:
99
  return html_content, {}
 
103
  for filename, data_url in image_data_urls.items():
104
  escaped_name = re.escape(filename)
105
 
 
106
  pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
107
  matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
 
108
  if matches1:
109
  html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
110
+ replacements[f"{filename} (img)"] = len(matches1)
111
 
 
112
  pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
113
  matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
 
114
  if matches2:
115
  html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
116
+ replacements[f"{filename} (bg)"] = len(matches2)
117
 
 
118
  pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
119
  matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
 
120
  if matches3:
121
  html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
122
+ replacements[f"{filename} (url)"] = len(matches3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  return html_content, replacements
125
 
126
  def inject_page_breaks(html_content: str, aspect_ratio: str):
127
+ """Inject page break CSS"""
128
+ page_size = "A4 landscape" if aspect_ratio == "16:9" else ("210mm 210mm" if aspect_ratio == "1:1" else "A4 portrait")
129
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  page_css = f"""
131
  <style id="auto-page-breaks">
132
+ @page {{ size: {page_size}; margin: 0; }}
133
+ html, body {{ margin: 0 !important; padding: 0 !important; }}
134
+ .page, .slide {{ width: 100% !important; min-height: 100vh !important; height: 100vh !important;
135
+ page-break-after: always !important; break-after: page !important;
136
+ page-break-inside: avoid !important; break-inside: avoid !important; }}
137
+ .page:last-child, .slide:last-child {{ page-break-after: auto !important; }}
138
+ * {{ -webkit-print-color-adjust: exact !important; print-color-adjust: exact !important; }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  </style>
140
  """
141
 
 
142
  if '</head>' in html_content:
143
  html_content = html_content.replace('</head>', page_css + '</head>')
144
  elif '<body' in html_content:
 
148
 
149
  return html_content
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
152
+ """Convert HTML to PDF using Puppeteer"""
153
  try:
 
 
154
  html_content = inject_page_breaks(html_content, aspect_ratio)
155
 
 
156
  html_file = os.path.join(temp_dir, "input.html")
157
  with open(html_file, 'w', encoding='utf-8') as f:
158
  f.write(html_content)
159
 
 
 
 
160
  script_dir = os.path.dirname(os.path.abspath(__file__))
161
+ puppeteer_script = os.path.join(script_dir, 'puppeteer_pdf.js')
 
 
 
 
 
162
 
163
+ if not os.path.exists(puppeteer_script):
164
+ return None, f"Error: puppeteer_pdf.js not found"
 
 
 
165
 
 
 
 
 
 
 
166
  result = subprocess.run(
167
  ['node', puppeteer_script, html_file, aspect_ratio],
168
  capture_output=True,
169
  text=True,
170
  timeout=60,
171
+ cwd=script_dir
172
  )
173
 
174
  if result.returncode != 0:
175
  return None, f"PDF conversion failed: {result.stderr}"
176
 
 
177
  pdf_file = html_file.replace('.html', '.pdf')
178
  if not os.path.exists(pdf_file):
179
  return None, "PDF file was not generated"
 
181
  with open(pdf_file, 'rb') as f:
182
  pdf_bytes = f.read()
183
 
 
184
  return pdf_bytes, None
185
 
186
  except subprocess.TimeoutExpired:
187
+ return None, "Error: PDF conversion timed out"
188
  except Exception as e:
189
  return None, f"Error: {str(e)}"
190
 
191
+ # ============= FASTAPI APP =============
192
+
193
+ api_app = FastAPI(title="HTML to PDF API", version="3.0")
194
+
195
+ api_app.add_middleware(
196
+ CORSMiddleware,
197
+ allow_origins=["*"],
198
+ allow_credentials=True,
199
+ allow_methods=["*"],
200
+ allow_headers=["*"],
201
+ )
202
+
203
+ @api_app.get("/api/health")
204
+ async def health():
205
+ return {"status": "healthy", "version": "3.0"}
206
+
207
+ @api_app.post("/api/convert")
208
+ async def api_convert(
209
+ html_file: Optional[UploadFile] = File(None),
210
+ html_content: Optional[str] = Form(None),
211
+ aspect_ratio: Optional[str] = Form(None),
212
+ auto_detect: bool = Form(True),
213
+ images: Optional[List[UploadFile]] = File(None)
214
+ ):
215
+ """Convert HTML to PDF via API"""
216
+ temp_dir = None
217
+
218
+ try:
219
+ if not html_file and not html_content:
220
+ raise HTTPException(status_code=400, detail="html_file or html_content required")
221
+
222
+ if html_file:
223
+ content = await html_file.read()
224
+ html = content.decode('utf-8', errors='replace')
225
+ filename = html_file.filename
226
+ else:
227
+ html = html_content
228
+ filename = "converted.pdf"
229
+
230
+ temp_dir = tempfile.mkdtemp()
231
+
232
+ if images:
233
+ html, _ = embed_images_as_base64(html, images)
234
+
235
+ if auto_detect or not aspect_ratio:
236
+ aspect_ratio = detect_aspect_ratio(html)
237
+
238
+ pdf_bytes, error = convert_html_to_pdf(html, aspect_ratio, temp_dir)
239
+
240
+ if error:
241
+ raise HTTPException(status_code=500, detail=error)
242
+
243
+ output_filename = filename.replace('.html', '.pdf')
244
+ if not output_filename.endswith('.pdf'):
245
+ output_filename = 'converted.pdf'
246
+
247
+ return Response(
248
+ content=pdf_bytes,
249
+ media_type="application/pdf",
250
+ headers={"Content-Disposition": f"attachment; filename={output_filename}"}
251
+ )
252
+
253
+ except HTTPException:
254
+ raise
255
+ except Exception as e:
256
+ raise HTTPException(status_code=500, detail=str(e))
257
+ finally:
258
+ if temp_dir and os.path.exists(temp_dir):
259
+ shutil.rmtree(temp_dir, ignore_errors=True)
260
+
261
+ # ============= START FASTAPI IN BACKGROUND =============
262
+
263
+ def run_fastapi():
264
+ """Run FastAPI server in background thread"""
265
+ uvicorn.run(api_app, host="0.0.0.0", port=8000, log_level="error")
266
+
267
+ # Start FastAPI in background thread
268
+ api_thread = threading.Thread(target=run_fastapi, daemon=True)
269
+ api_thread.start()
270
+
271
+ # ============= STREAMLIT UI =============
272
+
273
+ st.set_page_config(page_title="HTML to PDF Converter", page_icon="πŸ“„", layout="wide")
274
+
275
  st.title("πŸ“„ HTML to PDF Converter")
276
+
277
+ # Add API info banner
278
+ st.info("""
279
+ πŸš€ **API Available!** This space includes a REST API:
280
+ - Health: `GET /api/health`
281
+ - Convert: `POST /api/convert`
282
+ - Example: `curl -X POST https://abdallalswaiti-htmlpdf.hf.space/api/convert -F 'html_content=<html>...</html>' --output out.pdf`
283
  """)
284
 
285
+ tab1, tab2, tab3 = st.tabs(["πŸ“€ Upload HTML", "πŸ“ Paste HTML", "πŸ“š API Docs"])
 
286
 
 
287
  with tab1:
288
+ uploaded_file = st.file_uploader("Choose HTML file", type=['html', 'htm'])
289
+ uploaded_images = st.file_uploader("πŸ“· Upload Images", type=['jpg', 'jpeg', 'png', 'gif', 'svg'], accept_multiple_files=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
  if uploaded_file:
292
+ html_content = uploaded_file.getvalue().decode('utf-8', errors='replace')
 
 
 
 
 
 
 
 
293
  detected_ratio = detect_aspect_ratio(html_content)
294
 
295
+ auto_detect = st.checkbox("Auto-detect aspect ratio", value=True, key="auto1")
296
+ if auto_detect:
297
+ aspect_ratio = detected_ratio
298
+ st.info(f"πŸ” Detected: **{detected_ratio}**")
299
+ else:
300
+ aspect_ratio = st.radio("Aspect Ratio", ["16:9", "1:1", "9:16"], key="ratio1")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
 
302
+ if st.button("πŸ”„ Convert to PDF", key="conv1", type="primary"):
303
+ temp_dir = tempfile.mkdtemp()
304
  try:
305
  with st.spinner("Converting..."):
 
 
 
306
  processed_html = html_content
307
  if uploaded_images:
308
+ processed_html, replacements = embed_images_as_base64(html_content, uploaded_images)
309
+ if replacements:
310
+ st.success(f"βœ… Embedded {len(replacements)} image reference(s)")
 
 
 
311
 
 
312
  pdf_bytes, error = convert_html_to_pdf(processed_html, aspect_ratio, temp_dir)
313
 
314
  if error:
315
  st.error(f"❌ {error}")
316
  else:
317
+ st.success("βœ… PDF generated!")
318
+ st.download_button(
319
+ "⬇️ Download PDF",
320
+ data=pdf_bytes,
321
+ file_name=uploaded_file.name.replace('.html', '.pdf'),
322
+ mime="application/pdf"
323
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  finally:
325
+ if os.path.exists(temp_dir):
326
  shutil.rmtree(temp_dir, ignore_errors=True)
327
 
 
328
  with tab2:
329
+ html_code = st.text_area("HTML Content", height=300, value="""<!DOCTYPE html>
 
 
330
  <html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  <body>
332
+ <div class="page" style="height:100vh; display:flex; align-items:center; justify-content:center; background:#667eea; color:white;">
333
+ <h1>Page 1</h1>
 
 
 
 
334
  </div>
335
+ <div class="page" style="height:100vh; display:flex; align-items:center; justify-content:center; background:#f093fb; color:white;">
336
+ <h1>Page 2</h1>
 
337
  </div>
338
  </body>
339
+ </html>""")
340
+
341
+ if st.button("πŸ”„ Convert", key="conv2", type="primary"):
342
+ temp_dir = tempfile.mkdtemp()
343
+ try:
344
+ with st.spinner("Converting..."):
345
+ aspect_ratio = detect_aspect_ratio(html_code)
346
+ pdf_bytes, error = convert_html_to_pdf(html_code, aspect_ratio, temp_dir)
347
+
348
+ if error:
349
+ st.error(f"❌ {error}")
350
+ else:
351
+ st.success("βœ… PDF generated!")
352
+ st.download_button("⬇️ Download PDF", data=pdf_bytes, file_name="converted.pdf", mime="application/pdf")
353
+ finally:
354
+ if os.path.exists(temp_dir):
355
+ shutil.rmtree(temp_dir, ignore_errors=True)
356
+
357
+ with tab3:
358
+ st.markdown("""
359
+ ## πŸ“‘ REST API Documentation
360
+
361
+ ### Endpoints
362
+
363
+ **Health Check**
364
+ ```bash
365
+ curl https://abdallalswaiti-htmlpdf.hf.space/api/health
366
+ ```
367
+
368
+ **Convert HTML to PDF**
369
+ ```bash
370
+ curl -X POST https://abdallalswaiti-htmlpdf.hf.space/api/convert \\
371
+ -F 'html_content=<html><body><div class="page">Hello</div></body></html>' \\
372
+ --output output.pdf
373
+ ```
374
+
375
+ **With Images**
376
+ ```bash
377
+ curl -X POST https://abdallalswaiti-htmlpdf.hf.space/api/convert \\
378
+ -F "html_file=@document.html" \\
379
+ -F "images=@logo.png" \\
380
+ -F "aspect_ratio=16:9" \\
381
+ --output output.pdf
382
+ ```
383
+
384
+ ### Python Example
385
+ ```python
386
+ import requests
387
 
388
+ response = requests.post(
389
+ 'https://abdallalswaiti-htmlpdf.hf.space/api/convert',
390
+ data={'html_content': '<html><body><div class="page">Test</div></body></html>'}
 
 
 
391
  )
392
 
393
+ with open('output.pdf', 'wb') as f:
394
+ f.write(response.content)
395
+ ```
 
 
 
 
396
 
397
+ ### Parameters
398
+ - `html_file` (file): HTML file upload
399
+ - `html_content` (string): Raw HTML content
400
+ - `aspect_ratio` (string): "16:9", "1:1", or "9:16"
401
+ - `auto_detect` (boolean): Auto-detect ratio
402
+ - `images` (files): Images to embed
403
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
 
405
  st.markdown("---")
406
+ st.markdown("πŸ’‘ **Tip:** Use `.page` or `.slide` classes for automatic page breaks")