ABDALLALSWAITI commited on
Commit
6830c81
·
verified ·
1 Parent(s): e58da64

Upload api.py

Browse files
Files changed (1) hide show
  1. api.py +408 -0
api.py ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI Backend for HTML to PDF Conversion
3
+ Runs alongside Streamlit on port 7860
4
+ """
5
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
6
+ from fastapi.responses import Response, JSONResponse
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ import subprocess
9
+ import os
10
+ import tempfile
11
+ import shutil
12
+ import base64
13
+ import re
14
+ import mimetypes
15
+ from typing import List, Optional
16
+ from pathlib import Path
17
+
18
+ app = FastAPI(
19
+ title="HTML to PDF API",
20
+ description="Convert HTML to PDF with image support and page breaks",
21
+ version="1.0.0"
22
+ )
23
+
24
+ # Add CORS middleware
25
+ app.add_middleware(
26
+ CORSMiddleware,
27
+ allow_origins=["*"],
28
+ allow_credentials=True,
29
+ allow_methods=["*"],
30
+ allow_headers=["*"],
31
+ )
32
+
33
+ def detect_aspect_ratio(html_content):
34
+ """Detect aspect ratio from HTML content"""
35
+ viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
36
+ if viewport_match:
37
+ viewport = viewport_match.group(1).lower()
38
+ if 'orientation=portrait' in viewport:
39
+ return "9:16"
40
+ elif 'orientation=landscape' in viewport:
41
+ return "16:9"
42
+
43
+ aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
44
+ if aspect_match:
45
+ width = int(aspect_match.group(1))
46
+ height = int(aspect_match.group(2))
47
+ ratio = width / height
48
+ if ratio > 1.5:
49
+ return "16:9"
50
+ elif ratio < 0.7:
51
+ return "9:16"
52
+ else:
53
+ return "1:1"
54
+
55
+ if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
56
+ return "16:9"
57
+
58
+ return "9:16"
59
+
60
+ def image_to_base64(image_bytes, filename):
61
+ """Convert image bytes to base64 data URL"""
62
+ try:
63
+ mime_type, _ = mimetypes.guess_type(filename)
64
+ if not mime_type:
65
+ ext = os.path.splitext(filename)[1].lower()
66
+ mime_map = {
67
+ '.jpg': 'image/jpeg',
68
+ '.jpeg': 'image/jpeg',
69
+ '.png': 'image/png',
70
+ '.gif': 'image/gif',
71
+ '.svg': 'image/svg+xml',
72
+ '.webp': 'image/webp',
73
+ '.bmp': 'image/bmp'
74
+ }
75
+ mime_type = mime_map.get(ext, 'image/png')
76
+
77
+ b64_data = base64.b64encode(image_bytes).decode('utf-8')
78
+ data_url = f"data:{mime_type};base64,{b64_data}"
79
+ return data_url
80
+ except Exception as e:
81
+ raise HTTPException(status_code=400, detail=f"Error converting {filename} to base64: {str(e)}")
82
+
83
+ def embed_images_as_base64(html_content, images_dict):
84
+ """Embed all images directly as base64 data URLs in the HTML"""
85
+ if not images_dict:
86
+ return html_content, {}
87
+
88
+ replacements = {}
89
+
90
+ for filename, data_url in images_dict.items():
91
+ escaped_name = re.escape(filename)
92
+
93
+ # Pattern 1: img src attribute
94
+ pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
95
+ matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
96
+ count1 = len(matches1)
97
+ if matches1:
98
+ html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
99
+ replacements[f"{filename} (img src)"] = count1
100
+
101
+ # Pattern 2: background-image
102
+ pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
103
+ matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
104
+ count2 = len(matches2)
105
+ if matches2:
106
+ html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
107
+ replacements[f"{filename} (bg-image)"] = count2
108
+
109
+ # Pattern 3: CSS url()
110
+ pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
111
+ matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
112
+ count3 = len(matches3)
113
+ if matches3:
114
+ html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
115
+ replacements[f"{filename} (url)"] = count3
116
+
117
+ return html_content, replacements
118
+
119
+ def inject_page_breaks(html_content: str, aspect_ratio: str):
120
+ """Automatically inject page breaks and page sizing CSS"""
121
+
122
+ if aspect_ratio == "16:9":
123
+ page_size = "A4 landscape"
124
+ elif aspect_ratio == "1:1":
125
+ page_size = "210mm 210mm"
126
+ else:
127
+ page_size = "A4 portrait"
128
+
129
+ page_css = f"""
130
+ <style id="auto-page-breaks">
131
+ @page {{
132
+ size: {page_size};
133
+ margin: 0;
134
+ }}
135
+
136
+ html, body {{
137
+ margin: 0 !important;
138
+ padding: 0 !important;
139
+ width: 100% !important;
140
+ height: 100% !important;
141
+ }}
142
+
143
+ .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
144
+ width: 100% !important;
145
+ min-height: 100vh !important;
146
+ height: 100vh !important;
147
+ page-break-after: always !important;
148
+ break-after: page !important;
149
+ page-break-inside: avoid !important;
150
+ break-inside: avoid !important;
151
+ position: relative !important;
152
+ box-sizing: border-box !important;
153
+ overflow: hidden !important;
154
+ }}
155
+
156
+ .page:last-child, .slide:last-child,
157
+ section.page:last-child, article.page:last-child {{
158
+ page-break-after: auto !important;
159
+ break-after: auto !important;
160
+ }}
161
+
162
+ body > section:not(.no-page-break),
163
+ body > article:not(.no-page-break),
164
+ body > div:not(.no-page-break) {{
165
+ page-break-after: always !important;
166
+ break-after: page !important;
167
+ min-height: 100vh;
168
+ }}
169
+
170
+ body > section:last-child,
171
+ body > article:last-child,
172
+ body > div:last-child {{
173
+ page-break-after: auto !important;
174
+ }}
175
+
176
+ .page-break, .page-break-after {{
177
+ page-break-after: always !important;
178
+ break-after: page !important;
179
+ }}
180
+
181
+ .page-break-before {{
182
+ page-break-before: always !important;
183
+ break-before: page !important;
184
+ }}
185
+
186
+ .no-page-break, .keep-together {{
187
+ page-break-inside: avoid !important;
188
+ break-inside: avoid !important;
189
+ }}
190
+
191
+ h1, h2, h3, h4, h5, h6 {{
192
+ page-break-after: avoid !important;
193
+ break-after: avoid !important;
194
+ page-break-inside: avoid !important;
195
+ break-inside: avoid !important;
196
+ }}
197
+
198
+ img, figure, table, pre, blockquote {{
199
+ page-break-inside: avoid !important;
200
+ break-inside: avoid !important;
201
+ }}
202
+
203
+ * {{
204
+ -webkit-print-color-adjust: exact !important;
205
+ print-color-adjust: exact !important;
206
+ color-adjust: exact !important;
207
+ }}
208
+ </style>
209
+ """
210
+
211
+ if '</head>' in html_content:
212
+ html_content = html_content.replace('</head>', page_css + '</head>')
213
+ elif '<body' in html_content:
214
+ html_content = html_content.replace('<body', page_css + '<body', 1)
215
+ else:
216
+ html_content = page_css + html_content
217
+
218
+ return html_content
219
+
220
+ def convert_html_to_pdf(html_content, aspect_ratio, temp_dir):
221
+ """Convert HTML content to PDF using Puppeteer"""
222
+ try:
223
+ html_content = inject_page_breaks(html_content, aspect_ratio)
224
+
225
+ html_file = os.path.join(temp_dir, "input.html")
226
+ with open(html_file, 'w', encoding='utf-8') as f:
227
+ f.write(html_content)
228
+
229
+ # Find puppeteer script
230
+ possible_paths = [
231
+ 'puppeteer_pdf.js',
232
+ '/app/puppeteer_pdf.js',
233
+ os.path.join(os.path.dirname(__file__), 'puppeteer_pdf.js'),
234
+ ]
235
+
236
+ puppeteer_script = None
237
+ for path in possible_paths:
238
+ if os.path.exists(path):
239
+ puppeteer_script = path
240
+ break
241
+
242
+ if not puppeteer_script:
243
+ raise Exception("puppeteer_pdf.js not found")
244
+
245
+ result = subprocess.run(
246
+ ['node', puppeteer_script, html_file, aspect_ratio],
247
+ capture_output=True,
248
+ text=True,
249
+ timeout=60,
250
+ cwd=os.path.dirname(os.path.abspath(puppeteer_script))
251
+ )
252
+
253
+ if result.returncode != 0:
254
+ raise Exception(f"PDF conversion failed: {result.stderr}")
255
+
256
+ pdf_file = html_file.replace('.html', '.pdf')
257
+ if not os.path.exists(pdf_file):
258
+ raise Exception("PDF file was not generated")
259
+
260
+ with open(pdf_file, 'rb') as f:
261
+ pdf_bytes = f.read()
262
+
263
+ return pdf_bytes
264
+
265
+ except subprocess.TimeoutExpired:
266
+ raise Exception("PDF conversion timed out (60 seconds)")
267
+ except Exception as e:
268
+ raise Exception(f"Error: {str(e)}")
269
+
270
+ @app.get("/")
271
+ async def root():
272
+ """API root endpoint"""
273
+ return {
274
+ "message": "HTML to PDF Converter API",
275
+ "version": "1.0.0",
276
+ "endpoints": {
277
+ "POST /convert": "Convert HTML to PDF",
278
+ "GET /health": "Health check",
279
+ "GET /docs": "API documentation"
280
+ }
281
+ }
282
+
283
+ @app.get("/health")
284
+ async def health():
285
+ """Health check endpoint"""
286
+ return {"status": "healthy"}
287
+
288
+ @app.post("/convert")
289
+ async def convert_to_pdf(
290
+ html_file: UploadFile = File(..., description="HTML file to convert"),
291
+ aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
292
+ auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML"),
293
+ images: Optional[List[UploadFile]] = File(None, description="Images to embed in HTML")
294
+ ):
295
+ """
296
+ Convert HTML to PDF with optional image embedding
297
+
298
+ - **html_file**: HTML file to convert (required)
299
+ - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
300
+ - **auto_detect**: Auto-detect aspect ratio from HTML content
301
+ - **images**: Image files to embed as base64 in HTML
302
+ """
303
+ temp_dir = None
304
+ try:
305
+ # Read HTML content
306
+ html_content = await html_file.read()
307
+ try:
308
+ html_content = html_content.decode('utf-8')
309
+ except UnicodeDecodeError:
310
+ html_content = html_content.decode('latin-1')
311
+
312
+ # Detect or use provided aspect ratio
313
+ if auto_detect:
314
+ detected_ratio = detect_aspect_ratio(html_content)
315
+ aspect_ratio = detected_ratio
316
+ elif not aspect_ratio:
317
+ aspect_ratio = "9:16"
318
+
319
+ # Validate aspect ratio
320
+ if aspect_ratio not in ["16:9", "1:1", "9:16"]:
321
+ raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
322
+
323
+ # Process images if provided
324
+ image_replacements = {}
325
+ if images:
326
+ images_dict = {}
327
+ for img in images:
328
+ img_bytes = await img.read()
329
+ data_url = image_to_base64(img_bytes, img.filename)
330
+ images_dict[img.filename] = data_url
331
+
332
+ html_content, image_replacements = embed_images_as_base64(html_content, images_dict)
333
+
334
+ # Create temp directory and convert
335
+ temp_dir = tempfile.mkdtemp()
336
+ pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
337
+
338
+ # Return PDF
339
+ return Response(
340
+ content=pdf_bytes,
341
+ media_type="application/pdf",
342
+ headers={
343
+ "Content-Disposition": f"attachment; filename=converted.pdf",
344
+ "X-Aspect-Ratio": aspect_ratio,
345
+ "X-Image-Replacements": str(len(image_replacements)),
346
+ "X-PDF-Size": str(len(pdf_bytes))
347
+ }
348
+ )
349
+
350
+ except HTTPException:
351
+ raise
352
+ except Exception as e:
353
+ raise HTTPException(status_code=500, detail=str(e))
354
+ finally:
355
+ if temp_dir and os.path.exists(temp_dir):
356
+ shutil.rmtree(temp_dir, ignore_errors=True)
357
+
358
+ @app.post("/convert-base64")
359
+ async def convert_to_pdf_base64(
360
+ html_content: str = Form(..., description="HTML content as string"),
361
+ aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
362
+ auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML")
363
+ ):
364
+ """
365
+ Convert HTML string to PDF and return as base64
366
+
367
+ - **html_content**: HTML content as string (required)
368
+ - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
369
+ - **auto_detect**: Auto-detect aspect ratio from HTML content
370
+ """
371
+ temp_dir = None
372
+ try:
373
+ # Detect or use provided aspect ratio
374
+ if auto_detect:
375
+ detected_ratio = detect_aspect_ratio(html_content)
376
+ aspect_ratio = detected_ratio
377
+ elif not aspect_ratio:
378
+ aspect_ratio = "9:16"
379
+
380
+ # Validate aspect ratio
381
+ if aspect_ratio not in ["16:9", "1:1", "9:16"]:
382
+ raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
383
+
384
+ # Create temp directory and convert
385
+ temp_dir = tempfile.mkdtemp()
386
+ pdf_bytes = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
387
+
388
+ # Convert to base64
389
+ pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
390
+
391
+ return JSONResponse({
392
+ "success": True,
393
+ "pdf_base64": pdf_base64,
394
+ "aspect_ratio": aspect_ratio,
395
+ "size_bytes": len(pdf_bytes)
396
+ })
397
+
398
+ except HTTPException:
399
+ raise
400
+ except Exception as e:
401
+ raise HTTPException(status_code=500, detail=str(e))
402
+ finally:
403
+ if temp_dir and os.path.exists(temp_dir):
404
+ shutil.rmtree(temp_dir, ignore_errors=True)
405
+
406
+ if __name__ == "__main__":
407
+ import uvicorn
408
+ uvicorn.run(app, host="0.0.0.0", port=7860)