ABDALLALSWAITI commited on
Commit
e58da64
·
verified ·
1 Parent(s): 06295a6

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -425
app.py DELETED
@@ -1,425 +0,0 @@
1
- """
2
- FastAPI Backend for HTML to PDF Conversion
3
- Runs alongside Streamlit on port 7860
4
- """
5
- from fastapi import FastAPI, UploadFile, File, Form, HTTPException
6
- from fastapi.responses import Response, JSONResponse
7
- from fastapi.middleware.cors import CORSMiddleware
8
- import subprocess
9
- import os
10
- import tempfile
11
- import shutil
12
- import re
13
- import mimetypes
14
- from typing import List, Optional
15
- from pathlib import Path
16
-
17
- app = FastAPI(
18
- title="HTML to PDF API",
19
- description="Convert HTML to PDF with image support and page breaks",
20
- version="1.0.0"
21
- )
22
-
23
- # Add CORS middleware
24
- app.add_middleware(
25
- CORSMiddleware,
26
- allow_origins=["*"],
27
- allow_credentials=True,
28
- allow_methods=["*"],
29
- allow_headers=["*"],
30
- )
31
-
32
- def detect_aspect_ratio(html_content):
33
- """Detect aspect ratio from HTML content"""
34
- viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
35
- if viewport_match:
36
- viewport = viewport_match.group(1).lower()
37
- if 'orientation=portrait' in viewport:
38
- return "9:16"
39
- elif 'orientation=landscape' in viewport:
40
- return "16:9"
41
-
42
- aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
43
- if aspect_match:
44
- width = int(aspect_match.group(1))
45
- height = int(aspect_match.group(2))
46
- ratio = width / height
47
- if ratio > 1.5:
48
- return "16:9"
49
- elif ratio < 0.7:
50
- return "9:16"
51
- else:
52
- return "1:1"
53
-
54
- if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
55
- return "16:9"
56
-
57
- return "9:16"
58
-
59
- def normalize_image_paths(html_content):
60
- """Replace complex image paths with just filenames"""
61
- replacements = {}
62
-
63
- # Pattern 1: img src with paths - extract filename only
64
- pattern1 = r'(<img[^>]*\s+src\s*=\s*)(["\'])([^"\']*?/)?([^/"\'>]+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)'
65
-
66
- def replace_img_src(match):
67
- prefix = match.group(1)
68
- quote = match.group(2)
69
- filename = match.group(4)
70
- replacements[f"img src: {match.group(0)}"] = filename
71
- return f'{prefix}{quote}{filename}{quote}'
72
-
73
- html_content = re.sub(pattern1, replace_img_src, html_content, flags=re.IGNORECASE)
74
-
75
- # Pattern 2: background-image with paths
76
- pattern2 = r'(background-image\s*:\s*url\s*\()(["\']?)([^)"\']*/)?([^/")\']+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)(\))'
77
-
78
- def replace_bg_image(match):
79
- prefix = match.group(1)
80
- quote = match.group(2)
81
- filename = match.group(4)
82
- suffix = match.group(7)
83
- replacements[f"bg-image: {match.group(0)}"] = filename
84
- return f'{prefix}{quote}{filename}{quote}{suffix}'
85
-
86
- html_content = re.sub(pattern2, replace_bg_image, html_content, flags=re.IGNORECASE)
87
-
88
- # Pattern 3: CSS url() with paths
89
- pattern3 = r'(url\s*\()(["\']?)([^)"\']*/)?([^/")\']+\.(jpg|jpeg|png|gif|svg|webp|bmp))(\2)(\))'
90
-
91
- def replace_url(match):
92
- # Skip if already processed by background-image pattern
93
- if 'background-image' in html_content[max(0, match.start()-50):match.start()]:
94
- return match.group(0)
95
-
96
- prefix = match.group(1)
97
- quote = match.group(2)
98
- filename = match.group(4)
99
- suffix = match.group(7)
100
- replacements[f"url: {match.group(0)}"] = filename
101
- return f'{prefix}{quote}{filename}{quote}{suffix}'
102
-
103
- html_content = re.sub(pattern3, replace_url, html_content, flags=re.IGNORECASE)
104
-
105
- return html_content, replacements
106
-
107
- def inject_page_breaks(html_content: str, aspect_ratio: str):
108
- """Automatically inject page breaks and page sizing CSS"""
109
-
110
- if aspect_ratio == "16:9":
111
- page_size = "A4 landscape"
112
- elif aspect_ratio == "1:1":
113
- page_size = "210mm 210mm"
114
- else:
115
- page_size = "A4 portrait"
116
-
117
- page_css = f"""
118
- <style id="auto-page-breaks">
119
- @page {{
120
- size: {page_size};
121
- margin: 0;
122
- }}
123
-
124
- html, body {{
125
- margin: 0 !important;
126
- padding: 0 !important;
127
- width: 100% !important;
128
- height: 100% !important;
129
- }}
130
-
131
- .page, .slide, section.page, article.page, div[class*="page"], div[class*="slide"] {{
132
- width: 100% !important;
133
- min-height: 100vh !important;
134
- height: 100vh !important;
135
- page-break-after: always !important;
136
- break-after: page !important;
137
- page-break-inside: avoid !important;
138
- break-inside: avoid !important;
139
- position: relative !important;
140
- box-sizing: border-box !important;
141
- overflow: hidden !important;
142
- }}
143
-
144
- .page:last-child, .slide:last-child,
145
- section.page:last-child, article.page:last-child {{
146
- page-break-after: auto !important;
147
- break-after: auto !important;
148
- }}
149
-
150
- body > section:not(.no-page-break),
151
- body > article:not(.no-page-break),
152
- body > div:not(.no-page-break) {{
153
- page-break-after: always !important;
154
- break-after: page !important;
155
- min-height: 100vh;
156
- }}
157
-
158
- body > section:last-child,
159
- body > article:last-child,
160
- body > div:last-child {{
161
- page-break-after: auto !important;
162
- }}
163
-
164
- .page-break, .page-break-after {{
165
- page-break-after: always !important;
166
- break-after: page !important;
167
- }}
168
-
169
- .page-break-before {{
170
- page-break-before: always !important;
171
- break-before: page !important;
172
- }}
173
-
174
- .no-page-break, .keep-together {{
175
- page-break-inside: avoid !important;
176
- break-inside: avoid !important;
177
- }}
178
-
179
- h1, h2, h3, h4, h5, h6 {{
180
- page-break-after: avoid !important;
181
- break-after: avoid !important;
182
- page-break-inside: avoid !important;
183
- break-inside: avoid !important;
184
- }}
185
-
186
- img, figure, table, pre, blockquote {{
187
- page-break-inside: avoid !important;
188
- break-inside: avoid !important;
189
- }}
190
-
191
- * {{
192
- -webkit-print-color-adjust: exact !important;
193
- print-color-adjust: exact !important;
194
- color-adjust: exact !important;
195
- }}
196
- </style>
197
- """
198
-
199
- if '</head>' in html_content:
200
- html_content = html_content.replace('</head>', page_css + '</head>')
201
- elif '<body' in html_content:
202
- html_content = html_content.replace('<body', page_css + '<body', 1)
203
- else:
204
- html_content = page_css + html_content
205
-
206
- return html_content
207
-
208
- def convert_html_to_pdf(html_content, aspect_ratio, temp_dir, images=None):
209
- """Convert HTML content to PDF using Puppeteer"""
210
- try:
211
- # Normalize image paths in HTML
212
- html_content, path_replacements = normalize_image_paths(html_content)
213
-
214
- # Inject page breaks
215
- html_content = inject_page_breaks(html_content, aspect_ratio)
216
-
217
- # Save HTML file
218
- html_file = os.path.join(temp_dir, "input.html")
219
- with open(html_file, 'w', encoding='utf-8') as f:
220
- f.write(html_content)
221
-
222
- # Save image files to the same directory
223
- if images:
224
- for img in images:
225
- img_path = os.path.join(temp_dir, img.filename)
226
- with open(img_path, 'wb') as f:
227
- f.write(img.file.read())
228
-
229
- # Find puppeteer script
230
- possible_paths = [
231
- 'puppeteer_pdf.js',
232
- '/app/puppeteer_pdf.js',
233
- os.path.join(os.path.dirname(__file__), 'puppeteer_pdf.js'),
234
- ]
235
-
236
- puppeteer_script = None
237
- for path in possible_paths:
238
- if os.path.exists(path):
239
- puppeteer_script = path
240
- break
241
-
242
- if not puppeteer_script:
243
- raise Exception("puppeteer_pdf.js not found")
244
-
245
- result = subprocess.run(
246
- ['node', puppeteer_script, html_file, aspect_ratio],
247
- capture_output=True,
248
- text=True,
249
- timeout=60,
250
- cwd=temp_dir # Run in temp directory so images are accessible
251
- )
252
-
253
- if result.returncode != 0:
254
- raise Exception(f"PDF conversion failed: {result.stderr}")
255
-
256
- pdf_file = html_file.replace('.html', '.pdf')
257
- if not os.path.exists(pdf_file):
258
- raise Exception("PDF file was not generated")
259
-
260
- with open(pdf_file, 'rb') as f:
261
- pdf_bytes = f.read()
262
-
263
- return pdf_bytes, path_replacements
264
-
265
- except subprocess.TimeoutExpired:
266
- raise Exception("PDF conversion timed out (60 seconds)")
267
- except Exception as e:
268
- raise Exception(f"Error: {str(e)}")
269
-
270
- @app.get("/")
271
- async def root():
272
- """API root endpoint"""
273
- return {
274
- "message": "HTML to PDF Converter API",
275
- "version": "1.0.0",
276
- "endpoints": {
277
- "POST /convert": "Convert HTML to PDF",
278
- "GET /health": "Health check",
279
- "GET /docs": "API documentation"
280
- }
281
- }
282
-
283
- @app.get("/health")
284
- async def health():
285
- """Health check endpoint"""
286
- return {"status": "healthy"}
287
-
288
- @app.post("/convert")
289
- async def convert_to_pdf(
290
- html_file: UploadFile = File(..., description="HTML file to convert"),
291
- aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
292
- auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML"),
293
- images: Optional[List[UploadFile]] = File(None, description="Images referenced in HTML")
294
- ):
295
- """
296
- Convert HTML to PDF with image files in same directory
297
-
298
- - **html_file**: HTML file to convert (required)
299
- - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
300
- - **auto_detect**: Auto-detect aspect ratio from HTML content
301
- - **images**: Image files referenced in HTML (saved to temp directory)
302
- """
303
- temp_dir = None
304
- try:
305
- # Read HTML content
306
- html_content = await html_file.read()
307
- try:
308
- html_content = html_content.decode('utf-8')
309
- except UnicodeDecodeError:
310
- html_content = html_content.decode('latin-1')
311
-
312
- # Detect or use provided aspect ratio
313
- if auto_detect:
314
- detected_ratio = detect_aspect_ratio(html_content)
315
- aspect_ratio = detected_ratio
316
- elif not aspect_ratio:
317
- aspect_ratio = "9:16"
318
-
319
- # Validate aspect ratio
320
- if aspect_ratio not in ["16:9", "1:1", "9:16"]:
321
- raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
322
-
323
- # Create temp directory and convert
324
- temp_dir = tempfile.mkdtemp()
325
-
326
- # Read images into memory before conversion
327
- images_list = []
328
- if images:
329
- for img in images:
330
- img_bytes = await img.read()
331
- # Create a simple object to hold filename and bytes
332
- class ImageFile:
333
- def __init__(self, filename, content):
334
- self.filename = filename
335
- self.content = content
336
- self.file = None
337
-
338
- def get_bytes(self):
339
- return self.content
340
-
341
- img_obj = ImageFile(img.filename, img_bytes)
342
- # Create a file-like object for backwards compatibility
343
- import io
344
- img_obj.file = io.BytesIO(img_bytes)
345
- images_list.append(img_obj)
346
-
347
- pdf_bytes, path_replacements = convert_html_to_pdf(
348
- html_content,
349
- aspect_ratio,
350
- temp_dir,
351
- images_list if images_list else None
352
- )
353
-
354
- # Return PDF
355
- return Response(
356
- content=pdf_bytes,
357
- media_type="application/pdf",
358
- headers={
359
- "Content-Disposition": f"attachment; filename=converted.pdf",
360
- "X-Aspect-Ratio": aspect_ratio,
361
- "X-Path-Replacements": str(len(path_replacements)),
362
- "X-PDF-Size": str(len(pdf_bytes))
363
- }
364
- )
365
-
366
- except HTTPException:
367
- raise
368
- except Exception as e:
369
- raise HTTPException(status_code=500, detail=str(e))
370
- finally:
371
- if temp_dir and os.path.exists(temp_dir):
372
- shutil.rmtree(temp_dir, ignore_errors=True)
373
-
374
- @app.post("/convert-base64")
375
- async def convert_to_pdf_base64(
376
- html_content: str = Form(..., description="HTML content as string"),
377
- aspect_ratio: Optional[str] = Form(None, description="Aspect ratio: 16:9, 1:1, or 9:16"),
378
- auto_detect: bool = Form(True, description="Auto-detect aspect ratio from HTML")
379
- ):
380
- """
381
- Convert HTML string to PDF (for HTML without external images)
382
-
383
- - **html_content**: HTML content as string (required)
384
- - **aspect_ratio**: Page aspect ratio (optional if auto_detect=true)
385
- - **auto_detect**: Auto-detect aspect ratio from HTML content
386
- """
387
- temp_dir = None
388
- try:
389
- # Detect or use provided aspect ratio
390
- if auto_detect:
391
- detected_ratio = detect_aspect_ratio(html_content)
392
- aspect_ratio = detected_ratio
393
- elif not aspect_ratio:
394
- aspect_ratio = "9:16"
395
-
396
- # Validate aspect ratio
397
- if aspect_ratio not in ["16:9", "1:1", "9:16"]:
398
- raise HTTPException(status_code=400, detail="Invalid aspect ratio. Must be 16:9, 1:1, or 9:16")
399
-
400
- # Create temp directory and convert
401
- temp_dir = tempfile.mkdtemp()
402
- pdf_bytes, path_replacements = convert_html_to_pdf(html_content, aspect_ratio, temp_dir)
403
-
404
- return Response(
405
- content=pdf_bytes,
406
- media_type="application/pdf",
407
- headers={
408
- "Content-Disposition": f"attachment; filename=converted.pdf",
409
- "X-Aspect-Ratio": aspect_ratio,
410
- "X-Path-Replacements": str(len(path_replacements)),
411
- "X-PDF-Size": str(len(pdf_bytes))
412
- }
413
- )
414
-
415
- except HTTPException:
416
- raise
417
- except Exception as e:
418
- raise HTTPException(status_code=500, detail=str(e))
419
- finally:
420
- if temp_dir and os.path.exists(temp_dir):
421
- shutil.rmtree(temp_dir, ignore_errors=True)
422
-
423
- if __name__ == "__main__":
424
- import uvicorn
425
- uvicorn.run(app, host="0.0.0.0", port=7860)