ABDALLALSWAITI commited on
Commit
5b2c21f
·
verified ·
1 Parent(s): 45054fd

Update api.py

Browse files
Files changed (1) hide show
  1. api.py +132 -134
api.py CHANGED
@@ -1,17 +1,17 @@
1
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
2
- from fastapi.responses import Response, JSONResponse
3
  from fastapi.middleware.cors import CORSMiddleware
4
  import subprocess
5
  import os
6
  import tempfile
7
  import shutil
8
- from pathlib import Path
9
  import re
 
10
  from typing import List, Optional
11
 
12
  app = FastAPI(title="HTML to PDF Converter API")
13
 
14
- # Enable CORS
15
  app.add_middleware(
16
  CORSMiddleware,
17
  allow_origins=["*"],
@@ -21,21 +21,15 @@ app.add_middleware(
21
  )
22
 
23
  def detect_aspect_ratio(html_content):
24
- """
25
- Detect aspect ratio from HTML content
26
- Returns: "16:9", "1:1", or "9:16"
27
- """
28
- # Check for viewport meta tag
29
  viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
30
  if viewport_match:
31
  viewport = viewport_match.group(1).lower()
32
- if 'width=device-width' in viewport or 'width=100%' in viewport:
33
- if 'orientation=portrait' in viewport:
34
- return "9:16"
35
- elif 'orientation=landscape' in viewport:
36
- return "16:9"
37
 
38
- # Check for CSS aspect-ratio property
39
  aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
40
  if aspect_match:
41
  width = int(aspect_match.group(1))
@@ -48,110 +42,106 @@ def detect_aspect_ratio(html_content):
48
  else:
49
  return "1:1"
50
 
51
- # Check for common presentation frameworks
52
  if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
53
  return "16:9"
54
 
55
- # Default to A4 portrait
56
  return "9:16"
57
 
58
- def save_uploaded_images(images: List[UploadFile], temp_dir: str):
59
- """Save uploaded images and return mapping"""
60
- image_mapping = {}
61
- images_dir = os.path.join(temp_dir, "images")
62
- os.makedirs(images_dir, exist_ok=True)
63
-
64
- for image in images:
65
- # Save image
66
- image_path = os.path.join(images_dir, image.filename)
67
- with open(image_path, 'wb') as f:
68
- content = image.file.read()
69
- f.write(content)
 
 
 
 
70
 
71
- # Create mapping
72
- image_mapping[image.filename] = f"images/{image.filename}"
73
- print(f"API: Saved image: {image.filename} -> {image_path}")
74
-
75
- return image_mapping
 
 
76
 
77
- def process_html_with_images(html_content: str, temp_dir: str, image_mapping: dict):
78
- """Process HTML to handle image references with absolute file paths"""
79
- replacements_made = []
 
 
 
 
80
 
81
- for original_name, relative_path in image_mapping.items():
82
- # Get absolute path for the image
83
- absolute_path = os.path.abspath(os.path.join(temp_dir, relative_path))
84
- file_url = f"file://{absolute_path}"
85
-
86
- # Escape the filename for regex
87
- escaped_name = re.escape(original_name)
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- # Pattern 1: src with any path prefix
90
- pattern1 = rf'src=(["\'])(?:[^"\']*?/)?{escaped_name}\1'
91
- matches1 = re.findall(pattern1, html_content, flags=re.IGNORECASE)
92
- html_content = re.sub(
93
- pattern1,
94
- f'src=\\1{file_url}\\1',
95
- html_content,
96
- flags=re.IGNORECASE
97
- )
98
  if matches1:
99
- replacements_made.append(f"Pattern 1 (src): Found {len(matches1)} matches for {original_name}")
 
100
 
101
- # Pattern 2: url() with any path prefix
102
- pattern2 = rf'url\((["\']?)(?:[^)"\']*/)?{escaped_name}\1\)'
103
- matches2 = re.findall(pattern2, html_content, flags=re.IGNORECASE)
104
- html_content = re.sub(
105
- pattern2,
106
- f'url("{file_url}")',
107
- html_content,
108
- flags=re.IGNORECASE
109
- )
110
  if matches2:
111
- replacements_made.append(f"Pattern 2 (url): Found {len(matches2)} matches for {original_name}")
 
112
 
113
- # Pattern 3: href with any path prefix
114
- pattern3 = rf'href=(["\'])(?:[^"\']*?/)?{escaped_name}\1'
115
- matches3 = re.findall(pattern3, html_content, flags=re.IGNORECASE)
116
- html_content = re.sub(
117
- pattern3,
118
- f'href=\\1{file_url}\\1',
119
- html_content,
120
- flags=re.IGNORECASE
121
- )
122
  if matches3:
123
- replacements_made.append(f"Pattern 3 (href): Found {len(matches3)} matches for {original_name}")
 
124
 
125
- # Print debug info
126
- if replacements_made:
127
- print("=== API Image Replacements Made ===")
128
- for msg in replacements_made:
129
- print(f" ✓ {msg}")
130
  else:
131
- print("=== API WARNING: No image replacements made ===")
132
- print(f"Looking for images: {list(image_mapping.keys())}")
133
 
134
- return html_content
135
 
136
  def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
137
- """
138
- Convert HTML content to PDF using Puppeteer
139
-
140
- Args:
141
- html_content: String containing HTML content
142
- aspect_ratio: One of "16:9", "1:1", or "9:16"
143
- temp_dir: Temporary directory for processing
144
-
145
- Returns:
146
- Tuple of (pdf_bytes, error_message)
147
- """
148
  try:
149
- # Inject CSS to preserve styles better
150
  style_injection = """
151
  <style>
152
- @page {
153
- margin: 0;
154
- }
155
  * {
156
  -webkit-print-color-adjust: exact !important;
157
  print-color-adjust: exact !important;
@@ -164,7 +154,6 @@ def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
164
  </style>
165
  """
166
 
167
- # Insert style injection
168
  if '</head>' in html_content:
169
  html_content = html_content.replace('</head>', style_injection + '</head>')
170
  elif '<body' in html_content:
@@ -172,57 +161,57 @@ def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
172
  else:
173
  html_content = style_injection + html_content
174
 
175
- # Save HTML content to temporary file
176
  html_file = os.path.join(temp_dir, "input.html")
177
  with open(html_file, 'w', encoding='utf-8') as f:
178
  f.write(html_content)
179
 
180
- # Get the path to puppeteer_pdf.js
 
 
181
  puppeteer_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'puppeteer_pdf.js')
182
 
183
- print(f"API: Running Puppeteer conversion with aspect ratio: {aspect_ratio}")
184
- print(f"API: HTML file: {html_file}")
185
- print(f"API: Puppeteer script: {puppeteer_script}")
186
 
187
- # Run Node.js script to convert HTML to PDF
 
 
188
  result = subprocess.run(
189
  ['node', puppeteer_script, html_file, aspect_ratio],
190
  capture_output=True,
191
  text=True,
192
  timeout=60,
193
- cwd=os.path.dirname(os.path.abspath(__file__))
194
  )
195
 
196
  if result.returncode != 0:
197
- print(f"API: Puppeteer error: {result.stderr}")
198
  return None, f"PDF conversion failed: {result.stderr}"
199
 
200
- # Get the generated PDF path
201
  pdf_file = html_file.replace('.html', '.pdf')
202
-
203
  if not os.path.exists(pdf_file):
204
  return None, "PDF file was not generated"
205
 
206
- # Read PDF file into memory
207
  with open(pdf_file, 'rb') as f:
208
  pdf_bytes = f.read()
209
 
210
- print(f"API: PDF generated successfully, size: {len(pdf_bytes)} bytes")
211
  return pdf_bytes, None
212
 
213
  except subprocess.TimeoutExpired:
214
  return None, "Error: PDF conversion timed out (60 seconds)"
215
  except Exception as e:
216
- print(f"API: Conversion error: {str(e)}")
217
  return None, f"Error: {str(e)}"
218
 
219
  @app.get("/")
220
  async def root():
221
  return {
222
- "message": "HTML to PDF Converter API",
223
- "version": "2.0",
224
  "endpoints": {
225
- "/convert": "POST - Convert HTML to PDF (supports file upload or raw HTML)",
226
  "/health": "GET - Health check"
227
  }
228
  }
@@ -240,17 +229,17 @@ async def convert_to_pdf(
240
  images: Optional[List[UploadFile]] = File(None)
241
  ):
242
  """
243
- Convert HTML to PDF
244
 
245
  Parameters:
246
  - html_file: HTML file upload (optional)
247
  - html_content: Raw HTML content (optional, used if html_file not provided)
248
  - aspect_ratio: "16:9", "1:1", or "9:16" (optional if auto_detect is True)
249
  - auto_detect: Auto-detect aspect ratio from HTML (default: True)
250
- - images: List of image files referenced in the HTML (optional)
251
 
252
  Returns:
253
- - PDF file as bytes
254
  """
255
  temp_dir = None
256
 
@@ -271,35 +260,38 @@ async def convert_to_pdf(
271
  html = html_content
272
  filename = "converted.pdf"
273
 
 
 
 
274
  # Create temp directory
275
  temp_dir = tempfile.mkdtemp()
276
- print(f"API: Created temp directory: {temp_dir}")
277
 
278
- # Process images if provided
279
  if images:
280
- print(f"API: Processing {len(images)} uploaded images")
281
- image_mapping = save_uploaded_images(images, temp_dir)
282
- html = process_html_with_images(html, temp_dir, image_mapping)
283
- print(f"API: Image processing complete")
 
 
 
 
 
284
 
285
  # Determine aspect ratio
286
  if auto_detect or not aspect_ratio:
287
  detected_ratio = detect_aspect_ratio(html)
288
  aspect_ratio = detected_ratio
289
- print(f"API: Auto-detected aspect ratio: {aspect_ratio}")
290
  else:
291
- # Validate aspect ratio
292
  if aspect_ratio not in ["16:9", "1:1", "9:16"]:
293
  raise HTTPException(status_code=400, detail="Invalid aspect_ratio. Must be '16:9', '1:1', or '9:16'")
294
- print(f"API: Using specified aspect ratio: {aspect_ratio}")
295
 
296
  # Convert to PDF
297
  pdf_bytes, error = convert_html_to_pdf(html, aspect_ratio, temp_dir)
298
 
299
- # Cleanup
300
- if temp_dir:
301
- shutil.rmtree(temp_dir, ignore_errors=True)
302
-
303
  if error:
304
  raise HTTPException(status_code=500, detail=error)
305
 
@@ -308,23 +300,29 @@ async def convert_to_pdf(
308
  if not output_filename.endswith('.pdf'):
309
  output_filename = 'converted.pdf'
310
 
311
- # Return PDF as response
 
 
 
312
  return Response(
313
  content=pdf_bytes,
314
  media_type="application/pdf",
315
  headers={
316
  "Content-Disposition": f"attachment; filename={output_filename}",
317
- "X-Aspect-Ratio": aspect_ratio
 
318
  }
319
  )
320
 
321
  except HTTPException:
322
  raise
323
  except Exception as e:
324
- if temp_dir:
325
- shutil.rmtree(temp_dir, ignore_errors=True)
326
- print(f"API: Error in convert endpoint: {str(e)}")
327
  raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
328
 
329
  if __name__ == "__main__":
330
  import uvicorn
 
1
  from fastapi import FastAPI, File, UploadFile, Form, HTTPException
2
+ from fastapi.responses import Response
3
  from fastapi.middleware.cors import CORSMiddleware
4
  import subprocess
5
  import os
6
  import tempfile
7
  import shutil
8
+ import base64
9
  import re
10
+ import mimetypes
11
  from typing import List, Optional
12
 
13
  app = FastAPI(title="HTML to PDF Converter API")
14
 
 
15
  app.add_middleware(
16
  CORSMiddleware,
17
  allow_origins=["*"],
 
21
  )
22
 
23
  def detect_aspect_ratio(html_content):
24
+ """Detect aspect ratio from HTML content"""
 
 
 
 
25
  viewport_match = re.search(r'<meta[^>]*viewport[^>]*content=["\']([^"\']*)["\']', html_content, re.IGNORECASE)
26
  if viewport_match:
27
  viewport = viewport_match.group(1).lower()
28
+ if 'orientation=portrait' in viewport:
29
+ return "9:16"
30
+ elif 'orientation=landscape' in viewport:
31
+ return "16:9"
 
32
 
 
33
  aspect_match = re.search(r'aspect-ratio\s*:\s*(\d+)\s*/\s*(\d+)', html_content, re.IGNORECASE)
34
  if aspect_match:
35
  width = int(aspect_match.group(1))
 
42
  else:
43
  return "1:1"
44
 
 
45
  if any(keyword in html_content.lower() for keyword in ['reveal.js', 'impress.js', 'slide', 'presentation']):
46
  return "16:9"
47
 
 
48
  return "9:16"
49
 
50
+ def image_to_base64(image_bytes, filename):
51
+ """Convert image bytes to base64 data URL"""
52
+ try:
53
+ mime_type, _ = mimetypes.guess_type(filename)
54
+ if not mime_type:
55
+ ext = os.path.splitext(filename)[1].lower()
56
+ mime_map = {
57
+ '.jpg': 'image/jpeg',
58
+ '.jpeg': 'image/jpeg',
59
+ '.png': 'image/png',
60
+ '.gif': 'image/gif',
61
+ '.svg': 'image/svg+xml',
62
+ '.webp': 'image/webp',
63
+ '.bmp': 'image/bmp'
64
+ }
65
+ mime_type = mime_map.get(ext, 'image/png')
66
 
67
+ b64_data = base64.b64encode(image_bytes).decode('utf-8')
68
+ data_url = f"data:{mime_type};base64,{b64_data}"
69
+
70
+ return data_url
71
+ except Exception as e:
72
+ print(f"Error converting {filename} to base64: {str(e)}")
73
+ return None
74
 
75
+ def embed_images_as_base64(html_content, images: List[UploadFile]):
76
+ """
77
+ Embed all images directly as base64 data URLs in the HTML
78
+ This ensures images are always included in the PDF
79
+ """
80
+ if not images:
81
+ return html_content, {}
82
 
83
+ # Create mapping of filename to base64 data URL
84
+ image_data_urls = {}
85
+ for img in images:
86
+ img.file.seek(0)
87
+ image_bytes = img.file.read()
88
+ data_url = image_to_base64(image_bytes, img.filename)
89
+ if data_url:
90
+ image_data_urls[img.filename] = data_url
91
+ print(f"✓ Converted {img.filename} to base64 ({len(data_url)} chars)")
92
+
93
+ if not image_data_urls:
94
+ return html_content, {}
95
+
96
+ # Track replacements
97
+ replacements = {}
98
+
99
+ for filename, data_url in image_data_urls.items():
100
+ # Escape filename for regex
101
+ escaped_name = re.escape(filename)
102
 
103
+ # Pattern 1: img src attribute
104
+ pattern1 = rf'(<img[^>]*\s+src\s*=\s*)(["\'])(?:[^"\']*?/)?{escaped_name}\2'
105
+ matches1 = list(re.finditer(pattern1, html_content, flags=re.IGNORECASE | re.DOTALL))
106
+ count1 = len(matches1)
 
 
 
 
 
107
  if matches1:
108
+ html_content = re.sub(pattern1, rf'\1\2{data_url}\2', html_content, flags=re.IGNORECASE | re.DOTALL)
109
+ replacements[f"{filename} (img src)"] = count1
110
 
111
+ # Pattern 2: background-image
112
+ pattern2 = rf'(background-image\s*:\s*url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
113
+ matches2 = list(re.finditer(pattern2, html_content, flags=re.IGNORECASE))
114
+ count2 = len(matches2)
 
 
 
 
 
115
  if matches2:
116
+ html_content = re.sub(pattern2, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
117
+ replacements[f"{filename} (bg-image)"] = count2
118
 
119
+ # Pattern 3: CSS url()
120
+ pattern3 = rf'(url\s*\()(["\']?)(?:[^)"\']*/)?{escaped_name}\2(\))'
121
+ matches3 = list(re.finditer(pattern3, html_content, flags=re.IGNORECASE))
122
+ count3 = len(matches3)
 
 
 
 
 
123
  if matches3:
124
+ html_content = re.sub(pattern3, rf'\1"{data_url}"\3', html_content, flags=re.IGNORECASE)
125
+ replacements[f"{filename} (url)"] = count3
126
 
127
+ # Log results
128
+ if replacements:
129
+ print("=== Image Replacements ===")
130
+ for key, count in replacements.items():
131
+ print(f" ✓ {key}: {count} replacement(s)")
132
  else:
133
+ print("=== WARNING: No image replacements made ===")
134
+ print(f"Looking for: {list(image_data_urls.keys())}")
135
 
136
+ return html_content, replacements
137
 
138
  def convert_html_to_pdf(html_content: str, aspect_ratio: str, temp_dir: str):
139
+ """Convert HTML content to PDF using Puppeteer"""
 
 
 
 
 
 
 
 
 
 
140
  try:
141
+ # Inject CSS to preserve styles
142
  style_injection = """
143
  <style>
144
+ @page { margin: 0; }
 
 
145
  * {
146
  -webkit-print-color-adjust: exact !important;
147
  print-color-adjust: exact !important;
 
154
  </style>
155
  """
156
 
 
157
  if '</head>' in html_content:
158
  html_content = html_content.replace('</head>', style_injection + '</head>')
159
  elif '<body' in html_content:
 
161
  else:
162
  html_content = style_injection + html_content
163
 
164
+ # Save HTML to temp file
165
  html_file = os.path.join(temp_dir, "input.html")
166
  with open(html_file, 'w', encoding='utf-8') as f:
167
  f.write(html_content)
168
 
169
+ print(f"Saved HTML: {os.path.getsize(html_file):,} bytes")
170
+
171
+ # Find puppeteer script
172
  puppeteer_script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'puppeteer_pdf.js')
173
 
174
+ if not os.path.exists(puppeteer_script):
175
+ return None, f"Error: puppeteer_pdf.js not found at {puppeteer_script}"
 
176
 
177
+ print(f"Using Puppeteer: {puppeteer_script}")
178
+
179
+ # Run conversion
180
  result = subprocess.run(
181
  ['node', puppeteer_script, html_file, aspect_ratio],
182
  capture_output=True,
183
  text=True,
184
  timeout=60,
185
+ cwd=os.path.dirname(os.path.abspath(puppeteer_script))
186
  )
187
 
188
  if result.returncode != 0:
 
189
  return None, f"PDF conversion failed: {result.stderr}"
190
 
191
+ # Read PDF
192
  pdf_file = html_file.replace('.html', '.pdf')
 
193
  if not os.path.exists(pdf_file):
194
  return None, "PDF file was not generated"
195
 
 
196
  with open(pdf_file, 'rb') as f:
197
  pdf_bytes = f.read()
198
 
199
+ print(f"PDF generated: {len(pdf_bytes):,} bytes")
200
  return pdf_bytes, None
201
 
202
  except subprocess.TimeoutExpired:
203
  return None, "Error: PDF conversion timed out (60 seconds)"
204
  except Exception as e:
205
+ print(f"Conversion error: {str(e)}")
206
  return None, f"Error: {str(e)}"
207
 
208
  @app.get("/")
209
  async def root():
210
  return {
211
+ "message": "HTML to PDF Converter API with Base64 Image Embedding",
212
+ "version": "2.1",
213
  "endpoints": {
214
+ "/convert": "POST - Convert HTML to PDF (images embedded as base64)",
215
  "/health": "GET - Health check"
216
  }
217
  }
 
229
  images: Optional[List[UploadFile]] = File(None)
230
  ):
231
  """
232
+ Convert HTML to PDF with embedded base64 images
233
 
234
  Parameters:
235
  - html_file: HTML file upload (optional)
236
  - html_content: Raw HTML content (optional, used if html_file not provided)
237
  - aspect_ratio: "16:9", "1:1", or "9:16" (optional if auto_detect is True)
238
  - auto_detect: Auto-detect aspect ratio from HTML (default: True)
239
+ - images: List of image files - will be embedded as base64 in HTML (optional)
240
 
241
  Returns:
242
+ - PDF file as bytes with images embedded
243
  """
244
  temp_dir = None
245
 
 
260
  html = html_content
261
  filename = "converted.pdf"
262
 
263
+ print(f"\n{'='*60}")
264
+ print(f"Processing HTML: {len(html)} characters")
265
+
266
  # Create temp directory
267
  temp_dir = tempfile.mkdtemp()
268
+ print(f"Temp directory: {temp_dir}")
269
 
270
+ # Embed images as base64 if provided
271
  if images:
272
+ print(f"Processing {len(images)} uploaded images...")
273
+ html, replacements = embed_images_as_base64(html, images)
274
+
275
+ if replacements:
276
+ print(f"Successfully embedded {len(replacements)} image reference(s)")
277
+ else:
278
+ print("WARNING: Images uploaded but no matches found in HTML")
279
+ else:
280
+ print("No images provided")
281
 
282
  # Determine aspect ratio
283
  if auto_detect or not aspect_ratio:
284
  detected_ratio = detect_aspect_ratio(html)
285
  aspect_ratio = detected_ratio
286
+ print(f"Auto-detected aspect ratio: {aspect_ratio}")
287
  else:
 
288
  if aspect_ratio not in ["16:9", "1:1", "9:16"]:
289
  raise HTTPException(status_code=400, detail="Invalid aspect_ratio. Must be '16:9', '1:1', or '9:16'")
290
+ print(f"Using specified aspect ratio: {aspect_ratio}")
291
 
292
  # Convert to PDF
293
  pdf_bytes, error = convert_html_to_pdf(html, aspect_ratio, temp_dir)
294
 
 
 
 
 
295
  if error:
296
  raise HTTPException(status_code=500, detail=error)
297
 
 
300
  if not output_filename.endswith('.pdf'):
301
  output_filename = 'converted.pdf'
302
 
303
+ print(f"Success! Generated {output_filename}")
304
+ print(f"{'='*60}\n")
305
+
306
+ # Return PDF
307
  return Response(
308
  content=pdf_bytes,
309
  media_type="application/pdf",
310
  headers={
311
  "Content-Disposition": f"attachment; filename={output_filename}",
312
+ "X-Aspect-Ratio": aspect_ratio,
313
+ "X-Images-Embedded": str(len(images)) if images else "0"
314
  }
315
  )
316
 
317
  except HTTPException:
318
  raise
319
  except Exception as e:
320
+ print(f"Error in convert endpoint: {str(e)}")
 
 
321
  raise HTTPException(status_code=500, detail=str(e))
322
+ finally:
323
+ # Cleanup
324
+ if temp_dir and os.path.exists(temp_dir):
325
+ shutil.rmtree(temp_dir, ignore_errors=True)
326
 
327
  if __name__ == "__main__":
328
  import uvicorn