um41r commited on
Commit
558ba3c
·
verified ·
1 Parent(s): 5c83a71

Create routers/pdf_converter.py

Browse files
Files changed (1) hide show
  1. routers/pdf_converter.py +266 -0
routers/pdf_converter.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, File, UploadFile, HTTPException
2
+ from fastapi.responses import FileResponse
3
+ import os
4
+ import tempfile
5
+ from pdf2docx import Converter
6
+ import pdfplumber
7
+ import pandas as pd
8
+ from PyPDF2 import PdfReader
9
+ from pptx import Presentation
10
+ from pptx.util import Inches, Pt
11
+ from pdf2image import convert_from_path
12
+ import io
13
+
14
+ router = APIRouter()
15
+
16
+ TEMP_DIR = "/tmp/conversions"
17
+
18
+ @router.post("/to-word")
19
+ async def convert_pdf_to_word(file: UploadFile = File(...)):
20
+ """Convert PDF to Word (DOCX)"""
21
+ if not file.filename.endswith('.pdf'):
22
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed")
23
+
24
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
25
+ temp_docx = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', dir=TEMP_DIR)
26
+
27
+ try:
28
+ content = await file.read()
29
+ temp_pdf.write(content)
30
+ temp_pdf.close()
31
+
32
+ cv = Converter(temp_pdf.name)
33
+ cv.convert(temp_docx.name)
34
+ cv.close()
35
+
36
+ original_name = os.path.splitext(file.filename)[0]
37
+ output_filename = f"{original_name}.docx"
38
+
39
+ return FileResponse(
40
+ temp_docx.name,
41
+ media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
42
+ filename=output_filename
43
+ )
44
+
45
+ except Exception as e:
46
+ if os.path.exists(temp_pdf.name):
47
+ os.unlink(temp_pdf.name)
48
+ if os.path.exists(temp_docx.name):
49
+ os.unlink(temp_docx.name)
50
+ raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
51
+
52
+ finally:
53
+ if os.path.exists(temp_pdf.name):
54
+ try:
55
+ os.unlink(temp_pdf.name)
56
+ except:
57
+ pass
58
+
59
+ @router.post("/to-powerpoint")
60
+ async def convert_pdf_to_powerpoint(file: UploadFile = File(...)):
61
+ """Convert PDF to PowerPoint (PPTX) - each page becomes a slide with image"""
62
+ if not file.filename.endswith('.pdf'):
63
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed")
64
+
65
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
66
+ temp_pptx = tempfile.NamedTemporaryFile(delete=False, suffix='.pptx', dir=TEMP_DIR)
67
+
68
+ try:
69
+ content = await file.read()
70
+ temp_pdf.write(content)
71
+ temp_pdf.close()
72
+
73
+ # Convert PDF pages to images
74
+ images = convert_from_path(temp_pdf.name, dpi=150)
75
+
76
+ # Create PowerPoint presentation
77
+ prs = Presentation()
78
+ prs.slide_width = Inches(10)
79
+ prs.slide_height = Inches(7.5)
80
+
81
+ for i, image in enumerate(images):
82
+ # Add blank slide
83
+ blank_slide_layout = prs.slide_layouts[6] # Blank layout
84
+ slide = prs.slides.add_slide(blank_slide_layout)
85
+
86
+ # Save image to bytes
87
+ img_buffer = io.BytesIO()
88
+ image.save(img_buffer, format='PNG')
89
+ img_buffer.seek(0)
90
+
91
+ # Add image to slide (centered and fit to slide)
92
+ left = Inches(0.5)
93
+ top = Inches(0.5)
94
+ width = Inches(9)
95
+ height = Inches(6.5)
96
+
97
+ pic = slide.shapes.add_picture(img_buffer, left, top, width=width, height=height)
98
+
99
+ prs.save(temp_pptx.name)
100
+
101
+ original_name = os.path.splitext(file.filename)[0]
102
+ output_filename = f"{original_name}.pptx"
103
+
104
+ return FileResponse(
105
+ temp_pptx.name,
106
+ media_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
107
+ filename=output_filename
108
+ )
109
+
110
+ except Exception as e:
111
+ if os.path.exists(temp_pdf.name):
112
+ os.unlink(temp_pdf.name)
113
+ if os.path.exists(temp_pptx.name):
114
+ os.unlink(temp_pptx.name)
115
+ raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
116
+
117
+ finally:
118
+ if os.path.exists(temp_pdf.name):
119
+ try:
120
+ os.unlink(temp_pdf.name)
121
+ except:
122
+ pass
123
+
124
+ @router.post("/to-excel")
125
+ async def convert_pdf_to_excel(file: UploadFile = File(...)):
126
+ """Convert PDF tables to Excel (XLSX)"""
127
+ if not file.filename.endswith('.pdf'):
128
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed")
129
+
130
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
131
+ temp_xlsx = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', dir=TEMP_DIR)
132
+
133
+ try:
134
+ content = await file.read()
135
+ temp_pdf.write(content)
136
+ temp_pdf.close()
137
+
138
+ # Extract tables from PDF
139
+ with pdfplumber.open(temp_pdf.name) as pdf:
140
+ all_tables = []
141
+ for page in pdf.pages:
142
+ tables = page.extract_tables()
143
+ if tables:
144
+ all_tables.extend(tables)
145
+
146
+ if not all_tables:
147
+ raise HTTPException(status_code=400, detail="No tables found in PDF")
148
+
149
+ # Write to Excel
150
+ with pd.ExcelWriter(temp_xlsx.name, engine='openpyxl') as writer:
151
+ for idx, table in enumerate(all_tables):
152
+ df = pd.DataFrame(table[1:], columns=table[0] if table else None)
153
+ sheet_name = f'Table_{idx+1}'
154
+ df.to_excel(writer, sheet_name=sheet_name, index=False)
155
+
156
+ original_name = os.path.splitext(file.filename)[0]
157
+ output_filename = f"{original_name}.xlsx"
158
+
159
+ return FileResponse(
160
+ temp_xlsx.name,
161
+ media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
162
+ filename=output_filename
163
+ )
164
+
165
+ except HTTPException:
166
+ raise
167
+ except Exception as e:
168
+ raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
169
+
170
+ finally:
171
+ if os.path.exists(temp_pdf.name):
172
+ try:
173
+ os.unlink(temp_pdf.name)
174
+ except:
175
+ pass
176
+
177
+ @router.post("/to-html")
178
+ async def convert_pdf_to_html(file: UploadFile = File(...)):
179
+ """Convert PDF to HTML"""
180
+ if not file.filename.endswith('.pdf'):
181
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed")
182
+
183
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
184
+ temp_html = tempfile.NamedTemporaryFile(delete=False, suffix='.html', dir=TEMP_DIR, mode='w')
185
+
186
+ try:
187
+ content = await file.read()
188
+ temp_pdf.write(content)
189
+ temp_pdf.close()
190
+
191
+ # Extract text from PDF
192
+ with pdfplumber.open(temp_pdf.name) as pdf:
193
+ html_content = "<html><head><meta charset='UTF-8'><title>PDF Content</title>"
194
+ html_content += "<style>body{font-family:Arial,sans-serif;margin:40px;} .page{margin-bottom:40px;page-break-after:always;}</style></head><body>"
195
+
196
+ for i, page in enumerate(pdf.pages):
197
+ text = page.extract_text()
198
+ html_content += f"<div class='page'><h2>Page {i+1}</h2><pre>{text}</pre></div>"
199
+
200
+ html_content += "</body></html>"
201
+
202
+ temp_html.write(html_content)
203
+ temp_html.close()
204
+
205
+ original_name = os.path.splitext(file.filename)[0]
206
+ output_filename = f"{original_name}.html"
207
+
208
+ return FileResponse(
209
+ temp_html.name,
210
+ media_type="text/html",
211
+ filename=output_filename
212
+ )
213
+
214
+ except Exception as e:
215
+ raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
216
+
217
+ finally:
218
+ if os.path.exists(temp_pdf.name):
219
+ try:
220
+ os.unlink(temp_pdf.name)
221
+ except:
222
+ pass
223
+
224
+ @router.post("/to-text")
225
+ async def convert_pdf_to_text(file: UploadFile = File(...)):
226
+ """Extract text from PDF"""
227
+ if not file.filename.endswith('.pdf'):
228
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed")
229
+
230
+ temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
231
+ temp_txt = tempfile.NamedTemporaryFile(delete=False, suffix='.txt', dir=TEMP_DIR, mode='w')
232
+
233
+ try:
234
+ content = await file.read()
235
+ temp_pdf.write(content)
236
+ temp_pdf.close()
237
+
238
+ reader = PdfReader(temp_pdf.name)
239
+ text_content = ""
240
+
241
+ for i, page in enumerate(reader.pages):
242
+ text_content += f"--- Page {i+1} ---\n\n"
243
+ text_content += page.extract_text()
244
+ text_content += "\n\n"
245
+
246
+ temp_txt.write(text_content)
247
+ temp_txt.close()
248
+
249
+ original_name = os.path.splitext(file.filename)[0]
250
+ output_filename = f"{original_name}.txt"
251
+
252
+ return FileResponse(
253
+ temp_txt.name,
254
+ media_type="text/plain",
255
+ filename=output_filename
256
+ )
257
+
258
+ except Exception as e:
259
+ raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
260
+
261
+ finally:
262
+ if os.path.exists(temp_pdf.name):
263
+ try:
264
+ os.unlink(temp_pdf.name)
265
+ except:
266
+ pass