PD-Tools / routers /pdf_converter.py
um41r's picture
Create routers/pdf_converter.py
558ba3c verified
from fastapi import APIRouter, File, UploadFile, HTTPException
from fastapi.responses import FileResponse
import os
import tempfile
from pdf2docx import Converter
import pdfplumber
import pandas as pd
from PyPDF2 import PdfReader
from pptx import Presentation
from pptx.util import Inches, Pt
from pdf2image import convert_from_path
import io
router = APIRouter()
TEMP_DIR = "/tmp/conversions"
@router.post("/to-word")
async def convert_pdf_to_word(file: UploadFile = File(...)):
"""Convert PDF to Word (DOCX)"""
if not file.filename.endswith('.pdf'):
raise HTTPException(status_code=400, detail="Only PDF files are allowed")
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
temp_docx = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', dir=TEMP_DIR)
try:
content = await file.read()
temp_pdf.write(content)
temp_pdf.close()
cv = Converter(temp_pdf.name)
cv.convert(temp_docx.name)
cv.close()
original_name = os.path.splitext(file.filename)[0]
output_filename = f"{original_name}.docx"
return FileResponse(
temp_docx.name,
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
filename=output_filename
)
except Exception as e:
if os.path.exists(temp_pdf.name):
os.unlink(temp_pdf.name)
if os.path.exists(temp_docx.name):
os.unlink(temp_docx.name)
raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
finally:
if os.path.exists(temp_pdf.name):
try:
os.unlink(temp_pdf.name)
except:
pass
@router.post("/to-powerpoint")
async def convert_pdf_to_powerpoint(file: UploadFile = File(...)):
"""Convert PDF to PowerPoint (PPTX) - each page becomes a slide with image"""
if not file.filename.endswith('.pdf'):
raise HTTPException(status_code=400, detail="Only PDF files are allowed")
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
temp_pptx = tempfile.NamedTemporaryFile(delete=False, suffix='.pptx', dir=TEMP_DIR)
try:
content = await file.read()
temp_pdf.write(content)
temp_pdf.close()
# Convert PDF pages to images
images = convert_from_path(temp_pdf.name, dpi=150)
# Create PowerPoint presentation
prs = Presentation()
prs.slide_width = Inches(10)
prs.slide_height = Inches(7.5)
for i, image in enumerate(images):
# Add blank slide
blank_slide_layout = prs.slide_layouts[6] # Blank layout
slide = prs.slides.add_slide(blank_slide_layout)
# Save image to bytes
img_buffer = io.BytesIO()
image.save(img_buffer, format='PNG')
img_buffer.seek(0)
# Add image to slide (centered and fit to slide)
left = Inches(0.5)
top = Inches(0.5)
width = Inches(9)
height = Inches(6.5)
pic = slide.shapes.add_picture(img_buffer, left, top, width=width, height=height)
prs.save(temp_pptx.name)
original_name = os.path.splitext(file.filename)[0]
output_filename = f"{original_name}.pptx"
return FileResponse(
temp_pptx.name,
media_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
filename=output_filename
)
except Exception as e:
if os.path.exists(temp_pdf.name):
os.unlink(temp_pdf.name)
if os.path.exists(temp_pptx.name):
os.unlink(temp_pptx.name)
raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
finally:
if os.path.exists(temp_pdf.name):
try:
os.unlink(temp_pdf.name)
except:
pass
@router.post("/to-excel")
async def convert_pdf_to_excel(file: UploadFile = File(...)):
"""Convert PDF tables to Excel (XLSX)"""
if not file.filename.endswith('.pdf'):
raise HTTPException(status_code=400, detail="Only PDF files are allowed")
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
temp_xlsx = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', dir=TEMP_DIR)
try:
content = await file.read()
temp_pdf.write(content)
temp_pdf.close()
# Extract tables from PDF
with pdfplumber.open(temp_pdf.name) as pdf:
all_tables = []
for page in pdf.pages:
tables = page.extract_tables()
if tables:
all_tables.extend(tables)
if not all_tables:
raise HTTPException(status_code=400, detail="No tables found in PDF")
# Write to Excel
with pd.ExcelWriter(temp_xlsx.name, engine='openpyxl') as writer:
for idx, table in enumerate(all_tables):
df = pd.DataFrame(table[1:], columns=table[0] if table else None)
sheet_name = f'Table_{idx+1}'
df.to_excel(writer, sheet_name=sheet_name, index=False)
original_name = os.path.splitext(file.filename)[0]
output_filename = f"{original_name}.xlsx"
return FileResponse(
temp_xlsx.name,
media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
filename=output_filename
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
finally:
if os.path.exists(temp_pdf.name):
try:
os.unlink(temp_pdf.name)
except:
pass
@router.post("/to-html")
async def convert_pdf_to_html(file: UploadFile = File(...)):
"""Convert PDF to HTML"""
if not file.filename.endswith('.pdf'):
raise HTTPException(status_code=400, detail="Only PDF files are allowed")
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
temp_html = tempfile.NamedTemporaryFile(delete=False, suffix='.html', dir=TEMP_DIR, mode='w')
try:
content = await file.read()
temp_pdf.write(content)
temp_pdf.close()
# Extract text from PDF
with pdfplumber.open(temp_pdf.name) as pdf:
html_content = "<html><head><meta charset='UTF-8'><title>PDF Content</title>"
html_content += "<style>body{font-family:Arial,sans-serif;margin:40px;} .page{margin-bottom:40px;page-break-after:always;}</style></head><body>"
for i, page in enumerate(pdf.pages):
text = page.extract_text()
html_content += f"<div class='page'><h2>Page {i+1}</h2><pre>{text}</pre></div>"
html_content += "</body></html>"
temp_html.write(html_content)
temp_html.close()
original_name = os.path.splitext(file.filename)[0]
output_filename = f"{original_name}.html"
return FileResponse(
temp_html.name,
media_type="text/html",
filename=output_filename
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
finally:
if os.path.exists(temp_pdf.name):
try:
os.unlink(temp_pdf.name)
except:
pass
@router.post("/to-text")
async def convert_pdf_to_text(file: UploadFile = File(...)):
"""Extract text from PDF"""
if not file.filename.endswith('.pdf'):
raise HTTPException(status_code=400, detail="Only PDF files are allowed")
temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
temp_txt = tempfile.NamedTemporaryFile(delete=False, suffix='.txt', dir=TEMP_DIR, mode='w')
try:
content = await file.read()
temp_pdf.write(content)
temp_pdf.close()
reader = PdfReader(temp_pdf.name)
text_content = ""
for i, page in enumerate(reader.pages):
text_content += f"--- Page {i+1} ---\n\n"
text_content += page.extract_text()
text_content += "\n\n"
temp_txt.write(text_content)
temp_txt.close()
original_name = os.path.splitext(file.filename)[0]
output_filename = f"{original_name}.txt"
return FileResponse(
temp_txt.name,
media_type="text/plain",
filename=output_filename
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
finally:
if os.path.exists(temp_pdf.name):
try:
os.unlink(temp_pdf.name)
except:
pass