| from fastapi import APIRouter, File, UploadFile, HTTPException |
| from fastapi.responses import FileResponse |
| import os |
| import tempfile |
| from pdf2docx import Converter |
| import pdfplumber |
| import pandas as pd |
| from PyPDF2 import PdfReader |
| from pptx import Presentation |
| from pptx.util import Inches, Pt |
| from pdf2image import convert_from_path |
| import io |
|
|
| router = APIRouter() |
|
|
| TEMP_DIR = "/tmp/conversions" |
|
|
| @router.post("/to-word") |
| async def convert_pdf_to_word(file: UploadFile = File(...)): |
| """Convert PDF to Word (DOCX)""" |
| if not file.filename.endswith('.pdf'): |
| raise HTTPException(status_code=400, detail="Only PDF files are allowed") |
| |
| temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR) |
| temp_docx = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', dir=TEMP_DIR) |
| |
| try: |
| content = await file.read() |
| temp_pdf.write(content) |
| temp_pdf.close() |
| |
| cv = Converter(temp_pdf.name) |
| cv.convert(temp_docx.name) |
| cv.close() |
| |
| original_name = os.path.splitext(file.filename)[0] |
| output_filename = f"{original_name}.docx" |
| |
| return FileResponse( |
| temp_docx.name, |
| media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| filename=output_filename |
| ) |
| |
| except Exception as e: |
| if os.path.exists(temp_pdf.name): |
| os.unlink(temp_pdf.name) |
| if os.path.exists(temp_docx.name): |
| os.unlink(temp_docx.name) |
| raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}") |
| |
| finally: |
| if os.path.exists(temp_pdf.name): |
| try: |
| os.unlink(temp_pdf.name) |
| except: |
| pass |
|
|
| @router.post("/to-powerpoint") |
| async def convert_pdf_to_powerpoint(file: UploadFile = File(...)): |
| """Convert PDF to PowerPoint (PPTX) - each page becomes a slide with image""" |
| if not file.filename.endswith('.pdf'): |
| raise HTTPException(status_code=400, detail="Only PDF files are allowed") |
| |
| temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR) |
| temp_pptx = tempfile.NamedTemporaryFile(delete=False, suffix='.pptx', dir=TEMP_DIR) |
| |
| try: |
| content = await file.read() |
| temp_pdf.write(content) |
| temp_pdf.close() |
| |
| |
| images = convert_from_path(temp_pdf.name, dpi=150) |
| |
| |
| prs = Presentation() |
| prs.slide_width = Inches(10) |
| prs.slide_height = Inches(7.5) |
| |
| for i, image in enumerate(images): |
| |
| blank_slide_layout = prs.slide_layouts[6] |
| slide = prs.slides.add_slide(blank_slide_layout) |
| |
| |
| img_buffer = io.BytesIO() |
| image.save(img_buffer, format='PNG') |
| img_buffer.seek(0) |
| |
| |
| left = Inches(0.5) |
| top = Inches(0.5) |
| width = Inches(9) |
| height = Inches(6.5) |
| |
| pic = slide.shapes.add_picture(img_buffer, left, top, width=width, height=height) |
| |
| prs.save(temp_pptx.name) |
| |
| original_name = os.path.splitext(file.filename)[0] |
| output_filename = f"{original_name}.pptx" |
| |
| return FileResponse( |
| temp_pptx.name, |
| media_type="application/vnd.openxmlformats-officedocument.presentationml.presentation", |
| filename=output_filename |
| ) |
| |
| except Exception as e: |
| if os.path.exists(temp_pdf.name): |
| os.unlink(temp_pdf.name) |
| if os.path.exists(temp_pptx.name): |
| os.unlink(temp_pptx.name) |
| raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}") |
| |
| finally: |
| if os.path.exists(temp_pdf.name): |
| try: |
| os.unlink(temp_pdf.name) |
| except: |
| pass |
|
|
| @router.post("/to-excel") |
| async def convert_pdf_to_excel(file: UploadFile = File(...)): |
| """Convert PDF tables to Excel (XLSX)""" |
| if not file.filename.endswith('.pdf'): |
| raise HTTPException(status_code=400, detail="Only PDF files are allowed") |
| |
| temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR) |
| temp_xlsx = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', dir=TEMP_DIR) |
| |
| try: |
| content = await file.read() |
| temp_pdf.write(content) |
| temp_pdf.close() |
| |
| |
| with pdfplumber.open(temp_pdf.name) as pdf: |
| all_tables = [] |
| for page in pdf.pages: |
| tables = page.extract_tables() |
| if tables: |
| all_tables.extend(tables) |
| |
| if not all_tables: |
| raise HTTPException(status_code=400, detail="No tables found in PDF") |
| |
| |
| with pd.ExcelWriter(temp_xlsx.name, engine='openpyxl') as writer: |
| for idx, table in enumerate(all_tables): |
| df = pd.DataFrame(table[1:], columns=table[0] if table else None) |
| sheet_name = f'Table_{idx+1}' |
| df.to_excel(writer, sheet_name=sheet_name, index=False) |
| |
| original_name = os.path.splitext(file.filename)[0] |
| output_filename = f"{original_name}.xlsx" |
| |
| return FileResponse( |
| temp_xlsx.name, |
| media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
| filename=output_filename |
| ) |
| |
| except HTTPException: |
| raise |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}") |
| |
| finally: |
| if os.path.exists(temp_pdf.name): |
| try: |
| os.unlink(temp_pdf.name) |
| except: |
| pass |
|
|
| @router.post("/to-html") |
| async def convert_pdf_to_html(file: UploadFile = File(...)): |
| """Convert PDF to HTML""" |
| if not file.filename.endswith('.pdf'): |
| raise HTTPException(status_code=400, detail="Only PDF files are allowed") |
| |
| temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR) |
| temp_html = tempfile.NamedTemporaryFile(delete=False, suffix='.html', dir=TEMP_DIR, mode='w') |
| |
| try: |
| content = await file.read() |
| temp_pdf.write(content) |
| temp_pdf.close() |
| |
| |
| with pdfplumber.open(temp_pdf.name) as pdf: |
| html_content = "<html><head><meta charset='UTF-8'><title>PDF Content</title>" |
| html_content += "<style>body{font-family:Arial,sans-serif;margin:40px;} .page{margin-bottom:40px;page-break-after:always;}</style></head><body>" |
| |
| for i, page in enumerate(pdf.pages): |
| text = page.extract_text() |
| html_content += f"<div class='page'><h2>Page {i+1}</h2><pre>{text}</pre></div>" |
| |
| html_content += "</body></html>" |
| |
| temp_html.write(html_content) |
| temp_html.close() |
| |
| original_name = os.path.splitext(file.filename)[0] |
| output_filename = f"{original_name}.html" |
| |
| return FileResponse( |
| temp_html.name, |
| media_type="text/html", |
| filename=output_filename |
| ) |
| |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}") |
| |
| finally: |
| if os.path.exists(temp_pdf.name): |
| try: |
| os.unlink(temp_pdf.name) |
| except: |
| pass |
|
|
| @router.post("/to-text") |
| async def convert_pdf_to_text(file: UploadFile = File(...)): |
| """Extract text from PDF""" |
| if not file.filename.endswith('.pdf'): |
| raise HTTPException(status_code=400, detail="Only PDF files are allowed") |
| |
| temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR) |
| temp_txt = tempfile.NamedTemporaryFile(delete=False, suffix='.txt', dir=TEMP_DIR, mode='w') |
| |
| try: |
| content = await file.read() |
| temp_pdf.write(content) |
| temp_pdf.close() |
| |
| reader = PdfReader(temp_pdf.name) |
| text_content = "" |
| |
| for i, page in enumerate(reader.pages): |
| text_content += f"--- Page {i+1} ---\n\n" |
| text_content += page.extract_text() |
| text_content += "\n\n" |
| |
| temp_txt.write(text_content) |
| temp_txt.close() |
| |
| original_name = os.path.splitext(file.filename)[0] |
| output_filename = f"{original_name}.txt" |
| |
| return FileResponse( |
| temp_txt.name, |
| media_type="text/plain", |
| filename=output_filename |
| ) |
| |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}") |
| |
| finally: |
| if os.path.exists(temp_pdf.name): |
| try: |
| os.unlink(temp_pdf.name) |
| except: |
| pass |