Spaces:

um41r
/

PD-Tools

Running

App Files Files Community

um41r commited on about 1 month ago

Commit

558ba3c

verified ·

1 Parent(s): 5c83a71

Create routers/pdf_converter.py

Browse files

Files changed (1) hide show

routers/pdf_converter.py +266 -0

routers/pdf_converter.py ADDED Viewed

	@@ -0,0 +1,266 @@

+from fastapi import APIRouter, File, UploadFile, HTTPException
+from fastapi.responses import FileResponse
+import os
+import tempfile
+from pdf2docx import Converter
+import pdfplumber
+import pandas as pd
+from PyPDF2 import PdfReader
+from pptx import Presentation
+from pptx.util import Inches, Pt
+from pdf2image import convert_from_path
+import io
+router = APIRouter()
+TEMP_DIR = "/tmp/conversions"
+@router.post("/to-word")
+async def convert_pdf_to_word(file: UploadFile = File(...)):
+    """Convert PDF to Word (DOCX)"""
+    if not file.filename.endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
+    temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
+    temp_docx = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', dir=TEMP_DIR)
+    try:
+        content = await file.read()
+        temp_pdf.write(content)
+        temp_pdf.close()
+        cv = Converter(temp_pdf.name)
+        cv.convert(temp_docx.name)
+        cv.close()
+        original_name = os.path.splitext(file.filename)[0]
+        output_filename = f"{original_name}.docx"
+        return FileResponse(
+            temp_docx.name,
+            media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            filename=output_filename
+        )
+    except Exception as e:
+        if os.path.exists(temp_pdf.name):
+            os.unlink(temp_pdf.name)
+        if os.path.exists(temp_docx.name):
+            os.unlink(temp_docx.name)
+        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
+    finally:
+        if os.path.exists(temp_pdf.name):
+            try:
+                os.unlink(temp_pdf.name)
+            except:
+                pass
+@router.post("/to-powerpoint")
+async def convert_pdf_to_powerpoint(file: UploadFile = File(...)):
+    """Convert PDF to PowerPoint (PPTX) - each page becomes a slide with image"""
+    if not file.filename.endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
+    temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
+    temp_pptx = tempfile.NamedTemporaryFile(delete=False, suffix='.pptx', dir=TEMP_DIR)
+    try:
+        content = await file.read()
+        temp_pdf.write(content)
+        temp_pdf.close()
+        # Convert PDF pages to images
+        images = convert_from_path(temp_pdf.name, dpi=150)
+        # Create PowerPoint presentation
+        prs = Presentation()
+        prs.slide_width = Inches(10)
+        prs.slide_height = Inches(7.5)
+        for i, image in enumerate(images):
+            # Add blank slide
+            blank_slide_layout = prs.slide_layouts[6]  # Blank layout
+            slide = prs.slides.add_slide(blank_slide_layout)
+            # Save image to bytes
+            img_buffer = io.BytesIO()
+            image.save(img_buffer, format='PNG')
+            img_buffer.seek(0)
+            # Add image to slide (centered and fit to slide)
+            left = Inches(0.5)
+            top = Inches(0.5)
+            width = Inches(9)
+            height = Inches(6.5)
+            pic = slide.shapes.add_picture(img_buffer, left, top, width=width, height=height)
+        prs.save(temp_pptx.name)
+        original_name = os.path.splitext(file.filename)[0]
+        output_filename = f"{original_name}.pptx"
+        return FileResponse(
+            temp_pptx.name,
+            media_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
+            filename=output_filename
+        )
+    except Exception as e:
+        if os.path.exists(temp_pdf.name):
+            os.unlink(temp_pdf.name)
+        if os.path.exists(temp_pptx.name):
+            os.unlink(temp_pptx.name)
+        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
+    finally:
+        if os.path.exists(temp_pdf.name):
+            try:
+                os.unlink(temp_pdf.name)
+            except:
+                pass
+@router.post("/to-excel")
+async def convert_pdf_to_excel(file: UploadFile = File(...)):
+    """Convert PDF tables to Excel (XLSX)"""
+    if not file.filename.endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
+    temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
+    temp_xlsx = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', dir=TEMP_DIR)
+    try:
+        content = await file.read()
+        temp_pdf.write(content)
+        temp_pdf.close()
+        # Extract tables from PDF
+        with pdfplumber.open(temp_pdf.name) as pdf:
+            all_tables = []
+            for page in pdf.pages:
+                tables = page.extract_tables()
+                if tables:
+                    all_tables.extend(tables)
+            if not all_tables:
+                raise HTTPException(status_code=400, detail="No tables found in PDF")
+            # Write to Excel
+            with pd.ExcelWriter(temp_xlsx.name, engine='openpyxl') as writer:
+                for idx, table in enumerate(all_tables):
+                    df = pd.DataFrame(table[1:], columns=table[0] if table else None)
+                    sheet_name = f'Table_{idx+1}'
+                    df.to_excel(writer, sheet_name=sheet_name, index=False)
+        original_name = os.path.splitext(file.filename)[0]
+        output_filename = f"{original_name}.xlsx"
+        return FileResponse(
+            temp_xlsx.name,
+            media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+            filename=output_filename
+        )
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
+    finally:
+        if os.path.exists(temp_pdf.name):
+            try:
+                os.unlink(temp_pdf.name)
+            except:
+                pass
+@router.post("/to-html")
+async def convert_pdf_to_html(file: UploadFile = File(...)):
+    """Convert PDF to HTML"""
+    if not file.filename.endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
+    temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
+    temp_html = tempfile.NamedTemporaryFile(delete=False, suffix='.html', dir=TEMP_DIR, mode='w')
+    try:
+        content = await file.read()
+        temp_pdf.write(content)
+        temp_pdf.close()
+        # Extract text from PDF
+        with pdfplumber.open(temp_pdf.name) as pdf:
+            html_content = "<html><head><meta charset='UTF-8'><title>PDF Content</title>"
+            html_content += "<style>body{font-family:Arial,sans-serif;margin:40px;} .page{margin-bottom:40px;page-break-after:always;}</style></head><body>"
+            for i, page in enumerate(pdf.pages):
+                text = page.extract_text()
+                html_content += f"<div class='page'><h2>Page {i+1}</h2><pre>{text}</pre></div>"
+            html_content += "</body></html>"
+        temp_html.write(html_content)
+        temp_html.close()
+        original_name = os.path.splitext(file.filename)[0]
+        output_filename = f"{original_name}.html"
+        return FileResponse(
+            temp_html.name,
+            media_type="text/html",
+            filename=output_filename
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
+    finally:
+        if os.path.exists(temp_pdf.name):
+            try:
+                os.unlink(temp_pdf.name)
+            except:
+                pass
+@router.post("/to-text")
+async def convert_pdf_to_text(file: UploadFile = File(...)):
+    """Extract text from PDF"""
+    if not file.filename.endswith('.pdf'):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
+    temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
+    temp_txt = tempfile.NamedTemporaryFile(delete=False, suffix='.txt', dir=TEMP_DIR, mode='w')
+    try:
+        content = await file.read()
+        temp_pdf.write(content)
+        temp_pdf.close()
+        reader = PdfReader(temp_pdf.name)
+        text_content = ""
+        for i, page in enumerate(reader.pages):
+            text_content += f"--- Page {i+1} ---\n\n"
+            text_content += page.extract_text()
+            text_content += "\n\n"
+        temp_txt.write(text_content)
+        temp_txt.close()
+        original_name = os.path.splitext(file.filename)[0]
+        output_filename = f"{original_name}.txt"
+        return FileResponse(
+            temp_txt.name,
+            media_type="text/plain",
+            filename=output_filename
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")
+    finally:
+        if os.path.exists(temp_pdf.name):
+            try:
+                os.unlink(temp_pdf.name)
+            except:
+                pass