Spaces:

um41r
/

PD-Tools

Running

App Files Files Community

PD-Tools / routers /pdf_converter.py

um41r

Create routers/pdf_converter.py

558ba3c verified 3 months ago

raw

history blame contribute delete

9.31 kB

	from fastapi import APIRouter, File, UploadFile, HTTPException
	from fastapi.responses import FileResponse
	import os
	import tempfile
	from pdf2docx import Converter
	import pdfplumber
	import pandas as pd
	from PyPDF2 import PdfReader
	from pptx import Presentation
	from pptx.util import Inches, Pt
	from pdf2image import convert_from_path
	import io

	router = APIRouter()

	TEMP_DIR = "/tmp/conversions"

	@router.post("/to-word")
	async def convert_pdf_to_word(file: UploadFile = File(...)):
	"""Convert PDF to Word (DOCX)"""
	if not file.filename.endswith('.pdf'):
	raise HTTPException(status_code=400, detail="Only PDF files are allowed")

	temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
	temp_docx = tempfile.NamedTemporaryFile(delete=False, suffix='.docx', dir=TEMP_DIR)

	try:
	content = await file.read()
	temp_pdf.write(content)
	temp_pdf.close()

	cv = Converter(temp_pdf.name)
	cv.convert(temp_docx.name)
	cv.close()

	original_name = os.path.splitext(file.filename)[0]
	output_filename = f"{original_name}.docx"

	return FileResponse(
	temp_docx.name,
	media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
	filename=output_filename
	)

	except Exception as e:
	if os.path.exists(temp_pdf.name):
	os.unlink(temp_pdf.name)
	if os.path.exists(temp_docx.name):
	os.unlink(temp_docx.name)
	raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")

	finally:
	if os.path.exists(temp_pdf.name):
	try:
	os.unlink(temp_pdf.name)
	except:
	pass

	@router.post("/to-powerpoint")
	async def convert_pdf_to_powerpoint(file: UploadFile = File(...)):
	"""Convert PDF to PowerPoint (PPTX) - each page becomes a slide with image"""
	if not file.filename.endswith('.pdf'):
	raise HTTPException(status_code=400, detail="Only PDF files are allowed")

	temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
	temp_pptx = tempfile.NamedTemporaryFile(delete=False, suffix='.pptx', dir=TEMP_DIR)

	try:
	content = await file.read()
	temp_pdf.write(content)
	temp_pdf.close()

	# Convert PDF pages to images
	images = convert_from_path(temp_pdf.name, dpi=150)

	# Create PowerPoint presentation
	prs = Presentation()
	prs.slide_width = Inches(10)
	prs.slide_height = Inches(7.5)

	for i, image in enumerate(images):
	# Add blank slide
	blank_slide_layout = prs.slide_layouts[6] # Blank layout
	slide = prs.slides.add_slide(blank_slide_layout)

	# Save image to bytes
	img_buffer = io.BytesIO()
	image.save(img_buffer, format='PNG')
	img_buffer.seek(0)

	# Add image to slide (centered and fit to slide)
	left = Inches(0.5)
	top = Inches(0.5)
	width = Inches(9)
	height = Inches(6.5)

	pic = slide.shapes.add_picture(img_buffer, left, top, width=width, height=height)

	prs.save(temp_pptx.name)

	original_name = os.path.splitext(file.filename)[0]
	output_filename = f"{original_name}.pptx"

	return FileResponse(
	temp_pptx.name,
	media_type="application/vnd.openxmlformats-officedocument.presentationml.presentation",
	filename=output_filename
	)

	except Exception as e:
	if os.path.exists(temp_pdf.name):
	os.unlink(temp_pdf.name)
	if os.path.exists(temp_pptx.name):
	os.unlink(temp_pptx.name)
	raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")

	finally:
	if os.path.exists(temp_pdf.name):
	try:
	os.unlink(temp_pdf.name)
	except:
	pass

	@router.post("/to-excel")
	async def convert_pdf_to_excel(file: UploadFile = File(...)):
	"""Convert PDF tables to Excel (XLSX)"""
	if not file.filename.endswith('.pdf'):
	raise HTTPException(status_code=400, detail="Only PDF files are allowed")

	temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
	temp_xlsx = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx', dir=TEMP_DIR)

	try:
	content = await file.read()
	temp_pdf.write(content)
	temp_pdf.close()

	# Extract tables from PDF
	with pdfplumber.open(temp_pdf.name) as pdf:
	all_tables = []
	for page in pdf.pages:
	tables = page.extract_tables()
	if tables:
	all_tables.extend(tables)

	if not all_tables:
	raise HTTPException(status_code=400, detail="No tables found in PDF")

	# Write to Excel
	with pd.ExcelWriter(temp_xlsx.name, engine='openpyxl') as writer:
	for idx, table in enumerate(all_tables):
	df = pd.DataFrame(table[1:], columns=table[0] if table else None)
	sheet_name = f'Table_{idx+1}'
	df.to_excel(writer, sheet_name=sheet_name, index=False)

	original_name = os.path.splitext(file.filename)[0]
	output_filename = f"{original_name}.xlsx"

	return FileResponse(
	temp_xlsx.name,
	media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	filename=output_filename
	)

	except HTTPException:
	raise
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")

	finally:
	if os.path.exists(temp_pdf.name):
	try:
	os.unlink(temp_pdf.name)
	except:
	pass

	@router.post("/to-html")
	async def convert_pdf_to_html(file: UploadFile = File(...)):
	"""Convert PDF to HTML"""
	if not file.filename.endswith('.pdf'):
	raise HTTPException(status_code=400, detail="Only PDF files are allowed")

	temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
	temp_html = tempfile.NamedTemporaryFile(delete=False, suffix='.html', dir=TEMP_DIR, mode='w')

	try:
	content = await file.read()
	temp_pdf.write(content)
	temp_pdf.close()

	# Extract text from PDF
	with pdfplumber.open(temp_pdf.name) as pdf:
	html_content = "<html><head><meta charset='UTF-8'><title>PDF Content</title>"
	html_content += "<style>body{font-family:Arial,sans-serif;margin:40px;} .page{margin-bottom:40px;page-break-after:always;}</style></head><body>"

	for i, page in enumerate(pdf.pages):
	text = page.extract_text()
	html_content += f"<div class='page'><h2>Page {i+1}</h2><pre>{text}</pre></div>"

	html_content += "</body></html>"

	temp_html.write(html_content)
	temp_html.close()

	original_name = os.path.splitext(file.filename)[0]
	output_filename = f"{original_name}.html"

	return FileResponse(
	temp_html.name,
	media_type="text/html",
	filename=output_filename
	)

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")

	finally:
	if os.path.exists(temp_pdf.name):
	try:
	os.unlink(temp_pdf.name)
	except:
	pass

	@router.post("/to-text")
	async def convert_pdf_to_text(file: UploadFile = File(...)):
	"""Extract text from PDF"""
	if not file.filename.endswith('.pdf'):
	raise HTTPException(status_code=400, detail="Only PDF files are allowed")

	temp_pdf = tempfile.NamedTemporaryFile(delete=False, suffix='.pdf', dir=TEMP_DIR)
	temp_txt = tempfile.NamedTemporaryFile(delete=False, suffix='.txt', dir=TEMP_DIR, mode='w')

	try:
	content = await file.read()
	temp_pdf.write(content)
	temp_pdf.close()

	reader = PdfReader(temp_pdf.name)
	text_content = ""

	for i, page in enumerate(reader.pages):
	text_content += f"--- Page {i+1} ---\n\n"
	text_content += page.extract_text()
	text_content += "\n\n"

	temp_txt.write(text_content)
	temp_txt.close()

	original_name = os.path.splitext(file.filename)[0]
	output_filename = f"{original_name}.txt"

	return FileResponse(
	temp_txt.name,
	media_type="text/plain",
	filename=output_filename
	)

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Conversion failed: {str(e)}")

	finally:
	if os.path.exists(temp_pdf.name):
	try:
	os.unlink(temp_pdf.name)
	except:
	pass