Spaces:
Sleeping
Sleeping
File size: 6,064 Bytes
debf870 4a3cccb 8133402 4a3cccb 7a61a27 8133402 dbf74d1 a085c86 4a3cccb 8133402 debf870 4a3cccb 8133402 debf870 8133402 4a3cccb 8133402 4a3cccb debf870 8133402 debf870 4a3cccb debf870 8133402 debf870 8133402 debf870 8133402 4a3cccb 8133402 4a3cccb 8133402 4a3cccb debf870 4a3cccb debf870 4a3cccb debf870 4a3cccb 7a61a27 debf870 4a3cccb 7a61a27 8133402 4a3cccb 8133402 4a3cccb debf870 4a3cccb debf870 fe9658d 8133402 4a3cccb 8133402 4a3cccb 7a61a27 4a3cccb 8133402 4a3cccb 8133402 4a3cccb 7a61a27 4a3cccb 7a61a27 8133402 7a61a27 4a3cccb 8133402 4a3cccb 7a61a27 4a3cccb a085c86 4a3cccb 7a61a27 4a3cccb 8133402 a085c86 4a3cccb debf870 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | from fastapi import FastAPI, HTTPException, BackgroundTasks, UploadFile, File, Form
from fastapi.responses import JSONResponse
from pathlib import Path
import tempfile
import uuid
import os
import aiohttp
import pdfplumber
from PIL import Image
import subprocess
from bs4 import BeautifulSoup
import io
import docx
from docx.shared import Inches
import logging
import base64
import time
# Setup
API_KEY = os.getenv("PDF_API_KEY")
ZAI_SECRET_API_KEY = os.getenv("ZAI_SECRET_API_KEY")
app = FastAPI()
api_tokens = {"client-1": API_KEY}
MAX_PDF_SIZE_MB = 40
# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@app.get("/health")
def health():
return {"status": "ok"}
@app.post("/convert-pdf")
async def convert_pdf_endpoint(
background_tasks: BackgroundTasks,
file: UploadFile = File(...),
client_id: str = Form(...),
token: str = Form(...),
callback_url: str = Form(...),
book_id : int = Form(...)
):
# Auth
if client_id not in api_tokens or api_tokens[client_id] != token:
raise HTTPException(status_code=401, detail="Invalid API credentials")
# Check file type
if not file.filename.lower().endswith(".pdf"):
raise HTTPException(status_code=400, detail="Only PDF files are supported")
# Read file
try:
pdf_bytes = await file.read()
except Exception as e:
logger.error(f"Failed to read uploaded file: {e}")
raise HTTPException(status_code=400, detail="Unable to read file")
# Enforce size limit
if len(pdf_bytes) > MAX_PDF_SIZE_MB * 1024 * 1024:
raise HTTPException(status_code=413, detail=f"PDF too large (> {MAX_PDF_SIZE_MB} MB)")
# Write to temp file
try:
tmp_dir = Path(tempfile.mkdtemp())
tmp_pdf_path = tmp_dir / f"{uuid.uuid4()}.pdf"
tmp_pdf_path.write_bytes(pdf_bytes)
except Exception as e:
logger.error(f"Failed to write PDF file: {e}")
raise HTTPException(status_code=500, detail="Internal error writing PDF")
# Background processing
task_id = str(uuid.uuid4())
background_tasks.add_task(handle_pdf_processing, str(tmp_pdf_path), callback_url, task_id, book_id)
logger.info(f"Started task {task_id} for client {client_id}")
return JSONResponse({"status": "processing", "task_id": task_id})
async def handle_pdf_processing(pdf_path: str, callback_url: str, task_id: str, book_id:int):
try:
with open(pdf_path, "rb") as f:
html, _ = extract_pdf_to_html(f)
except Exception as e:
logger.error(f"PDF processing failed: {e}")
html = f"<p>Error extracting PDF: {e}</p>"
finally:
try:
Path(pdf_path).unlink(missing_ok=True)
except Exception as e:
logger.warning(f"Temp file cleanup failed: {e}")
try:
headers = { "x-api-key": ZAI_SECRET_API_KEY }
async with aiohttp.ClientSession() as session:
await session.post(callback_url, json={
"task_id": task_id,
"content": html,
"book_id": book_id
}, headers=headers )
logger.info(f"Content Generated \n {html} ")
logger.info(f"Callback sent for task {task_id}")
except Exception as e:
logger.error(f"Callback failed for task {task_id}: {e}")
def extract_text_from_image(image: Image.Image) -> str:
temp_img_path = Path(tempfile.mktemp(suffix=".png"))
image.save(temp_img_path)
try:
result = subprocess.run(
["latexocr", str(temp_img_path)],
capture_output=True,
text=True
)
if result.returncode == 0:
return f"<pre>\\[{result.stdout.strip()}\\]</pre>"
else:
return ""
except Exception as e:
return f"<i>LaTeX-OCR error: {str(e)}</i>"
finally:
try:
temp_img_path.unlink(missing_ok=True)
except Exception:
pass
def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
html_output = ""
docx_output = docx.Document()
toc = []
with pdfplumber.open(file) as pdf:
for page_num, page in enumerate(pdf.pages):
start = time.time()
page_title = f"Page {page_num + 1}"
toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
docx_output.add_heading(page_title, level=2)
text = page.extract_text()
if text:
for line in text.split("\n"):
html_output += f"<p>{line}</p>\n"
docx_output.add_paragraph(line)
else:
html_output += "<p><i>No text detected on this page.</i></p>"
for img_obj in page.images:
try:
x0, top, x1, bottom = img_obj["x0"], img_obj["top"], img_obj["x1"], img_obj["bottom"]
cropped = page.crop((x0, top, x1, bottom)).to_image(resolution=300).original
except Exception:
continue
math_html = extract_text_from_image(cropped)
if math_html.strip():
html_output += f"<div>{math_html}</div>\n"
docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
buffer = io.BytesIO()
cropped.save(buffer, format="PNG")
buffer.seek(0)
b64_img = base64.b64encode(buffer.read()).decode("utf-8")
html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
buffer.seek(0)
try:
docx_output.add_picture(buffer, width=Inches(5))
except Exception:
pass
logger.info(f"Processed page {page_num + 1} in {time.time() - start:.2f}s")
full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
return full_html, docx_output
|