Spaces:

madankn79
/

pdf2htmlv51

Sleeping

App Files Files Community

madankn79 commited on Jun 10, 2025

Commit

debf870

1 Parent(s): 8133402

Initial Commit 5.1.0

Browse files

Files changed (1) hide show

app.py +30 -34

app.py CHANGED Viewed

@@ -1,8 +1,6 @@
-from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.responses import JSONResponse
-from pydantic import BaseModel
 from pathlib import Path
-import base64
 import tempfile
 import uuid
 import os
@@ -17,38 +15,43 @@ from docx.shared import Inches
 import logging
 # Setup
-API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
 app = FastAPI()
 api_tokens = {"client-1": API_KEY}
 MAX_PDF_SIZE_MB = 40
-# Logging config
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Schema
-class PDFRequest(BaseModel):
-    file_b64: str
-    callback_url: str
-    client_id: str
-    token: str
 @app.get("/health")
 def health():
     return {"status": "ok"}
 @app.post("/convert-pdf")
-async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
     # Auth
-    if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
         raise HTTPException(status_code=401, detail="Invalid API credentials")
-    # Decode base64
     try:
-        pdf_bytes = base64.b64decode(payload.file_b64, validate=True)
     except Exception as e:
-        logger.error(f"Base64 decode failed: {e}")
-        raise HTTPException(status_code=400, detail="Invalid base64-encoded PDF")
     # Enforce size limit
     if len(pdf_bytes) > MAX_PDF_SIZE_MB * 1024 * 1024:
@@ -63,15 +66,15 @@ async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: Background
         logger.error(f"Failed to write PDF file: {e}")
         raise HTTPException(status_code=500, detail="Internal error writing PDF")
-    # Process asynchronously
     task_id = str(uuid.uuid4())
-    background_tasks.add_task(handle_pdf_processing, str(tmp_pdf_path), payload.callback_url, task_id)
-    logger.info(f"Started task {task_id} for client {payload.client_id}")
     return JSONResponse({"status": "processing", "task_id": task_id})
-async def handle_pdf_processing(pdf_path: str, callback_url: str, task_id: str):
     try:
         with open(pdf_path, "rb") as f:
             html, _ = extract_pdf_to_html(f)
@@ -84,20 +87,20 @@ async def handle_pdf_processing(pdf_path: str, callback_url: str, task_id: str):
         except Exception as e:
             logger.warning(f"Temp file cleanup failed: {e}")
-    # Callback
     try:
         async with aiohttp.ClientSession() as session:
             await session.post(callback_url, json={
                 "task_id": task_id,
-                "html": html,
-            })
         logger.info(f"Callback sent for task {task_id}")
     except Exception as e:
         logger.error(f"Callback failed for task {task_id}: {e}")
 def extract_text_from_image(image: Image.Image) -> str:
-    """Extract LaTeX text using latexocr from a PIL image"""
     temp_img_path = Path(tempfile.mktemp(suffix=".png"))
     image.save(temp_img_path)
     try:
@@ -120,10 +123,6 @@ def extract_text_from_image(image: Image.Image) -> str:
 def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
-    """Main function to extract PDF content into HTML + DOCX"""
-    if not file:
-        return "<p>No file provided.</p>", docx.Document()
     html_output = ""
     docx_output = docx.Document()
     toc = []
@@ -150,20 +149,17 @@ def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
                 except Exception:
                     continue
-                # OCR LaTeX from image
                 math_html = extract_text_from_image(cropped)
                 if math_html.strip():
                     html_output += f"<div>{math_html}</div>\n"
                     docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
-                # Embed image
                 buffer = io.BytesIO()
                 cropped.save(buffer, format="PNG")
                 buffer.seek(0)
                 b64_img = base64.b64encode(buffer.read()).decode("utf-8")
                 html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
-                # Add to DOCX
                 buffer.seek(0)
                 try:
                     docx_output.add_picture(buffer, width=Inches(5))
@@ -171,4 +167,4 @@ def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
                     pass
     full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
-    return full_html, docx_output

+from fastapi import FastAPI, HTTPException, BackgroundTasks, UploadFile, File, Form
 from fastapi.responses import JSONResponse
 from pathlib import Path
 import tempfile
 import uuid
 import os
 import logging
 # Setup
+API_KEY = os.getenv("PDF_API_KEY")
+ZAI_SECRET_API_KEY = os.getenv("ZAI_SECRET_API_KEY")
 app = FastAPI()
 api_tokens = {"client-1": API_KEY}
 MAX_PDF_SIZE_MB = 40
+# Logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 @app.get("/health")
 def health():
     return {"status": "ok"}
 @app.post("/convert-pdf")
+async def convert_pdf_endpoint(
+        background_tasks: BackgroundTasks,
+        file: UploadFile = File(...),
+        client_id: str = Form(...),
+        token: str = Form(...),
+        callback_url: str = Form(...),
+        book_id : int = Form(...)
+):
     # Auth
+    if client_id not in api_tokens or api_tokens[client_id] != token:
         raise HTTPException(status_code=401, detail="Invalid API credentials")
+    # Check file type
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported")
+    # Read file
     try:
+        pdf_bytes = await file.read()
     except Exception as e:
+        logger.error(f"Failed to read uploaded file: {e}")
+        raise HTTPException(status_code=400, detail="Unable to read file")
     # Enforce size limit
     if len(pdf_bytes) > MAX_PDF_SIZE_MB * 1024 * 1024:
         logger.error(f"Failed to write PDF file: {e}")
         raise HTTPException(status_code=500, detail="Internal error writing PDF")
+    # Background processing
     task_id = str(uuid.uuid4())
+    background_tasks.add_task(handle_pdf_processing, str(tmp_pdf_path), callback_url, task_id, book_id)
+    logger.info(f"Started task {task_id} for client {client_id}")
     return JSONResponse({"status": "processing", "task_id": task_id})
+async def handle_pdf_processing(pdf_path: str, callback_url: str, task_id: str, book_id:int):
     try:
         with open(pdf_path, "rb") as f:
             html, _ = extract_pdf_to_html(f)
         except Exception as e:
             logger.warning(f"Temp file cleanup failed: {e}")
     try:
+        headers = { "x-api-key": ZAI_SECRET_API_KEY }
         async with aiohttp.ClientSession() as session:
             await session.post(callback_url, json={
                 "task_id": task_id,
+                "content": html,
+                "book_id": book_id
+            }, headers=headers )
         logger.info(f"Callback sent for task {task_id}")
     except Exception as e:
         logger.error(f"Callback failed for task {task_id}: {e}")
 def extract_text_from_image(image: Image.Image) -> str:
     temp_img_path = Path(tempfile.mktemp(suffix=".png"))
     image.save(temp_img_path)
     try:
 def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
     html_output = ""
     docx_output = docx.Document()
     toc = []
                 except Exception:
                     continue
                 math_html = extract_text_from_image(cropped)
                 if math_html.strip():
                     html_output += f"<div>{math_html}</div>\n"
                     docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
                 buffer = io.BytesIO()
                 cropped.save(buffer, format="PNG")
                 buffer.seek(0)
                 b64_img = base64.b64encode(buffer.read()).decode("utf-8")
                 html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
                 buffer.seek(0)
                 try:
                     docx_output.add_picture(buffer, width=Inches(5))
                     pass
     full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
+    return full_html, docx_output