Spaces:

madankn79
/

pdf2htmlv51

Sleeping

App Files Files Community

madankn79 commited on Jun 7, 2025

Commit

8133402

1 Parent(s): 3136b3a

Initial Commit 5.1.0

Browse files

Files changed (1) hide show

app.py +65 -30

app.py CHANGED Viewed

@@ -1,77 +1,108 @@
-from fastapi import FastAPI, Request, HTTPException, BackgroundTasks
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 import base64
 import tempfile
 import uuid
 import os
 import aiohttp
 import pdfplumber
-import pytesseract
-from pdf2image import convert_from_bytes
 from PIL import Image
 import subprocess
 from bs4 import BeautifulSoup
 import io
 import docx
 from docx.shared import Inches
 API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
 app = FastAPI()
-# In-memory token check (replace with KV or DB in production)
 api_tokens = {"client-1": API_KEY}
 class PDFRequest(BaseModel):
     file_b64: str
     callback_url: str
     client_id: str
     token: str
 @app.post("/convert-pdf")
 async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
     if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
         raise HTTPException(status_code=401, detail="Invalid API credentials")
     try:
-        pdf_bytes = base64.b64decode(payload.file_b64)
-        tmp_pdf_path = tempfile.mktemp(suffix=".pdf")
-        with open(tmp_pdf_path, "wb") as f:
-            f.write(pdf_bytes)
     except Exception as e:
-        raise HTTPException(status_code=400, detail=f"Invalid PDF: {str(e)}")
     task_id = str(uuid.uuid4())
-    background_tasks.add_task(handle_pdf_processing, tmp_pdf_path, payload.callback_url, task_id)
     return JSONResponse({"status": "processing", "task_id": task_id})
-async def handle_pdf_processing(pdf_path, callback_url, task_id):
     try:
         with open(pdf_path, "rb") as f:
             html, _ = extract_pdf_to_html(f)
     finally:
-        if os.path.exists(pdf_path):
-            os.remove(pdf_path)
     try:
         async with aiohttp.ClientSession() as session:
             await session.post(callback_url, json={
                 "task_id": task_id,
                 "html": html,
             })
     except Exception as e:
-        print(f"Callback failed: {e}")
 def extract_text_from_image(image: Image.Image) -> str:
-    temp_img_path = tempfile.mktemp(suffix=".png")
     image.save(temp_img_path)
     try:
         result = subprocess.run(
-            ["latexocr", temp_img_path],
             capture_output=True,
             text=True
         )
@@ -82,13 +113,16 @@ def extract_text_from_image(image: Image.Image) -> str:
     except Exception as e:
         return f"<i>LaTeX-OCR error: {str(e)}</i>"
     finally:
-        if os.path.exists(temp_img_path):
-            os.remove(temp_img_path)
-def extract_pdf_to_html(file):
-    if file is None:
-        return "<p>No file uploaded.</p>", ""
     html_output = ""
     docx_output = docx.Document()
@@ -116,24 +150,25 @@ def extract_pdf_to_html(file):
                 except Exception:
                     continue
                 math_html = extract_text_from_image(cropped)
                 if math_html.strip():
                     html_output += f"<div>{math_html}</div>\n"
                     docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
                 buffer = io.BytesIO()
                 cropped.save(buffer, format="PNG")
                 buffer.seek(0)
                 b64_img = base64.b64encode(buffer.read()).decode("utf-8")
                 html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
                 buffer.seek(0)
-                docx_output.add_picture(buffer, width=Inches(5))
     full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
-    return full_html, docx_output
-@app.get("/health")
-def health():
-    return {"status": "ok"}

+from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
+from pathlib import Path
 import base64
 import tempfile
 import uuid
 import os
 import aiohttp
 import pdfplumber
 from PIL import Image
 import subprocess
 from bs4 import BeautifulSoup
 import io
 import docx
 from docx.shared import Inches
+import logging
+# Setup
 API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
 app = FastAPI()
 api_tokens = {"client-1": API_KEY}
+MAX_PDF_SIZE_MB = 40
+# Logging config
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Schema
 class PDFRequest(BaseModel):
     file_b64: str
     callback_url: str
     client_id: str
     token: str
+@app.get("/health")
+def health():
+    return {"status": "ok"}
 @app.post("/convert-pdf")
 async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
+    # Auth
     if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
         raise HTTPException(status_code=401, detail="Invalid API credentials")
+    # Decode base64
+    try:
+        pdf_bytes = base64.b64decode(payload.file_b64, validate=True)
+    except Exception as e:
+        logger.error(f"Base64 decode failed: {e}")
+        raise HTTPException(status_code=400, detail="Invalid base64-encoded PDF")
+    # Enforce size limit
+    if len(pdf_bytes) > MAX_PDF_SIZE_MB * 1024 * 1024:
+        raise HTTPException(status_code=413, detail=f"PDF too large (> {MAX_PDF_SIZE_MB} MB)")
+    # Write to temp file
     try:
+        tmp_dir = Path(tempfile.mkdtemp())
+        tmp_pdf_path = tmp_dir / f"{uuid.uuid4()}.pdf"
+        tmp_pdf_path.write_bytes(pdf_bytes)
     except Exception as e:
+        logger.error(f"Failed to write PDF file: {e}")
+        raise HTTPException(status_code=500, detail="Internal error writing PDF")
+    # Process asynchronously
     task_id = str(uuid.uuid4())
+    background_tasks.add_task(handle_pdf_processing, str(tmp_pdf_path), payload.callback_url, task_id)
+    logger.info(f"Started task {task_id} for client {payload.client_id}")
     return JSONResponse({"status": "processing", "task_id": task_id})
+async def handle_pdf_processing(pdf_path: str, callback_url: str, task_id: str):
     try:
         with open(pdf_path, "rb") as f:
             html, _ = extract_pdf_to_html(f)
+    except Exception as e:
+        logger.error(f"PDF processing failed: {e}")
+        html = f"<p>Error extracting PDF: {e}</p>"
     finally:
+        try:
+            Path(pdf_path).unlink(missing_ok=True)
+        except Exception as e:
+            logger.warning(f"Temp file cleanup failed: {e}")
+    # Callback
     try:
         async with aiohttp.ClientSession() as session:
             await session.post(callback_url, json={
                 "task_id": task_id,
                 "html": html,
             })
+        logger.info(f"Callback sent for task {task_id}")
     except Exception as e:
+        logger.error(f"Callback failed for task {task_id}: {e}")
 def extract_text_from_image(image: Image.Image) -> str:
+    """Extract LaTeX text using latexocr from a PIL image"""
+    temp_img_path = Path(tempfile.mktemp(suffix=".png"))
     image.save(temp_img_path)
     try:
         result = subprocess.run(
+            ["latexocr", str(temp_img_path)],
             capture_output=True,
             text=True
         )
     except Exception as e:
         return f"<i>LaTeX-OCR error: {str(e)}</i>"
     finally:
+        try:
+            temp_img_path.unlink(missing_ok=True)
+        except Exception:
+            pass
+def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
+    """Main function to extract PDF content into HTML + DOCX"""
+    if not file:
+        return "<p>No file provided.</p>", docx.Document()
     html_output = ""
     docx_output = docx.Document()
                 except Exception:
                     continue
+                # OCR LaTeX from image
                 math_html = extract_text_from_image(cropped)
                 if math_html.strip():
                     html_output += f"<div>{math_html}</div>\n"
                     docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
+                # Embed image
                 buffer = io.BytesIO()
                 cropped.save(buffer, format="PNG")
                 buffer.seek(0)
                 b64_img = base64.b64encode(buffer.read()).decode("utf-8")
                 html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
+                # Add to DOCX
                 buffer.seek(0)
+                try:
+                    docx_output.add_picture(buffer, width=Inches(5))
+                except Exception:
+                    pass
     full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
+    return full_html, docx_output