Spaces:

madankn79
/

pdf2htmlv51

Sleeping

App Files Files Community

madankn79 commited on Jun 7, 2025

Commit

7a61a27

1 Parent(s): 1edd595

Initial Commit 5.1.0

Browse files

Files changed (3) hide show

app.py +25 -40
apt.txt +2 -0
requirements.txt +5 -1

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# Hugging Face-style async API using FastAPI
 from fastapi import FastAPI, Request, HTTPException, BackgroundTasks
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
@@ -6,24 +5,21 @@ import base64
 import tempfile
 import uuid
 import os
-import secrets
 import aiohttp
-import shutil
 import pdfplumber
 import pytesseract
 from pdf2image import convert_from_bytes
-import io
 from PIL import Image
 import subprocess
 from bs4 import BeautifulSoup
 API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
 app = FastAPI()
-# In-memory storage for tokens or you can use Redis/DB
 api_tokens = {"client-1": API_KEY}
 class PDFRequest(BaseModel):
@@ -34,11 +30,9 @@ class PDFRequest(BaseModel):
 @app.post("/convert-pdf")
 async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
-    # Auth check
     if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
         raise HTTPException(status_code=401, detail="Invalid API credentials")
-    # Save base64 PDF to a temp file
     try:
         pdf_bytes = base64.b64decode(payload.file_b64)
         tmp_pdf_path = tempfile.mktemp(suffix=".pdf")
@@ -48,20 +42,19 @@ async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: Background
         raise HTTPException(status_code=400, detail=f"Invalid PDF: {str(e)}")
     task_id = str(uuid.uuid4())
-    # Run conversion in background
     background_tasks.add_task(handle_pdf_processing, tmp_pdf_path, payload.callback_url, task_id)
     return JSONResponse({"status": "processing", "task_id": task_id})
 async def handle_pdf_processing(pdf_path, callback_url, task_id):
     try:
-        html, _ = extract_pdf_to_html(open(pdf_path, "rb"))
     finally:
         if os.path.exists(pdf_path):
             os.remove(pdf_path)
-    # Post result to callback URL
     try:
         async with aiohttp.ClientSession() as session:
             await session.post(callback_url, json={
@@ -71,20 +64,27 @@ async def handle_pdf_processing(pdf_path, callback_url, task_id):
     except Exception as e:
         print(f"Callback failed: {e}")
 def extract_text_from_image(image: Image.Image) -> str:
-    # Save image temporarily
-    temp_img_path = "temp_math.png"
     image.save(temp_img_path)
     try:
-        # Run LaTeX-OCR CLI (assumes it's installed)
         result = subprocess.run(
             ["latexocr", temp_img_path],
             capture_output=True,
             text=True
         )
-        return f"<pre>\\[{result.stdout.strip()}\\]</pre>" if result.returncode == 0 else ""
     except Exception as e:
-        return f"<i>Error: {str(e)}</i>"
 def extract_pdf_to_html(file):
     if file is None:
@@ -94,14 +94,13 @@ def extract_pdf_to_html(file):
     docx_output = docx.Document()
     toc = []
-    with pdfplumber.open(file.name) as pdf:
         for page_num, page in enumerate(pdf.pages):
             page_title = f"Page {page_num + 1}"
             toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
             html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
             docx_output.add_heading(page_title, level=2)
-            # Extract text and add paragraphs
             text = page.extract_text()
             if text:
                 for line in text.split("\n"):
@@ -110,45 +109,31 @@ def extract_pdf_to_html(file):
             else:
                 html_output += "<p><i>No text detected on this page.</i></p>"
-            # Process images embedded in page (diagrams, formulas, etc)
             for img_obj in page.images:
-                x0, top, x1, bottom = img_obj["x0"], img_obj["top"], img_obj["x1"], img_obj["bottom"]
-                page_x0, page_top, page_x1, page_bottom = page.bbox
-                # Clip bbox to page boundaries
-                crop_x0 = max(x0, page_x0)
-                crop_top = max(top, page_top)
-                crop_x1 = min(x1, page_x1)
-                crop_bottom = min(bottom, page_bottom)
                 try:
-                    cropped = page.crop((crop_x0, crop_top, crop_x1, crop_bottom)).to_image(resolution=300).original
-                except Exception as e:
-                    # Skip if cropping fails
                     continue
-                # Run LaTeX OCR on image for math formulas
                 math_html = extract_text_from_image(cropped)
                 if math_html.strip():
                     html_output += f"<div>{math_html}</div>\n"
-                    # Also add LaTeX text to docx as paragraph
                     docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
-                # Convert cropped image to base64 and embed in HTML
                 buffer = io.BytesIO()
                 cropped.save(buffer, format="PNG")
                 buffer.seek(0)
                 b64_img = base64.b64encode(buffer.read()).decode("utf-8")
                 html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
-                # Add image to docx
                 buffer.seek(0)
                 docx_output.add_picture(buffer, width=Inches(5))
     full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
-    return full_html
-# Secure with a basic API token system
 @app.get("/health")
 def health():
     return {"status": "ok"}

 from fastapi import FastAPI, Request, HTTPException, BackgroundTasks
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 import tempfile
 import uuid
 import os
 import aiohttp
 import pdfplumber
 import pytesseract
 from pdf2image import convert_from_bytes
 from PIL import Image
 import subprocess
 from bs4 import BeautifulSoup
+import io
+import docx
+from docx.shared import Inches
 API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
 app = FastAPI()
+# In-memory token check (replace with KV or DB in production)
 api_tokens = {"client-1": API_KEY}
 class PDFRequest(BaseModel):
 @app.post("/convert-pdf")
 async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
     if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
         raise HTTPException(status_code=401, detail="Invalid API credentials")
     try:
         pdf_bytes = base64.b64decode(payload.file_b64)
         tmp_pdf_path = tempfile.mktemp(suffix=".pdf")
         raise HTTPException(status_code=400, detail=f"Invalid PDF: {str(e)}")
     task_id = str(uuid.uuid4())
     background_tasks.add_task(handle_pdf_processing, tmp_pdf_path, payload.callback_url, task_id)
     return JSONResponse({"status": "processing", "task_id": task_id})
 async def handle_pdf_processing(pdf_path, callback_url, task_id):
     try:
+        with open(pdf_path, "rb") as f:
+            html, _ = extract_pdf_to_html(f)
     finally:
         if os.path.exists(pdf_path):
             os.remove(pdf_path)
     try:
         async with aiohttp.ClientSession() as session:
             await session.post(callback_url, json={
     except Exception as e:
         print(f"Callback failed: {e}")
 def extract_text_from_image(image: Image.Image) -> str:
+    temp_img_path = tempfile.mktemp(suffix=".png")
     image.save(temp_img_path)
     try:
         result = subprocess.run(
             ["latexocr", temp_img_path],
             capture_output=True,
             text=True
         )
+        if result.returncode == 0:
+            return f"<pre>\\[{result.stdout.strip()}\\]</pre>"
+        else:
+            return ""
     except Exception as e:
+        return f"<i>LaTeX-OCR error: {str(e)}</i>"
+    finally:
+        if os.path.exists(temp_img_path):
+            os.remove(temp_img_path)
 def extract_pdf_to_html(file):
     if file is None:
     docx_output = docx.Document()
     toc = []
+    with pdfplumber.open(file) as pdf:
         for page_num, page in enumerate(pdf.pages):
             page_title = f"Page {page_num + 1}"
             toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
             html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
             docx_output.add_heading(page_title, level=2)
             text = page.extract_text()
             if text:
                 for line in text.split("\n"):
             else:
                 html_output += "<p><i>No text detected on this page.</i></p>"
             for img_obj in page.images:
                 try:
+                    x0, top, x1, bottom = img_obj["x0"], img_obj["top"], img_obj["x1"], img_obj["bottom"]
+                    cropped = page.crop((x0, top, x1, bottom)).to_image(resolution=300).original
+                except Exception:
                     continue
                 math_html = extract_text_from_image(cropped)
                 if math_html.strip():
                     html_output += f"<div>{math_html}</div>\n"
                     docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
                 buffer = io.BytesIO()
                 cropped.save(buffer, format="PNG")
                 buffer.seek(0)
                 b64_img = base64.b64encode(buffer.read()).decode("utf-8")
                 html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
                 buffer.seek(0)
                 docx_output.add_picture(buffer, width=Inches(5))
     full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
+    return full_html, docx_output
 @app.get("/health")
 def health():
     return {"status": "ok"}

apt.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ poppler-utils
2	+ tesseract-ocr

requirements.txt CHANGED Viewed

@@ -1,7 +1,11 @@
 gradio
 pdfplumber
 pytesseract
 pdf2image
 Pillow
 beautifulsoup4
-python-docx

 gradio
+fastapi
+uvicorn
+aiohttp
 pdfplumber
 pytesseract
 pdf2image
 Pillow
 beautifulsoup4
+python-docx
+git+https://github.com/lukas-blecher/LaTeX-OCR.git