Spaces:

madankn79
/

pdf2htmlv51

Sleeping

App Files Files Community

madankn79 commited on Jun 7, 2025

Commit

4a3cccb

1 Parent(s): e924673

Initial Commit 5.1.0

Browse files

Files changed (3) hide show

.idea/.gitignore +10 -0
app.py +157 -0
requirements.txt +7 -0

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Environment-dependent path to Maven home directory
+/mavenHomeManager.xml
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml

app.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Hugging Face-style async API using FastAPI
+from fastapi import FastAPI, Request, HTTPException, BackgroundTasks
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+import base64
+import tempfile
+import uuid
+import os
+import secrets
+import aiohttp
+import shutil
+import pdfplumber
+import pytesseract
+from pdf2image import convert_from_bytes
+import io
+import os
+from PIL import Image
+import subprocess
+from bs4 import BeautifulSoup
+from pdf_processing import extract_pdf_to_html  # Assume your function is modularized here
+API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
+app = FastAPI()
+# In-memory storage for tokens or you can use Redis/DB
+api_tokens = {"client-1": API_KEY}
+class PDFRequest(BaseModel):
+    file_b64: str
+    callback_url: str
+    client_id: str
+    token: str
+@app.post("/convert-pdf")
+async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
+    # Auth check
+    if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
+        raise HTTPException(status_code=401, detail="Invalid API credentials")
+    # Save base64 PDF to a temp file
+    try:
+        pdf_bytes = base64.b64decode(payload.file_b64)
+        tmp_pdf_path = tempfile.mktemp(suffix=".pdf")
+        with open(tmp_pdf_path, "wb") as f:
+            f.write(pdf_bytes)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid PDF: {str(e)}")
+    task_id = str(uuid.uuid4())
+    # Run conversion in background
+    background_tasks.add_task(handle_pdf_processing, tmp_pdf_path, payload.callback_url, task_id)
+    return JSONResponse({"status": "processing", "task_id": task_id})
+async def handle_pdf_processing(pdf_path, callback_url, task_id):
+    try:
+        html, _ = extract_pdf_to_html(open(pdf_path, "rb"))
+    finally:
+        if os.path.exists(pdf_path):
+            os.remove(pdf_path)
+    # Post result to callback URL
+    try:
+        async with aiohttp.ClientSession() as session:
+            await session.post(callback_url, json={
+                "task_id": task_id,
+                "html": html,
+            })
+    except Exception as e:
+        print(f"Callback failed: {e}")
+def extract_text_from_image(image: Image.Image) -> str:
+    # Save image temporarily
+    temp_img_path = "temp_math.png"
+    image.save(temp_img_path)
+    try:
+        # Run LaTeX-OCR CLI (assumes it's installed)
+        result = subprocess.run(
+            ["latexocr", temp_img_path],
+            capture_output=True,
+            text=True
+        )
+        return f"<pre>\\[{result.stdout.strip()}\\]</pre>" if result.returncode == 0 else ""
+    except Exception as e:
+        return f"<i>Error: {str(e)}</i>"
+def extract_pdf_to_html(file):
+    if file is None:
+        return "<p>No file uploaded.</p>", ""
+    html_output = ""
+    docx_output = docx.Document()
+    toc = []
+    with pdfplumber.open(file.name) as pdf:
+        for page_num, page in enumerate(pdf.pages):
+            page_title = f"Page {page_num + 1}"
+            toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
+            html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
+            docx_output.add_heading(page_title, level=2)
+            # Extract text and add paragraphs
+            text = page.extract_text()
+            if text:
+                for line in text.split("\n"):
+                    html_output += f"<p>{line}</p>\n"
+                    docx_output.add_paragraph(line)
+            else:
+                html_output += "<p><i>No text detected on this page.</i></p>"
+            # Process images embedded in page (diagrams, formulas, etc)
+            for img_obj in page.images:
+                x0, top, x1, bottom = img_obj["x0"], img_obj["top"], img_obj["x1"], img_obj["bottom"]
+                page_x0, page_top, page_x1, page_bottom = page.bbox
+                # Clip bbox to page boundaries
+                crop_x0 = max(x0, page_x0)
+                crop_top = max(top, page_top)
+                crop_x1 = min(x1, page_x1)
+                crop_bottom = min(bottom, page_bottom)
+                try:
+                    cropped = page.crop((crop_x0, crop_top, crop_x1, crop_bottom)).to_image(resolution=300).original
+                except Exception as e:
+                    # Skip if cropping fails
+                    continue
+                # Run LaTeX OCR on image for math formulas
+                math_html = extract_text_from_image(cropped)
+                if math_html.strip():
+                    html_output += f"<div>{math_html}</div>\n"
+                    # Also add LaTeX text to docx as paragraph
+                    docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
+                # Convert cropped image to base64 and embed in HTML
+                buffer = io.BytesIO()
+                cropped.save(buffer, format="PNG")
+                buffer.seek(0)
+                b64_img = base64.b64encode(buffer.read()).decode("utf-8")
+                html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
+                # Add image to docx
+                buffer.seek(0)
+                docx_output.add_picture(buffer, width=Inches(5))
+    full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
+    docx_path = "output.docx"
+    docx_output.save(docx_path)
+    return full_html, docx_path
+# Secure with a basic API token system
+@app.get("/health")
+def health():
+    return {"status": "ok"}

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio
+pdfplumber
+pytesseract
+pdf2image
+Pillow
+beautifulsoup4
+python-docx