File size: 6,064 Bytes
debf870
4a3cccb
8133402
4a3cccb
 
 
 
 
 
 
 
7a61a27
 
 
8133402
dbf74d1
a085c86
4a3cccb
8133402
debf870
 
4a3cccb
 
8133402
 
debf870
8133402
 
4a3cccb
8133402
 
 
 
4a3cccb
debf870
 
 
 
 
 
 
 
8133402
debf870
4a3cccb
 
debf870
 
 
 
 
8133402
debf870
8133402
debf870
 
8133402
 
 
 
 
 
4a3cccb
8133402
 
 
4a3cccb
8133402
 
4a3cccb
debf870
4a3cccb
debf870
4a3cccb
debf870
4a3cccb
 
7a61a27
debf870
4a3cccb
7a61a27
 
8133402
 
 
4a3cccb
8133402
 
 
 
4a3cccb
 
debf870
4a3cccb
 
 
debf870
 
 
fe9658d
8133402
4a3cccb
8133402
4a3cccb
7a61a27
4a3cccb
8133402
4a3cccb
 
 
8133402
4a3cccb
 
 
7a61a27
 
 
 
4a3cccb
7a61a27
 
8133402
 
 
 
7a61a27
4a3cccb
8133402
4a3cccb
 
 
 
7a61a27
4a3cccb
a085c86
4a3cccb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a61a27
 
 
4a3cccb
 
 
 
 
 
 
 
 
 
 
 
 
 
8133402
 
 
 
a085c86
4a3cccb
debf870
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
from fastapi import FastAPI, HTTPException, BackgroundTasks, UploadFile, File, Form
from fastapi.responses import JSONResponse
from pathlib import Path
import tempfile
import uuid
import os
import aiohttp
import pdfplumber
from PIL import Image
import subprocess
from bs4 import BeautifulSoup
import io
import docx
from docx.shared import Inches
import logging
import base64
import time

# Setup
API_KEY = os.getenv("PDF_API_KEY")
ZAI_SECRET_API_KEY = os.getenv("ZAI_SECRET_API_KEY")
app = FastAPI()
api_tokens = {"client-1": API_KEY}
MAX_PDF_SIZE_MB = 40

# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@app.get("/health")
def health():
    return {"status": "ok"}

@app.post("/convert-pdf")
async def convert_pdf_endpoint(
        background_tasks: BackgroundTasks,
        file: UploadFile = File(...),
        client_id: str = Form(...),
        token: str = Form(...),
        callback_url: str = Form(...),
        book_id : int = Form(...)
):
    # Auth
    if client_id not in api_tokens or api_tokens[client_id] != token:
        raise HTTPException(status_code=401, detail="Invalid API credentials")

    # Check file type
    if not file.filename.lower().endswith(".pdf"):
        raise HTTPException(status_code=400, detail="Only PDF files are supported")

    # Read file
    try:
        pdf_bytes = await file.read()
    except Exception as e:
        logger.error(f"Failed to read uploaded file: {e}")
        raise HTTPException(status_code=400, detail="Unable to read file")

    # Enforce size limit
    if len(pdf_bytes) > MAX_PDF_SIZE_MB * 1024 * 1024:
        raise HTTPException(status_code=413, detail=f"PDF too large (> {MAX_PDF_SIZE_MB} MB)")

    # Write to temp file
    try:
        tmp_dir = Path(tempfile.mkdtemp())
        tmp_pdf_path = tmp_dir / f"{uuid.uuid4()}.pdf"
        tmp_pdf_path.write_bytes(pdf_bytes)
    except Exception as e:
        logger.error(f"Failed to write PDF file: {e}")
        raise HTTPException(status_code=500, detail="Internal error writing PDF")

    # Background processing
    task_id = str(uuid.uuid4())
    background_tasks.add_task(handle_pdf_processing, str(tmp_pdf_path), callback_url, task_id, book_id)

    logger.info(f"Started task {task_id} for client {client_id}")
    return JSONResponse({"status": "processing", "task_id": task_id})


async def handle_pdf_processing(pdf_path: str, callback_url: str, task_id: str, book_id:int):
    try:
        with open(pdf_path, "rb") as f:
            html, _ = extract_pdf_to_html(f)
    except Exception as e:
        logger.error(f"PDF processing failed: {e}")
        html = f"<p>Error extracting PDF: {e}</p>"
    finally:
        try:
            Path(pdf_path).unlink(missing_ok=True)
        except Exception as e:
            logger.warning(f"Temp file cleanup failed: {e}")

    try:
        headers = { "x-api-key": ZAI_SECRET_API_KEY }
        async with aiohttp.ClientSession() as session:
            await session.post(callback_url, json={
                "task_id": task_id,
                "content": html,
                "book_id": book_id
            }, headers=headers )
        logger.info(f"Content Generated \n {html} ")
        logger.info(f"Callback sent for task {task_id}")
    except Exception as e:
        logger.error(f"Callback failed for task {task_id}: {e}")


def extract_text_from_image(image: Image.Image) -> str:
    temp_img_path = Path(tempfile.mktemp(suffix=".png"))
    image.save(temp_img_path)
    try:
        result = subprocess.run(
            ["latexocr", str(temp_img_path)],
            capture_output=True,
            text=True
        )
        if result.returncode == 0:
            return f"<pre>\\[{result.stdout.strip()}\\]</pre>"
        else:
            return ""
    except Exception as e:
        return f"<i>LaTeX-OCR error: {str(e)}</i>"
    finally:
        try:
            temp_img_path.unlink(missing_ok=True)
        except Exception:
            pass


def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
    html_output = ""
    docx_output = docx.Document()
    toc = []

    with pdfplumber.open(file) as pdf:
        for page_num, page in enumerate(pdf.pages):
            start = time.time()
            page_title = f"Page {page_num + 1}"
            toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
            html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
            docx_output.add_heading(page_title, level=2)

            text = page.extract_text()
            if text:
                for line in text.split("\n"):
                    html_output += f"<p>{line}</p>\n"
                    docx_output.add_paragraph(line)
            else:
                html_output += "<p><i>No text detected on this page.</i></p>"

            for img_obj in page.images:
                try:
                    x0, top, x1, bottom = img_obj["x0"], img_obj["top"], img_obj["x1"], img_obj["bottom"]
                    cropped = page.crop((x0, top, x1, bottom)).to_image(resolution=300).original
                except Exception:
                    continue

                math_html = extract_text_from_image(cropped)
                if math_html.strip():
                    html_output += f"<div>{math_html}</div>\n"
                    docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)

                buffer = io.BytesIO()
                cropped.save(buffer, format="PNG")
                buffer.seek(0)
                b64_img = base64.b64encode(buffer.read()).decode("utf-8")
                html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'

                buffer.seek(0)
                try:
                    docx_output.add_picture(buffer, width=Inches(5))
                except Exception:
                    pass
            logger.info(f"Processed page {page_num + 1} in {time.time() - start:.2f}s")
    full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
    return full_html, docx_output