madankn79 commited on
Commit
debf870
·
1 Parent(s): 8133402

Initial Commit 5.1.0

Browse files
Files changed (1) hide show
  1. app.py +30 -34
app.py CHANGED
@@ -1,8 +1,6 @@
1
- from fastapi import FastAPI, HTTPException, BackgroundTasks
2
  from fastapi.responses import JSONResponse
3
- from pydantic import BaseModel
4
  from pathlib import Path
5
- import base64
6
  import tempfile
7
  import uuid
8
  import os
@@ -17,38 +15,43 @@ from docx.shared import Inches
17
  import logging
18
 
19
  # Setup
20
- API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
 
21
  app = FastAPI()
22
  api_tokens = {"client-1": API_KEY}
23
  MAX_PDF_SIZE_MB = 40
24
 
25
- # Logging config
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
 
29
- # Schema
30
- class PDFRequest(BaseModel):
31
- file_b64: str
32
- callback_url: str
33
- client_id: str
34
- token: str
35
-
36
  @app.get("/health")
37
  def health():
38
  return {"status": "ok"}
39
 
40
  @app.post("/convert-pdf")
41
- async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
 
 
 
 
 
 
 
42
  # Auth
43
- if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
44
  raise HTTPException(status_code=401, detail="Invalid API credentials")
45
 
46
- # Decode base64
 
 
 
 
47
  try:
48
- pdf_bytes = base64.b64decode(payload.file_b64, validate=True)
49
  except Exception as e:
50
- logger.error(f"Base64 decode failed: {e}")
51
- raise HTTPException(status_code=400, detail="Invalid base64-encoded PDF")
52
 
53
  # Enforce size limit
54
  if len(pdf_bytes) > MAX_PDF_SIZE_MB * 1024 * 1024:
@@ -63,15 +66,15 @@ async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: Background
63
  logger.error(f"Failed to write PDF file: {e}")
64
  raise HTTPException(status_code=500, detail="Internal error writing PDF")
65
 
66
- # Process asynchronously
67
  task_id = str(uuid.uuid4())
68
- background_tasks.add_task(handle_pdf_processing, str(tmp_pdf_path), payload.callback_url, task_id)
69
 
70
- logger.info(f"Started task {task_id} for client {payload.client_id}")
71
  return JSONResponse({"status": "processing", "task_id": task_id})
72
 
73
 
74
- async def handle_pdf_processing(pdf_path: str, callback_url: str, task_id: str):
75
  try:
76
  with open(pdf_path, "rb") as f:
77
  html, _ = extract_pdf_to_html(f)
@@ -84,20 +87,20 @@ async def handle_pdf_processing(pdf_path: str, callback_url: str, task_id: str):
84
  except Exception as e:
85
  logger.warning(f"Temp file cleanup failed: {e}")
86
 
87
- # Callback
88
  try:
 
89
  async with aiohttp.ClientSession() as session:
90
  await session.post(callback_url, json={
91
  "task_id": task_id,
92
- "html": html,
93
- })
 
94
  logger.info(f"Callback sent for task {task_id}")
95
  except Exception as e:
96
  logger.error(f"Callback failed for task {task_id}: {e}")
97
 
98
 
99
  def extract_text_from_image(image: Image.Image) -> str:
100
- """Extract LaTeX text using latexocr from a PIL image"""
101
  temp_img_path = Path(tempfile.mktemp(suffix=".png"))
102
  image.save(temp_img_path)
103
  try:
@@ -120,10 +123,6 @@ def extract_text_from_image(image: Image.Image) -> str:
120
 
121
 
122
  def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
123
- """Main function to extract PDF content into HTML + DOCX"""
124
- if not file:
125
- return "<p>No file provided.</p>", docx.Document()
126
-
127
  html_output = ""
128
  docx_output = docx.Document()
129
  toc = []
@@ -150,20 +149,17 @@ def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
150
  except Exception:
151
  continue
152
 
153
- # OCR LaTeX from image
154
  math_html = extract_text_from_image(cropped)
155
  if math_html.strip():
156
  html_output += f"<div>{math_html}</div>\n"
157
  docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
158
 
159
- # Embed image
160
  buffer = io.BytesIO()
161
  cropped.save(buffer, format="PNG")
162
  buffer.seek(0)
163
  b64_img = base64.b64encode(buffer.read()).decode("utf-8")
164
  html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
165
 
166
- # Add to DOCX
167
  buffer.seek(0)
168
  try:
169
  docx_output.add_picture(buffer, width=Inches(5))
@@ -171,4 +167,4 @@ def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
171
  pass
172
 
173
  full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
174
- return full_html, docx_output
 
1
+ from fastapi import FastAPI, HTTPException, BackgroundTasks, UploadFile, File, Form
2
  from fastapi.responses import JSONResponse
 
3
  from pathlib import Path
 
4
  import tempfile
5
  import uuid
6
  import os
 
15
  import logging
16
 
17
  # Setup
18
+ API_KEY = os.getenv("PDF_API_KEY")
19
+ ZAI_SECRET_API_KEY = os.getenv("ZAI_SECRET_API_KEY")
20
  app = FastAPI()
21
  api_tokens = {"client-1": API_KEY}
22
  MAX_PDF_SIZE_MB = 40
23
 
24
+ # Logging
25
  logging.basicConfig(level=logging.INFO)
26
  logger = logging.getLogger(__name__)
27
 
 
 
 
 
 
 
 
28
  @app.get("/health")
29
  def health():
30
  return {"status": "ok"}
31
 
32
  @app.post("/convert-pdf")
33
+ async def convert_pdf_endpoint(
34
+ background_tasks: BackgroundTasks,
35
+ file: UploadFile = File(...),
36
+ client_id: str = Form(...),
37
+ token: str = Form(...),
38
+ callback_url: str = Form(...),
39
+ book_id : int = Form(...)
40
+ ):
41
  # Auth
42
+ if client_id not in api_tokens or api_tokens[client_id] != token:
43
  raise HTTPException(status_code=401, detail="Invalid API credentials")
44
 
45
+ # Check file type
46
+ if not file.filename.lower().endswith(".pdf"):
47
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
48
+
49
+ # Read file
50
  try:
51
+ pdf_bytes = await file.read()
52
  except Exception as e:
53
+ logger.error(f"Failed to read uploaded file: {e}")
54
+ raise HTTPException(status_code=400, detail="Unable to read file")
55
 
56
  # Enforce size limit
57
  if len(pdf_bytes) > MAX_PDF_SIZE_MB * 1024 * 1024:
 
66
  logger.error(f"Failed to write PDF file: {e}")
67
  raise HTTPException(status_code=500, detail="Internal error writing PDF")
68
 
69
+ # Background processing
70
  task_id = str(uuid.uuid4())
71
+ background_tasks.add_task(handle_pdf_processing, str(tmp_pdf_path), callback_url, task_id, book_id)
72
 
73
+ logger.info(f"Started task {task_id} for client {client_id}")
74
  return JSONResponse({"status": "processing", "task_id": task_id})
75
 
76
 
77
+ async def handle_pdf_processing(pdf_path: str, callback_url: str, task_id: str, book_id:int):
78
  try:
79
  with open(pdf_path, "rb") as f:
80
  html, _ = extract_pdf_to_html(f)
 
87
  except Exception as e:
88
  logger.warning(f"Temp file cleanup failed: {e}")
89
 
 
90
  try:
91
+ headers = { "x-api-key": ZAI_SECRET_API_KEY }
92
  async with aiohttp.ClientSession() as session:
93
  await session.post(callback_url, json={
94
  "task_id": task_id,
95
+ "content": html,
96
+ "book_id": book_id
97
+ }, headers=headers )
98
  logger.info(f"Callback sent for task {task_id}")
99
  except Exception as e:
100
  logger.error(f"Callback failed for task {task_id}: {e}")
101
 
102
 
103
  def extract_text_from_image(image: Image.Image) -> str:
 
104
  temp_img_path = Path(tempfile.mktemp(suffix=".png"))
105
  image.save(temp_img_path)
106
  try:
 
123
 
124
 
125
  def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
 
 
 
 
126
  html_output = ""
127
  docx_output = docx.Document()
128
  toc = []
 
149
  except Exception:
150
  continue
151
 
 
152
  math_html = extract_text_from_image(cropped)
153
  if math_html.strip():
154
  html_output += f"<div>{math_html}</div>\n"
155
  docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
156
 
 
157
  buffer = io.BytesIO()
158
  cropped.save(buffer, format="PNG")
159
  buffer.seek(0)
160
  b64_img = base64.b64encode(buffer.read()).decode("utf-8")
161
  html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
162
 
 
163
  buffer.seek(0)
164
  try:
165
  docx_output.add_picture(buffer, width=Inches(5))
 
167
  pass
168
 
169
  full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
170
+ return full_html, docx_output