madankn79 commited on
Commit
8133402
·
1 Parent(s): 3136b3a

Initial Commit 5.1.0

Browse files
Files changed (1) hide show
  1. app.py +65 -30
app.py CHANGED
@@ -1,77 +1,108 @@
1
- from fastapi import FastAPI, Request, HTTPException, BackgroundTasks
2
  from fastapi.responses import JSONResponse
3
  from pydantic import BaseModel
 
4
  import base64
5
  import tempfile
6
  import uuid
7
  import os
8
  import aiohttp
9
  import pdfplumber
10
- import pytesseract
11
- from pdf2image import convert_from_bytes
12
  from PIL import Image
13
  import subprocess
14
  from bs4 import BeautifulSoup
15
  import io
16
  import docx
17
  from docx.shared import Inches
 
18
 
 
19
  API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
20
  app = FastAPI()
21
-
22
- # In-memory token check (replace with KV or DB in production)
23
  api_tokens = {"client-1": API_KEY}
 
 
 
 
 
24
 
 
25
  class PDFRequest(BaseModel):
26
  file_b64: str
27
  callback_url: str
28
  client_id: str
29
  token: str
30
 
 
 
 
 
31
  @app.post("/convert-pdf")
32
  async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
 
33
  if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
34
  raise HTTPException(status_code=401, detail="Invalid API credentials")
35
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  try:
37
- pdf_bytes = base64.b64decode(payload.file_b64)
38
- tmp_pdf_path = tempfile.mktemp(suffix=".pdf")
39
- with open(tmp_pdf_path, "wb") as f:
40
- f.write(pdf_bytes)
41
  except Exception as e:
42
- raise HTTPException(status_code=400, detail=f"Invalid PDF: {str(e)}")
 
43
 
 
44
  task_id = str(uuid.uuid4())
45
- background_tasks.add_task(handle_pdf_processing, tmp_pdf_path, payload.callback_url, task_id)
46
 
 
47
  return JSONResponse({"status": "processing", "task_id": task_id})
48
 
49
 
50
- async def handle_pdf_processing(pdf_path, callback_url, task_id):
51
  try:
52
  with open(pdf_path, "rb") as f:
53
  html, _ = extract_pdf_to_html(f)
 
 
 
54
  finally:
55
- if os.path.exists(pdf_path):
56
- os.remove(pdf_path)
 
 
57
 
 
58
  try:
59
  async with aiohttp.ClientSession() as session:
60
  await session.post(callback_url, json={
61
  "task_id": task_id,
62
  "html": html,
63
  })
 
64
  except Exception as e:
65
- print(f"Callback failed: {e}")
66
 
67
 
68
  def extract_text_from_image(image: Image.Image) -> str:
69
- temp_img_path = tempfile.mktemp(suffix=".png")
 
70
  image.save(temp_img_path)
71
-
72
  try:
73
  result = subprocess.run(
74
- ["latexocr", temp_img_path],
75
  capture_output=True,
76
  text=True
77
  )
@@ -82,13 +113,16 @@ def extract_text_from_image(image: Image.Image) -> str:
82
  except Exception as e:
83
  return f"<i>LaTeX-OCR error: {str(e)}</i>"
84
  finally:
85
- if os.path.exists(temp_img_path):
86
- os.remove(temp_img_path)
 
 
87
 
88
 
89
- def extract_pdf_to_html(file):
90
- if file is None:
91
- return "<p>No file uploaded.</p>", ""
 
92
 
93
  html_output = ""
94
  docx_output = docx.Document()
@@ -116,24 +150,25 @@ def extract_pdf_to_html(file):
116
  except Exception:
117
  continue
118
 
 
119
  math_html = extract_text_from_image(cropped)
120
  if math_html.strip():
121
  html_output += f"<div>{math_html}</div>\n"
122
  docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
123
 
 
124
  buffer = io.BytesIO()
125
  cropped.save(buffer, format="PNG")
126
  buffer.seek(0)
127
  b64_img = base64.b64encode(buffer.read()).decode("utf-8")
128
  html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
129
 
 
130
  buffer.seek(0)
131
- docx_output.add_picture(buffer, width=Inches(5))
 
 
 
132
 
133
  full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
134
- return full_html, docx_output
135
-
136
-
137
- @app.get("/health")
138
- def health():
139
- return {"status": "ok"}
 
1
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
2
  from fastapi.responses import JSONResponse
3
  from pydantic import BaseModel
4
+ from pathlib import Path
5
  import base64
6
  import tempfile
7
  import uuid
8
  import os
9
  import aiohttp
10
  import pdfplumber
 
 
11
  from PIL import Image
12
  import subprocess
13
  from bs4 import BeautifulSoup
14
  import io
15
  import docx
16
  from docx.shared import Inches
17
+ import logging
18
 
19
+ # Setup
20
  API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
21
  app = FastAPI()
 
 
22
  api_tokens = {"client-1": API_KEY}
23
+ MAX_PDF_SIZE_MB = 40
24
+
25
+ # Logging config
26
+ logging.basicConfig(level=logging.INFO)
27
+ logger = logging.getLogger(__name__)
28
 
29
+ # Schema
30
  class PDFRequest(BaseModel):
31
  file_b64: str
32
  callback_url: str
33
  client_id: str
34
  token: str
35
 
36
+ @app.get("/health")
37
+ def health():
38
+ return {"status": "ok"}
39
+
40
  @app.post("/convert-pdf")
41
  async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
42
+ # Auth
43
  if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
44
  raise HTTPException(status_code=401, detail="Invalid API credentials")
45
 
46
+ # Decode base64
47
+ try:
48
+ pdf_bytes = base64.b64decode(payload.file_b64, validate=True)
49
+ except Exception as e:
50
+ logger.error(f"Base64 decode failed: {e}")
51
+ raise HTTPException(status_code=400, detail="Invalid base64-encoded PDF")
52
+
53
+ # Enforce size limit
54
+ if len(pdf_bytes) > MAX_PDF_SIZE_MB * 1024 * 1024:
55
+ raise HTTPException(status_code=413, detail=f"PDF too large (> {MAX_PDF_SIZE_MB} MB)")
56
+
57
+ # Write to temp file
58
  try:
59
+ tmp_dir = Path(tempfile.mkdtemp())
60
+ tmp_pdf_path = tmp_dir / f"{uuid.uuid4()}.pdf"
61
+ tmp_pdf_path.write_bytes(pdf_bytes)
 
62
  except Exception as e:
63
+ logger.error(f"Failed to write PDF file: {e}")
64
+ raise HTTPException(status_code=500, detail="Internal error writing PDF")
65
 
66
+ # Process asynchronously
67
  task_id = str(uuid.uuid4())
68
+ background_tasks.add_task(handle_pdf_processing, str(tmp_pdf_path), payload.callback_url, task_id)
69
 
70
+ logger.info(f"Started task {task_id} for client {payload.client_id}")
71
  return JSONResponse({"status": "processing", "task_id": task_id})
72
 
73
 
74
+ async def handle_pdf_processing(pdf_path: str, callback_url: str, task_id: str):
75
  try:
76
  with open(pdf_path, "rb") as f:
77
  html, _ = extract_pdf_to_html(f)
78
+ except Exception as e:
79
+ logger.error(f"PDF processing failed: {e}")
80
+ html = f"<p>Error extracting PDF: {e}</p>"
81
  finally:
82
+ try:
83
+ Path(pdf_path).unlink(missing_ok=True)
84
+ except Exception as e:
85
+ logger.warning(f"Temp file cleanup failed: {e}")
86
 
87
+ # Callback
88
  try:
89
  async with aiohttp.ClientSession() as session:
90
  await session.post(callback_url, json={
91
  "task_id": task_id,
92
  "html": html,
93
  })
94
+ logger.info(f"Callback sent for task {task_id}")
95
  except Exception as e:
96
+ logger.error(f"Callback failed for task {task_id}: {e}")
97
 
98
 
99
  def extract_text_from_image(image: Image.Image) -> str:
100
+ """Extract LaTeX text using latexocr from a PIL image"""
101
+ temp_img_path = Path(tempfile.mktemp(suffix=".png"))
102
  image.save(temp_img_path)
 
103
  try:
104
  result = subprocess.run(
105
+ ["latexocr", str(temp_img_path)],
106
  capture_output=True,
107
  text=True
108
  )
 
113
  except Exception as e:
114
  return f"<i>LaTeX-OCR error: {str(e)}</i>"
115
  finally:
116
+ try:
117
+ temp_img_path.unlink(missing_ok=True)
118
+ except Exception:
119
+ pass
120
 
121
 
122
+ def extract_pdf_to_html(file) -> tuple[str, docx.Document]:
123
+ """Main function to extract PDF content into HTML + DOCX"""
124
+ if not file:
125
+ return "<p>No file provided.</p>", docx.Document()
126
 
127
  html_output = ""
128
  docx_output = docx.Document()
 
150
  except Exception:
151
  continue
152
 
153
+ # OCR LaTeX from image
154
  math_html = extract_text_from_image(cropped)
155
  if math_html.strip():
156
  html_output += f"<div>{math_html}</div>\n"
157
  docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
158
 
159
+ # Embed image
160
  buffer = io.BytesIO()
161
  cropped.save(buffer, format="PNG")
162
  buffer.seek(0)
163
  b64_img = base64.b64encode(buffer.read()).decode("utf-8")
164
  html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
165
 
166
+ # Add to DOCX
167
  buffer.seek(0)
168
+ try:
169
+ docx_output.add_picture(buffer, width=Inches(5))
170
+ except Exception:
171
+ pass
172
 
173
  full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
174
+ return full_html, docx_output