madankn79 commited on
Commit
7a61a27
·
1 Parent(s): 1edd595

Initial Commit 5.1.0

Browse files
Files changed (3) hide show
  1. app.py +25 -40
  2. apt.txt +2 -0
  3. requirements.txt +5 -1
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # Hugging Face-style async API using FastAPI
2
  from fastapi import FastAPI, Request, HTTPException, BackgroundTasks
3
  from fastapi.responses import JSONResponse
4
  from pydantic import BaseModel
@@ -6,24 +5,21 @@ import base64
6
  import tempfile
7
  import uuid
8
  import os
9
- import secrets
10
  import aiohttp
11
- import shutil
12
  import pdfplumber
13
  import pytesseract
14
  from pdf2image import convert_from_bytes
15
- import io
16
  from PIL import Image
17
  import subprocess
18
  from bs4 import BeautifulSoup
19
-
20
-
 
21
 
22
  API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
23
-
24
  app = FastAPI()
25
 
26
- # In-memory storage for tokens or you can use Redis/DB
27
  api_tokens = {"client-1": API_KEY}
28
 
29
  class PDFRequest(BaseModel):
@@ -34,11 +30,9 @@ class PDFRequest(BaseModel):
34
 
35
  @app.post("/convert-pdf")
36
  async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
37
- # Auth check
38
  if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
39
  raise HTTPException(status_code=401, detail="Invalid API credentials")
40
 
41
- # Save base64 PDF to a temp file
42
  try:
43
  pdf_bytes = base64.b64decode(payload.file_b64)
44
  tmp_pdf_path = tempfile.mktemp(suffix=".pdf")
@@ -48,20 +42,19 @@ async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: Background
48
  raise HTTPException(status_code=400, detail=f"Invalid PDF: {str(e)}")
49
 
50
  task_id = str(uuid.uuid4())
51
-
52
- # Run conversion in background
53
  background_tasks.add_task(handle_pdf_processing, tmp_pdf_path, payload.callback_url, task_id)
54
 
55
  return JSONResponse({"status": "processing", "task_id": task_id})
56
 
 
57
  async def handle_pdf_processing(pdf_path, callback_url, task_id):
58
  try:
59
- html, _ = extract_pdf_to_html(open(pdf_path, "rb"))
 
60
  finally:
61
  if os.path.exists(pdf_path):
62
  os.remove(pdf_path)
63
 
64
- # Post result to callback URL
65
  try:
66
  async with aiohttp.ClientSession() as session:
67
  await session.post(callback_url, json={
@@ -71,20 +64,27 @@ async def handle_pdf_processing(pdf_path, callback_url, task_id):
71
  except Exception as e:
72
  print(f"Callback failed: {e}")
73
 
 
74
  def extract_text_from_image(image: Image.Image) -> str:
75
- # Save image temporarily
76
- temp_img_path = "temp_math.png"
77
  image.save(temp_img_path)
 
78
  try:
79
- # Run LaTeX-OCR CLI (assumes it's installed)
80
  result = subprocess.run(
81
  ["latexocr", temp_img_path],
82
  capture_output=True,
83
  text=True
84
  )
85
- return f"<pre>\\[{result.stdout.strip()}\\]</pre>" if result.returncode == 0 else ""
 
 
 
86
  except Exception as e:
87
- return f"<i>Error: {str(e)}</i>"
 
 
 
 
88
 
89
  def extract_pdf_to_html(file):
90
  if file is None:
@@ -94,14 +94,13 @@ def extract_pdf_to_html(file):
94
  docx_output = docx.Document()
95
  toc = []
96
 
97
- with pdfplumber.open(file.name) as pdf:
98
  for page_num, page in enumerate(pdf.pages):
99
  page_title = f"Page {page_num + 1}"
100
  toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
101
  html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
102
  docx_output.add_heading(page_title, level=2)
103
 
104
- # Extract text and add paragraphs
105
  text = page.extract_text()
106
  if text:
107
  for line in text.split("\n"):
@@ -110,45 +109,31 @@ def extract_pdf_to_html(file):
110
  else:
111
  html_output += "<p><i>No text detected on this page.</i></p>"
112
 
113
- # Process images embedded in page (diagrams, formulas, etc)
114
  for img_obj in page.images:
115
- x0, top, x1, bottom = img_obj["x0"], img_obj["top"], img_obj["x1"], img_obj["bottom"]
116
- page_x0, page_top, page_x1, page_bottom = page.bbox
117
-
118
- # Clip bbox to page boundaries
119
- crop_x0 = max(x0, page_x0)
120
- crop_top = max(top, page_top)
121
- crop_x1 = min(x1, page_x1)
122
- crop_bottom = min(bottom, page_bottom)
123
-
124
  try:
125
- cropped = page.crop((crop_x0, crop_top, crop_x1, crop_bottom)).to_image(resolution=300).original
126
- except Exception as e:
127
- # Skip if cropping fails
128
  continue
129
 
130
- # Run LaTeX OCR on image for math formulas
131
  math_html = extract_text_from_image(cropped)
132
  if math_html.strip():
133
  html_output += f"<div>{math_html}</div>\n"
134
- # Also add LaTeX text to docx as paragraph
135
  docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
136
 
137
- # Convert cropped image to base64 and embed in HTML
138
  buffer = io.BytesIO()
139
  cropped.save(buffer, format="PNG")
140
  buffer.seek(0)
141
  b64_img = base64.b64encode(buffer.read()).decode("utf-8")
142
  html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
143
 
144
- # Add image to docx
145
  buffer.seek(0)
146
  docx_output.add_picture(buffer, width=Inches(5))
147
 
148
  full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
149
- return full_html
 
150
 
151
- # Secure with a basic API token system
152
  @app.get("/health")
153
  def health():
154
  return {"status": "ok"}
 
 
1
  from fastapi import FastAPI, Request, HTTPException, BackgroundTasks
2
  from fastapi.responses import JSONResponse
3
  from pydantic import BaseModel
 
5
  import tempfile
6
  import uuid
7
  import os
 
8
  import aiohttp
 
9
  import pdfplumber
10
  import pytesseract
11
  from pdf2image import convert_from_bytes
 
12
  from PIL import Image
13
  import subprocess
14
  from bs4 import BeautifulSoup
15
+ import io
16
+ import docx
17
+ from docx.shared import Inches
18
 
19
  API_KEY = os.getenv("PDF_API_KEY", "secret-api-key")
 
20
  app = FastAPI()
21
 
22
+ # In-memory token check (replace with KV or DB in production)
23
  api_tokens = {"client-1": API_KEY}
24
 
25
  class PDFRequest(BaseModel):
 
30
 
31
  @app.post("/convert-pdf")
32
  async def convert_pdf_endpoint(payload: PDFRequest, background_tasks: BackgroundTasks):
 
33
  if payload.client_id not in api_tokens or api_tokens[payload.client_id] != payload.token:
34
  raise HTTPException(status_code=401, detail="Invalid API credentials")
35
 
 
36
  try:
37
  pdf_bytes = base64.b64decode(payload.file_b64)
38
  tmp_pdf_path = tempfile.mktemp(suffix=".pdf")
 
42
  raise HTTPException(status_code=400, detail=f"Invalid PDF: {str(e)}")
43
 
44
  task_id = str(uuid.uuid4())
 
 
45
  background_tasks.add_task(handle_pdf_processing, tmp_pdf_path, payload.callback_url, task_id)
46
 
47
  return JSONResponse({"status": "processing", "task_id": task_id})
48
 
49
+
50
  async def handle_pdf_processing(pdf_path, callback_url, task_id):
51
  try:
52
+ with open(pdf_path, "rb") as f:
53
+ html, _ = extract_pdf_to_html(f)
54
  finally:
55
  if os.path.exists(pdf_path):
56
  os.remove(pdf_path)
57
 
 
58
  try:
59
  async with aiohttp.ClientSession() as session:
60
  await session.post(callback_url, json={
 
64
  except Exception as e:
65
  print(f"Callback failed: {e}")
66
 
67
+
68
  def extract_text_from_image(image: Image.Image) -> str:
69
+ temp_img_path = tempfile.mktemp(suffix=".png")
 
70
  image.save(temp_img_path)
71
+
72
  try:
 
73
  result = subprocess.run(
74
  ["latexocr", temp_img_path],
75
  capture_output=True,
76
  text=True
77
  )
78
+ if result.returncode == 0:
79
+ return f"<pre>\\[{result.stdout.strip()}\\]</pre>"
80
+ else:
81
+ return ""
82
  except Exception as e:
83
+ return f"<i>LaTeX-OCR error: {str(e)}</i>"
84
+ finally:
85
+ if os.path.exists(temp_img_path):
86
+ os.remove(temp_img_path)
87
+
88
 
89
  def extract_pdf_to_html(file):
90
  if file is None:
 
94
  docx_output = docx.Document()
95
  toc = []
96
 
97
+ with pdfplumber.open(file) as pdf:
98
  for page_num, page in enumerate(pdf.pages):
99
  page_title = f"Page {page_num + 1}"
100
  toc.append(f"<li><a href='#page{page_num+1}'>{page_title}</a></li>")
101
  html_output += f"<h2 id='page{page_num+1}'>{page_title}</h2>\n"
102
  docx_output.add_heading(page_title, level=2)
103
 
 
104
  text = page.extract_text()
105
  if text:
106
  for line in text.split("\n"):
 
109
  else:
110
  html_output += "<p><i>No text detected on this page.</i></p>"
111
 
 
112
  for img_obj in page.images:
 
 
 
 
 
 
 
 
 
113
  try:
114
+ x0, top, x1, bottom = img_obj["x0"], img_obj["top"], img_obj["x1"], img_obj["bottom"]
115
+ cropped = page.crop((x0, top, x1, bottom)).to_image(resolution=300).original
116
+ except Exception:
117
  continue
118
 
 
119
  math_html = extract_text_from_image(cropped)
120
  if math_html.strip():
121
  html_output += f"<div>{math_html}</div>\n"
 
122
  docx_output.add_paragraph(BeautifulSoup(math_html, "html.parser").text)
123
 
 
124
  buffer = io.BytesIO()
125
  cropped.save(buffer, format="PNG")
126
  buffer.seek(0)
127
  b64_img = base64.b64encode(buffer.read()).decode("utf-8")
128
  html_output += f'<img src="data:image/png;base64,{b64_img}" style="width:100%; margin: 1rem 0;" />\n'
129
 
 
130
  buffer.seek(0)
131
  docx_output.add_picture(buffer, width=Inches(5))
132
 
133
  full_html = f"<ul>{''.join(toc)}</ul>\n" + html_output
134
+ return full_html, docx_output
135
+
136
 
 
137
  @app.get("/health")
138
  def health():
139
  return {"status": "ok"}
apt.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ poppler-utils
2
+ tesseract-ocr
requirements.txt CHANGED
@@ -1,7 +1,11 @@
1
  gradio
 
 
 
2
  pdfplumber
3
  pytesseract
4
  pdf2image
5
  Pillow
6
  beautifulsoup4
7
- python-docx
 
 
1
  gradio
2
+ fastapi
3
+ uvicorn
4
+ aiohttp
5
  pdfplumber
6
  pytesseract
7
  pdf2image
8
  Pillow
9
  beautifulsoup4
10
+ python-docx
11
+ git+https://github.com/lukas-blecher/LaTeX-OCR.git