triflix commited on
Commit
76565ed
·
verified ·
1 Parent(s): 283bdfb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +297 -55
app.py CHANGED
@@ -1,14 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
- import uuid
3
  import shutil
4
  import tempfile
5
- from fastapi import FastAPI, UploadFile, File
 
 
 
 
 
6
  from fastapi.responses import JSONResponse
 
 
 
 
 
7
  from paddleocr import PaddleOCR
8
- from pdf2image import convert_from_bytes
 
9
 
10
- # OCR instance
11
- ocr_engine = PaddleOCR(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  lang="mr",
13
  text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
14
  use_doc_orientation_classify=False,
@@ -16,64 +77,245 @@ ocr_engine = PaddleOCR(
16
  use_textline_orientation=False
17
  )
18
 
19
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
- def run_ocr_on_image(image_path: str):
23
- result = ocr_engine.predict(input=image_path)
24
- collected = []
 
 
 
25
 
26
- for block in result:
27
- texts = block.get("rec_texts", [])
28
- scores = block.get("rec_scores", [])
29
- pairs = [{"text": t, "score": float(s)} for t, s in zip(texts, scores)]
30
- collected.append(pairs)
31
 
32
- return collected
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
 
35
- @app.post("/ocr")
36
- async def ocr_endpoint(files: list[UploadFile] = File(...)):
37
- session_dir = tempfile.mkdtemp(prefix="ocr_")
38
- response_data = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  try:
41
- for file in files:
42
- original_name = file.filename
43
- file_ext = original_name.lower()
44
-
45
- saved_path = os.path.join(session_dir, f"{uuid.uuid4()}_{original_name}")
46
- with open(saved_path, "wb") as tmp:
47
- tmp.write(await file.read())
48
-
49
- # ------ PDF ------
50
- if file_ext.endswith(".pdf"):
51
- pdf_bytes = open(saved_path, "rb").read()
52
- pages = convert_from_bytes(pdf_bytes)
53
- page_results = []
54
-
55
- for idx, page in enumerate(pages):
56
- img_path = os.path.join(session_dir, f"{uuid.uuid4()}_page{idx}.jpg")
57
- page.save(img_path, "JPEG")
58
- page_results.append({
59
- "page": idx,
60
- "ocr": run_ocr_on_image(img_path)
61
- })
62
-
63
- response_data[original_name] = {
64
- "type": "pdf",
65
- "pages": page_results
66
- }
67
-
68
- # ------ Images ------
69
  else:
70
- image_result = run_ocr_on_image(saved_path)
71
- response_data[original_name] = {
72
- "type": "image",
73
- "ocr": image_result
74
- }
75
 
76
- finally:
77
- shutil.rmtree(session_dir, ignore_errors=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- return JSONResponse(response_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ """
3
+ Single-file FastAPI app for HuggingFace Space (CPU) supporting:
4
+ - Batch upload of images and PDFs (combination) up to TOTAL_FILE_LIMIT processed pages/images.
5
+ - PDF -> images conversion (PyMuPDF) with per-pdf page limit.
6
+ - Parallel image OCR (ThreadPoolExecutor) with safe concurrency defaults.
7
+ - Detailed per-file results, per-page breakdown, and per-item error reporting.
8
+ - Secure defaults: file type & size validation, temp-directory isolation, cleanup, non-root user compatibility.
9
+
10
+ Usage (example):
11
+ POST /ocr?per_pdf_pages=3&total_limit=15
12
+ multipart/form-data files: file field can be repeated
13
+
14
+ Produces JSON:
15
+ {
16
+ "summary": { "processed_files": 3, "total_pages_images": 6 },
17
+ "files": [
18
+ {
19
+ "filename": "CVC.jpg",
20
+ "type": "image",
21
+ "page": null,
22
+ "results": [{"text":"...","confidence":0.99}, ...],
23
+ "error": null
24
+ },
25
+ {
26
+ "filename": "doc.pdf",
27
+ "type": "pdf",
28
+ "page": 1,
29
+ "results": [...],
30
+ "error": null
31
+ }
32
+ ]
33
+ }
34
+ """
35
+ from __future__ import annotations
36
  import os
 
37
  import shutil
38
  import tempfile
39
+ import uuid
40
+ import math
41
+ import logging
42
+ from concurrent.futures import ThreadPoolExecutor, as_completed
43
+ from typing import List, Optional, Dict, Any, Tuple
44
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Query
45
  from fastapi.responses import JSONResponse
46
+ from pydantic import BaseModel, Field
47
+ from pathlib import Path
48
+
49
+ # OCR backend imports (local)
50
+ # PaddleOCR heavy initialization occurs once at startup
51
  from paddleocr import PaddleOCR
52
+ import fitz # PyMuPDF
53
+ from PIL import Image
54
 
55
+ # --- Configuration and secure defaults ---
56
+ ALLOWED_IMAGE_EXT = {".jpg", ".jpeg", ".png", ".tiff", ".bmp", ".webp"}
57
+ ALLOWED_DOC_EXT = {".pdf"}
58
+ ALLOWED_EXTENSIONS = ALLOWED_IMAGE_EXT.union(ALLOWED_DOC_EXT)
59
+ DEFAULT_PER_PDF_PAGES = 3
60
+ DEFAULT_TOTAL_LIMIT = 15 # max total pages/images processed per request
61
+ MAX_PER_PDF_PAGES = 10
62
+ MAX_FILE_SIZE_BYTES = 25 * 1024 * 1024 # 25 MB per uploaded file
63
+ OCR_DPI = 220 # dpi used when converting PDF pages to images
64
+ MAX_WORKERS = min(4, (os.cpu_count() or 2)) # conservative concurrency
65
+
66
+ # Logging
67
+ logging.basicConfig(level=logging.INFO)
68
+ logger = logging.getLogger("ocr_service")
69
+
70
+ # --- Initialize PaddleOCR once (reuse across requests) ---
71
+ # Language and model consistent with user's request (Marathi / Devanagari mobile recognizer).
72
+ OCR_ENGINE = PaddleOCR(
73
  lang="mr",
74
  text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
75
  use_doc_orientation_classify=False,
 
77
  use_textline_orientation=False
78
  )
79
 
80
+
81
+ # --- Response Schemas ---
82
+ class OCRText(BaseModel):
83
+ text: str = Field(..., description="Recognized text line")
84
+ confidence: float = Field(..., ge=0.0, le=1.0)
85
+
86
+
87
+ class FileResult(BaseModel):
88
+ filename: str
89
+ type: str # "image" or "pdf"
90
+ page: Optional[int] = None # for pdf pages; null for images
91
+ results: List[OCRText] = Field(default_factory=list)
92
+ error: Optional[str] = None
93
+
94
+
95
+ class OCROutput(BaseModel):
96
+ summary: Dict[str, Any]
97
+ files: List[FileResult]
98
+
99
+
100
+ # --- Utility functions ---
101
+ def safe_extension(filename: str) -> str:
102
+ return Path(filename).suffix.lower()
103
+
104
+
105
+ def validate_extension(filename: str) -> None:
106
+ ext = safe_extension(filename)
107
+ if ext not in ALLOWED_EXTENSIONS:
108
+ raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
109
+
110
+
111
+ def save_upload_to_temp(upload: UploadFile, dest_dir: str) -> str:
112
+ """
113
+ Save UploadFile to a uniquely named temp file in dest_dir.
114
+ Validates max size and uses streaming write to avoid memory spikes.
115
+ Returns full path to saved file.
116
+ """
117
+ ext = safe_extension(upload.filename)
118
+ tmp_name = f"{uuid.uuid4()}{ext}"
119
+ tmp_path = os.path.join(dest_dir, tmp_name)
120
+ total = 0
121
+ with open(tmp_path, "wb") as out_f:
122
+ while True:
123
+ chunk = upload.file.read(1024 * 64)
124
+ if not chunk:
125
+ break
126
+ total += len(chunk)
127
+ if total > MAX_FILE_SIZE_BYTES:
128
+ out_f.close()
129
+ os.remove(tmp_path)
130
+ raise HTTPException(status_code=413, detail=f"File too large: {upload.filename}")
131
+ out_f.write(chunk)
132
+ return tmp_path
133
 
134
 
135
+ def estimate_pdf_pages(pdf_path: str) -> int:
136
+ """Return number of pages in PDF without conversion."""
137
+ doc = fitz.open(pdf_path)
138
+ count = len(doc)
139
+ doc.close()
140
+ return count
141
 
 
 
 
 
 
142
 
143
+ def convert_pdf_to_images(pdf_path: str, dest_dir: str, pages_to_convert: int) -> List[Tuple[str, int]]:
144
+ """
145
+ Convert first N pages of PDF to images.
146
+ Returns list of tuples: (image_path, page_index1based)
147
+ """
148
+ doc = fitz.open(pdf_path)
149
+ page_count = len(doc)
150
+ limit = min(page_count, pages_to_convert)
151
+ images: List[Tuple[str, int]] = []
152
+ for i in range(limit):
153
+ page = doc.load_page(i)
154
+ pix = page.get_pixmap(dpi=OCR_DPI)
155
+ img_name = f"{uuid.uuid4()}.jpg"
156
+ img_path = os.path.join(dest_dir, img_name)
157
+ pix.save(img_path)
158
+ images.append((img_path, i + 1)) # page index 1-based
159
+ doc.close()
160
+ return images
161
 
162
 
163
+ def ocr_image_path(image_path: str) -> List[OCRText]:
164
+ """
165
+ Run PaddleOCR on a single image path and return list of OCRText.
166
+ This function isolates the OCR call and normalizes the output.
167
+ """
168
+ # PaddleOCR's predict/ocr returns a nested result structure.
169
+ # Use .predict(input=...) as in the user's examples.
170
+ try:
171
+ res = OCR_ENGINE.predict(input=image_path)
172
+ except Exception as e:
173
+ logger.exception("PaddleOCR failed on %s", image_path)
174
+ raise RuntimeError(f"OCR engine failure: {str(e)}")
175
+
176
+ aggregated: List[OCRText] = []
177
+ # res expected to be a list of blocks/dicts with keys 'rec_texts' and 'rec_scores'
178
+ for block in res:
179
+ rec_texts = block.get("rec_texts") or []
180
+ rec_scores = block.get("rec_scores") or []
181
+ for t, s in zip(rec_texts, rec_scores):
182
+ # enforce numeric confidence and clip to [0,1]
183
+ try:
184
+ conf = float(s)
185
+ except Exception:
186
+ conf = 0.0
187
+ conf = max(0.0, min(1.0, conf))
188
+ aggregated.append(OCRText(text=str(t), confidence=conf))
189
+ return aggregated
190
+
191
+
192
+ # --- FastAPI app and endpoint ---
193
+ app = FastAPI(title="Batch PaddleOCR API (PDF+Image)", version="1.0")
194
+
195
+
196
+ @app.post("/ocr", response_model=OCROutput)
197
+ async def ocr_batch_endpoint(
198
+ files: List[UploadFile] = File(..., description="Upload up to 'total_limit' images/pages across files."),
199
+ per_pdf_pages: int = Query(DEFAULT_PER_PDF_PAGES, ge=1, le=MAX_PER_PDF_PAGES, description="Max pages to convert per PDF"),
200
+ total_limit: int = Query(DEFAULT_TOTAL_LIMIT, ge=1, le=50, description="Maximum total pages/images processed in request"),
201
+ ):
202
+ """
203
+ Accepts multiple files (images and PDFs). Converts PDFs -> images (first per_pdf_pages pages)
204
+ and runs OCR on each image. Ensures total converted pages/images <= total_limit.
205
+ Returns per-file per-page OCR results and summary.
206
+ """
207
+
208
+ if len(files) == 0:
209
+ raise HTTPException(status_code=400, detail="No files uploaded")
210
+
211
+ # Save uploaded files to request-scoped temporary directory; ensures cleanup
212
+ request_tmpdir = tempfile.mkdtemp(prefix="ocrreq_")
213
+ saved_files: List[Tuple[str, str]] = [] # (original_filename, saved_path)
214
 
215
  try:
216
+ # 1) Validate and save uploads
217
+ for up in files:
218
+ validate_extension(up.filename)
219
+ saved_path = save_upload_to_temp(up, request_tmpdir)
220
+ saved_files.append((up.filename, saved_path))
221
+
222
+ # 2) Pre-scan PDFs to count required pages and enforce total_limit
223
+ total_pages_images = 0
224
+ pdfs_to_convert: List[Tuple[str, str, int]] = [] # (orig_name, saved_path, pages_to_convert)
225
+ image_files: List[Tuple[str, str]] = [] # (orig_name, saved_path)
226
+
227
+ for orig_name, path in saved_files:
228
+ ext = safe_extension(orig_name)
229
+ if ext in ALLOWED_IMAGE_EXT:
230
+ total_pages_images += 1
231
+ image_files.append((orig_name, path))
232
+ elif ext == ".pdf":
233
+ try:
234
+ pages = estimate_pdf_pages(path)
235
+ except Exception as e:
236
+ raise HTTPException(status_code=400, detail=f"Unable to read PDF {orig_name}: {str(e)}")
237
+ pages_to_convert = min(pages, per_pdf_pages)
238
+ pdfs_to_convert.append((orig_name, path, pages_to_convert))
239
+ total_pages_images += pages_to_convert
 
 
 
 
240
  else:
241
+ # Shouldn't reach due to earlier validation
242
+ raise HTTPException(status_code=400, detail=f"Unsupported extension for {orig_name}")
 
 
 
243
 
244
+ if total_pages_images == 0:
245
+ raise HTTPException(status_code=400, detail="No valid images/pages to process")
246
+
247
+ if total_pages_images > total_limit:
248
+ raise HTTPException(
249
+ status_code=413,
250
+ detail=f"Request would process {total_pages_images} pages/images which exceeds total_limit {total_limit}"
251
+ )
252
+
253
+ # 3) Convert PDFs to images (store list of (filename,page,image_path))
254
+ converted_images: List[Tuple[str, Optional[int], str]] = [] # (orig_filename, page_or_None, image_path)
255
+ for orig_name, pdf_path, pages_to_convert in pdfs_to_convert:
256
+ try:
257
+ imgs = convert_pdf_to_images(pdf_path, request_tmpdir, pages_to_convert)
258
+ except Exception as e:
259
+ # if conversion fails for a file, record as zero and continue
260
+ logger.exception("PDF conversion failed for %s", orig_name)
261
+ converted_images.append((orig_name, None, f"__error__conversion__:{str(e)}"))
262
+ continue
263
+ for img_path, page_num in imgs:
264
+ converted_images.append((orig_name, page_num, img_path))
265
+
266
+ # include standalone image files
267
+ for orig_name, img_path in image_files:
268
+ converted_images.append((orig_name, None, img_path))
269
 
270
+ # 4) OCR all images - use ThreadPoolExecutor for parallelism within safe workers
271
+ results_per_file: List[FileResult] = []
272
+ futures = {}
273
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
274
+ for orig_name, page_num, img_path in converted_images:
275
+ if isinstance(img_path, str) and img_path.startswith("__error__conversion__"):
276
+ # embed conversion error immediately
277
+ err_msg = img_path.split(":", 1)[1] if ":" in img_path else "Conversion error"
278
+ fr = FileResult(filename=orig_name, type="pdf", page=page_num, results=[], error=err_msg)
279
+ results_per_file.append(fr)
280
+ continue
281
+ futures[ex.submit(ocr_image_path, img_path)] = (orig_name, page_num, img_path)
282
+
283
+ for fut in as_completed(list(futures.keys())):
284
+ orig_name, page_num, img_path = futures[fut]
285
+ try:
286
+ ocr_texts = fut.result()
287
+ fr = FileResult(
288
+ filename=orig_name,
289
+ type=("pdf" if page_num is not None else "image"),
290
+ page=page_num,
291
+ results=ocr_texts,
292
+ error=None,
293
+ )
294
+ except Exception as e:
295
+ logger.exception("OCR failed for %s (page=%s): %s", orig_name, page_num, str(e))
296
+ fr = FileResult(
297
+ filename=orig_name,
298
+ type=("pdf" if page_num is not None else "image"),
299
+ page=page_num,
300
+ results=[],
301
+ error=str(e),
302
+ )
303
+ results_per_file.append(fr)
304
+
305
+ # 5) Build summary and return
306
+ processed_files_count = len([r for r in results_per_file if r.error is None or r.results])
307
+ summary = {
308
+ "requested_files": len(files),
309
+ "processed_files": processed_files_count,
310
+ "total_pages_images": total_pages_images,
311
+ "per_pdf_pages": per_pdf_pages,
312
+ "total_limit": total_limit,
313
+ }
314
+ return JSONResponse(OCROutput(summary=summary, files=results_per_file).model_dump())
315
+
316
+ finally:
317
+ # Cleanup temp files and directory
318
+ try:
319
+ shutil.rmtree(request_tmpdir)
320
+ except Exception:
321
+ logger.warning("Failed to cleanup tempdir %s", request_tmpdir)