triflix commited on
Commit
c08cd96
·
verified ·
1 Parent(s): 76565ed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -291
app.py CHANGED
@@ -1,321 +1,107 @@
1
- # app.py
2
- """
3
- Single-file FastAPI app for HuggingFace Space (CPU) supporting:
4
- - Batch upload of images and PDFs (combination) up to TOTAL_FILE_LIMIT processed pages/images.
5
- - PDF -> images conversion (PyMuPDF) with per-pdf page limit.
6
- - Parallel image OCR (ThreadPoolExecutor) with safe concurrency defaults.
7
- - Detailed per-file results, per-page breakdown, and per-item error reporting.
8
- - Secure defaults: file type & size validation, temp-directory isolation, cleanup, non-root user compatibility.
9
-
10
- Usage (example):
11
- POST /ocr?per_pdf_pages=3&total_limit=15
12
- multipart/form-data files: file field can be repeated
13
-
14
- Produces JSON:
15
- {
16
- "summary": { "processed_files": 3, "total_pages_images": 6 },
17
- "files": [
18
- {
19
- "filename": "CVC.jpg",
20
- "type": "image",
21
- "page": null,
22
- "results": [{"text":"...","confidence":0.99}, ...],
23
- "error": null
24
- },
25
- {
26
- "filename": "doc.pdf",
27
- "type": "pdf",
28
- "page": 1,
29
- "results": [...],
30
- "error": null
31
- }
32
- ]
33
- }
34
- """
35
- from __future__ import annotations
36
  import os
37
- import shutil
38
- import tempfile
39
  import uuid
40
- import math
41
- import logging
42
- from concurrent.futures import ThreadPoolExecutor, as_completed
43
- from typing import List, Optional, Dict, Any, Tuple
44
- from fastapi import FastAPI, UploadFile, File, HTTPException, Query
45
  from fastapi.responses import JSONResponse
46
- from pydantic import BaseModel, Field
47
- from pathlib import Path
48
-
49
- # OCR backend imports (local)
50
- # PaddleOCR heavy initialization occurs once at startup
 
 
 
 
 
 
 
51
  from paddleocr import PaddleOCR
52
- import fitz # PyMuPDF
53
- from PIL import Image
54
-
55
- # --- Configuration and secure defaults ---
56
- ALLOWED_IMAGE_EXT = {".jpg", ".jpeg", ".png", ".tiff", ".bmp", ".webp"}
57
- ALLOWED_DOC_EXT = {".pdf"}
58
- ALLOWED_EXTENSIONS = ALLOWED_IMAGE_EXT.union(ALLOWED_DOC_EXT)
59
- DEFAULT_PER_PDF_PAGES = 3
60
- DEFAULT_TOTAL_LIMIT = 15 # max total pages/images processed per request
61
- MAX_PER_PDF_PAGES = 10
62
- MAX_FILE_SIZE_BYTES = 25 * 1024 * 1024 # 25 MB per uploaded file
63
- OCR_DPI = 220 # dpi used when converting PDF pages to images
64
- MAX_WORKERS = min(4, (os.cpu_count() or 2)) # conservative concurrency
65
-
66
- # Logging
67
- logging.basicConfig(level=logging.INFO)
68
- logger = logging.getLogger("ocr_service")
69
-
70
- # --- Initialize PaddleOCR once (reuse across requests) ---
71
- # Language and model consistent with user's request (Marathi / Devanagari mobile recognizer).
72
- OCR_ENGINE = PaddleOCR(
73
- lang="mr",
74
- text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
75
- use_doc_orientation_classify=False,
76
- use_doc_unwarping=False,
77
- use_textline_orientation=False
78
- )
79
-
80
-
81
- # --- Response Schemas ---
82
- class OCRText(BaseModel):
83
- text: str = Field(..., description="Recognized text line")
84
- confidence: float = Field(..., ge=0.0, le=1.0)
85
-
86
-
87
- class FileResult(BaseModel):
88
- filename: str
89
- type: str # "image" or "pdf"
90
- page: Optional[int] = None # for pdf pages; null for images
91
- results: List[OCRText] = Field(default_factory=list)
92
- error: Optional[str] = None
93
-
94
-
95
- class OCROutput(BaseModel):
96
- summary: Dict[str, Any]
97
- files: List[FileResult]
98
-
99
 
100
- # --- Utility functions ---
101
- def safe_extension(filename: str) -> str:
102
- return Path(filename).suffix.lower()
 
 
 
103
 
104
-
105
- def validate_extension(filename: str) -> None:
106
- ext = safe_extension(filename)
107
- if ext not in ALLOWED_EXTENSIONS:
108
- raise HTTPException(status_code=400, detail=f"Unsupported file extension: {ext}")
109
-
110
-
111
- def save_upload_to_temp(upload: UploadFile, dest_dir: str) -> str:
112
- """
113
- Save UploadFile to a uniquely named temp file in dest_dir.
114
- Validates max size and uses streaming write to avoid memory spikes.
115
- Returns full path to saved file.
116
- """
117
- ext = safe_extension(upload.filename)
118
- tmp_name = f"{uuid.uuid4()}{ext}"
119
- tmp_path = os.path.join(dest_dir, tmp_name)
120
- total = 0
121
- with open(tmp_path, "wb") as out_f:
122
- while True:
123
- chunk = upload.file.read(1024 * 64)
124
- if not chunk:
125
- break
126
- total += len(chunk)
127
- if total > MAX_FILE_SIZE_BYTES:
128
- out_f.close()
129
- os.remove(tmp_path)
130
- raise HTTPException(status_code=413, detail=f"File too large: {upload.filename}")
131
- out_f.write(chunk)
132
- return tmp_path
133
-
134
-
135
- def estimate_pdf_pages(pdf_path: str) -> int:
136
- """Return number of pages in PDF without conversion."""
137
  doc = fitz.open(pdf_path)
138
- count = len(doc)
139
- doc.close()
140
- return count
141
 
 
 
 
 
 
142
 
143
- def convert_pdf_to_images(pdf_path: str, dest_dir: str, pages_to_convert: int) -> List[Tuple[str, int]]:
144
- """
145
- Convert first N pages of PDF to images.
146
- Returns list of tuples: (image_path, page_index1based)
147
- """
148
- doc = fitz.open(pdf_path)
149
- page_count = len(doc)
150
- limit = min(page_count, pages_to_convert)
151
- images: List[Tuple[str, int]] = []
152
  for i in range(limit):
153
  page = doc.load_page(i)
154
- pix = page.get_pixmap(dpi=OCR_DPI)
155
  img_name = f"{uuid.uuid4()}.jpg"
156
- img_path = os.path.join(dest_dir, img_name)
157
  pix.save(img_path)
158
- images.append((img_path, i + 1)) # page index 1-based
159
- doc.close()
160
- return images
161
 
 
162
 
163
- def ocr_image_path(image_path: str) -> List[OCRText]:
164
- """
165
- Run PaddleOCR on a single image path and return list of OCRText.
166
- This function isolates the OCR call and normalizes the output.
167
- """
168
- # PaddleOCR's predict/ocr returns a nested result structure.
169
- # Use .predict(input=...) as in the user's examples.
170
- try:
171
- res = OCR_ENGINE.predict(input=image_path)
172
- except Exception as e:
173
- logger.exception("PaddleOCR failed on %s", image_path)
174
- raise RuntimeError(f"OCR engine failure: {str(e)}")
175
-
176
- aggregated: List[OCRText] = []
177
- # res expected to be a list of blocks/dicts with keys 'rec_texts' and 'rec_scores'
178
- for block in res:
179
- rec_texts = block.get("rec_texts") or []
180
- rec_scores = block.get("rec_scores") or []
181
- for t, s in zip(rec_texts, rec_scores):
182
- # enforce numeric confidence and clip to [0,1]
183
- try:
184
- conf = float(s)
185
- except Exception:
186
- conf = 0.0
187
- conf = max(0.0, min(1.0, conf))
188
- aggregated.append(OCRText(text=str(t), confidence=conf))
189
- return aggregated
190
-
191
-
192
- # --- FastAPI app and endpoint ---
193
- app = FastAPI(title="Batch PaddleOCR API (PDF+Image)", version="1.0")
194
 
 
 
 
 
 
 
 
 
 
 
195
 
196
- @app.post("/ocr", response_model=OCROutput)
197
- async def ocr_batch_endpoint(
198
- files: List[UploadFile] = File(..., description="Upload up to 'total_limit' images/pages across files."),
199
- per_pdf_pages: int = Query(DEFAULT_PER_PDF_PAGES, ge=1, le=MAX_PER_PDF_PAGES, description="Max pages to convert per PDF"),
200
- total_limit: int = Query(DEFAULT_TOTAL_LIMIT, ge=1, le=50, description="Maximum total pages/images processed in request"),
201
- ):
202
- """
203
- Accepts multiple files (images and PDFs). Converts PDFs -> images (first per_pdf_pages pages)
204
- and runs OCR on each image. Ensures total converted pages/images <= total_limit.
205
- Returns per-file per-page OCR results and summary.
206
- """
207
 
208
- if len(files) == 0:
209
- raise HTTPException(status_code=400, detail="No files uploaded")
 
 
 
 
 
 
 
210
 
211
- # Save uploaded files to request-scoped temporary directory; ensures cleanup
212
- request_tmpdir = tempfile.mkdtemp(prefix="ocrreq_")
213
- saved_files: List[Tuple[str, str]] = [] # (original_filename, saved_path)
214
 
215
- try:
216
- # 1) Validate and save uploads
217
- for up in files:
218
- validate_extension(up.filename)
219
- saved_path = save_upload_to_temp(up, request_tmpdir)
220
- saved_files.append((up.filename, saved_path))
221
 
222
- # 2) Pre-scan PDFs to count required pages and enforce total_limit
223
- total_pages_images = 0
224
- pdfs_to_convert: List[Tuple[str, str, int]] = [] # (orig_name, saved_path, pages_to_convert)
225
- image_files: List[Tuple[str, str]] = [] # (orig_name, saved_path)
226
 
227
- for orig_name, path in saved_files:
228
- ext = safe_extension(orig_name)
229
- if ext in ALLOWED_IMAGE_EXT:
230
- total_pages_images += 1
231
- image_files.append((orig_name, path))
232
- elif ext == ".pdf":
233
- try:
234
- pages = estimate_pdf_pages(path)
235
- except Exception as e:
236
- raise HTTPException(status_code=400, detail=f"Unable to read PDF {orig_name}: {str(e)}")
237
- pages_to_convert = min(pages, per_pdf_pages)
238
- pdfs_to_convert.append((orig_name, path, pages_to_convert))
239
- total_pages_images += pages_to_convert
240
- else:
241
- # Shouldn't reach due to earlier validation
242
- raise HTTPException(status_code=400, detail=f"Unsupported extension for {orig_name}")
243
 
244
- if total_pages_images == 0:
245
- raise HTTPException(status_code=400, detail="No valid images/pages to process")
246
 
247
- if total_pages_images > total_limit:
248
- raise HTTPException(
249
- status_code=413,
250
- detail=f"Request would process {total_pages_images} pages/images which exceeds total_limit {total_limit}"
251
- )
252
 
253
- # 3) Convert PDFs to images (store list of (filename,page,image_path))
254
- converted_images: List[Tuple[str, Optional[int], str]] = [] # (orig_filename, page_or_None, image_path)
255
- for orig_name, pdf_path, pages_to_convert in pdfs_to_convert:
256
- try:
257
- imgs = convert_pdf_to_images(pdf_path, request_tmpdir, pages_to_convert)
258
- except Exception as e:
259
- # if conversion fails for a file, record as zero and continue
260
- logger.exception("PDF conversion failed for %s", orig_name)
261
- converted_images.append((orig_name, None, f"__error__conversion__:{str(e)}"))
262
- continue
263
- for img_path, page_num in imgs:
264
- converted_images.append((orig_name, page_num, img_path))
265
 
266
- # include standalone image files
267
- for orig_name, img_path in image_files:
268
- converted_images.append((orig_name, None, img_path))
269
 
270
- # 4) OCR all images - use ThreadPoolExecutor for parallelism within safe workers
271
- results_per_file: List[FileResult] = []
272
- futures = {}
273
- with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
274
- for orig_name, page_num, img_path in converted_images:
275
- if isinstance(img_path, str) and img_path.startswith("__error__conversion__"):
276
- # embed conversion error immediately
277
- err_msg = img_path.split(":", 1)[1] if ":" in img_path else "Conversion error"
278
- fr = FileResult(filename=orig_name, type="pdf", page=page_num, results=[], error=err_msg)
279
- results_per_file.append(fr)
280
- continue
281
- futures[ex.submit(ocr_image_path, img_path)] = (orig_name, page_num, img_path)
282
 
283
- for fut in as_completed(list(futures.keys())):
284
- orig_name, page_num, img_path = futures[fut]
285
- try:
286
- ocr_texts = fut.result()
287
- fr = FileResult(
288
- filename=orig_name,
289
- type=("pdf" if page_num is not None else "image"),
290
- page=page_num,
291
- results=ocr_texts,
292
- error=None,
293
- )
294
- except Exception as e:
295
- logger.exception("OCR failed for %s (page=%s): %s", orig_name, page_num, str(e))
296
- fr = FileResult(
297
- filename=orig_name,
298
- type=("pdf" if page_num is not None else "image"),
299
- page=page_num,
300
- results=[],
301
- error=str(e),
302
- )
303
- results_per_file.append(fr)
304
 
305
- # 5) Build summary and return
306
- processed_files_count = len([r for r in results_per_file if r.error is None or r.results])
307
- summary = {
308
- "requested_files": len(files),
309
- "processed_files": processed_files_count,
310
- "total_pages_images": total_pages_images,
311
- "per_pdf_pages": per_pdf_pages,
312
- "total_limit": total_limit,
313
- }
314
- return JSONResponse(OCROutput(summary=summary, files=results_per_file).model_dump())
315
 
316
- finally:
317
- # Cleanup temp files and directory
318
- try:
319
- shutil.rmtree(request_tmpdir)
320
- except Exception:
321
- logger.warning("Failed to cleanup tempdir %s", request_tmpdir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
 
2
  import uuid
3
+ from fastapi import FastAPI, UploadFile, File, HTTPException
 
 
 
 
4
  from fastapi.responses import JSONResponse
5
+ from typing import List
6
+ import fitz
7
+
8
+ # -------------------------------------------------------------------
9
+ # FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
10
+ # -------------------------------------------------------------------
11
+ os.environ["PADDLE_HOME"] = "/app/paddle_home"
12
+ os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
13
+ os.makedirs("/app/paddle_home", exist_ok=True)
14
+ os.makedirs("/app/xdg_cache", exist_ok=True)
15
+
16
+ # now safe to import paddlex/paddleocr
17
  from paddleocr import PaddleOCR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # -------------------------------------------------------------------
20
+ # PDF IMAGE
21
+ # -------------------------------------------------------------------
22
+ def pdf_to_images(pdf_path: str, max_pages: int | None = 3) -> List[str]:
23
+ if not os.path.exists(pdf_path):
24
+ raise FileNotFoundError(pdf_path)
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  doc = fitz.open(pdf_path)
27
+ page_count = len(doc)
 
 
28
 
29
+ limit = page_count if max_pages is None else min(max_pages, page_count)
30
+ output_paths: List[str] = []
31
+
32
+ out_dir = "/app/pdf_images"
33
+ os.makedirs(out_dir, exist_ok=True)
34
 
 
 
 
 
 
 
 
 
 
35
  for i in range(limit):
36
  page = doc.load_page(i)
37
+ pix = page.get_pixmap(dpi=220)
38
  img_name = f"{uuid.uuid4()}.jpg"
39
+ img_path = os.path.join(out_dir, img_name)
40
  pix.save(img_path)
41
+ output_paths.append(img_path)
 
 
42
 
43
+ return output_paths
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # -------------------------------------------------------------------
47
+ # OCR ENGINE
48
+ # -------------------------------------------------------------------
49
+ ocr_engine = PaddleOCR(
50
+ lang="mr",
51
+ text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
52
+ use_doc_orientation_classify=False,
53
+ use_doc_unwarping=False,
54
+ use_textline_orientation=False
55
+ )
56
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ def extract_text(image_path: str):
59
+ result = ocr_engine.predict(input=image_path)
60
+ output = []
61
+ for block in result:
62
+ texts = block["rec_texts"]
63
+ scores = block["rec_scores"]
64
+ for t, s in zip(texts, scores):
65
+ output.append({"text": t, "confidence": float(s)})
66
+ return output
67
 
 
 
 
68
 
69
+ # -------------------------------------------------------------------
70
+ # FASTAPI
71
+ # -------------------------------------------------------------------
72
+ app = FastAPI()
73
+ UPLOAD_DIR = "/app/uploads"
74
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
75
 
 
 
 
 
76
 
77
+ @app.post("/ocr")
78
+ async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int | None = 3):
79
+ if len(files) > 15:
80
+ raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ final_output = []
 
83
 
84
+ for file in files:
85
+ filename = file.filename.lower()
86
+ ext = filename.split(".")[-1]
 
 
87
 
88
+ temp_name = f"{uuid.uuid4()}.{ext}"
89
+ temp_path = os.path.join(UPLOAD_DIR, temp_name)
 
 
 
 
 
 
 
 
 
 
90
 
91
+ with open(temp_path, "wb") as f:
92
+ f.write(await file.read())
 
93
 
94
+ # PDF
95
+ if filename.endswith(".pdf"):
96
+ img_paths = pdf_to_images(temp_path, max_pages=max_pages)
97
+ for img_path in img_paths:
98
+ final_output.extend(extract_text(img_path))
 
 
 
 
 
 
 
99
 
100
+ # Images
101
+ elif filename.endswith((".jpg", ".jpeg", ".png")):
102
+ final_output.extend(extract_text(temp_path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ else:
105
+ raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}")
 
 
 
 
 
 
 
 
106
 
107
+ return JSONResponse({"results": final_output})