triflix commited on
Commit
a09fca3
·
verified ·
1 Parent(s): 7da07f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +326 -134
app.py CHANGED
@@ -1,186 +1,378 @@
 
 
 
 
1
  import io
2
- import gc
3
- import logging
4
- from typing import List, Dict, Any
5
- from PIL import Image
6
  import numpy as np
7
- from fastapi import FastAPI, File, UploadFile, HTTPException
 
 
8
  from paddleocr import PaddleOCR
9
- from pdf2image import convert_from_bytes
 
 
 
10
 
11
  # Configure logging
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
15
- # Global OCR instance (loaded once at startup)
16
- ocr_engine = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- def get_ocr_engine():
19
- """Singleton pattern for OCR model"""
20
- global ocr_engine
21
- if ocr_engine is None:
22
- logger.info("Initializing PaddleOCR model...")
23
- ocr_engine = PaddleOCR(
 
24
  lang="mr",
25
  use_doc_orientation_classify=False,
26
  use_doc_unwarping=False,
27
  use_textline_orientation=False,
28
- show_log=False # Reduce clutter
 
29
  )
30
- return ocr_engine
 
 
31
 
32
- app = FastAPI(title="PaddleOCR Marathi API")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
- def resize_image(image: Image.Image, max_pixels: int = 2500) -> Image.Image:
35
- """Resize if any dimension exceeds limit to control memory usage"""
36
- if max(image.size) > max_pixels:
37
- ratio = max_pixels / max(image.size)
38
- new_size = (int(image.width * ratio), int(image.height * ratio))
39
- logger.info(f"Resizing {image.size} -> {new_size}")
40
- return image.resize(new_size, Image.Resampling.LANCZOS)
41
- return image
42
 
43
- def process_image(contents: bytes, filename: str) -> Dict[str, Any]:
44
- """Process single image entirely in memory"""
45
  try:
46
- image = Image.open(io.BytesIO(contents)).convert('RGB')
47
- image = resize_image(image)
 
 
 
 
 
48
  img_array = np.array(image)
49
 
50
- ocr = get_ocr_engine()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  result = ocr.ocr(img_array, cls=False)
52
 
53
- texts, scores, bboxes = [], [], []
54
- if result and result[0]:
55
- for line in result[0]:
56
- bbox, (text, score) = line
57
- texts.append(text)
58
- scores.append(float(score))
59
- bboxes.append(bbox)
60
 
61
- # Immediate cleanup
62
- del image, img_array
63
- gc.collect()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  return {
66
- "filename": filename,
67
- "type": "image",
68
- "success": True,
69
- "results": [{"text": t, "confidence": s, "bbox": b}
70
- for t, s, b in zip(texts, scores, bboxes)]
71
  }
 
72
  except Exception as e:
73
- logger.error(f"Image processing failed: {e}")
74
- return {"filename": filename, "type": "image", "success": False, "error": str(e)}
75
 
76
- def process_pdf(contents: bytes, filename: str) -> Dict[str, Any]:
77
- """Process PDF page-by-page with memory cleanup between pages"""
 
78
  try:
79
- # Convert PDF to images (poppler handles memory efficiently)
80
- images = convert_from_bytes(contents, dpi=200, fmt='png')
81
- pages = []
 
 
 
 
 
82
 
83
- for page_num, image in enumerate(images, 1):
84
- image = resize_image(image.convert('RGB'))
85
- img_array = np.array(image)
 
 
86
 
87
- ocr = get_ocr_engine()
88
- result = ocr.ocr(img_array, cls=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- texts, scores, bboxes = [], [], []
91
- if result and result[0]:
92
- for line in result[0]:
93
- bbox, (text, score) = line
94
- texts.append(text)
95
- scores.append(float(score))
96
- bboxes.append(bbox)
97
 
98
- pages.append({
99
- "page_number": page_num,
100
- "results": [{"text": t, "confidence": s, "bbox": b}
101
- for t, s, b in zip(texts, scores, bboxes)]
102
  })
103
 
104
- # Clean up per page
105
- del image, img_array
106
  gc.collect()
107
- # REMOVED: await asyncio.sleep(0.05) # This was causing the error
108
 
109
- # Final cleanup
110
- del images
111
  gc.collect()
112
 
113
  return {
114
- "filename": filename,
115
- "type": "pdf",
116
- "success": True,
117
- "page_count": len(pages),
118
- "pages": pages
119
  }
 
 
 
120
  except Exception as e:
121
- logger.error(f"PDF processing failed: {e}")
122
- return {"filename": filename, "type": "pdf", "success": False, "error": str(e)}
 
 
 
 
123
 
124
- @app.post("/ocr/image")
125
- async def ocr_image(file: UploadFile = File(...)):
126
- """Single image endpoint"""
127
- if not file.content_type.startswith('image/'):
128
- raise HTTPException(400, "Invalid image file")
129
-
130
- try:
131
- contents = await file.read()
132
- return process_image(contents, file.filename)
133
- finally:
134
- await file.close()
135
 
136
- @app.post("/ocr/pdf")
137
- async def ocr_pdf(file: UploadFile = File(...)):
138
- """Single PDF endpoint"""
139
- if not (file.content_type == 'application/pdf' or file.filename.endswith('.pdf')):
140
- raise HTTPException(400, "Invalid PDF file")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- try:
143
- contents = await file.read()
144
- return process_pdf(contents, file.filename)
145
- finally:
146
- await file.close()
 
 
 
 
 
147
 
148
- @app.post("/ocr/batch")
149
- async def ocr_batch(files: List[UploadFile] = File(...)):
150
- """Batch processing endpoint - max 5 files to prevent OOM"""
151
- if len(files) > 5:
152
- raise HTTPException(400, "Maximum 5 files per batch")
 
 
 
 
 
 
 
 
 
 
 
 
153
 
 
154
  results = []
155
  for file in files:
156
- try:
157
- contents = await file.read()
158
- is_pdf = file.content_type == 'application/pdf' or file.filename.endswith('.pdf')
159
- result = process_pdf(contents, file.filename) if is_pdf else process_image(contents, file.filename)
160
- results.append(result)
161
- except Exception as e:
162
- results.append({"filename": file.filename, "success": False, "error": str(e)})
163
- finally:
164
- await file.close()
165
-
166
- return {"processed": len(results), "files": results}
167
 
168
- @app.get("/health")
169
- async def health():
170
- """Check if model is loaded"""
171
- try:
172
- get_ocr_engine()
173
- return {"status": "ready", "model": "loaded"}
174
- except:
175
- raise HTTPException(503, "Model not loaded")
176
 
177
- @app.on_event("startup")
178
- async def load_model():
179
- logger.info("Preloading OCR model...")
180
- get_ocr_engine()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
- @app.on_event("shutdown")
183
- async def cleanup():
184
- global ocr_engine
185
- ocr_engine = None
186
- gc.collect()
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from typing import List
5
  import io
 
 
 
 
6
  import numpy as np
7
+ from PIL import Image
8
+ import pdf2image
9
+ import cv2
10
  from paddleocr import PaddleOCR
11
+ import gc
12
+ import asyncio
13
+ from concurrent.futures import ThreadPoolExecutor
14
+ import logging
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO)
18
  logger = logging.getLogger(__name__)
19
 
20
+ app = FastAPI(
21
+ title="Marathi OCR API",
22
+ description="OCR API for Marathi text extraction from images and PDFs",
23
+ version="1.0.0"
24
+ )
25
+
26
+ # CORS middleware
27
+ app.add_middleware(
28
+ CORSMiddleware,
29
+ allow_origins=["*"],
30
+ allow_credentials=True,
31
+ allow_methods=["*"],
32
+ allow_headers=["*"],
33
+ )
34
+
35
+ # Global OCR instance (initialized once)
36
+ ocr_instance = None
37
+ executor = ThreadPoolExecutor(max_workers=2) # Limit concurrent processing
38
+
39
+ # Constants
40
+ MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
41
+ ALLOWED_IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"}
42
+ ALLOWED_EXTENSIONS = ALLOWED_IMAGE_EXTENSIONS | {".pdf"}
43
+ MAX_FILES_PER_REQUEST = 10
44
+ PDF_DPI = 200 # Balance between quality and RAM usage
45
 
46
+
47
+ def get_ocr():
48
+ """Lazy load OCR instance"""
49
+ global ocr_instance
50
+ if ocr_instance is None:
51
+ logger.info("Initializing PaddleOCR...")
52
+ ocr_instance = PaddleOCR(
53
  lang="mr",
54
  use_doc_orientation_classify=False,
55
  use_doc_unwarping=False,
56
  use_textline_orientation=False,
57
+ use_angle_cls=False, # Disable angle classification for speed
58
+ show_log=False
59
  )
60
+ logger.info("PaddleOCR initialized successfully")
61
+ return ocr_instance
62
+
63
 
64
+ def validate_file(file: UploadFile, file_size: int):
65
+ """Validate uploaded file"""
66
+ # Check file size
67
+ if file_size > MAX_FILE_SIZE:
68
+ raise HTTPException(
69
+ status_code=413,
70
+ detail=f"File too large. Maximum size: {MAX_FILE_SIZE / 1024 / 1024}MB"
71
+ )
72
+
73
+ # Check extension
74
+ file_ext = file.filename.lower().split('.')[-1]
75
+ if f".{file_ext}" not in ALLOWED_EXTENSIONS:
76
+ raise HTTPException(
77
+ status_code=400,
78
+ detail=f"Invalid file type. Allowed: {', '.join(ALLOWED_EXTENSIONS)}"
79
+ )
80
+
81
+ return f".{file_ext}"
82
 
 
 
 
 
 
 
 
 
83
 
84
+ def process_image_bytes(image_bytes: bytes) -> np.ndarray:
85
+ """Convert image bytes to numpy array"""
86
  try:
87
+ image = Image.open(io.BytesIO(image_bytes))
88
+
89
+ # Convert to RGB if necessary
90
+ if image.mode != 'RGB':
91
+ image = image.convert('RGB')
92
+
93
+ # Convert to numpy array
94
  img_array = np.array(image)
95
 
96
+ # Optional: Resize if image is too large to save RAM
97
+ max_dimension = 4096
98
+ h, w = img_array.shape[:2]
99
+ if max(h, w) > max_dimension:
100
+ scale = max_dimension / max(h, w)
101
+ new_w, new_h = int(w * scale), int(h * scale)
102
+ img_array = cv2.resize(img_array, (new_w, new_h))
103
+ logger.info(f"Resized image from {w}x{h} to {new_w}x{new_h}")
104
+
105
+ return img_array
106
+ except Exception as e:
107
+ logger.error(f"Error processing image: {e}")
108
+ raise HTTPException(status_code=400, detail=f"Invalid image format: {str(e)}")
109
+
110
+
111
+ def pdf_to_images(pdf_bytes: bytes) -> List[np.ndarray]:
112
+ """Convert PDF to list of image arrays without saving to disk"""
113
+ try:
114
+ # Convert PDF bytes to images in memory
115
+ images = pdf2image.convert_from_bytes(
116
+ pdf_bytes,
117
+ dpi=PDF_DPI,
118
+ fmt='RGB',
119
+ thread_count=1 # Limit threads to control RAM
120
+ )
121
+
122
+ # Convert PIL images to numpy arrays
123
+ img_arrays = []
124
+ for img in images:
125
+ img_array = np.array(img)
126
+ img_arrays.append(img_array)
127
+
128
+ logger.info(f"Converted PDF to {len(img_arrays)} images")
129
+ return img_arrays
130
+
131
+ except Exception as e:
132
+ logger.error(f"Error converting PDF: {e}")
133
+ raise HTTPException(status_code=400, detail=f"Invalid PDF format: {str(e)}")
134
+
135
+
136
+ def run_ocr(img_array: np.ndarray) -> dict:
137
+ """Run OCR on image array"""
138
+ try:
139
+ ocr = get_ocr()
140
  result = ocr.ocr(img_array, cls=False)
141
 
142
+ if not result or not result[0]:
143
+ return {
144
+ "texts": [],
145
+ "scores": [],
146
+ "details": []
147
+ }
 
148
 
149
+ # Extract data
150
+ texts = []
151
+ scores = []
152
+ details = []
153
+
154
+ for line in result[0]:
155
+ bbox = line[0] # Bounding box coordinates
156
+ text = line[1][0] # Recognized text
157
+ score = line[1][1] # Confidence score
158
+
159
+ texts.append(text)
160
+ scores.append(float(score))
161
+ details.append({
162
+ "text": text,
163
+ "confidence": float(score),
164
+ "bbox": [[int(point[0]), int(point[1])] for point in bbox]
165
+ })
166
 
167
  return {
168
+ "texts": texts,
169
+ "scores": scores,
170
+ "details": details
 
 
171
  }
172
+
173
  except Exception as e:
174
+ logger.error(f"OCR processing error: {e}")
175
+ raise HTTPException(status_code=500, detail=f"OCR failed: {str(e)}")
176
 
177
+
178
+ async def process_single_file(file: UploadFile) -> dict:
179
+ """Process a single file (image or PDF)"""
180
  try:
181
+ # Read file into memory
182
+ file_bytes = await file.read()
183
+ file_size = len(file_bytes)
184
+
185
+ # Validate
186
+ file_ext = validate_file(file, file_size)
187
+
188
+ logger.info(f"Processing file: {file.filename} ({file_size / 1024:.2f}KB)")
189
 
190
+ results = []
191
+
192
+ if file_ext == ".pdf":
193
+ # Process PDF
194
+ img_arrays = pdf_to_images(file_bytes)
195
 
196
+ # Process each page
197
+ for page_num, img_array in enumerate(img_arrays, 1):
198
+ logger.info(f"Processing PDF page {page_num}/{len(img_arrays)}")
199
+
200
+ # Run OCR in thread pool to avoid blocking
201
+ loop = asyncio.get_event_loop()
202
+ ocr_result = await loop.run_in_executor(executor, run_ocr, img_array)
203
+
204
+ results.append({
205
+ "page": page_num,
206
+ **ocr_result
207
+ })
208
+
209
+ # Clean up
210
+ del img_array
211
+ gc.collect()
212
+
213
+ else:
214
+ # Process single image
215
+ img_array = process_image_bytes(file_bytes)
216
 
217
+ # Run OCR in thread pool
218
+ loop = asyncio.get_event_loop()
219
+ ocr_result = await loop.run_in_executor(executor, run_ocr, img_array)
 
 
 
 
220
 
221
+ results.append({
222
+ "page": 1,
223
+ **ocr_result
 
224
  })
225
 
226
+ # Clean up
227
+ del img_array
228
  gc.collect()
 
229
 
230
+ # Clean up file bytes
231
+ del file_bytes
232
  gc.collect()
233
 
234
  return {
235
+ "filename": file.filename,
236
+ "file_type": file_ext,
237
+ "total_pages": len(results),
238
+ "results": results,
239
+ "status": "success"
240
  }
241
+
242
+ except HTTPException:
243
+ raise
244
  except Exception as e:
245
+ logger.error(f"Error processing file {file.filename}: {e}")
246
+ return {
247
+ "filename": file.filename,
248
+ "status": "error",
249
+ "error": str(e)
250
+ }
251
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
+ @app.on_event("startup")
254
+ async def startup_event():
255
+ """Initialize on startup"""
256
+ logger.info("Starting OCR API...")
257
+ # Pre-load OCR model
258
+ get_ocr()
259
+ logger.info("API ready!")
260
+
261
+
262
+ @app.on_event("shutdown")
263
+ async def shutdown_event():
264
+ """Cleanup on shutdown"""
265
+ logger.info("Shutting down...")
266
+ executor.shutdown(wait=True)
267
+
268
+
269
+ @app.get("/")
270
+ async def root():
271
+ """Health check endpoint"""
272
+ return {
273
+ "status": "healthy",
274
+ "message": "Marathi OCR API is running",
275
+ "endpoints": {
276
+ "single_file": "/ocr/",
277
+ "multiple_files": "/ocr/batch/",
278
+ "health": "/health"
279
+ }
280
+ }
281
+
282
+
283
+ @app.get("/health")
284
+ async def health():
285
+ """Detailed health check"""
286
+ return {
287
+ "status": "healthy",
288
+ "ocr_loaded": ocr_instance is not None,
289
+ "max_file_size_mb": MAX_FILE_SIZE / 1024 / 1024,
290
+ "max_files_per_request": MAX_FILES_PER_REQUEST,
291
+ "supported_formats": list(ALLOWED_EXTENSIONS)
292
+ }
293
+
294
+
295
+ @app.post("/ocr/")
296
+ async def ocr_single_file(file: UploadFile = File(...)):
297
+ """
298
+ OCR for a single image or PDF file
299
 
300
+ - **file**: Image (JPG, PNG, etc.) or PDF file
301
+
302
+ Returns OCR results with text, confidence scores, and bounding boxes
303
+ """
304
+ result = await process_single_file(file)
305
+
306
+ if result["status"] == "error":
307
+ raise HTTPException(status_code=500, detail=result["error"])
308
+
309
+ return JSONResponse(content=result)
310
 
311
+
312
+ @app.post("/ocr/batch/")
313
+ async def ocr_batch_files(files: List[UploadFile] = File(...)):
314
+ """
315
+ OCR for multiple images or PDF files
316
+
317
+ - **files**: List of image or PDF files (max 10)
318
+
319
+ Returns OCR results for each file
320
+ """
321
+ if len(files) > MAX_FILES_PER_REQUEST:
322
+ raise HTTPException(
323
+ status_code=400,
324
+ detail=f"Too many files. Maximum: {MAX_FILES_PER_REQUEST}"
325
+ )
326
+
327
+ logger.info(f"Processing batch of {len(files)} files")
328
 
329
+ # Process files sequentially to manage RAM
330
  results = []
331
  for file in files:
332
+ result = await process_single_file(file)
333
+ results.append(result)
334
+
335
+ # Force garbage collection between files
336
+ gc.collect()
337
+
338
+ return JSONResponse(content={
339
+ "total_files": len(files),
340
+ "results": results
341
+ })
 
342
 
 
 
 
 
 
 
 
 
343
 
344
+ @app.post("/ocr/extract-text/")
345
+ async def extract_text_only(file: UploadFile = File(...)):
346
+ """
347
+ Extract only text from image/PDF (simplified response)
348
+
349
+ - **file**: Image or PDF file
350
+
351
+ Returns only extracted text without bounding boxes
352
+ """
353
+ result = await process_single_file(file)
354
+
355
+ if result["status"] == "error":
356
+ raise HTTPException(status_code=500, detail=result["error"])
357
+
358
+ # Simplify response
359
+ simplified = {
360
+ "filename": result["filename"],
361
+ "file_type": result["file_type"],
362
+ "pages": []
363
+ }
364
+
365
+ for page_result in result["results"]:
366
+ simplified["pages"].append({
367
+ "page": page_result["page"],
368
+ "text": " ".join(page_result["texts"]),
369
+ "word_count": len(page_result["texts"]),
370
+ "average_confidence": sum(page_result["scores"]) / len(page_result["scores"]) if page_result["scores"] else 0
371
+ })
372
+
373
+ return JSONResponse(content=simplified)
374
 
375
+
376
+ if __name__ == "__main__":
377
+ import uvicorn
378
+ uvicorn.run(app, host="0.0.0.0", port=7860)