triflix commited on
Commit
bfb796a
·
verified ·
1 Parent(s): f401e05

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -0
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import gc
3
+ import logging
4
+ from typing import List, Dict, Any
5
+ from PIL import Image
6
+ import numpy as np
7
+ from fastapi import FastAPI, File, UploadFile, HTTPException
8
+ from paddleocr import PaddleOCR
9
+ from pdf2image import convert_from_bytes
10
+ import asyncio
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Global OCR instance (loaded once at startup)
17
+ ocr_engine = None
18
+
19
+ def get_ocr_engine():
20
+ """Singleton pattern for OCR model"""
21
+ global ocr_engine
22
+ if ocr_engine is None:
23
+ logger.info("Initializing PaddleOCR model...")
24
+ ocr_engine = PaddleOCR(
25
+ text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
26
+ lang="mr",
27
+ use_doc_orientation_classify=False,
28
+ use_doc_unwarping=False,
29
+ use_textline_orientation=False,
30
+ show_log=False # Reduce clutter
31
+ )
32
+ return ocr_engine
33
+
34
+ app = FastAPI(title="PaddleOCR Marathi API")
35
+
36
+ def resize_image(image: Image.Image, max_pixels: int = 2500) -> Image.Image:
37
+ """Resize if any dimension exceeds limit to control memory usage"""
38
+ if max(image.size) > max_pixels:
39
+ ratio = max_pixels / max(image.size)
40
+ new_size = (int(image.width * ratio), int(image.height * ratio))
41
+ logger.info(f"Resizing {image.size} -> {new_size}")
42
+ return image.resize(new_size, Image.Resampling.LANCZOS)
43
+ return image
44
+
45
+ def process_image(contents: bytes, filename: str) -> Dict[str, Any]:
46
+ """Process single image entirely in memory"""
47
+ try:
48
+ image = Image.open(io.BytesIO(contents)).convert('RGB')
49
+ image = resize_image(image)
50
+ img_array = np.array(image)
51
+
52
+ ocr = get_ocr_engine()
53
+ result = ocr.ocr(img_array, cls=False)
54
+
55
+ texts, scores, bboxes = [], [], []
56
+ if result and result[0]:
57
+ for line in result[0]:
58
+ bbox, (text, score) = line
59
+ texts.append(text)
60
+ scores.append(float(score))
61
+ bboxes.append(bbox)
62
+
63
+ # Immediate cleanup
64
+ del image, img_array
65
+ gc.collect()
66
+
67
+ return {
68
+ "filename": filename,
69
+ "type": "image",
70
+ "success": True,
71
+ "results": [{"text": t, "confidence": s, "bbox": b}
72
+ for t, s, b in zip(texts, scores, bboxes)]
73
+ }
74
+ except Exception as e:
75
+ logger.error(f"Image processing failed: {e}")
76
+ return {"filename": filename, "type": "image", "success": False, "error": str(e)}
77
+
78
+ def process_pdf(contents: bytes, filename: str) -> Dict[str, Any]:
79
+ """Process PDF page-by-page with memory cleanup between pages"""
80
+ try:
81
+ # Convert PDF to images (poppler handles memory efficiently)
82
+ images = convert_from_bytes(contents, dpi=200, fmt='png')
83
+ pages = []
84
+
85
+ for page_num, image in enumerate(images, 1):
86
+ image = resize_image(image.convert('RGB'))
87
+ img_array = np.array(image)
88
+
89
+ ocr = get_ocr_engine()
90
+ result = ocr.ocr(img_array, cls=False)
91
+
92
+ texts, scores, bboxes = [], [], []
93
+ if result and result[0]:
94
+ for line in result[0]:
95
+ bbox, (text, score) = line
96
+ texts.append(text)
97
+ scores.append(float(score))
98
+ bboxes.append(bbox)
99
+
100
+ pages.append({
101
+ "page_number": page_num,
102
+ "results": [{"text": t, "confidence": s, "bbox": b}
103
+ for t, s, b in zip(texts, scores, bboxes)]
104
+ })
105
+
106
+ # Clean up per page
107
+ del image, img_array
108
+ gc.collect()
109
+ await asyncio.sleep(0.05) # Brief pause to let GC work
110
+
111
+ # Final cleanup
112
+ del images
113
+ gc.collect()
114
+
115
+ return {
116
+ "filename": filename,
117
+ "type": "pdf",
118
+ "success": True,
119
+ "page_count": len(pages),
120
+ "pages": pages
121
+ }
122
+ except Exception as e:
123
+ logger.error(f"PDF processing failed: {e}")
124
+ return {"filename": filename, "type": "pdf", "success": False, "error": str(e)}
125
+
126
+ @app.post("/ocr/image")
127
+ async def ocr_image(file: UploadFile = File(...)):
128
+ """Single image endpoint"""
129
+ if not file.content_type.startswith('image/'):
130
+ raise HTTPException(400, "Invalid image file")
131
+
132
+ try:
133
+ contents = await file.read()
134
+ return process_image(contents, file.filename)
135
+ finally:
136
+ await file.close()
137
+
138
+ @app.post("/ocr/pdf")
139
+ async def ocr_pdf(file: UploadFile = File(...)):
140
+ """Single PDF endpoint"""
141
+ if not (file.content_type == 'application/pdf' or file.filename.endswith('.pdf')):
142
+ raise HTTPException(400, "Invalid PDF file")
143
+
144
+ try:
145
+ contents = await file.read()
146
+ return process_pdf(contents, file.filename)
147
+ finally:
148
+ await file.close()
149
+
150
+ @app.post("/ocr/batch")
151
+ async def ocr_batch(files: List[UploadFile] = File(...)):
152
+ """Batch processing endpoint - max 5 files to prevent OOM"""
153
+ if len(files) > 5:
154
+ raise HTTPException(400, "Maximum 5 files per batch")
155
+
156
+ results = []
157
+ for file in files:
158
+ try:
159
+ contents = await file.read()
160
+ is_pdf = file.content_type == 'application/pdf' or file.filename.endswith('.pdf')
161
+ result = process_pdf(contents, file.filename) if is_pdf else process_image(contents, file.filename)
162
+ results.append(result)
163
+ except Exception as e:
164
+ results.append({"filename": file.filename, "success": False, "error": str(e)})
165
+ finally:
166
+ await file.close()
167
+
168
+ return {"processed": len(results), "files": results}
169
+
170
+ @app.get("/health")
171
+ async def health():
172
+ """Check if model is loaded"""
173
+ try:
174
+ get_ocr_engine()
175
+ return {"status": "ready", "model": "loaded"}
176
+ except:
177
+ raise HTTPException(503, "Model not loaded")
178
+
179
+ @app.on_event("startup")
180
+ async def load_model():
181
+ logger.info("Preloading OCR model...")
182
+ get_ocr_engine()
183
+
184
+ @app.on_event("shutdown")
185
+ async def cleanup():
186
+ global ocr_engine
187
+ ocr_engine = None
188
+ gc.collect()