Soumik Bose commited on
Commit
6536e0d
·
1 Parent(s): 3ddb265
Files changed (1) hide show
  1. main.py +92 -130
main.py CHANGED
@@ -41,13 +41,6 @@ class Config:
41
  MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
42
  ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
43
  ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
44
-
45
- # RapidOCR Settings
46
- USE_ANGLE_CLS = os.getenv("OCR_USE_ANGLE_CLS", "true").lower() == "true"
47
- USE_TEXT_SCORE = os.getenv("OCR_USE_TEXT_SCORE", "true").lower() == "true"
48
- MIN_HEIGHT = int(os.getenv("OCR_MIN_HEIGHT", "30"))
49
- TEXT_SCORE_THRESHOLD = float(os.getenv("OCR_TEXT_SCORE", "0.5"))
50
- ENABLE_PREPROCESSING = os.getenv("OCR_PREPROCESSING", "true").lower() == "true"
51
 
52
  class RequestIdFilter(logging.Filter):
53
  def filter(self, record):
@@ -127,135 +120,91 @@ class FileValidator:
127
  raise HTTPException(413, "File too large")
128
  return tmp_path
129
 
130
- class RapidOCREngine:
131
- """Singleton RapidOCR engine for efficient reuse"""
132
- _instance = None
133
- _engine = None
134
-
135
- def __new__(cls):
136
- if cls._instance is None:
137
- cls._instance = super().__new__(cls)
138
- cls._instance._initialize_engine()
139
- return cls._instance
140
-
141
- def _initialize_engine(self):
142
- """Initialize RapidOCR with optimized settings"""
143
- try:
144
- self._engine = RapidOCR(
145
- det_use_cuda=False,
146
- cls_use_cuda=False,
147
- rec_use_cuda=False,
148
- use_angle_cls=Config.USE_ANGLE_CLS,
149
- use_text_score=Config.USE_TEXT_SCORE,
150
- print_verbose=False,
151
- min_height=Config.MIN_HEIGHT,
152
- text_score=Config.TEXT_SCORE_THRESHOLD
153
- )
154
- logger.info("RapidOCR engine initialized successfully")
155
- except Exception as e:
156
- logger.error(f"Failed to initialize RapidOCR: {str(e)}")
157
- raise
158
 
159
- def get_engine(self):
160
- return self._engine
 
 
161
 
162
- @staticmethod
163
- def preprocess_image(img_array):
164
- """Enhanced preprocessing for better accuracy"""
165
- if not Config.ENABLE_PREPROCESSING:
166
- return img_array
167
 
168
- try:
169
- # Convert to grayscale if needed
170
- if len(img_array.shape) == 3:
171
- gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
172
- else:
173
- gray = img_array
174
-
175
- # Denoise
176
- denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
177
-
178
- # Enhance contrast using CLAHE
179
- clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
180
- contrast = clahe.apply(denoised)
181
 
182
- # Sharpen
183
- kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
184
- sharpened = cv2.filter2D(contrast, -1, kernel)
185
-
186
- # Adaptive threshold
187
- processed = cv2.adaptiveThreshold(
188
- sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
189
- cv2.THRESH_BINARY, 11, 2
190
- )
191
-
192
- return processed
193
- except Exception as e:
194
- logger.warning(f"Preprocessing failed, using original image: {str(e)}")
195
- return img_array
196
-
197
- class OCRProcessor:
198
- def __init__(self):
199
- self.ocr_engine = RapidOCREngine().get_engine()
200
-
201
- def _extract_from_image(self, img_array) -> dict:
202
- """Extract text from a single image using RapidOCR"""
203
  try:
204
- # Preprocess image
205
- processed_img = RapidOCREngine.preprocess_image(img_array)
206
 
207
- # Perform OCR
208
- result, elapse = self.ocr_engine(processed_img)
 
 
 
209
 
210
  if result is None or len(result) == 0:
 
211
  return {
212
- "text": "",
213
- "confidence": 0.0,
214
- "lines_detected": 0
215
  }
216
 
217
  # Parse results
218
  texts = []
219
  confidences = []
220
 
221
- for line in result:
222
  try:
223
  if isinstance(line, (list, tuple)):
224
  if len(line) == 2:
225
- # [box, text] or [text, confidence]
226
  if isinstance(line[0], (list, tuple)):
227
- _, text = line
228
  confidence = 1.0
229
  else:
230
  text, confidence = line
 
231
  elif len(line) == 3:
232
- # [box, text, confidence]
233
- _, text, confidence = line
234
  elif len(line) >= 4:
235
- _, text, confidence = line[0], line[1], line[2]
 
236
  else:
237
  continue
238
-
239
- texts.append(str(text))
240
- confidences.append(float(confidence) if confidence is not None else 1.0)
 
 
 
241
  except Exception as e:
242
- logger.debug(f"Skipping malformed line: {e}")
243
  continue
244
 
245
  if not texts:
246
  return {
247
- "text": "",
248
- "confidence": 0.0,
249
- "lines_detected": 0
250
  }
251
 
252
  combined_text = '\n'.join(texts)
253
  avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
254
 
 
 
255
  return {
256
- "text": combined_text,
257
- "confidence": avg_confidence,
258
- "lines_detected": len(texts)
259
  }
260
 
261
  except Exception as e:
@@ -263,7 +212,16 @@ class OCRProcessor:
263
  raise ValueError(f"OCR extraction error: {str(e)}")
264
 
265
  def process_file(self, file_path: str, content_type: str) -> dict:
266
- """Process PDF or image file and extract text"""
 
 
 
 
 
 
 
 
 
267
  start = time.perf_counter()
268
  pages_content = []
269
  all_confidences = []
@@ -280,32 +238,36 @@ class OCRProcessor:
280
  page_num = idx + 1
281
  logger.info(f"Scanning Page {page_num}/{total}")
282
 
283
- # Convert PIL Image to numpy array for OpenCV
284
- img_array = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
285
-
286
- # Extract text
287
- ocr_result = self._extract_from_image(img_array)
288
-
289
- pages_content.append({
290
- "index": idx,
291
- "page_number": page_num,
292
- "text": ocr_result["text"],
293
- "confidence": ocr_result["confidence"],
294
- "lines_detected": ocr_result["lines_detected"]
295
- })
296
 
297
- if ocr_result["confidence"] > 0:
298
- all_confidences.append(ocr_result["confidence"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  else:
300
  logger.info("Scanning Single Image...")
301
 
302
- # Read image with OpenCV
303
- img_array = cv2.imread(file_path)
304
- if img_array is None:
305
- raise ValueError("Failed to load image file")
306
-
307
- # Extract text
308
- ocr_result = self._extract_from_image(img_array)
309
 
310
  pages_content.append({
311
  "index": 0,
@@ -319,8 +281,9 @@ class OCRProcessor:
319
  all_confidences.append(ocr_result["confidence"])
320
 
321
  avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
 
322
 
323
- logger.info(f"OCR Complete in {(time.perf_counter()-start)*1000:.2f}ms | Avg Confidence: {avg_confidence:.2%}")
324
 
325
  return {
326
  "total_pages": len(pages_content),
@@ -394,13 +357,11 @@ async def root(request: Request):
394
  async def health_check(request: Request):
395
  """Health check endpoint"""
396
  try:
397
- # Verify OCR engine is initialized
398
- engine = RapidOCREngine().get_engine()
399
  return {
400
  "request_id": request.state.request_id,
401
  "status": StatusEnum.SUCCESS,
402
  "message": "Service healthy",
403
- "ocr_engine": "ready"
404
  }
405
  except Exception as e:
406
  return JSONResponse(
@@ -427,7 +388,7 @@ async def extract_data(
427
  FileValidator.validate(file)
428
  tmp_path = FileValidator.check_size_and_save(file)
429
 
430
- # CPU heavy task run in thread pool
431
  # ContextVars are automatically copied to the thread
432
  processor = OCRProcessor()
433
  result = await run_in_threadpool(
@@ -479,10 +440,11 @@ async def extract_data(
479
  @app.on_event("startup")
480
  async def startup_event():
481
  """Initialize OCR engine on startup"""
482
- logger.info("Starting OCR API with RapidOCR engine...")
483
  try:
484
- RapidOCREngine() # Initialize singleton
485
- logger.info("RapidOCR engine ready")
 
486
  except Exception as e:
487
  logger.error(f"Failed to initialize OCR engine: {str(e)}")
488
  raise
 
41
  MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
42
  ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
43
  ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
 
 
 
 
 
 
 
44
 
45
  class RequestIdFilter(logging.Filter):
46
  def filter(self, record):
 
120
  raise HTTPException(413, "File too large")
121
  return tmp_path
122
 
123
+ class OCRProcessor:
124
+ """RapidOCR-based OCR processor with enhanced accuracy"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ def __init__(self):
127
+ """Initialize RapidOCR engine"""
128
+ self.engine = RapidOCR()
129
+ logger.info("RapidOCR engine initialized successfully")
130
 
131
+ def _extract_text_from_image(self, image_path: str) -> dict:
132
+ """
133
+ Extract text from a single image using RapidOCR
 
 
134
 
135
+ Args:
136
+ image_path: Path to image file
 
 
 
 
 
 
 
 
 
 
 
137
 
138
+ Returns:
139
+ dict: Contains text, confidence, and line count
140
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  try:
142
+ # Perform OCR - RapidOCR returns (result_object, elapse_list)
143
+ ocr_result, elapse = self.engine(image_path)
144
 
145
+ # Handle result object
146
+ if hasattr(ocr_result, '__iter__') and not isinstance(ocr_result, str):
147
+ result = list(ocr_result)
148
+ else:
149
+ result = ocr_result
150
 
151
  if result is None or len(result) == 0:
152
+ logger.warning(f"No text detected in image: {image_path}")
153
  return {
154
+ 'text': '',
155
+ 'confidence': 0.0,
156
+ 'lines_detected': 0
157
  }
158
 
159
  # Parse results
160
  texts = []
161
  confidences = []
162
 
163
+ for idx, line in enumerate(result):
164
  try:
165
  if isinstance(line, (list, tuple)):
166
  if len(line) == 2:
167
+ # Format: [box, text] or [text, confidence]
168
  if isinstance(line[0], (list, tuple)):
169
+ box, text = line
170
  confidence = 1.0
171
  else:
172
  text, confidence = line
173
+ box = []
174
  elif len(line) == 3:
175
+ # Format: [box, text, confidence]
176
+ box, text, confidence = line
177
  elif len(line) >= 4:
178
+ # Format: [box, text, confidence, something_else]
179
+ box, text, confidence = line[0], line[1], line[2]
180
  else:
181
  continue
182
+ else:
183
+ continue
184
+
185
+ texts.append(str(text))
186
+ confidences.append(float(confidence) if confidence is not None else 1.0)
187
+
188
  except Exception as e:
189
+ logger.debug(f"Skipping malformed line {idx}: {e}")
190
  continue
191
 
192
  if not texts:
193
  return {
194
+ 'text': '',
195
+ 'confidence': 0.0,
196
+ 'lines_detected': 0
197
  }
198
 
199
  combined_text = '\n'.join(texts)
200
  avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
201
 
202
+ logger.debug(f"Extracted {len(texts)} lines with avg confidence: {avg_confidence:.2%}")
203
+
204
  return {
205
+ 'text': combined_text,
206
+ 'confidence': avg_confidence,
207
+ 'lines_detected': len(texts)
208
  }
209
 
210
  except Exception as e:
 
212
  raise ValueError(f"OCR extraction error: {str(e)}")
213
 
214
  def process_file(self, file_path: str, content_type: str) -> dict:
215
+ """
216
+ Process PDF or image file and extract text
217
+
218
+ Args:
219
+ file_path: Path to the file
220
+ content_type: MIME type of the file
221
+
222
+ Returns:
223
+ dict: Processing results with pages content
224
+ """
225
  start = time.perf_counter()
226
  pages_content = []
227
  all_confidences = []
 
238
  page_num = idx + 1
239
  logger.info(f"Scanning Page {page_num}/{total}")
240
 
241
+ # Save PIL Image to temp file for RapidOCR
242
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_img:
243
+ img.save(tmp_img.name, 'PNG')
244
+ temp_img_path = tmp_img.name
 
 
 
 
 
 
 
 
 
245
 
246
+ try:
247
+ # Extract text from temp image
248
+ ocr_result = self._extract_text_from_image(temp_img_path)
249
+
250
+ pages_content.append({
251
+ "index": idx,
252
+ "page_number": page_num,
253
+ "text": ocr_result["text"],
254
+ "confidence": ocr_result["confidence"],
255
+ "lines_detected": ocr_result["lines_detected"]
256
+ })
257
+
258
+ if ocr_result["confidence"] > 0:
259
+ all_confidences.append(ocr_result["confidence"])
260
+ finally:
261
+ # Clean up temp image
262
+ try:
263
+ os.remove(temp_img_path)
264
+ except:
265
+ pass
266
  else:
267
  logger.info("Scanning Single Image...")
268
 
269
+ # Extract text from image
270
+ ocr_result = self._extract_text_from_image(file_path)
 
 
 
 
 
271
 
272
  pages_content.append({
273
  "index": 0,
 
281
  all_confidences.append(ocr_result["confidence"])
282
 
283
  avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
284
+ processing_time = (time.perf_counter() - start) * 1000
285
 
286
+ logger.info(f"OCR Complete in {processing_time:.2f}ms | Avg Confidence: {avg_confidence:.2%}")
287
 
288
  return {
289
  "total_pages": len(pages_content),
 
357
  async def health_check(request: Request):
358
  """Health check endpoint"""
359
  try:
 
 
360
  return {
361
  "request_id": request.state.request_id,
362
  "status": StatusEnum.SUCCESS,
363
  "message": "Service healthy",
364
+ "ocr_engine": "RapidOCR"
365
  }
366
  except Exception as e:
367
  return JSONResponse(
 
388
  FileValidator.validate(file)
389
  tmp_path = FileValidator.check_size_and_save(file)
390
 
391
+ # Initialize OCR processor and run in thread pool
392
  # ContextVars are automatically copied to the thread
393
  processor = OCRProcessor()
394
  result = await run_in_threadpool(
 
440
  @app.on_event("startup")
441
  async def startup_event():
442
  """Initialize OCR engine on startup"""
443
+ logger.info("Starting RapidOCR API...")
444
  try:
445
+ # Test initialize the engine
446
+ test_processor = OCRProcessor()
447
+ logger.info("RapidOCR engine ready for processing")
448
  except Exception as e:
449
  logger.error(f"Failed to initialize OCR engine: {str(e)}")
450
  raise