Soumik Bose commited on
Commit ·
6536e0d
1
Parent(s): 3ddb265
go
Browse files
main.py
CHANGED
|
@@ -41,13 +41,6 @@ class Config:
|
|
| 41 |
MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
|
| 42 |
ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
|
| 43 |
ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
|
| 44 |
-
|
| 45 |
-
# RapidOCR Settings
|
| 46 |
-
USE_ANGLE_CLS = os.getenv("OCR_USE_ANGLE_CLS", "true").lower() == "true"
|
| 47 |
-
USE_TEXT_SCORE = os.getenv("OCR_USE_TEXT_SCORE", "true").lower() == "true"
|
| 48 |
-
MIN_HEIGHT = int(os.getenv("OCR_MIN_HEIGHT", "30"))
|
| 49 |
-
TEXT_SCORE_THRESHOLD = float(os.getenv("OCR_TEXT_SCORE", "0.5"))
|
| 50 |
-
ENABLE_PREPROCESSING = os.getenv("OCR_PREPROCESSING", "true").lower() == "true"
|
| 51 |
|
| 52 |
class RequestIdFilter(logging.Filter):
|
| 53 |
def filter(self, record):
|
|
@@ -127,135 +120,91 @@ class FileValidator:
|
|
| 127 |
raise HTTPException(413, "File too large")
|
| 128 |
return tmp_path
|
| 129 |
|
| 130 |
-
class
|
| 131 |
-
"""
|
| 132 |
-
_instance = None
|
| 133 |
-
_engine = None
|
| 134 |
-
|
| 135 |
-
def __new__(cls):
|
| 136 |
-
if cls._instance is None:
|
| 137 |
-
cls._instance = super().__new__(cls)
|
| 138 |
-
cls._instance._initialize_engine()
|
| 139 |
-
return cls._instance
|
| 140 |
-
|
| 141 |
-
def _initialize_engine(self):
|
| 142 |
-
"""Initialize RapidOCR with optimized settings"""
|
| 143 |
-
try:
|
| 144 |
-
self._engine = RapidOCR(
|
| 145 |
-
det_use_cuda=False,
|
| 146 |
-
cls_use_cuda=False,
|
| 147 |
-
rec_use_cuda=False,
|
| 148 |
-
use_angle_cls=Config.USE_ANGLE_CLS,
|
| 149 |
-
use_text_score=Config.USE_TEXT_SCORE,
|
| 150 |
-
print_verbose=False,
|
| 151 |
-
min_height=Config.MIN_HEIGHT,
|
| 152 |
-
text_score=Config.TEXT_SCORE_THRESHOLD
|
| 153 |
-
)
|
| 154 |
-
logger.info("RapidOCR engine initialized successfully")
|
| 155 |
-
except Exception as e:
|
| 156 |
-
logger.error(f"Failed to initialize RapidOCR: {str(e)}")
|
| 157 |
-
raise
|
| 158 |
|
| 159 |
-
def
|
| 160 |
-
|
|
|
|
|
|
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
if not Config.ENABLE_PREPROCESSING:
|
| 166 |
-
return img_array
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
if len(img_array.shape) == 3:
|
| 171 |
-
gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
|
| 172 |
-
else:
|
| 173 |
-
gray = img_array
|
| 174 |
-
|
| 175 |
-
# Denoise
|
| 176 |
-
denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
|
| 177 |
-
|
| 178 |
-
# Enhance contrast using CLAHE
|
| 179 |
-
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 180 |
-
contrast = clahe.apply(denoised)
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
# Adaptive threshold
|
| 187 |
-
processed = cv2.adaptiveThreshold(
|
| 188 |
-
sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
| 189 |
-
cv2.THRESH_BINARY, 11, 2
|
| 190 |
-
)
|
| 191 |
-
|
| 192 |
-
return processed
|
| 193 |
-
except Exception as e:
|
| 194 |
-
logger.warning(f"Preprocessing failed, using original image: {str(e)}")
|
| 195 |
-
return img_array
|
| 196 |
-
|
| 197 |
-
class OCRProcessor:
|
| 198 |
-
def __init__(self):
|
| 199 |
-
self.ocr_engine = RapidOCREngine().get_engine()
|
| 200 |
-
|
| 201 |
-
def _extract_from_image(self, img_array) -> dict:
|
| 202 |
-
"""Extract text from a single image using RapidOCR"""
|
| 203 |
try:
|
| 204 |
-
#
|
| 205 |
-
|
| 206 |
|
| 207 |
-
#
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
if result is None or len(result) == 0:
|
|
|
|
| 211 |
return {
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
}
|
| 216 |
|
| 217 |
# Parse results
|
| 218 |
texts = []
|
| 219 |
confidences = []
|
| 220 |
|
| 221 |
-
for line in result:
|
| 222 |
try:
|
| 223 |
if isinstance(line, (list, tuple)):
|
| 224 |
if len(line) == 2:
|
| 225 |
-
# [box, text] or [text, confidence]
|
| 226 |
if isinstance(line[0], (list, tuple)):
|
| 227 |
-
|
| 228 |
confidence = 1.0
|
| 229 |
else:
|
| 230 |
text, confidence = line
|
|
|
|
| 231 |
elif len(line) == 3:
|
| 232 |
-
# [box, text, confidence]
|
| 233 |
-
|
| 234 |
elif len(line) >= 4:
|
| 235 |
-
|
|
|
|
| 236 |
else:
|
| 237 |
continue
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
| 241 |
except Exception as e:
|
| 242 |
-
logger.debug(f"Skipping malformed line: {e}")
|
| 243 |
continue
|
| 244 |
|
| 245 |
if not texts:
|
| 246 |
return {
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
}
|
| 251 |
|
| 252 |
combined_text = '\n'.join(texts)
|
| 253 |
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
| 254 |
|
|
|
|
|
|
|
| 255 |
return {
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
}
|
| 260 |
|
| 261 |
except Exception as e:
|
|
@@ -263,7 +212,16 @@ class OCRProcessor:
|
|
| 263 |
raise ValueError(f"OCR extraction error: {str(e)}")
|
| 264 |
|
| 265 |
def process_file(self, file_path: str, content_type: str) -> dict:
|
| 266 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
start = time.perf_counter()
|
| 268 |
pages_content = []
|
| 269 |
all_confidences = []
|
|
@@ -280,32 +238,36 @@ class OCRProcessor:
|
|
| 280 |
page_num = idx + 1
|
| 281 |
logger.info(f"Scanning Page {page_num}/{total}")
|
| 282 |
|
| 283 |
-
#
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
ocr_result = self._extract_from_image(img_array)
|
| 288 |
-
|
| 289 |
-
pages_content.append({
|
| 290 |
-
"index": idx,
|
| 291 |
-
"page_number": page_num,
|
| 292 |
-
"text": ocr_result["text"],
|
| 293 |
-
"confidence": ocr_result["confidence"],
|
| 294 |
-
"lines_detected": ocr_result["lines_detected"]
|
| 295 |
-
})
|
| 296 |
|
| 297 |
-
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
else:
|
| 300 |
logger.info("Scanning Single Image...")
|
| 301 |
|
| 302 |
-
#
|
| 303 |
-
|
| 304 |
-
if img_array is None:
|
| 305 |
-
raise ValueError("Failed to load image file")
|
| 306 |
-
|
| 307 |
-
# Extract text
|
| 308 |
-
ocr_result = self._extract_from_image(img_array)
|
| 309 |
|
| 310 |
pages_content.append({
|
| 311 |
"index": 0,
|
|
@@ -319,8 +281,9 @@ class OCRProcessor:
|
|
| 319 |
all_confidences.append(ocr_result["confidence"])
|
| 320 |
|
| 321 |
avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
|
|
|
|
| 322 |
|
| 323 |
-
logger.info(f"OCR Complete in {
|
| 324 |
|
| 325 |
return {
|
| 326 |
"total_pages": len(pages_content),
|
|
@@ -394,13 +357,11 @@ async def root(request: Request):
|
|
| 394 |
async def health_check(request: Request):
|
| 395 |
"""Health check endpoint"""
|
| 396 |
try:
|
| 397 |
-
# Verify OCR engine is initialized
|
| 398 |
-
engine = RapidOCREngine().get_engine()
|
| 399 |
return {
|
| 400 |
"request_id": request.state.request_id,
|
| 401 |
"status": StatusEnum.SUCCESS,
|
| 402 |
"message": "Service healthy",
|
| 403 |
-
"ocr_engine": "
|
| 404 |
}
|
| 405 |
except Exception as e:
|
| 406 |
return JSONResponse(
|
|
@@ -427,7 +388,7 @@ async def extract_data(
|
|
| 427 |
FileValidator.validate(file)
|
| 428 |
tmp_path = FileValidator.check_size_and_save(file)
|
| 429 |
|
| 430 |
-
#
|
| 431 |
# ContextVars are automatically copied to the thread
|
| 432 |
processor = OCRProcessor()
|
| 433 |
result = await run_in_threadpool(
|
|
@@ -479,10 +440,11 @@ async def extract_data(
|
|
| 479 |
@app.on_event("startup")
|
| 480 |
async def startup_event():
|
| 481 |
"""Initialize OCR engine on startup"""
|
| 482 |
-
logger.info("Starting
|
| 483 |
try:
|
| 484 |
-
|
| 485 |
-
|
|
|
|
| 486 |
except Exception as e:
|
| 487 |
logger.error(f"Failed to initialize OCR engine: {str(e)}")
|
| 488 |
raise
|
|
|
|
| 41 |
MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
|
| 42 |
ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
|
| 43 |
ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
class RequestIdFilter(logging.Filter):
|
| 46 |
def filter(self, record):
|
|
|
|
| 120 |
raise HTTPException(413, "File too large")
|
| 121 |
return tmp_path
|
| 122 |
|
| 123 |
+
class OCRProcessor:
|
| 124 |
+
"""RapidOCR-based OCR processor with enhanced accuracy"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
+
def __init__(self):
|
| 127 |
+
"""Initialize RapidOCR engine"""
|
| 128 |
+
self.engine = RapidOCR()
|
| 129 |
+
logger.info("RapidOCR engine initialized successfully")
|
| 130 |
|
| 131 |
+
def _extract_text_from_image(self, image_path: str) -> dict:
|
| 132 |
+
"""
|
| 133 |
+
Extract text from a single image using RapidOCR
|
|
|
|
|
|
|
| 134 |
|
| 135 |
+
Args:
|
| 136 |
+
image_path: Path to image file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
+
Returns:
|
| 139 |
+
dict: Contains text, confidence, and line count
|
| 140 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
try:
|
| 142 |
+
# Perform OCR - RapidOCR returns (result_object, elapse_list)
|
| 143 |
+
ocr_result, elapse = self.engine(image_path)
|
| 144 |
|
| 145 |
+
# Handle result object
|
| 146 |
+
if hasattr(ocr_result, '__iter__') and not isinstance(ocr_result, str):
|
| 147 |
+
result = list(ocr_result)
|
| 148 |
+
else:
|
| 149 |
+
result = ocr_result
|
| 150 |
|
| 151 |
if result is None or len(result) == 0:
|
| 152 |
+
logger.warning(f"No text detected in image: {image_path}")
|
| 153 |
return {
|
| 154 |
+
'text': '',
|
| 155 |
+
'confidence': 0.0,
|
| 156 |
+
'lines_detected': 0
|
| 157 |
}
|
| 158 |
|
| 159 |
# Parse results
|
| 160 |
texts = []
|
| 161 |
confidences = []
|
| 162 |
|
| 163 |
+
for idx, line in enumerate(result):
|
| 164 |
try:
|
| 165 |
if isinstance(line, (list, tuple)):
|
| 166 |
if len(line) == 2:
|
| 167 |
+
# Format: [box, text] or [text, confidence]
|
| 168 |
if isinstance(line[0], (list, tuple)):
|
| 169 |
+
box, text = line
|
| 170 |
confidence = 1.0
|
| 171 |
else:
|
| 172 |
text, confidence = line
|
| 173 |
+
box = []
|
| 174 |
elif len(line) == 3:
|
| 175 |
+
# Format: [box, text, confidence]
|
| 176 |
+
box, text, confidence = line
|
| 177 |
elif len(line) >= 4:
|
| 178 |
+
# Format: [box, text, confidence, something_else]
|
| 179 |
+
box, text, confidence = line[0], line[1], line[2]
|
| 180 |
else:
|
| 181 |
continue
|
| 182 |
+
else:
|
| 183 |
+
continue
|
| 184 |
+
|
| 185 |
+
texts.append(str(text))
|
| 186 |
+
confidences.append(float(confidence) if confidence is not None else 1.0)
|
| 187 |
+
|
| 188 |
except Exception as e:
|
| 189 |
+
logger.debug(f"Skipping malformed line {idx}: {e}")
|
| 190 |
continue
|
| 191 |
|
| 192 |
if not texts:
|
| 193 |
return {
|
| 194 |
+
'text': '',
|
| 195 |
+
'confidence': 0.0,
|
| 196 |
+
'lines_detected': 0
|
| 197 |
}
|
| 198 |
|
| 199 |
combined_text = '\n'.join(texts)
|
| 200 |
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
| 201 |
|
| 202 |
+
logger.debug(f"Extracted {len(texts)} lines with avg confidence: {avg_confidence:.2%}")
|
| 203 |
+
|
| 204 |
return {
|
| 205 |
+
'text': combined_text,
|
| 206 |
+
'confidence': avg_confidence,
|
| 207 |
+
'lines_detected': len(texts)
|
| 208 |
}
|
| 209 |
|
| 210 |
except Exception as e:
|
|
|
|
| 212 |
raise ValueError(f"OCR extraction error: {str(e)}")
|
| 213 |
|
| 214 |
def process_file(self, file_path: str, content_type: str) -> dict:
|
| 215 |
+
"""
|
| 216 |
+
Process PDF or image file and extract text
|
| 217 |
+
|
| 218 |
+
Args:
|
| 219 |
+
file_path: Path to the file
|
| 220 |
+
content_type: MIME type of the file
|
| 221 |
+
|
| 222 |
+
Returns:
|
| 223 |
+
dict: Processing results with pages content
|
| 224 |
+
"""
|
| 225 |
start = time.perf_counter()
|
| 226 |
pages_content = []
|
| 227 |
all_confidences = []
|
|
|
|
| 238 |
page_num = idx + 1
|
| 239 |
logger.info(f"Scanning Page {page_num}/{total}")
|
| 240 |
|
| 241 |
+
# Save PIL Image to temp file for RapidOCR
|
| 242 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_img:
|
| 243 |
+
img.save(tmp_img.name, 'PNG')
|
| 244 |
+
temp_img_path = tmp_img.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
+
try:
|
| 247 |
+
# Extract text from temp image
|
| 248 |
+
ocr_result = self._extract_text_from_image(temp_img_path)
|
| 249 |
+
|
| 250 |
+
pages_content.append({
|
| 251 |
+
"index": idx,
|
| 252 |
+
"page_number": page_num,
|
| 253 |
+
"text": ocr_result["text"],
|
| 254 |
+
"confidence": ocr_result["confidence"],
|
| 255 |
+
"lines_detected": ocr_result["lines_detected"]
|
| 256 |
+
})
|
| 257 |
+
|
| 258 |
+
if ocr_result["confidence"] > 0:
|
| 259 |
+
all_confidences.append(ocr_result["confidence"])
|
| 260 |
+
finally:
|
| 261 |
+
# Clean up temp image
|
| 262 |
+
try:
|
| 263 |
+
os.remove(temp_img_path)
|
| 264 |
+
except:
|
| 265 |
+
pass
|
| 266 |
else:
|
| 267 |
logger.info("Scanning Single Image...")
|
| 268 |
|
| 269 |
+
# Extract text from image
|
| 270 |
+
ocr_result = self._extract_text_from_image(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
pages_content.append({
|
| 273 |
"index": 0,
|
|
|
|
| 281 |
all_confidences.append(ocr_result["confidence"])
|
| 282 |
|
| 283 |
avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
|
| 284 |
+
processing_time = (time.perf_counter() - start) * 1000
|
| 285 |
|
| 286 |
+
logger.info(f"OCR Complete in {processing_time:.2f}ms | Avg Confidence: {avg_confidence:.2%}")
|
| 287 |
|
| 288 |
return {
|
| 289 |
"total_pages": len(pages_content),
|
|
|
|
| 357 |
async def health_check(request: Request):
|
| 358 |
"""Health check endpoint"""
|
| 359 |
try:
|
|
|
|
|
|
|
| 360 |
return {
|
| 361 |
"request_id": request.state.request_id,
|
| 362 |
"status": StatusEnum.SUCCESS,
|
| 363 |
"message": "Service healthy",
|
| 364 |
+
"ocr_engine": "RapidOCR"
|
| 365 |
}
|
| 366 |
except Exception as e:
|
| 367 |
return JSONResponse(
|
|
|
|
| 388 |
FileValidator.validate(file)
|
| 389 |
tmp_path = FileValidator.check_size_and_save(file)
|
| 390 |
|
| 391 |
+
# Initialize OCR processor and run in thread pool
|
| 392 |
# ContextVars are automatically copied to the thread
|
| 393 |
processor = OCRProcessor()
|
| 394 |
result = await run_in_threadpool(
|
|
|
|
| 440 |
@app.on_event("startup")
|
| 441 |
async def startup_event():
|
| 442 |
"""Initialize OCR engine on startup"""
|
| 443 |
+
logger.info("Starting RapidOCR API...")
|
| 444 |
try:
|
| 445 |
+
# Test initialize the engine
|
| 446 |
+
test_processor = OCRProcessor()
|
| 447 |
+
logger.info("RapidOCR engine ready for processing")
|
| 448 |
except Exception as e:
|
| 449 |
logger.error(f"Failed to initialize OCR engine: {str(e)}")
|
| 450 |
raise
|