Spaces:

xce009
/

ocr-api

Running

App Files Files Community

Soumik Bose commited on Jan 25

Commit

6536e0d

1 Parent(s): 3ddb265

go

Browse files

Files changed (1) hide show

main.py +92 -130

main.py CHANGED Viewed

@@ -41,13 +41,6 @@ class Config:
     MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
     ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
     ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
-    # RapidOCR Settings
-    USE_ANGLE_CLS = os.getenv("OCR_USE_ANGLE_CLS", "true").lower() == "true"
-    USE_TEXT_SCORE = os.getenv("OCR_USE_TEXT_SCORE", "true").lower() == "true"
-    MIN_HEIGHT = int(os.getenv("OCR_MIN_HEIGHT", "30"))
-    TEXT_SCORE_THRESHOLD = float(os.getenv("OCR_TEXT_SCORE", "0.5"))
-    ENABLE_PREPROCESSING = os.getenv("OCR_PREPROCESSING", "true").lower() == "true"
 class RequestIdFilter(logging.Filter):
     def filter(self, record):
@@ -127,135 +120,91 @@ class FileValidator:
             raise HTTPException(413, "File too large")
         return tmp_path
-class RapidOCREngine:
-    """Singleton RapidOCR engine for efficient reuse"""
-    _instance = None
-    _engine = None
-    def __new__(cls):
-        if cls._instance is None:
-            cls._instance = super().__new__(cls)
-            cls._instance._initialize_engine()
-        return cls._instance
-    def _initialize_engine(self):
-        """Initialize RapidOCR with optimized settings"""
-        try:
-            self._engine = RapidOCR(
-                det_use_cuda=False,
-                cls_use_cuda=False,
-                rec_use_cuda=False,
-                use_angle_cls=Config.USE_ANGLE_CLS,
-                use_text_score=Config.USE_TEXT_SCORE,
-                print_verbose=False,
-                min_height=Config.MIN_HEIGHT,
-                text_score=Config.TEXT_SCORE_THRESHOLD
-            )
-            logger.info("RapidOCR engine initialized successfully")
-        except Exception as e:
-            logger.error(f"Failed to initialize RapidOCR: {str(e)}")
-            raise
-    def get_engine(self):
-        return self._engine
-    @staticmethod
-    def preprocess_image(img_array):
-        """Enhanced preprocessing for better accuracy"""
-        if not Config.ENABLE_PREPROCESSING:
-            return img_array
-        try:
-            # Convert to grayscale if needed
-            if len(img_array.shape) == 3:
-                gray = cv2.cvtColor(img_array, cv2.COLOR_BGR2GRAY)
-            else:
-                gray = img_array
-            # Denoise
-            denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
-            # Enhance contrast using CLAHE
-            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
-            contrast = clahe.apply(denoised)
-            # Sharpen
-            kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
-            sharpened = cv2.filter2D(contrast, -1, kernel)
-            # Adaptive threshold
-            processed = cv2.adaptiveThreshold(
-                sharpened, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-                cv2.THRESH_BINARY, 11, 2
-            )
-            return processed
-        except Exception as e:
-            logger.warning(f"Preprocessing failed, using original image: {str(e)}")
-            return img_array
-class OCRProcessor:
-    def __init__(self):
-        self.ocr_engine = RapidOCREngine().get_engine()
-    def _extract_from_image(self, img_array) -> dict:
-        """Extract text from a single image using RapidOCR"""
         try:
-            # Preprocess image
-            processed_img = RapidOCREngine.preprocess_image(img_array)
-            # Perform OCR
-            result, elapse = self.ocr_engine(processed_img)
             if result is None or len(result) == 0:
                 return {
-                    "text": "",
-                    "confidence": 0.0,
-                    "lines_detected": 0
                 }
             # Parse results
             texts = []
             confidences = []
-            for line in result:
                 try:
                     if isinstance(line, (list, tuple)):
                         if len(line) == 2:
-                            # [box, text] or [text, confidence]
                             if isinstance(line[0], (list, tuple)):
-                                _, text = line
                                 confidence = 1.0
                             else:
                                 text, confidence = line
                         elif len(line) == 3:
-                            # [box, text, confidence]
-                            _, text, confidence = line
                         elif len(line) >= 4:
-                            _, text, confidence = line[0], line[1], line[2]
                         else:
                             continue
-                        texts.append(str(text))
-                        confidences.append(float(confidence) if confidence is not None else 1.0)
                 except Exception as e:
-                    logger.debug(f"Skipping malformed line: {e}")
                     continue
             if not texts:
                 return {
-                    "text": "",
-                    "confidence": 0.0,
-                    "lines_detected": 0
                 }
             combined_text = '\n'.join(texts)
             avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
             return {
-                "text": combined_text,
-                "confidence": avg_confidence,
-                "lines_detected": len(texts)
             }
         except Exception as e:
@@ -263,7 +212,16 @@ class OCRProcessor:
             raise ValueError(f"OCR extraction error: {str(e)}")
     def process_file(self, file_path: str, content_type: str) -> dict:
-        """Process PDF or image file and extract text"""
         start = time.perf_counter()
         pages_content = []
         all_confidences = []
@@ -280,32 +238,36 @@ class OCRProcessor:
                     page_num = idx + 1
                     logger.info(f"Scanning Page {page_num}/{total}")
-                    # Convert PIL Image to numpy array for OpenCV
-                    img_array = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-                    # Extract text
-                    ocr_result = self._extract_from_image(img_array)
-                    pages_content.append({
-                        "index": idx,
-                        "page_number": page_num,
-                        "text": ocr_result["text"],
-                        "confidence": ocr_result["confidence"],
-                        "lines_detected": ocr_result["lines_detected"]
-                    })
-                    if ocr_result["confidence"] > 0:
-                        all_confidences.append(ocr_result["confidence"])
             else:
                 logger.info("Scanning Single Image...")
-                # Read image with OpenCV
-                img_array = cv2.imread(file_path)
-                if img_array is None:
-                    raise ValueError("Failed to load image file")
-                # Extract text
-                ocr_result = self._extract_from_image(img_array)
                 pages_content.append({
                     "index": 0,
@@ -319,8 +281,9 @@ class OCRProcessor:
                     all_confidences.append(ocr_result["confidence"])
             avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
-            logger.info(f"OCR Complete in {(time.perf_counter()-start)*1000:.2f}ms | Avg Confidence: {avg_confidence:.2%}")
             return {
                 "total_pages": len(pages_content),
@@ -394,13 +357,11 @@ async def root(request: Request):
 async def health_check(request: Request):
     """Health check endpoint"""
     try:
-        # Verify OCR engine is initialized
-        engine = RapidOCREngine().get_engine()
         return {
             "request_id": request.state.request_id,
             "status": StatusEnum.SUCCESS,
             "message": "Service healthy",
-            "ocr_engine": "ready"
         }
     except Exception as e:
         return JSONResponse(
@@ -427,7 +388,7 @@ async def extract_data(
         FileValidator.validate(file)
         tmp_path = FileValidator.check_size_and_save(file)
-        # CPU heavy task run in thread pool
         # ContextVars are automatically copied to the thread
         processor = OCRProcessor()
         result = await run_in_threadpool(
@@ -479,10 +440,11 @@ async def extract_data(
 @app.on_event("startup")
 async def startup_event():
     """Initialize OCR engine on startup"""
-    logger.info("Starting OCR API with RapidOCR engine...")
     try:
-        RapidOCREngine()  # Initialize singleton
-        logger.info("RapidOCR engine ready")
     except Exception as e:
         logger.error(f"Failed to initialize OCR engine: {str(e)}")
         raise

     MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
     ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
     ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
 class RequestIdFilter(logging.Filter):
     def filter(self, record):
             raise HTTPException(413, "File too large")
         return tmp_path
+class OCRProcessor:
+    """RapidOCR-based OCR processor with enhanced accuracy"""
+    def __init__(self):
+        """Initialize RapidOCR engine"""
+        self.engine = RapidOCR()
+        logger.info("RapidOCR engine initialized successfully")
+    def _extract_text_from_image(self, image_path: str) -> dict:
+        """
+        Extract text from a single image using RapidOCR
+        Args:
+            image_path: Path to image file
+        Returns:
+            dict: Contains text, confidence, and line count
+        """
         try:
+            # Perform OCR - RapidOCR returns (result_object, elapse_list)
+            ocr_result, elapse = self.engine(image_path)
+            # Handle result object
+            if hasattr(ocr_result, '__iter__') and not isinstance(ocr_result, str):
+                result = list(ocr_result)
+            else:
+                result = ocr_result
             if result is None or len(result) == 0:
+                logger.warning(f"No text detected in image: {image_path}")
                 return {
+                    'text': '',
+                    'confidence': 0.0,
+                    'lines_detected': 0
                 }
             # Parse results
             texts = []
             confidences = []
+            for idx, line in enumerate(result):
                 try:
                     if isinstance(line, (list, tuple)):
                         if len(line) == 2:
+                            # Format: [box, text] or [text, confidence]
                             if isinstance(line[0], (list, tuple)):
+                                box, text = line
                                 confidence = 1.0
                             else:
                                 text, confidence = line
+                                box = []
                         elif len(line) == 3:
+                            # Format: [box, text, confidence]
+                            box, text, confidence = line
                         elif len(line) >= 4:
+                            # Format: [box, text, confidence, something_else]
+                            box, text, confidence = line[0], line[1], line[2]
                         else:
                             continue
+                    else:
+                        continue
+                    texts.append(str(text))
+                    confidences.append(float(confidence) if confidence is not None else 1.0)
                 except Exception as e:
+                    logger.debug(f"Skipping malformed line {idx}: {e}")
                     continue
             if not texts:
                 return {
+                    'text': '',
+                    'confidence': 0.0,
+                    'lines_detected': 0
                 }
             combined_text = '\n'.join(texts)
             avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+            logger.debug(f"Extracted {len(texts)} lines with avg confidence: {avg_confidence:.2%}")
             return {
+                'text': combined_text,
+                'confidence': avg_confidence,
+                'lines_detected': len(texts)
             }
         except Exception as e:
             raise ValueError(f"OCR extraction error: {str(e)}")
     def process_file(self, file_path: str, content_type: str) -> dict:
+        """
+        Process PDF or image file and extract text
+        Args:
+            file_path: Path to the file
+            content_type: MIME type of the file
+        Returns:
+            dict: Processing results with pages content
+        """
         start = time.perf_counter()
         pages_content = []
         all_confidences = []
                     page_num = idx + 1
                     logger.info(f"Scanning Page {page_num}/{total}")
+                    # Save PIL Image to temp file for RapidOCR
+                    with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp_img:
+                        img.save(tmp_img.name, 'PNG')
+                        temp_img_path = tmp_img.name
+                    try:
+                        # Extract text from temp image
+                        ocr_result = self._extract_text_from_image(temp_img_path)
+                        pages_content.append({
+                            "index": idx,
+                            "page_number": page_num,
+                            "text": ocr_result["text"],
+                            "confidence": ocr_result["confidence"],
+                            "lines_detected": ocr_result["lines_detected"]
+                        })
+                        if ocr_result["confidence"] > 0:
+                            all_confidences.append(ocr_result["confidence"])
+                    finally:
+                        # Clean up temp image
+                        try:
+                            os.remove(temp_img_path)
+                        except:
+                            pass
             else:
                 logger.info("Scanning Single Image...")
+                # Extract text from image
+                ocr_result = self._extract_text_from_image(file_path)
                 pages_content.append({
                     "index": 0,
                     all_confidences.append(ocr_result["confidence"])
             avg_confidence = sum(all_confidences) / len(all_confidences) if all_confidences else 0.0
+            processing_time = (time.perf_counter() - start) * 1000
+            logger.info(f"OCR Complete in {processing_time:.2f}ms | Avg Confidence: {avg_confidence:.2%}")
             return {
                 "total_pages": len(pages_content),
 async def health_check(request: Request):
     """Health check endpoint"""
     try:
         return {
             "request_id": request.state.request_id,
             "status": StatusEnum.SUCCESS,
             "message": "Service healthy",
+            "ocr_engine": "RapidOCR"
         }
     except Exception as e:
         return JSONResponse(
         FileValidator.validate(file)
         tmp_path = FileValidator.check_size_and_save(file)
+        # Initialize OCR processor and run in thread pool
         # ContextVars are automatically copied to the thread
         processor = OCRProcessor()
         result = await run_in_threadpool(
 @app.on_event("startup")
 async def startup_event():
     """Initialize OCR engine on startup"""
+    logger.info("Starting RapidOCR API...")
     try:
+        # Test initialize the engine
+        test_processor = OCRProcessor()
+        logger.info("RapidOCR engine ready for processing")
     except Exception as e:
         logger.error(f"Failed to initialize OCR engine: {str(e)}")
         raise