Spaces:

Abhisesh7
/

Invoice-Fraud-Detection

Sleeping

App Files Files Community

Abhisesh7 commited on May 22, 2025

Commit

f3645fd

verified ·

1 Parent(s): 23ace93

Update image_extraction.py

Browse files

Files changed (1) hide show

image_extraction.py +40 -18

image_extraction.py CHANGED Viewed

@@ -29,13 +29,13 @@ def initialize_paddle_ocr():
                 lang='en',
                 use_gpu=False,
                 show_log=False,  # Suppress PaddleOCR logs to reduce noise
-                det_max_side_len=3000,  # Further increase max side length for better detection
                 rec_batch_num=1,  # Process one image at a time for stability
                 det_db_score_mode='slow',  # Use most accurate detection
-                det_db_box_thresh=0.3,  # Lower threshold for better text detection
-                det_db_unclip_ratio=3.0,  # Increase ratio for better text region detection
-                drop_score=0.2,  # Lower drop score to retain more text
-                det_db_thresh=0.2  # Lower threshold for detection
             )
             logger.info("PaddleOCR initialized successfully.")
             return ocr
@@ -51,19 +51,20 @@ def initialize_paddle_ocr():
 # Initialize PaddleOCR at module level
 ocr = initialize_paddle_ocr()
-def preprocess_image(img):
     """
-    Preprocess the image to maximize OCR accuracy.
     Args:
         img (PIL.Image): Input image.
     Returns:
         PIL.Image: Preprocessed image.
     """
     try:
         # Resize image to a higher resolution for better OCR
-        max_size = (2500, 2500)
         img.thumbnail(max_size, Image.Resampling.LANCZOS)
         # Convert to grayscale
@@ -71,26 +72,26 @@ def preprocess_image(img):
         # Increase contrast
         enhancer = ImageEnhance.Contrast(img)
-        img = enhancer.enhance(4.0)
         # Sharpen the image
         img = img.filter(ImageFilter.SHARPEN)
         # Reduce noise with a stronger filter
-        img = img.filter(ImageFilter.MedianFilter(size=5))
         # Apply adaptive thresholding
         img_array = np.array(img)
-        thresh = 150  # Adjusted threshold for better binarization
         img_array = np.where(img_array > thresh, 255, 0).astype(np.uint8)
         img = Image.fromarray(img_array)
         # Apply dilation to connect broken characters
-        img = img.filter(ImageFilter.MaxFilter(size=3))
         return img
     except Exception as e:
-        logger.error(f"Failed to preprocess image: {str(e)}")
         return img
 def validate_image(image_file):
@@ -123,7 +124,7 @@ def validate_image(image_file):
 def extract_text_from_image(image_file):
     """
-    Extract text from an image using PaddleOCR with maximum accuracy.
     Args:
         image_file (str): Path to the image file.
@@ -146,13 +147,15 @@ def extract_text_from_image(image_file):
         logger.info(f"Extracting text from image: {image_file}")
         # Convert image file to a format PaddleOCR can process
         img = Image.open(image_file)
-        # Preprocess the image
-        img = preprocess_image(img)
         img_byte_arr = io.BytesIO()
-        img.save(img_byte_arr, format='PNG')
         img_byte_arr = img_byte_arr.getvalue()
-        # Perform OCR with error handling for resource constraints
         result = ocr.ocr(img_byte_arr, cls=True)
         # Extract text from OCR result
@@ -163,6 +166,25 @@ def extract_text_from_image(image_file):
                     for word_info in line:
                         text += word_info[1][0] + "\n"
         logger.info("Successfully extracted text from image.")
         logger.debug(f"Extracted text:\n{text}")
         return text.strip()

                 lang='en',
                 use_gpu=False,
                 show_log=False,  # Suppress PaddleOCR logs to reduce noise
+                det_max_side_len=3500,  # Increase max side length for better detection
                 rec_batch_num=1,  # Process one image at a time for stability
                 det_db_score_mode='slow',  # Use most accurate detection
+                det_db_box_thresh=0.2,  # Lower threshold for better text detection
+                det_db_unclip_ratio=3.5,  # Increase ratio for better text region detection
+                drop_score=0.1,  # Lower drop score to retain more text
+                det_db_thresh=0.1  # Lower threshold for detection
             )
             logger.info("PaddleOCR initialized successfully.")
             return ocr
 # Initialize PaddleOCR at module level
 ocr = initialize_paddle_ocr()
+def preprocess_image(img, attempt=1):
     """
+    Preprocess the image to maximize OCR accuracy with multiple attempts.
     Args:
         img (PIL.Image): Input image.
+        attempt (int): Preprocessing attempt number (1 or 2 for different settings).
     Returns:
         PIL.Image: Preprocessed image.
     """
     try:
         # Resize image to a higher resolution for better OCR
+        max_size = (3000, 3000)
         img.thumbnail(max_size, Image.Resampling.LANCZOS)
         # Convert to grayscale
         # Increase contrast
         enhancer = ImageEnhance.Contrast(img)
+        img = enhancer.enhance(5.0 if attempt == 1 else 3.0)
         # Sharpen the image
         img = img.filter(ImageFilter.SHARPEN)
         # Reduce noise with a stronger filter
+        img = img.filter(ImageFilter.MedianFilter(size=5 if attempt == 1 else 3))
         # Apply adaptive thresholding
         img_array = np.array(img)
+        thresh = 120 if attempt == 1 else 150  # Different thresholds for different attempts
         img_array = np.where(img_array > thresh, 255, 0).astype(np.uint8)
         img = Image.fromarray(img_array)
         # Apply dilation to connect broken characters
+        img = img.filter(ImageFilter.MaxFilter(size=3 if attempt == 1 else 5))
         return img
     except Exception as e:
+        logger.error(f"Failed to preprocess image (Attempt {attempt}): {str(e)}")
         return img
 def validate_image(image_file):
 def extract_text_from_image(image_file):
     """
+    Extract text from an image using PaddleOCR with multiple attempts for accuracy.
     Args:
         image_file (str): Path to the image file.
         logger.info(f"Extracting text from image: {image_file}")
         # Convert image file to a format PaddleOCR can process
         img = Image.open(image_file)
+        # First attempt with default preprocessing
+        logger.info("Attempt 1: Extracting text with default preprocessing...")
+        img_processed = preprocess_image(img, attempt=1)
         img_byte_arr = io.BytesIO()
+        img_processed.save(img_byte_arr, format='PNG')
         img_byte_arr = img_byte_arr.getvalue()
+        # Perform OCR
         result = ocr.ocr(img_byte_arr, cls=True)
         # Extract text from OCR result
                     for word_info in line:
                         text += word_info[1][0] + "\n"
+        # If text is empty or contains obvious errors, try a second attempt
+        if not text.strip() or len(text.splitlines()) < 5:  # Arbitrary threshold for "too little text"
+            logger.warning("First OCR attempt yielded insufficient text. Trying second attempt with different preprocessing...")
+            img_processed = preprocess_image(img, attempt=2)
+            img_byte_arr = io.BytesIO()
+            img_processed.save(img_byte_arr, format='PNG')
+            img_byte_arr = img_byte_arr.getvalue()
+            # Perform OCR again
+            result = ocr.ocr(img_byte_arr, cls=True)
+            # Extract text from second attempt
+            text = ""
+            if result:
+                for line in result:
+                    if line:  # Check if line is not None
+                        for word_info in line:
+                            text += word_info[1][0] + "\n"
         logger.info("Successfully extracted text from image.")
         logger.debug(f"Extracted text:\n{text}")
         return text.strip()