Abhisesh7 commited on
Commit
f3645fd
·
verified ·
1 Parent(s): 23ace93

Update image_extraction.py

Browse files
Files changed (1) hide show
  1. image_extraction.py +40 -18
image_extraction.py CHANGED
@@ -29,13 +29,13 @@ def initialize_paddle_ocr():
29
  lang='en',
30
  use_gpu=False,
31
  show_log=False, # Suppress PaddleOCR logs to reduce noise
32
- det_max_side_len=3000, # Further increase max side length for better detection
33
  rec_batch_num=1, # Process one image at a time for stability
34
  det_db_score_mode='slow', # Use most accurate detection
35
- det_db_box_thresh=0.3, # Lower threshold for better text detection
36
- det_db_unclip_ratio=3.0, # Increase ratio for better text region detection
37
- drop_score=0.2, # Lower drop score to retain more text
38
- det_db_thresh=0.2 # Lower threshold for detection
39
  )
40
  logger.info("PaddleOCR initialized successfully.")
41
  return ocr
@@ -51,19 +51,20 @@ def initialize_paddle_ocr():
51
  # Initialize PaddleOCR at module level
52
  ocr = initialize_paddle_ocr()
53
 
54
- def preprocess_image(img):
55
  """
56
- Preprocess the image to maximize OCR accuracy.
57
 
58
  Args:
59
  img (PIL.Image): Input image.
 
60
 
61
  Returns:
62
  PIL.Image: Preprocessed image.
63
  """
64
  try:
65
  # Resize image to a higher resolution for better OCR
66
- max_size = (2500, 2500)
67
  img.thumbnail(max_size, Image.Resampling.LANCZOS)
68
 
69
  # Convert to grayscale
@@ -71,26 +72,26 @@ def preprocess_image(img):
71
 
72
  # Increase contrast
73
  enhancer = ImageEnhance.Contrast(img)
74
- img = enhancer.enhance(4.0)
75
 
76
  # Sharpen the image
77
  img = img.filter(ImageFilter.SHARPEN)
78
 
79
  # Reduce noise with a stronger filter
80
- img = img.filter(ImageFilter.MedianFilter(size=5))
81
 
82
  # Apply adaptive thresholding
83
  img_array = np.array(img)
84
- thresh = 150 # Adjusted threshold for better binarization
85
  img_array = np.where(img_array > thresh, 255, 0).astype(np.uint8)
86
  img = Image.fromarray(img_array)
87
 
88
  # Apply dilation to connect broken characters
89
- img = img.filter(ImageFilter.MaxFilter(size=3))
90
 
91
  return img
92
  except Exception as e:
93
- logger.error(f"Failed to preprocess image: {str(e)}")
94
  return img
95
 
96
  def validate_image(image_file):
@@ -123,7 +124,7 @@ def validate_image(image_file):
123
 
124
  def extract_text_from_image(image_file):
125
  """
126
- Extract text from an image using PaddleOCR with maximum accuracy.
127
 
128
  Args:
129
  image_file (str): Path to the image file.
@@ -146,13 +147,15 @@ def extract_text_from_image(image_file):
146
  logger.info(f"Extracting text from image: {image_file}")
147
  # Convert image file to a format PaddleOCR can process
148
  img = Image.open(image_file)
149
- # Preprocess the image
150
- img = preprocess_image(img)
 
 
151
  img_byte_arr = io.BytesIO()
152
- img.save(img_byte_arr, format='PNG')
153
  img_byte_arr = img_byte_arr.getvalue()
154
 
155
- # Perform OCR with error handling for resource constraints
156
  result = ocr.ocr(img_byte_arr, cls=True)
157
 
158
  # Extract text from OCR result
@@ -163,6 +166,25 @@ def extract_text_from_image(image_file):
163
  for word_info in line:
164
  text += word_info[1][0] + "\n"
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  logger.info("Successfully extracted text from image.")
167
  logger.debug(f"Extracted text:\n{text}")
168
  return text.strip()
 
29
  lang='en',
30
  use_gpu=False,
31
  show_log=False, # Suppress PaddleOCR logs to reduce noise
32
+ det_max_side_len=3500, # Increase max side length for better detection
33
  rec_batch_num=1, # Process one image at a time for stability
34
  det_db_score_mode='slow', # Use most accurate detection
35
+ det_db_box_thresh=0.2, # Lower threshold for better text detection
36
+ det_db_unclip_ratio=3.5, # Increase ratio for better text region detection
37
+ drop_score=0.1, # Lower drop score to retain more text
38
+ det_db_thresh=0.1 # Lower threshold for detection
39
  )
40
  logger.info("PaddleOCR initialized successfully.")
41
  return ocr
 
51
  # Initialize PaddleOCR at module level
52
  ocr = initialize_paddle_ocr()
53
 
54
+ def preprocess_image(img, attempt=1):
55
  """
56
+ Preprocess the image to maximize OCR accuracy with multiple attempts.
57
 
58
  Args:
59
  img (PIL.Image): Input image.
60
+ attempt (int): Preprocessing attempt number (1 or 2 for different settings).
61
 
62
  Returns:
63
  PIL.Image: Preprocessed image.
64
  """
65
  try:
66
  # Resize image to a higher resolution for better OCR
67
+ max_size = (3000, 3000)
68
  img.thumbnail(max_size, Image.Resampling.LANCZOS)
69
 
70
  # Convert to grayscale
 
72
 
73
  # Increase contrast
74
  enhancer = ImageEnhance.Contrast(img)
75
+ img = enhancer.enhance(5.0 if attempt == 1 else 3.0)
76
 
77
  # Sharpen the image
78
  img = img.filter(ImageFilter.SHARPEN)
79
 
80
  # Reduce noise with a stronger filter
81
+ img = img.filter(ImageFilter.MedianFilter(size=5 if attempt == 1 else 3))
82
 
83
  # Apply adaptive thresholding
84
  img_array = np.array(img)
85
+ thresh = 120 if attempt == 1 else 150 # Different thresholds for different attempts
86
  img_array = np.where(img_array > thresh, 255, 0).astype(np.uint8)
87
  img = Image.fromarray(img_array)
88
 
89
  # Apply dilation to connect broken characters
90
+ img = img.filter(ImageFilter.MaxFilter(size=3 if attempt == 1 else 5))
91
 
92
  return img
93
  except Exception as e:
94
+ logger.error(f"Failed to preprocess image (Attempt {attempt}): {str(e)}")
95
  return img
96
 
97
  def validate_image(image_file):
 
124
 
125
  def extract_text_from_image(image_file):
126
  """
127
+ Extract text from an image using PaddleOCR with multiple attempts for accuracy.
128
 
129
  Args:
130
  image_file (str): Path to the image file.
 
147
  logger.info(f"Extracting text from image: {image_file}")
148
  # Convert image file to a format PaddleOCR can process
149
  img = Image.open(image_file)
150
+
151
+ # First attempt with default preprocessing
152
+ logger.info("Attempt 1: Extracting text with default preprocessing...")
153
+ img_processed = preprocess_image(img, attempt=1)
154
  img_byte_arr = io.BytesIO()
155
+ img_processed.save(img_byte_arr, format='PNG')
156
  img_byte_arr = img_byte_arr.getvalue()
157
 
158
+ # Perform OCR
159
  result = ocr.ocr(img_byte_arr, cls=True)
160
 
161
  # Extract text from OCR result
 
166
  for word_info in line:
167
  text += word_info[1][0] + "\n"
168
 
169
+ # If text is empty or contains obvious errors, try a second attempt
170
+ if not text.strip() or len(text.splitlines()) < 5: # Arbitrary threshold for "too little text"
171
+ logger.warning("First OCR attempt yielded insufficient text. Trying second attempt with different preprocessing...")
172
+ img_processed = preprocess_image(img, attempt=2)
173
+ img_byte_arr = io.BytesIO()
174
+ img_processed.save(img_byte_arr, format='PNG')
175
+ img_byte_arr = img_byte_arr.getvalue()
176
+
177
+ # Perform OCR again
178
+ result = ocr.ocr(img_byte_arr, cls=True)
179
+
180
+ # Extract text from second attempt
181
+ text = ""
182
+ if result:
183
+ for line in result:
184
+ if line: # Check if line is not None
185
+ for word_info in line:
186
+ text += word_info[1][0] + "\n"
187
+
188
  logger.info("Successfully extracted text from image.")
189
  logger.debug(f"Extracted text:\n{text}")
190
  return text.strip()