dev2607 commited on
Commit
2d99a85
·
verified ·
1 Parent(s): 447cd51

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -22
app.py CHANGED
@@ -146,41 +146,102 @@ def extract_text_from_image(image):
146
  except (subprocess.SubprocessError, FileNotFoundError):
147
  return "Tesseract OCR is not installed or not properly configured. Please check installation."
148
 
149
- # Image preprocessing for better OCR
150
  import cv2
151
  import numpy as np
 
152
 
153
- # Convert PIL image to OpenCV format
 
 
 
 
 
 
 
154
  img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
155
 
156
  # Convert to grayscale
157
  gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
158
 
159
- # Apply thresholding to get black and white image
160
- _, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
161
-
162
- # Noise removal
163
- kernel = np.ones((1, 1), np.uint8)
164
- binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
165
-
166
- # Dilate to connect text
167
- binary = cv2.dilate(binary, kernel, iterations=1)
168
-
169
- # Convert back to PIL image for tesseract
170
- binary_pil = Image.fromarray(cv2.bitwise_not(binary))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
- # Run OCR with improved configuration
173
- custom_config = r'--oem 3 --psm 6 -l eng'
174
- text = pytesseract.image_to_string(binary_pil, config=custom_config)
175
 
176
- if not text.strip():
177
- # Try original image as fallback
178
- text = pytesseract.image_to_string(image, config=custom_config)
 
 
179
 
180
- if not text.strip():
181
  return "No text could be extracted. Ensure image is clear and readable."
182
 
183
- return text
184
  except Exception as e:
185
  return f"Error extracting text: {str(e)}"
186
 
 
146
  except (subprocess.SubprocessError, FileNotFoundError):
147
  return "Tesseract OCR is not installed or not properly configured. Please check installation."
148
 
149
+ # Import necessary libraries
150
  import cv2
151
  import numpy as np
152
+ from PIL import Image, ImageOps, ImageEnhance
153
 
154
+ # First approach: Invert the image for light text on dark background
155
+ inverted_image = ImageOps.invert(image)
156
+
157
+ # Try OCR on inverted image
158
+ custom_config = r'--oem 3 --psm 6 -l eng --dpi 300'
159
+ inverted_text = pytesseract.image_to_string(inverted_image, config=custom_config)
160
+
161
+ # Second approach: OpenCV processing for colored backgrounds
162
  img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
163
 
164
  # Convert to grayscale
165
  gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
166
 
167
+ # Apply bilateral filter to preserve edges while reducing noise
168
+ filtered = cv2.bilateralFilter(gray, 11, 17, 17)
169
+
170
+ # Adaptive thresholding to handle varied lighting
171
+ thresh = cv2.adaptiveThreshold(filtered, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
172
+ cv2.THRESH_BINARY, 11, 2)
173
+
174
+ # Invert the image (if text is light on dark background)
175
+ inverted_thresh = cv2.bitwise_not(thresh)
176
+
177
+ # Try OCR on processed image
178
+ cv_text = pytesseract.image_to_string(
179
+ Image.fromarray(inverted_thresh),
180
+ config=custom_config
181
+ )
182
+
183
+ # Third approach: Color filtering to isolate text from colored background
184
+ # Convert to HSV color space to better isolate colors
185
+ hsv = cv2.cvtColor(img_cv, cv2.COLOR_BGR2HSV)
186
+
187
+ # Create a mask to extract light colored text (assuming white/light text)
188
+ lower_white = np.array([0, 0, 150])
189
+ upper_white = np.array([180, 30, 255])
190
+ mask = cv2.inRange(hsv, lower_white, upper_white)
191
+
192
+ # Apply morphological operations to clean up the mask
193
+ kernel = np.ones((2, 2), np.uint8)
194
+ mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
195
+ mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
196
+
197
+ # Improve character connectivity
198
+ mask = cv2.dilate(mask, kernel, iterations=1)
199
+
200
+ # Try OCR on color filtered image
201
+ color_text = pytesseract.image_to_string(
202
+ Image.fromarray(mask),
203
+ config=r'--oem 3 --psm 6 -l eng --dpi 300'
204
+ )
205
+
206
+ # Fourth approach: Try directly with the image but with different configs
207
+ direct_text = pytesseract.image_to_string(
208
+ image,
209
+ config=r'--oem 3 --psm 11 -l eng --dpi 300'
210
+ )
211
+
212
+ # Compare results and select the best one
213
+ results = [inverted_text, cv_text, color_text, direct_text]
214
+
215
+ # Select the result with the most alphanumeric characters
216
+ def count_alphanumeric(text):
217
+ return sum(c.isalnum() for c in text)
218
+
219
+ best_text = max(results, key=count_alphanumeric)
220
+
221
+ # If still poor results, try with explicit text color inversion in tesseract
222
+ if count_alphanumeric(best_text) < 20:
223
+ # Try with tesseract's built-in inversion
224
+ neg_text = pytesseract.image_to_string(
225
+ image,
226
+ config=r'--oem 3 --psm 6 -c textord_heavy_nr=1 -c textord_debug_printable=0 -l eng --dpi 300'
227
+ )
228
+ if count_alphanumeric(neg_text) > count_alphanumeric(best_text):
229
+ best_text = neg_text
230
 
231
+ # Clean up the text
232
+ best_text = re.sub(r'[^\w\s,;:%.()\n\'-]', '', best_text)
233
+ best_text = best_text.replace('\n\n', '\n')
234
 
235
+ # Special case for ingredients list format
236
+ if "ingredient" in best_text.lower() or any(x in best_text.lower() for x in ["sugar", "cocoa", "milk", "contain"]):
237
+ # Specific cleaning for ingredient lists
238
+ best_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', best_text) # Add space between lowercase and uppercase
239
+ best_text = re.sub(r'(\d+)([a-zA-Z])', r'\1 \2', best_text) # Add space between number and letter
240
 
241
+ if not best_text.strip():
242
  return "No text could be extracted. Ensure image is clear and readable."
243
 
244
+ return best_text.strip()
245
  except Exception as e:
246
  return f"Error extracting text: {str(e)}"
247