omthakur1 commited on
Commit
19d3640
·
1 Parent(s): 5ec4d20

feat: Advanced OCR with image preprocessing for accurate text extraction

Browse files
Files changed (2) hide show
  1. app.py +60 -3
  2. requirements.txt +2 -0
app.py CHANGED
@@ -257,7 +257,7 @@ def pdf_to_word():
257
 
258
  @app.route('/image-to-text', methods=['POST'])
259
  def image_to_text():
260
- """Extract text from image using Tesseract OCR"""
261
 
262
  if 'file' not in request.files:
263
  return jsonify({'error': 'No file provided'}), 400
@@ -273,8 +273,65 @@ def image_to_text():
273
 
274
  logger.info(f"Extracting text from image ({image.size})...")
275
 
276
- # Perform OCR
277
- text = pytesseract.image_to_string(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
  logger.info(f"OCR successful! Extracted {len(text)} characters")
280
 
 
257
 
258
  @app.route('/image-to-text', methods=['POST'])
259
  def image_to_text():
260
+ """Extract text from image using Tesseract OCR with advanced preprocessing"""
261
 
262
  if 'file' not in request.files:
263
  return jsonify({'error': 'No file provided'}), 400
 
273
 
274
  logger.info(f"Extracting text from image ({image.size})...")
275
 
276
+ # Convert to RGB if necessary
277
+ if image.mode != 'RGB':
278
+ image = image.convert('RGB')
279
+
280
+ # Convert PIL Image to numpy array for preprocessing
281
+ import numpy as np
282
+ import cv2
283
+
284
+ img_array = np.array(image)
285
+
286
+ # Image preprocessing for better OCR
287
+ # 1. Convert to grayscale
288
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
289
+
290
+ # 2. Apply slight Gaussian blur to reduce noise
291
+ blurred = cv2.GaussianBlur(gray, (3, 3), 0)
292
+
293
+ # 3. Apply adaptive thresholding for better contrast
294
+ thresh = cv2.adaptiveThreshold(
295
+ blurred, 255,
296
+ cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
297
+ cv2.THRESH_BINARY,
298
+ 11, 2
299
+ )
300
+
301
+ # 4. Apply morphological operations to remove small noise
302
+ kernel = np.ones((1, 1), np.uint8)
303
+ processed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
304
+ processed = cv2.medianBlur(processed, 1)
305
+
306
+ # Convert back to PIL Image for Tesseract
307
+ processed_image = Image.fromarray(processed)
308
+
309
+ # Configure Tesseract for better accuracy
310
+ # PSM 3 = Fully automatic page segmentation, but no OSD
311
+ # PSM 6 = Assume a single uniform block of text
312
+ # OEM 3 = Default, based on what is available (LSTM + Legacy)
313
+ custom_config = r'--oem 3 --psm 3 -c preserve_interword_spaces=1'
314
+
315
+ # Perform OCR with configuration
316
+ text = pytesseract.image_to_string(processed_image, config=custom_config, lang='eng')
317
+
318
+ # Clean up the extracted text
319
+ # Remove excessive whitespace and empty lines
320
+ lines = [line.strip() for line in text.split('\n')]
321
+ cleaned_lines = [line for line in lines if line and len(line) > 0]
322
+
323
+ # Filter out lines with too many special characters (likely errors)
324
+ filtered_lines = []
325
+ for line in cleaned_lines:
326
+ # Count alphanumeric vs special chars
327
+ alnum_count = sum(c.isalnum() or c.isspace() for c in line)
328
+ special_count = len(line) - alnum_count
329
+
330
+ # Keep line if it has reasonable ratio of alphanumeric characters
331
+ if len(line) > 0 and (alnum_count / len(line)) > 0.5:
332
+ filtered_lines.append(line)
333
+
334
+ text = '\n'.join(filtered_lines)
335
 
336
  logger.info(f"OCR successful! Extracted {len(text)} characters")
337
 
requirements.txt CHANGED
@@ -6,3 +6,5 @@ PyPDF2==3.0.1
6
  pytesseract==0.3.10
7
  Pillow==10.2.0
8
  pdf2docx==0.5.8
 
 
 
6
  pytesseract==0.3.10
7
  Pillow==10.2.0
8
  pdf2docx==0.5.8
9
+ opencv-python-headless==4.8.1.78
10
+ numpy==1.24.3