Vachudev commited on
Commit
006541d
·
verified ·
1 Parent(s): 7880958

added ocr_preprocessing_engine call

Browse files
Files changed (1) hide show
  1. ocr_engine.py +50 -15
ocr_engine.py CHANGED
@@ -3,41 +3,76 @@ from pdf2image import convert_from_path
3
  from PIL import Image
4
  import os
5
  import logging
 
 
 
 
 
 
 
 
 
 
6
  logger = logging.getLogger("ocr_engine")
 
7
  def extract_text_from_file(file_path: str) -> str:
8
  """
9
- Extracts text from a PDF or Image file using Tesseract.
 
 
 
 
10
  """
11
  if not os.path.exists(file_path):
12
  return ""
13
 
14
  text_content = ""
15
-
 
16
  try:
17
- # Handle PDF
 
18
  if file_path.lower().endswith('.pdf'):
19
  try:
20
- # Convert PDF pages to images
21
- images = convert_from_path(file_path)
22
- for i, image in enumerate(images):
23
- page_text = pytesseract.image_to_string(image)
24
- text_content += f"--- Page {i+1} ---\n{page_text}\n"
25
  except Exception as e:
26
- logger.error(f"Error converting PDF: {e}")
27
  return f"Error reading PDF: {str(e)}"
28
-
29
- # Handle Images (JPG, PNG, etc.)
30
  elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
31
  try:
32
- image = Image.open(file_path)
33
- text_content = pytesseract.image_to_string(image)
34
  except Exception as e:
35
- logger.error(f"Error reading image: {e}")
36
  return f"Error reading image: {str(e)}"
37
-
38
  else:
39
  return "Unsupported file format. Please upload PDF or Image."
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  except Exception as e:
42
  logger.error(f"OCR Critical Error: {e}")
43
  return f"OCR Failed: {str(e)}"
 
3
  from PIL import Image
4
  import os
5
  import logging
6
+
7
+ # Import the Robust Vision logic
8
+ # Ensure ocr_preprocessing_engine.py is in the same directory
9
+ try:
10
+ from ocr_preprocessing_engine import preprocess_image
11
+ except ImportError:
12
+ # Fail-safe if the module is missing
13
+ logging.warning("ocr_preprocessing_engine not found. Using raw OCR only.")
14
+ def preprocess_image(img, page_num): return img
15
+
16
  logger = logging.getLogger("ocr_engine")
17
+
18
  def extract_text_from_file(file_path: str) -> str:
19
  """
20
+ Extracts text using a Hybrid Pipeline:
21
+ 1. Attempt Robust Preprocessing (Deskew -> Denoise -> Adaptive Threshold).
22
+ 2. Fallback to Raw Image if preprocessing yields low/empty confidence.
23
+
24
+ Ref: Tesseract best practices for DPI and Preprocessing [3], [1].
25
  """
26
  if not os.path.exists(file_path):
27
  return ""
28
 
29
  text_content = ""
30
+ images = []
31
+
32
  try:
33
+ # 1. Image Loading & DPI Scaling
34
+ # Tesseract works best at 300 DPI [3].
35
  if file_path.lower().endswith('.pdf'):
36
  try:
37
+ images = convert_from_path(file_path, dpi=300)
 
 
 
 
38
  except Exception as e:
 
39
  return f"Error reading PDF: {str(e)}"
 
 
40
  elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp')):
41
  try:
42
+ images = [Image.open(file_path)]
 
43
  except Exception as e:
 
44
  return f"Error reading image: {str(e)}"
 
45
  else:
46
  return "Unsupported file format. Please upload PDF or Image."
47
 
48
+ # 2. Page-by-Page Extraction
49
+ for i, raw_img in enumerate(images):
50
+ page_num = i + 1
51
+
52
+ # Tesseract Configuration
53
+ # --psm 4: Assume variable size text (good for single-column invoices) [4]
54
+ # --oem 3: Default LSTM engine
55
+ custom_config = r'--oem 3 --psm 4'
56
+
57
+ page_text = ""
58
+
59
+ # --- STRATEGY A: ROBUST PREPROCESSING ---
60
+ try:
61
+ # Apply the "Make OCR Work" pipeline (Deskew, Denoise, Threshold) [5], [6]
62
+ processed_img = preprocess_image(raw_img, page_num)
63
+ page_text = pytesseract.image_to_string(processed_img, config=custom_config)
64
+ except Exception as e:
65
+ logger.warning(f"Page {page_num}: Preprocessing failed ({e}). Skipping to fallback.")
66
+
67
+ # --- STRATEGY B: FALLBACK MECHANISM ---
68
+ # If preprocessing was too aggressive (e.g., thresholding wiped the text),
69
+ # rely on Tesseract's internal Otsu binarization [3], [1].
70
+ if len(page_text.strip()) < 10:
71
+ logger.info(f"Page {page_num}: Low confidence extraction. Retrying with raw image...")
72
+ page_text = pytesseract.image_to_string(raw_img, config=custom_config)
73
+
74
+ text_content += f"--- Page {page_num} ---\n{page_text}\n"
75
+
76
  except Exception as e:
77
  logger.error(f"OCR Critical Error: {e}")
78
  return f"OCR Failed: {str(e)}"