rahulrana0001 commited on
Commit
d4d012d
·
1 Parent(s): af3bb1a

Feature: Integrate Tesseract as the ultimate tertiary OCR engine for maximum text extraction power

Browse files
Files changed (3) hide show
  1. packages.txt +3 -0
  2. pipeline/ocr.py +16 -1
  3. requirements.txt +1 -0
packages.txt CHANGED
@@ -1,2 +1,5 @@
1
  libgl1
2
  libglib2.0-0
 
 
 
 
1
  libgl1
2
  libglib2.0-0
3
+ tesseract-ocr
4
+ tesseract-ocr-eng
5
+ tesseract-ocr-tam
pipeline/ocr.py CHANGED
@@ -89,8 +89,23 @@ def extract_text_from_image(image_path: str) -> str:
89
  all_text.extend(result)
90
  except Exception as e:
91
  print(f"DEBUG: EasyOCR Failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  else:
93
- # Use EasyOCR Fallback
94
  reader = get_easy_reader()
95
  if not reader: return "OCR Engine Error"
96
 
 
89
  all_text.extend(result)
90
  except Exception as e:
91
  print(f"DEBUG: EasyOCR Failed: {e}")
92
+
93
+ try:
94
+ # --- TERTIARY: Tesseract OCR (Ultimate Open-Source Backup) ---
95
+ print("DEBUG: Falling back to Tesseract AI...")
96
+ import pytesseract
97
+ from PIL import Image
98
+ # Tesseract usually performs best with PIL images
99
+ pil_img = Image.fromarray(cv2.cvtColor(thresh, cv2.COLOR_GRAY2RGB))
100
+ # psm 6 assumes a single uniform block of text
101
+ tess_text = pytesseract.image_to_string(pil_img, lang='eng', config='--psm 6')
102
+ if tess_text.strip():
103
+ all_text.append(tess_text.strip())
104
+ except Exception as e:
105
+ print(f"DEBUG: Tesseract Failed (Ensure tesseract is installed on system): {e}")
106
+
107
  else:
108
+ # Use EasyOCR Fallback if Paddle isn't available
109
  reader = get_easy_reader()
110
  if not reader: return "OCR Engine Error"
111
 
requirements.txt CHANGED
@@ -14,3 +14,4 @@ deep-translator
14
  psutil
15
  opencv-python
16
  nest_asyncio
 
 
14
  psutil
15
  opencv-python
16
  nest_asyncio
17
+ pytesseract