Commit ·
d4d012d
1
Parent(s): af3bb1a
Feature: Integrate Tesseract as the ultimate tertiary OCR engine for maximum text extraction power
Browse files- packages.txt +3 -0
- pipeline/ocr.py +16 -1
- requirements.txt +1 -0
packages.txt
CHANGED
|
@@ -1,2 +1,5 @@
|
|
| 1 |
libgl1
|
| 2 |
libglib2.0-0
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
libgl1
|
| 2 |
libglib2.0-0
|
| 3 |
+
tesseract-ocr
|
| 4 |
+
tesseract-ocr-eng
|
| 5 |
+
tesseract-ocr-tam
|
pipeline/ocr.py
CHANGED
|
@@ -89,8 +89,23 @@ def extract_text_from_image(image_path: str) -> str:
|
|
| 89 |
all_text.extend(result)
|
| 90 |
except Exception as e:
|
| 91 |
print(f"DEBUG: EasyOCR Failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
else:
|
| 93 |
-
# Use EasyOCR Fallback
|
| 94 |
reader = get_easy_reader()
|
| 95 |
if not reader: return "OCR Engine Error"
|
| 96 |
|
|
|
|
| 89 |
all_text.extend(result)
|
| 90 |
except Exception as e:
|
| 91 |
print(f"DEBUG: EasyOCR Failed: {e}")
|
| 92 |
+
|
| 93 |
+
try:
|
| 94 |
+
# --- TERTIARY: Tesseract OCR (Ultimate Open-Source Backup) ---
|
| 95 |
+
print("DEBUG: Falling back to Tesseract AI...")
|
| 96 |
+
import pytesseract
|
| 97 |
+
from PIL import Image
|
| 98 |
+
# Tesseract usually performs best with PIL images
|
| 99 |
+
pil_img = Image.fromarray(cv2.cvtColor(thresh, cv2.COLOR_GRAY2RGB))
|
| 100 |
+
# psm 6 assumes a single uniform block of text
|
| 101 |
+
tess_text = pytesseract.image_to_string(pil_img, lang='eng', config='--psm 6')
|
| 102 |
+
if tess_text.strip():
|
| 103 |
+
all_text.append(tess_text.strip())
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"DEBUG: Tesseract Failed (Ensure tesseract is installed on system): {e}")
|
| 106 |
+
|
| 107 |
else:
|
| 108 |
+
# Use EasyOCR Fallback if Paddle isn't available
|
| 109 |
reader = get_easy_reader()
|
| 110 |
if not reader: return "OCR Engine Error"
|
| 111 |
|
requirements.txt
CHANGED
|
@@ -14,3 +14,4 @@ deep-translator
|
|
| 14 |
psutil
|
| 15 |
opencv-python
|
| 16 |
nest_asyncio
|
|
|
|
|
|
| 14 |
psutil
|
| 15 |
opencv-python
|
| 16 |
nest_asyncio
|
| 17 |
+
pytesseract
|