Spaces:
Sleeping
Sleeping
Update src/pdf_parser.py
Browse files- src/pdf_parser.py +31 -9
src/pdf_parser.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
PDF Parser Module with
|
| 3 |
"""
|
| 4 |
import os
|
| 5 |
import json
|
|
@@ -20,8 +20,24 @@ class PDFParser:
|
|
| 20 |
self.processed_files = self._load_processed_files()
|
| 21 |
self.debug = debug
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
if self.debug:
|
| 24 |
-
print("✅ PDFParser initialized with
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def _debug_print(self, label: str, data: any):
|
| 27 |
"""Print debug information"""
|
|
@@ -80,7 +96,7 @@ class PDFParser:
|
|
| 80 |
return text
|
| 81 |
|
| 82 |
def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 83 |
-
"""Extract images from PDF pages with
|
| 84 |
images_data = []
|
| 85 |
try:
|
| 86 |
self._debug_print("Image Extraction Started", f"File: {pdf_path}")
|
|
@@ -96,15 +112,21 @@ class PDFParser:
|
|
| 96 |
image.save(image_path)
|
| 97 |
self._debug_print(f"Image {idx} Saved", str(image_path))
|
| 98 |
|
| 99 |
-
# Extract text using OCR
|
| 100 |
-
self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR...")
|
| 101 |
|
| 102 |
try:
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
if not ocr_text or len(ocr_text.strip()) < 5:
|
| 107 |
-
self._debug_print(f"Image {idx} WARNING", "⚠️ OCR returned empty or very short text!")
|
| 108 |
except Exception as ocr_error:
|
| 109 |
self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
|
| 110 |
ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"
|
|
|
|
| 1 |
"""
|
| 2 |
+
PDF Parser Module with FIXED Russian OCR support
|
| 3 |
"""
|
| 4 |
import os
|
| 5 |
import json
|
|
|
|
| 20 |
self.processed_files = self._load_processed_files()
|
| 21 |
self.debug = debug
|
| 22 |
|
| 23 |
+
# Configure Tesseract for Russian + English
|
| 24 |
+
self._configure_tesseract()
|
| 25 |
+
|
| 26 |
if self.debug:
|
| 27 |
+
print("✅ PDFParser initialized with Russian OCR support")
|
| 28 |
+
|
| 29 |
+
def _configure_tesseract(self):
|
| 30 |
+
"""Configure Tesseract with proper paths and language support"""
|
| 31 |
+
try:
|
| 32 |
+
# Windows specific path
|
| 33 |
+
if os.name == 'nt':
|
| 34 |
+
pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
| 35 |
+
|
| 36 |
+
# Test Tesseract
|
| 37 |
+
pytesseract.get_tesseract_version()
|
| 38 |
+
print("✅ Tesseract configured successfully")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"⚠️ Tesseract configuration warning: {e}")
|
| 41 |
|
| 42 |
def _debug_print(self, label: str, data: any):
|
| 43 |
"""Print debug information"""
|
|
|
|
| 96 |
return text
|
| 97 |
|
| 98 |
def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 99 |
+
"""Extract images from PDF pages with Russian OCR support"""
|
| 100 |
images_data = []
|
| 101 |
try:
|
| 102 |
self._debug_print("Image Extraction Started", f"File: {pdf_path}")
|
|
|
|
| 112 |
image.save(image_path)
|
| 113 |
self._debug_print(f"Image {idx} Saved", str(image_path))
|
| 114 |
|
| 115 |
+
# Extract text using OCR with Russian support
|
| 116 |
+
self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
|
| 117 |
|
| 118 |
try:
|
| 119 |
+
# CRITICAL: Use 'rus+eng' for Russian + English support
|
| 120 |
+
ocr_text = pytesseract.image_to_string(image, lang='rus')
|
| 121 |
+
|
| 122 |
+
# Clean up text
|
| 123 |
+
ocr_text = ocr_text.strip()
|
| 124 |
+
|
| 125 |
+
if not ocr_text or len(ocr_text) < 5:
|
| 126 |
+
self._debug_print(f"Image {idx} OCR Result", f"⚠️ EMPTY or very short ({len(ocr_text)} chars)")
|
| 127 |
+
else:
|
| 128 |
+
self._debug_print(f"Image {idx} OCR Result", f"✅ Success - {len(ocr_text)} chars: {ocr_text[:150]}")
|
| 129 |
|
|
|
|
|
|
|
| 130 |
except Exception as ocr_error:
|
| 131 |
self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
|
| 132 |
ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"
|