dnj0 commited on
Commit
dd7abcc
·
verified ·
1 Parent(s): bd42160

Update src/pdf_parser.py

Browse files
Files changed (1) hide show
  1. src/pdf_parser.py +31 -9
src/pdf_parser.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- PDF Parser Module with DEBUG for image extraction
3
  """
4
  import os
5
  import json
@@ -20,8 +20,24 @@ class PDFParser:
20
  self.processed_files = self._load_processed_files()
21
  self.debug = debug
22
 
 
 
 
23
  if self.debug:
24
- print("✅ PDFParser initialized with DEBUG mode ON")
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  def _debug_print(self, label: str, data: any):
27
  """Print debug information"""
@@ -80,7 +96,7 @@ class PDFParser:
80
  return text
81
 
82
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
83
- """Extract images from PDF pages with detailed debugging"""
84
  images_data = []
85
  try:
86
  self._debug_print("Image Extraction Started", f"File: {pdf_path}")
@@ -96,15 +112,21 @@ class PDFParser:
96
  image.save(image_path)
97
  self._debug_print(f"Image {idx} Saved", str(image_path))
98
 
99
- # Extract text using OCR
100
- self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR...")
101
 
102
  try:
103
- ocr_text = pytesseract.image_to_string(image, lang='rus+eng')
104
- self._debug_print(f"Image {idx} OCR Result", f"Length: {len(ocr_text)}, Content: {ocr_text[:200] if ocr_text else 'EMPTY'}")
 
 
 
 
 
 
 
 
105
 
106
- if not ocr_text or len(ocr_text.strip()) < 5:
107
- self._debug_print(f"Image {idx} WARNING", "⚠️ OCR returned empty or very short text!")
108
  except Exception as ocr_error:
109
  self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
110
  ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"
 
1
  """
2
+ PDF Parser Module with FIXED Russian OCR support
3
  """
4
  import os
5
  import json
 
20
  self.processed_files = self._load_processed_files()
21
  self.debug = debug
22
 
23
+ # Configure Tesseract for Russian + English
24
+ self._configure_tesseract()
25
+
26
  if self.debug:
27
+ print("✅ PDFParser initialized with Russian OCR support")
28
+
29
+ def _configure_tesseract(self):
30
+ """Configure Tesseract with proper paths and language support"""
31
+ try:
32
+ # Windows specific path
33
+ if os.name == 'nt':
34
+ pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
35
+
36
+ # Test Tesseract
37
+ pytesseract.get_tesseract_version()
38
+ print("✅ Tesseract configured successfully")
39
+ except Exception as e:
40
+ print(f"⚠️ Tesseract configuration warning: {e}")
41
 
42
  def _debug_print(self, label: str, data: any):
43
  """Print debug information"""
 
96
  return text
97
 
98
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
99
+ """Extract images from PDF pages with Russian OCR support"""
100
  images_data = []
101
  try:
102
  self._debug_print("Image Extraction Started", f"File: {pdf_path}")
 
112
  image.save(image_path)
113
  self._debug_print(f"Image {idx} Saved", str(image_path))
114
 
115
+ # Extract text using OCR with Russian support
116
+ self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
117
 
118
  try:
119
+ # CRITICAL: Use 'rus+eng' for Russian + English support
120
+ ocr_text = pytesseract.image_to_string(image, lang='rus')
121
+
122
+ # Clean up text
123
+ ocr_text = ocr_text.strip()
124
+
125
+ if not ocr_text or len(ocr_text) < 5:
126
+ self._debug_print(f"Image {idx} OCR Result", f"⚠️ EMPTY or very short ({len(ocr_text)} chars)")
127
+ else:
128
+ self._debug_print(f"Image {idx} OCR Result", f"✅ Success - {len(ocr_text)} chars: {ocr_text[:150]}")
129
 
 
 
130
  except Exception as ocr_error:
131
  self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
132
  ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"