final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 18, 2025

Commit

dd7abcc

verified ·

1 Parent(s): bd42160

Update src/pdf_parser.py

Browse files

Files changed (1) hide show

src/pdf_parser.py +31 -9

src/pdf_parser.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-PDF Parser Module with DEBUG for image extraction
 """
 import os
 import json
@@ -20,8 +20,24 @@ class PDFParser:
         self.processed_files = self._load_processed_files()
         self.debug = debug
         if self.debug:
-            print("✅ PDFParser initialized with DEBUG mode ON")
     def _debug_print(self, label: str, data: any):
         """Print debug information"""
@@ -80,7 +96,7 @@ class PDFParser:
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
-        """Extract images from PDF pages with detailed debugging"""
         images_data = []
         try:
             self._debug_print("Image Extraction Started", f"File: {pdf_path}")
@@ -96,15 +112,21 @@ class PDFParser:
                 image.save(image_path)
                 self._debug_print(f"Image {idx} Saved", str(image_path))
-                # Extract text using OCR
-                self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR...")
                 try:
-                    ocr_text = pytesseract.image_to_string(image, lang='rus+eng')
-                    self._debug_print(f"Image {idx} OCR Result", f"Length: {len(ocr_text)}, Content: {ocr_text[:200] if ocr_text else 'EMPTY'}")
-                    if not ocr_text or len(ocr_text.strip()) < 5:
-                        self._debug_print(f"Image {idx} WARNING", "⚠️ OCR returned empty or very short text!")
                 except Exception as ocr_error:
                     self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
                     ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"

 """
+PDF Parser Module with FIXED Russian OCR support
 """
 import os
 import json
         self.processed_files = self._load_processed_files()
         self.debug = debug
+        # Configure Tesseract for Russian + English
+        self._configure_tesseract()
         if self.debug:
+            print("✅ PDFParser initialized with Russian OCR support")
+    def _configure_tesseract(self):
+        """Configure Tesseract with proper paths and language support"""
+        try:
+            # Windows specific path
+            if os.name == 'nt':
+                pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+            # Test Tesseract
+            pytesseract.get_tesseract_version()
+            print("✅ Tesseract configured successfully")
+        except Exception as e:
+            print(f"⚠️  Tesseract configuration warning: {e}")
     def _debug_print(self, label: str, data: any):
         """Print debug information"""
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
+        """Extract images from PDF pages with Russian OCR support"""
         images_data = []
         try:
             self._debug_print("Image Extraction Started", f"File: {pdf_path}")
                 image.save(image_path)
                 self._debug_print(f"Image {idx} Saved", str(image_path))
+                # Extract text using OCR with Russian support
+                self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
                 try:
+                    # CRITICAL: Use 'rus+eng' for Russian + English support
+                    ocr_text = pytesseract.image_to_string(image, lang='rus')
+                    # Clean up text
+                    ocr_text = ocr_text.strip()
+                    if not ocr_text or len(ocr_text) < 5:
+                        self._debug_print(f"Image {idx} OCR Result", f"⚠️ EMPTY or very short ({len(ocr_text)} chars)")
+                    else:
+                        self._debug_print(f"Image {idx} OCR Result", f"✅ Success - {len(ocr_text)} chars: {ocr_text[:150]}")
                 except Exception as ocr_error:
                     self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
                     ocr_text = f"[Image {idx}: OCR failed - {str(ocr_error)}]"