final_project2

Sleeping

App Files Files Community

dnj0 commited on Nov 20, 2025

Commit

53a61da

verified ·

1 Parent(s): e4ac86d

Update src/pdf_parser.py

Browse files

Files changed (1) hide show

src/pdf_parser.py +4 -29

src/pdf_parser.py CHANGED Viewed

@@ -1,6 +1,4 @@
-"""
-PDF Parser Module with FIXED Russian OCR support
-"""
 import os
 import json
 import hashlib
@@ -20,16 +18,13 @@ class PDFParser:
         self.processed_files = self._load_processed_files()
         self.debug = debug
-        # Configure Tesseract for Russian + English
         self._configure_tesseract()
         if self.debug:
-            print("✅ PDFParser initialized with Russian OCR support")
     def _configure_tesseract(self):
-        """Configure Tesseract with proper paths and language support"""
         try:
-            # Windows specific path
             if os.name == 'nt':
                 pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
@@ -40,7 +35,6 @@ class PDFParser:
             print(f"⚠️  Tesseract configuration warning: {e}")
     def _debug_print(self, label: str, data: any):
-        """Print debug information"""
         if self.debug:
             print(f"\n🔍 [PDF Parser] {label}")
             if isinstance(data, dict):
@@ -54,7 +48,6 @@ class PDFParser:
                 print(f"  {data}")
     def _load_processed_files(self) -> Dict[str, str]:
-        """Load list of already processed files with their hashes"""
         if os.path.exists(PROCESSED_FILES_LOG):
             try:
                 with open(PROCESSED_FILES_LOG, 'r') as f:
@@ -64,12 +57,10 @@ class PDFParser:
         return {}
     def _save_processed_files(self):
-        """Save processed files list to disk"""
         with open(PROCESSED_FILES_LOG, 'w') as f:
             json.dump(self.processed_files, f, indent=2)
     def _get_file_hash(self, file_path: str) -> str:
-        """Generate hash of file to detect changes"""
         hash_md5 = hashlib.md5()
         with open(file_path, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
@@ -77,7 +68,6 @@ class PDFParser:
         return hash_md5.hexdigest()
     def _extract_text_from_pdf(self, pdf_path: str) -> str:
-        """Extract text from PDF using PyPDF2"""
         text = ""
         try:
             with open(pdf_path, 'rb') as file:
@@ -96,7 +86,6 @@ class PDFParser:
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
-        """Extract images from PDF pages with Russian OCR support"""
         images_data = []
         try:
             self._debug_print("Image Extraction Started", f"File: {pdf_path}")
@@ -107,19 +96,15 @@ class PDFParser:
             for idx, image in enumerate(images):
                 self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
-                # Save image
                 image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
                 image.save(image_path)
                 self._debug_print(f"Image {idx} Saved", str(image_path))
-                # Extract text using OCR with Russian support
-                self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
                 try:
-                    # CRITICAL: Use 'rus+eng' for Russian + English support
                     ocr_text = pytesseract.image_to_string(image, lang='rus')
-                    # Clean up text
                     ocr_text = ocr_text.strip()
                     if not ocr_text or len(ocr_text) < 5:
@@ -144,7 +129,6 @@ class PDFParser:
         return images_data
     def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
-        """Extract table content from PDF"""
         tables_data = []
         try:
             text = self._extract_text_from_pdf(pdf_path)
@@ -177,26 +161,22 @@ class PDFParser:
         return tables_data
     def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
-        """Parse PDF and extract text, images, and tables with debug output"""
         file_hash = self._get_file_hash(pdf_path)
         doc_id = Path(pdf_path).stem
         self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
-        # Check if file was already processed
         if doc_id in self.processed_files:
             if self.processed_files[doc_id] == file_hash:
-                self._debug_print("Status", f"File {doc_id} already processed, loading from cache")
                 return self._load_extracted_data(doc_id)
         print(f"\n📄 Processing PDF: {doc_id}")
-        # Extract content
         text = self._extract_text_from_pdf(pdf_path)
         images = self._extract_images_from_pdf(pdf_path, doc_id)
         tables = self._extract_tables_from_pdf(pdf_path, doc_id)
-        # Summary
         self._debug_print("Extraction Summary", {
             'text_length': len(text),
             'images_count': len(images),
@@ -204,17 +184,14 @@ class PDFParser:
             'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
         })
-        # Save extracted data
         self._save_extracted_data(doc_id, text, images, tables)
-        # Update processed files log
         self.processed_files[doc_id] = file_hash
         self._save_processed_files()
         return text, images, tables
     def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
-        """Save extracted data to docstore"""
         data = {
             'text': text,
             'images': images,
@@ -227,7 +204,6 @@ class PDFParser:
         self._debug_print("Data Saved", str(data_path))
     def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
-        """Load previously extracted data from docstore"""
         data_path = self.docstore_path / f"{doc_id}_data.json"
         try:
             with open(data_path, 'r', encoding='utf-8') as f:
@@ -237,7 +213,6 @@ class PDFParser:
             return "", [], []
     def get_all_documents(self) -> Dict:
-        """Load all processed documents from docstore"""
         all_docs = {}
         for json_file in self.docstore_path.glob("*_data.json"):
             doc_id = json_file.stem.replace("_data", "")

 import os
 import json
 import hashlib
         self.processed_files = self._load_processed_files()
         self.debug = debug
         self._configure_tesseract()
         if self.debug:
+            print("✅ PDFParser initialized")
     def _configure_tesseract(self):
         try:
             if os.name == 'nt':
                 pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
             print(f"⚠️  Tesseract configuration warning: {e}")
     def _debug_print(self, label: str, data: any):
         if self.debug:
             print(f"\n🔍 [PDF Parser] {label}")
             if isinstance(data, dict):
                 print(f"  {data}")
     def _load_processed_files(self) -> Dict[str, str]:
         if os.path.exists(PROCESSED_FILES_LOG):
             try:
                 with open(PROCESSED_FILES_LOG, 'r') as f:
         return {}
     def _save_processed_files(self):
         with open(PROCESSED_FILES_LOG, 'w') as f:
             json.dump(self.processed_files, f, indent=2)
     def _get_file_hash(self, file_path: str) -> str:
         hash_md5 = hashlib.md5()
         with open(file_path, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
         return hash_md5.hexdigest()
     def _extract_text_from_pdf(self, pdf_path: str) -> str:
         text = ""
         try:
             with open(pdf_path, 'rb') as file:
         return text
     def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
         images_data = []
         try:
             self._debug_print("Image Extraction Started", f"File: {pdf_path}")
             for idx, image in enumerate(images):
                 self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
                 image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
                 image.save(image_path)
                 self._debug_print(f"Image {idx} Saved", str(image_path))
+                self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR...")
                 try:
                     ocr_text = pytesseract.image_to_string(image, lang='rus')
                     ocr_text = ocr_text.strip()
                     if not ocr_text or len(ocr_text) < 5:
         return images_data
     def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
         tables_data = []
         try:
             text = self._extract_text_from_pdf(pdf_path)
         return tables_data
     def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
         file_hash = self._get_file_hash(pdf_path)
         doc_id = Path(pdf_path).stem
         self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
         if doc_id in self.processed_files:
             if self.processed_files[doc_id] == file_hash:
+                self._debug_print("Status", f"File {doc_id} already processed")
                 return self._load_extracted_data(doc_id)
         print(f"\n📄 Processing PDF: {doc_id}")
         text = self._extract_text_from_pdf(pdf_path)
         images = self._extract_images_from_pdf(pdf_path, doc_id)
         tables = self._extract_tables_from_pdf(pdf_path, doc_id)
         self._debug_print("Extraction Summary", {
             'text_length': len(text),
             'images_count': len(images),
             'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
         })
         self._save_extracted_data(doc_id, text, images, tables)
         self.processed_files[doc_id] = file_hash
         self._save_processed_files()
         return text, images, tables
     def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
         data = {
             'text': text,
             'images': images,
         self._debug_print("Data Saved", str(data_path))
     def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
         data_path = self.docstore_path / f"{doc_id}_data.json"
         try:
             with open(data_path, 'r', encoding='utf-8') as f:
             return "", [], []
     def get_all_documents(self) -> Dict:
         all_docs = {}
         for json_file in self.docstore_path.glob("*_data.json"):
             doc_id = json_file.stem.replace("_data", "")