dnj0 commited on
Commit
53a61da
·
verified ·
1 Parent(s): e4ac86d

Update src/pdf_parser.py

Browse files
Files changed (1) hide show
  1. src/pdf_parser.py +4 -29
src/pdf_parser.py CHANGED
@@ -1,6 +1,4 @@
1
- """
2
- PDF Parser Module with FIXED Russian OCR support
3
- """
4
  import os
5
  import json
6
  import hashlib
@@ -20,16 +18,13 @@ class PDFParser:
20
  self.processed_files = self._load_processed_files()
21
  self.debug = debug
22
 
23
- # Configure Tesseract for Russian + English
24
  self._configure_tesseract()
25
 
26
  if self.debug:
27
- print("✅ PDFParser initialized with Russian OCR support")
28
 
29
  def _configure_tesseract(self):
30
- """Configure Tesseract with proper paths and language support"""
31
  try:
32
- # Windows specific path
33
  if os.name == 'nt':
34
  pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
35
 
@@ -40,7 +35,6 @@ class PDFParser:
40
  print(f"⚠️ Tesseract configuration warning: {e}")
41
 
42
  def _debug_print(self, label: str, data: any):
43
- """Print debug information"""
44
  if self.debug:
45
  print(f"\n🔍 [PDF Parser] {label}")
46
  if isinstance(data, dict):
@@ -54,7 +48,6 @@ class PDFParser:
54
  print(f" {data}")
55
 
56
  def _load_processed_files(self) -> Dict[str, str]:
57
- """Load list of already processed files with their hashes"""
58
  if os.path.exists(PROCESSED_FILES_LOG):
59
  try:
60
  with open(PROCESSED_FILES_LOG, 'r') as f:
@@ -64,12 +57,10 @@ class PDFParser:
64
  return {}
65
 
66
  def _save_processed_files(self):
67
- """Save processed files list to disk"""
68
  with open(PROCESSED_FILES_LOG, 'w') as f:
69
  json.dump(self.processed_files, f, indent=2)
70
 
71
  def _get_file_hash(self, file_path: str) -> str:
72
- """Generate hash of file to detect changes"""
73
  hash_md5 = hashlib.md5()
74
  with open(file_path, "rb") as f:
75
  for chunk in iter(lambda: f.read(4096), b""):
@@ -77,7 +68,6 @@ class PDFParser:
77
  return hash_md5.hexdigest()
78
 
79
  def _extract_text_from_pdf(self, pdf_path: str) -> str:
80
- """Extract text from PDF using PyPDF2"""
81
  text = ""
82
  try:
83
  with open(pdf_path, 'rb') as file:
@@ -96,7 +86,6 @@ class PDFParser:
96
  return text
97
 
98
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
99
- """Extract images from PDF pages with Russian OCR support"""
100
  images_data = []
101
  try:
102
  self._debug_print("Image Extraction Started", f"File: {pdf_path}")
@@ -107,19 +96,15 @@ class PDFParser:
107
  for idx, image in enumerate(images):
108
  self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
109
 
110
- # Save image
111
  image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
112
  image.save(image_path)
113
  self._debug_print(f"Image {idx} Saved", str(image_path))
114
 
115
- # Extract text using OCR with Russian support
116
- self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
117
 
118
  try:
119
- # CRITICAL: Use 'rus+eng' for Russian + English support
120
  ocr_text = pytesseract.image_to_string(image, lang='rus')
121
 
122
- # Clean up text
123
  ocr_text = ocr_text.strip()
124
 
125
  if not ocr_text or len(ocr_text) < 5:
@@ -144,7 +129,6 @@ class PDFParser:
144
  return images_data
145
 
146
  def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
147
- """Extract table content from PDF"""
148
  tables_data = []
149
  try:
150
  text = self._extract_text_from_pdf(pdf_path)
@@ -177,26 +161,22 @@ class PDFParser:
177
  return tables_data
178
 
179
  def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
180
- """Parse PDF and extract text, images, and tables with debug output"""
181
  file_hash = self._get_file_hash(pdf_path)
182
  doc_id = Path(pdf_path).stem
183
 
184
  self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
185
 
186
- # Check if file was already processed
187
  if doc_id in self.processed_files:
188
  if self.processed_files[doc_id] == file_hash:
189
- self._debug_print("Status", f"File {doc_id} already processed, loading from cache")
190
  return self._load_extracted_data(doc_id)
191
 
192
  print(f"\n📄 Processing PDF: {doc_id}")
193
 
194
- # Extract content
195
  text = self._extract_text_from_pdf(pdf_path)
196
  images = self._extract_images_from_pdf(pdf_path, doc_id)
197
  tables = self._extract_tables_from_pdf(pdf_path, doc_id)
198
 
199
- # Summary
200
  self._debug_print("Extraction Summary", {
201
  'text_length': len(text),
202
  'images_count': len(images),
@@ -204,17 +184,14 @@ class PDFParser:
204
  'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
205
  })
206
 
207
- # Save extracted data
208
  self._save_extracted_data(doc_id, text, images, tables)
209
 
210
- # Update processed files log
211
  self.processed_files[doc_id] = file_hash
212
  self._save_processed_files()
213
 
214
  return text, images, tables
215
 
216
  def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
217
- """Save extracted data to docstore"""
218
  data = {
219
  'text': text,
220
  'images': images,
@@ -227,7 +204,6 @@ class PDFParser:
227
  self._debug_print("Data Saved", str(data_path))
228
 
229
  def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
230
- """Load previously extracted data from docstore"""
231
  data_path = self.docstore_path / f"{doc_id}_data.json"
232
  try:
233
  with open(data_path, 'r', encoding='utf-8') as f:
@@ -237,7 +213,6 @@ class PDFParser:
237
  return "", [], []
238
 
239
  def get_all_documents(self) -> Dict:
240
- """Load all processed documents from docstore"""
241
  all_docs = {}
242
  for json_file in self.docstore_path.glob("*_data.json"):
243
  doc_id = json_file.stem.replace("_data", "")
 
1
+
 
 
2
  import os
3
  import json
4
  import hashlib
 
18
  self.processed_files = self._load_processed_files()
19
  self.debug = debug
20
 
 
21
  self._configure_tesseract()
22
 
23
  if self.debug:
24
+ print("✅ PDFParser initialized")
25
 
26
  def _configure_tesseract(self):
 
27
  try:
 
28
  if os.name == 'nt':
29
  pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
30
 
 
35
  print(f"⚠️ Tesseract configuration warning: {e}")
36
 
37
  def _debug_print(self, label: str, data: any):
 
38
  if self.debug:
39
  print(f"\n🔍 [PDF Parser] {label}")
40
  if isinstance(data, dict):
 
48
  print(f" {data}")
49
 
50
  def _load_processed_files(self) -> Dict[str, str]:
 
51
  if os.path.exists(PROCESSED_FILES_LOG):
52
  try:
53
  with open(PROCESSED_FILES_LOG, 'r') as f:
 
57
  return {}
58
 
59
  def _save_processed_files(self):
 
60
  with open(PROCESSED_FILES_LOG, 'w') as f:
61
  json.dump(self.processed_files, f, indent=2)
62
 
63
  def _get_file_hash(self, file_path: str) -> str:
 
64
  hash_md5 = hashlib.md5()
65
  with open(file_path, "rb") as f:
66
  for chunk in iter(lambda: f.read(4096), b""):
 
68
  return hash_md5.hexdigest()
69
 
70
  def _extract_text_from_pdf(self, pdf_path: str) -> str:
 
71
  text = ""
72
  try:
73
  with open(pdf_path, 'rb') as file:
 
86
  return text
87
 
88
  def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
 
89
  images_data = []
90
  try:
91
  self._debug_print("Image Extraction Started", f"File: {pdf_path}")
 
96
  for idx, image in enumerate(images):
97
  self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
98
 
 
99
  image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
100
  image.save(image_path)
101
  self._debug_print(f"Image {idx} Saved", str(image_path))
102
 
103
+ self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR...")
 
104
 
105
  try:
 
106
  ocr_text = pytesseract.image_to_string(image, lang='rus')
107
 
 
108
  ocr_text = ocr_text.strip()
109
 
110
  if not ocr_text or len(ocr_text) < 5:
 
129
  return images_data
130
 
131
  def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
 
132
  tables_data = []
133
  try:
134
  text = self._extract_text_from_pdf(pdf_path)
 
161
  return tables_data
162
 
163
  def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
 
164
  file_hash = self._get_file_hash(pdf_path)
165
  doc_id = Path(pdf_path).stem
166
 
167
  self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
168
 
 
169
  if doc_id in self.processed_files:
170
  if self.processed_files[doc_id] == file_hash:
171
+ self._debug_print("Status", f"File {doc_id} already processed")
172
  return self._load_extracted_data(doc_id)
173
 
174
  print(f"\n📄 Processing PDF: {doc_id}")
175
 
 
176
  text = self._extract_text_from_pdf(pdf_path)
177
  images = self._extract_images_from_pdf(pdf_path, doc_id)
178
  tables = self._extract_tables_from_pdf(pdf_path, doc_id)
179
 
 
180
  self._debug_print("Extraction Summary", {
181
  'text_length': len(text),
182
  'images_count': len(images),
 
184
  'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
185
  })
186
 
 
187
  self._save_extracted_data(doc_id, text, images, tables)
188
 
 
189
  self.processed_files[doc_id] = file_hash
190
  self._save_processed_files()
191
 
192
  return text, images, tables
193
 
194
  def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
 
195
  data = {
196
  'text': text,
197
  'images': images,
 
204
  self._debug_print("Data Saved", str(data_path))
205
 
206
  def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
 
207
  data_path = self.docstore_path / f"{doc_id}_data.json"
208
  try:
209
  with open(data_path, 'r', encoding='utf-8') as f:
 
213
  return "", [], []
214
 
215
  def get_all_documents(self) -> Dict:
 
216
  all_docs = {}
217
  for json_file in self.docstore_path.glob("*_data.json"):
218
  doc_id = json_file.stem.replace("_data", "")