triflix commited on
Commit
f8f2a5f
·
verified ·
1 Parent(s): 6f9d4a7

Update logiccode.py

Browse files
Files changed (1) hide show
  1. logiccode.py +64 -35
logiccode.py CHANGED
@@ -20,6 +20,7 @@ from paddleocr import PaddleOCR
20
  import difflib
21
  from concurrent.futures import ThreadPoolExecutor
22
  import multiprocessing
 
23
 
24
  # Optional PDF support
25
  try:
@@ -165,13 +166,17 @@ def pdf_to_images(pdf_path, max_pages=3):
165
 
166
  def process_page_ocr(img_path, page_num, ocr, debug):
167
  """Process a single page with OCR (for parallel execution)"""
168
- if debug:
169
- print(f"\n--- Processing PDF Page {page_num} ---")
170
- result = ocr.predict(input=img_path)
171
- texts = []
172
- for res in result:
173
- texts.extend(res['rec_texts'])
174
- return texts
 
 
 
 
175
 
176
  def get_ocr_text(file_path, ocr, max_pages=3, debug=False):
177
  """Process image or PDF with OCR, returning all extracted text lines"""
@@ -187,26 +192,23 @@ def get_ocr_text(file_path, ocr, max_pages=3, debug=False):
187
  image_paths, total_pages, temp_dir = pdf_to_images(file_path, max_pages)
188
  print(f"Processing PDF: {total_pages} pages total, processing first {len(image_paths)} pages...")
189
 
190
- # NEW: Process pages in parallel with ThreadPoolExecutor
191
- max_workers = min(len(image_paths), 4) # Max 4 parallel pages
192
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
193
- # Submit all pages
194
  future_to_page = {
195
  executor.submit(process_page_ocr, img_path, i+1, ocr, debug): i
196
  for i, img_path in enumerate(image_paths)
197
  }
198
 
199
- # Collect results in order
200
  page_results = [None] * len(image_paths)
201
  for future in future_to_page:
202
  page_idx = future_to_page[future]
203
  try:
204
  page_results[page_idx] = future.result()
205
  except Exception as e:
206
- print(f"Error processing page {page_idx+1}: {e}")
207
  page_results[page_idx] = []
208
 
209
- # Combine results in correct order
210
  for texts in page_results:
211
  all_texts.extend(texts)
212
  else:
@@ -214,6 +216,9 @@ def get_ocr_text(file_path, ocr, max_pages=3, debug=False):
214
  for res in result:
215
  all_texts.extend(res['rec_texts'])
216
 
 
 
 
217
  finally:
218
  if temp_dir and os.path.exists(temp_dir):
219
  import shutil
@@ -261,7 +266,7 @@ def calculate_doc_type(ocr_tokens, debug=False):
261
  ocr_combined = " ".join(ocr_tokens)
262
  scores = {}
263
 
264
- # NEW: Pre-calculate keyword sets once
265
  doc_keyword_sets = {}
266
  for doc_type, keywords in DOC_KEYWORDS.items():
267
  doc_keyword_sets[doc_type] = set(k.lower() for k in keywords)
@@ -295,7 +300,7 @@ def calculate_doc_type(ocr_tokens, debug=False):
295
  sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
296
  best_type, best_score = sorted_scores[0]
297
 
298
- # Tie-breaking logic (unchanged)
299
  if len(sorted_scores) > 1 and (sorted_scores[0][1] - sorted_scores[1][1]) < 5:
300
  if debug:
301
  print(f"\n⚠️ Tie detected between '{sorted_scores[0][0]}' and '{sorted_scores[1][0]}'!")
@@ -335,7 +340,7 @@ def calculate_doc_type(ocr_tokens, debug=False):
335
 
336
  def verify_keywords(ocr_tokens, user_keywords, use_fuzzy=False):
337
  """
338
- FIXED: Sequence-aware matching for multi-keyword inputs.
339
  Checks if keywords appear consecutively in OCR text first.
340
  """
341
  ocr_set = set(ocr_tokens)
@@ -413,16 +418,25 @@ def main():
413
 
414
  required_set = set(required_list)
415
 
416
- # NEW: Initialize OCR once, reuse for all files
417
  print("Initializing OCR engine (first run may take a few seconds)...")
418
- ocr_engine = PaddleOCR(
419
- lang="mr",
420
- use_doc_orientation_classify=False,
421
- use_doc_unwarping=False,
422
- use_textline_orientation=False,
423
- max_batch_size=16, # Process multiple images in parallel
424
- num_workers=min(4, multiprocessing.cpu_count()), # CPU workers for preprocessing
425
- )
 
 
 
 
 
 
 
 
 
426
 
427
  # Process each file and collect results
428
  file_results = []
@@ -436,11 +450,28 @@ def main():
436
  for idx, file_path in enumerate(args.file, 1):
437
  print(f"--- FILE {idx}/{len(args.file)}: {os.path.basename(file_path)} ---")
438
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  # Extract text from file
440
  ocr_texts = get_ocr_text(file_path, ocr_engine, args.pages, args.debug)
441
 
442
  if not ocr_texts:
443
- print(f"⚠️ No text extracted from {file_path}\n")
 
 
 
 
 
444
  file_results.append({
445
  'file': file_path,
446
  'doc_type': 'Unknown',
@@ -450,18 +481,16 @@ def main():
450
  })
451
  continue
452
 
453
- # Debug: Show raw OCR
454
- if args.debug:
455
- print("\n" + "="*60)
456
- print("RAW OCR EXTRACTED TEXT:")
457
- print("="*60)
458
- for i, text in enumerate(ocr_texts, 1):
459
- print(f"{i:3d}. {text}")
460
- print("="*60 + "\n")
461
 
462
  # Normalize tokens
463
  ocr_tokens = normalize_text(" ".join(ocr_texts))
464
 
 
 
 
465
  # Debug: Show normalized tokens
466
  if args.debug:
467
  print("="*60)
@@ -562,7 +591,7 @@ def main():
562
  print(f"✅ All keywords found across uploaded documents!")
563
  keywords_status = "VERIFIED"
564
 
565
- # Overall status: BOTH documents and keywords must be verified
566
  overall_status = "VERIFIED" if (docs_status == "VERIFIED" and keywords_status == "VERIFIED") else "NOT VERIFIED"
567
 
568
  print(f"\n{'='*60}")
 
20
  import difflib
21
  from concurrent.futures import ThreadPoolExecutor
22
  import multiprocessing
23
+ import sys
24
 
25
  # Optional PDF support
26
  try:
 
166
 
167
  def process_page_ocr(img_path, page_num, ocr, debug):
168
  """Process a single page with OCR (for parallel execution)"""
169
+ try:
170
+ if debug:
171
+ print(f"\n--- Processing PDF Page {page_num} ---")
172
+ result = ocr.predict(input=img_path)
173
+ texts = []
174
+ for res in result:
175
+ texts.extend(res['rec_texts'])
176
+ return texts
177
+ except Exception as e:
178
+ print(f"❌ ERROR: OCR failed on page {page_num}: {str(e)}")
179
+ return []
180
 
181
  def get_ocr_text(file_path, ocr, max_pages=3, debug=False):
182
  """Process image or PDF with OCR, returning all extracted text lines"""
 
192
  image_paths, total_pages, temp_dir = pdf_to_images(file_path, max_pages)
193
  print(f"Processing PDF: {total_pages} pages total, processing first {len(image_paths)} pages...")
194
 
195
+ # Process pages in parallel
196
+ max_workers = min(len(image_paths), 4)
197
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
 
198
  future_to_page = {
199
  executor.submit(process_page_ocr, img_path, i+1, ocr, debug): i
200
  for i, img_path in enumerate(image_paths)
201
  }
202
 
 
203
  page_results = [None] * len(image_paths)
204
  for future in future_to_page:
205
  page_idx = future_to_page[future]
206
  try:
207
  page_results[page_idx] = future.result()
208
  except Exception as e:
209
+ print(f" ERROR: Failed to process page {page_idx+1}: {str(e)}")
210
  page_results[page_idx] = []
211
 
 
212
  for texts in page_results:
213
  all_texts.extend(texts)
214
  else:
 
216
  for res in result:
217
  all_texts.extend(res['rec_texts'])
218
 
219
+ except Exception as e:
220
+ print(f"❌ ERROR: Failed to process file {file_path}: {str(e)}")
221
+ return []
222
  finally:
223
  if temp_dir and os.path.exists(temp_dir):
224
  import shutil
 
266
  ocr_combined = " ".join(ocr_tokens)
267
  scores = {}
268
 
269
+ # Pre-calculate keyword sets once
270
  doc_keyword_sets = {}
271
  for doc_type, keywords in DOC_KEYWORDS.items():
272
  doc_keyword_sets[doc_type] = set(k.lower() for k in keywords)
 
300
  sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
301
  best_type, best_score = sorted_scores[0]
302
 
303
+ # Tie-breaking logic
304
  if len(sorted_scores) > 1 and (sorted_scores[0][1] - sorted_scores[1][1]) < 5:
305
  if debug:
306
  print(f"\n⚠️ Tie detected between '{sorted_scores[0][0]}' and '{sorted_scores[1][0]}'!")
 
340
 
341
  def verify_keywords(ocr_tokens, user_keywords, use_fuzzy=False):
342
  """
343
+ Sequence-aware matching for multi-keyword inputs.
344
  Checks if keywords appear consecutively in OCR text first.
345
  """
346
  ocr_set = set(ocr_tokens)
 
418
 
419
  required_set = set(required_list)
420
 
421
+ # Initialize OCR once, reuse for all files
422
  print("Initializing OCR engine (first run may take a few seconds)...")
423
+ try:
424
+ ocr_engine = PaddleOCR(
425
+ lang="mr",
426
+ use_doc_orientation_classify=False,
427
+ use_doc_unwarping=False,
428
+ use_textline_orientation=False,
429
+ max_batch_size=16,
430
+ num_workers=min(4, multiprocessing.cpu_count()),
431
+ )
432
+ # Test if OCR is working
433
+ test_result = ocr_engine.predict(input="")
434
+ if not test_result:
435
+ print("⚠️ WARNING: OCR engine test returned empty result. Models may not be loaded correctly.")
436
+ except Exception as e:
437
+ print(f"❌ CRITICAL ERROR: Failed to initialize OCR engine: {str(e)}")
438
+ print("Please ensure PaddleOCR is installed correctly and models are downloaded.")
439
+ sys.exit(1)
440
 
441
  # Process each file and collect results
442
  file_results = []
 
450
  for idx, file_path in enumerate(args.file, 1):
451
  print(f"--- FILE {idx}/{len(args.file)}: {os.path.basename(file_path)} ---")
452
 
453
+ # Check if file exists
454
+ if not os.path.exists(file_path):
455
+ print(f"❌ ERROR: File not found: {file_path}\n")
456
+ file_results.append({
457
+ 'file': file_path,
458
+ 'doc_type': 'Unknown',
459
+ 'doc_score': 0,
460
+ 'keywords_matched': [],
461
+ 'status': 'ERROR'
462
+ })
463
+ continue
464
+
465
  # Extract text from file
466
  ocr_texts = get_ocr_text(file_path, ocr_engine, args.pages, args.debug)
467
 
468
  if not ocr_texts:
469
+ print(f"⚠️ No text extracted from {file_path}")
470
+ print(" Possible causes:")
471
+ print(" - File is corrupted or empty")
472
+ print(" - OCR engine failed to process the file")
473
+ print(" - Text is not in supported language/format")
474
+ print(" Try running with --debug flag to see detailed OCR output\n")
475
  file_results.append({
476
  'file': file_path,
477
  'doc_type': 'Unknown',
 
481
  })
482
  continue
483
 
484
+ # Show OCR summary even without debug if text is very short
485
+ if len(ocr_texts) < 5 and not args.debug:
486
+ print(f" ℹ️ Only {len(ocr_texts)} lines of text extracted. Run with --debug to see details.")
 
 
 
 
 
487
 
488
  # Normalize tokens
489
  ocr_tokens = normalize_text(" ".join(ocr_texts))
490
 
491
+ # Show token count
492
+ print(f" Extracted {len(ocr_tokens)} valid tokens from OCR text")
493
+
494
  # Debug: Show normalized tokens
495
  if args.debug:
496
  print("="*60)
 
591
  print(f"✅ All keywords found across uploaded documents!")
592
  keywords_status = "VERIFIED"
593
 
594
+ # Overall status
595
  overall_status = "VERIFIED" if (docs_status == "VERIFIED" and keywords_status == "VERIFIED") else "NOT VERIFIED"
596
 
597
  print(f"\n{'='*60}")