Spaces:

triflix
/

DocumentVerification

Paused

App Files Files Community

triflix commited on Nov 19, 2025

Commit

f8f2a5f

verified ·

1 Parent(s): 6f9d4a7

Update logiccode.py

Browse files

Files changed (1) hide show

logiccode.py +64 -35

logiccode.py CHANGED Viewed

@@ -20,6 +20,7 @@ from paddleocr import PaddleOCR
 import difflib
 from concurrent.futures import ThreadPoolExecutor
 import multiprocessing
 # Optional PDF support
 try:
@@ -165,13 +166,17 @@ def pdf_to_images(pdf_path, max_pages=3):
 def process_page_ocr(img_path, page_num, ocr, debug):
     """Process a single page with OCR (for parallel execution)"""
-    if debug:
-        print(f"\n--- Processing PDF Page {page_num} ---")
-    result = ocr.predict(input=img_path)
-    texts = []
-    for res in result:
-        texts.extend(res['rec_texts'])
-    return texts
 def get_ocr_text(file_path, ocr, max_pages=3, debug=False):
     """Process image or PDF with OCR, returning all extracted text lines"""
@@ -187,26 +192,23 @@ def get_ocr_text(file_path, ocr, max_pages=3, debug=False):
             image_paths, total_pages, temp_dir = pdf_to_images(file_path, max_pages)
             print(f"Processing PDF: {total_pages} pages total, processing first {len(image_paths)} pages...")
-            # NEW: Process pages in parallel with ThreadPoolExecutor
-            max_workers = min(len(image_paths), 4)  # Max 4 parallel pages
             with ThreadPoolExecutor(max_workers=max_workers) as executor:
-                # Submit all pages
                 future_to_page = {
                     executor.submit(process_page_ocr, img_path, i+1, ocr, debug): i
                     for i, img_path in enumerate(image_paths)
                 }
-                # Collect results in order
                 page_results = [None] * len(image_paths)
                 for future in future_to_page:
                     page_idx = future_to_page[future]
                     try:
                         page_results[page_idx] = future.result()
                     except Exception as e:
-                        print(f"Error processing page {page_idx+1}: {e}")
                         page_results[page_idx] = []
-                # Combine results in correct order
                 for texts in page_results:
                     all_texts.extend(texts)
         else:
@@ -214,6 +216,9 @@ def get_ocr_text(file_path, ocr, max_pages=3, debug=False):
             for res in result:
                 all_texts.extend(res['rec_texts'])
     finally:
         if temp_dir and os.path.exists(temp_dir):
             import shutil
@@ -261,7 +266,7 @@ def calculate_doc_type(ocr_tokens, debug=False):
     ocr_combined = " ".join(ocr_tokens)
     scores = {}
-    # NEW: Pre-calculate keyword sets once
     doc_keyword_sets = {}
     for doc_type, keywords in DOC_KEYWORDS.items():
         doc_keyword_sets[doc_type] = set(k.lower() for k in keywords)
@@ -295,7 +300,7 @@ def calculate_doc_type(ocr_tokens, debug=False):
     sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
     best_type, best_score = sorted_scores[0]
-    # Tie-breaking logic (unchanged)
     if len(sorted_scores) > 1 and (sorted_scores[0][1] - sorted_scores[1][1]) < 5:
         if debug:
             print(f"\n⚠️  Tie detected between '{sorted_scores[0][0]}' and '{sorted_scores[1][0]}'!")
@@ -335,7 +340,7 @@ def calculate_doc_type(ocr_tokens, debug=False):
 def verify_keywords(ocr_tokens, user_keywords, use_fuzzy=False):
     """
-    FIXED: Sequence-aware matching for multi-keyword inputs.
     Checks if keywords appear consecutively in OCR text first.
     """
     ocr_set = set(ocr_tokens)
@@ -413,16 +418,25 @@ def main():
     required_set = set(required_list)
-    # NEW: Initialize OCR once, reuse for all files
     print("Initializing OCR engine (first run may take a few seconds)...")
-    ocr_engine = PaddleOCR(
-        lang="mr",
-        use_doc_orientation_classify=False,
-        use_doc_unwarping=False,
-        use_textline_orientation=False,
-        max_batch_size=16,  # Process multiple images in parallel
-        num_workers=min(4, multiprocessing.cpu_count()),  # CPU workers for preprocessing
-    )
     # Process each file and collect results
     file_results = []
@@ -436,11 +450,28 @@ def main():
     for idx, file_path in enumerate(args.file, 1):
         print(f"--- FILE {idx}/{len(args.file)}: {os.path.basename(file_path)} ---")
         # Extract text from file
         ocr_texts = get_ocr_text(file_path, ocr_engine, args.pages, args.debug)
         if not ocr_texts:
-            print(f"⚠️  No text extracted from {file_path}\n")
             file_results.append({
                 'file': file_path,
                 'doc_type': 'Unknown',
@@ -450,18 +481,16 @@ def main():
             })
             continue
-        # Debug: Show raw OCR
-        if args.debug:
-            print("\n" + "="*60)
-            print("RAW OCR EXTRACTED TEXT:")
-            print("="*60)
-            for i, text in enumerate(ocr_texts, 1):
-                print(f"{i:3d}. {text}")
-            print("="*60 + "\n")
         # Normalize tokens
         ocr_tokens = normalize_text(" ".join(ocr_texts))
         # Debug: Show normalized tokens
         if args.debug:
             print("="*60)
@@ -562,7 +591,7 @@ def main():
         print(f"✅ All keywords found across uploaded documents!")
         keywords_status = "VERIFIED"
-    # Overall status: BOTH documents and keywords must be verified
     overall_status = "VERIFIED" if (docs_status == "VERIFIED" and keywords_status == "VERIFIED") else "NOT VERIFIED"
     print(f"\n{'='*60}")

 import difflib
 from concurrent.futures import ThreadPoolExecutor
 import multiprocessing
+import sys
 # Optional PDF support
 try:
 def process_page_ocr(img_path, page_num, ocr, debug):
     """Process a single page with OCR (for parallel execution)"""
+    try:
+        if debug:
+            print(f"\n--- Processing PDF Page {page_num} ---")
+        result = ocr.predict(input=img_path)
+        texts = []
+        for res in result:
+            texts.extend(res['rec_texts'])
+        return texts
+    except Exception as e:
+        print(f"❌ ERROR: OCR failed on page {page_num}: {str(e)}")
+        return []
 def get_ocr_text(file_path, ocr, max_pages=3, debug=False):
     """Process image or PDF with OCR, returning all extracted text lines"""
             image_paths, total_pages, temp_dir = pdf_to_images(file_path, max_pages)
             print(f"Processing PDF: {total_pages} pages total, processing first {len(image_paths)} pages...")
+            # Process pages in parallel
+            max_workers = min(len(image_paths), 4)
             with ThreadPoolExecutor(max_workers=max_workers) as executor:
                 future_to_page = {
                     executor.submit(process_page_ocr, img_path, i+1, ocr, debug): i
                     for i, img_path in enumerate(image_paths)
                 }
                 page_results = [None] * len(image_paths)
                 for future in future_to_page:
                     page_idx = future_to_page[future]
                     try:
                         page_results[page_idx] = future.result()
                     except Exception as e:
+                        print(f"❌ ERROR: Failed to process page {page_idx+1}: {str(e)}")
                         page_results[page_idx] = []
                 for texts in page_results:
                     all_texts.extend(texts)
         else:
             for res in result:
                 all_texts.extend(res['rec_texts'])
+    except Exception as e:
+        print(f"❌ ERROR: Failed to process file {file_path}: {str(e)}")
+        return []
     finally:
         if temp_dir and os.path.exists(temp_dir):
             import shutil
     ocr_combined = " ".join(ocr_tokens)
     scores = {}
+    # Pre-calculate keyword sets once
     doc_keyword_sets = {}
     for doc_type, keywords in DOC_KEYWORDS.items():
         doc_keyword_sets[doc_type] = set(k.lower() for k in keywords)
     sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
     best_type, best_score = sorted_scores[0]
+    # Tie-breaking logic
     if len(sorted_scores) > 1 and (sorted_scores[0][1] - sorted_scores[1][1]) < 5:
         if debug:
             print(f"\n⚠️  Tie detected between '{sorted_scores[0][0]}' and '{sorted_scores[1][0]}'!")
 def verify_keywords(ocr_tokens, user_keywords, use_fuzzy=False):
     """
+    Sequence-aware matching for multi-keyword inputs.
     Checks if keywords appear consecutively in OCR text first.
     """
     ocr_set = set(ocr_tokens)
     required_set = set(required_list)
+    # Initialize OCR once, reuse for all files
     print("Initializing OCR engine (first run may take a few seconds)...")
+    try:
+        ocr_engine = PaddleOCR(
+            lang="mr",
+            use_doc_orientation_classify=False,
+            use_doc_unwarping=False,
+            use_textline_orientation=False,
+            max_batch_size=16,
+            num_workers=min(4, multiprocessing.cpu_count()),
+        )
+        # Test if OCR is working
+        test_result = ocr_engine.predict(input="")
+        if not test_result:
+            print("⚠️  WARNING: OCR engine test returned empty result. Models may not be loaded correctly.")
+    except Exception as e:
+        print(f"❌ CRITICAL ERROR: Failed to initialize OCR engine: {str(e)}")
+        print("Please ensure PaddleOCR is installed correctly and models are downloaded.")
+        sys.exit(1)
     # Process each file and collect results
     file_results = []
     for idx, file_path in enumerate(args.file, 1):
         print(f"--- FILE {idx}/{len(args.file)}: {os.path.basename(file_path)} ---")
+        # Check if file exists
+        if not os.path.exists(file_path):
+            print(f"❌ ERROR: File not found: {file_path}\n")
+            file_results.append({
+                'file': file_path,
+                'doc_type': 'Unknown',
+                'doc_score': 0,
+                'keywords_matched': [],
+                'status': 'ERROR'
+            })
+            continue
         # Extract text from file
         ocr_texts = get_ocr_text(file_path, ocr_engine, args.pages, args.debug)
         if not ocr_texts:
+            print(f"⚠️  No text extracted from {file_path}")
+            print("   Possible causes:")
+            print("   - File is corrupted or empty")
+            print("   - OCR engine failed to process the file")
+            print("   - Text is not in supported language/format")
+            print("   Try running with --debug flag to see detailed OCR output\n")
             file_results.append({
                 'file': file_path,
                 'doc_type': 'Unknown',
             })
             continue
+        # Show OCR summary even without debug if text is very short
+        if len(ocr_texts) < 5 and not args.debug:
+            print(f"   ℹ️  Only {len(ocr_texts)} lines of text extracted. Run with --debug to see details.")
         # Normalize tokens
         ocr_tokens = normalize_text(" ".join(ocr_texts))
+        # Show token count
+        print(f"   Extracted {len(ocr_tokens)} valid tokens from OCR text")
         # Debug: Show normalized tokens
         if args.debug:
             print("="*60)
         print(f"✅ All keywords found across uploaded documents!")
         keywords_status = "VERIFIED"
+    # Overall status
     overall_status = "VERIFIED" if (docs_status == "VERIFIED" and keywords_status == "VERIFIED") else "NOT VERIFIED"
     print(f"\n{'='*60}")