Spaces:

triflix
/

DocumentVerification

Sleeping

App Files Files Community

triflix commited on Nov 19, 2025

Commit

bf3efa4

verified ·

1 Parent(s): 795714c

Delete logiccode.py

Browse files

Files changed (1) hide show

logiccode.py +0 -490

logiccode.py DELETED Viewed

@@ -1,490 +0,0 @@
-#!/usr/bin/env python3
-"""
-OCR Document Verification with Batch Processing & Required Document Checklist
-Usage:
-# Single file (backward compatible)
-python ocrupdated2.py --file image.jpg --inputkeywords "keyword1 keyword2" --fuzzy --debug
-# Multiple files with required document checklist
-python ocrupdated2.py --file doc1.pdf doc2.jpg doc3.png --inputkeywords "Shaikh Anisa Rahat" --required PAN HSC AgeNationalityDomicile --fuzzy --debug
-NOTE: Use spaces to separate required document types, NOT commas:
-✅ --required PAN Aadhaar HSC
-❌ --required PAN, Aadhaar, HSC
-"""
-import argparse
-import re
-import os
-import tempfile
-from collections import defaultdict
-from paddleocr import PaddleOCR
-import difflib
-# Optional PDF support
-try:
-    import fitz  # PyMuPDF
-    PDF_SUPPORT = True
-except ImportError:
-    PDF_SUPPORT = False
-    print("Warning: PyMuPDF not installed. PDF support disabled. Install with: pip install PyMuPDF")
-# Document keywords (kept same as your updated version)
-DOC_KEYWORDS = {
-    "Aadhaar": [
-        "uidai", "aadhaar", "aadhar", "government of india", "भारत सरकार",
-        "आधार", "यूआईडीएआई", "प्रधानमंत्री", "जन्म तिथि", "पता", "लिंग",
-        "unique identification authority", "aadhaar number", "enrollment number"
-    ],
-    "PAN": [
-        "permanent account number", "income tax", "incometaxindia", "pan",
-        "income tax department", "आयकर विभाग", "स्थायी खाता संख्या",
-        "taxpayer", "father's name", "पिता का नाम", "signature", "inc"
-    ],
-    "Driving_License": [
-        "driving licence", "motor vehicles act", "rto", "mcwg", "lmv",
-        "transport department", "licence no", "valid till", "date of issue",
-        "ड्राइविंग लाइसेंस", "परिवहन विभाग", "challan", "regional transport office"
-    ],
-    "Passport": [
-        "passport", "republic of india", "ministry of external affairs",
-        "passport number", "date of issue", "date of expiry", "surname",
-        "given names", "nationality indian", "पासपोर्ट", "गणराज्य", "विदेश मंत्रालय",
-        "consular", "visa"
-    ],
-    "SSC": [
-        "secondary school certificate", "statement of marks", "ssc", "10th", "class x",
-        "board of secondary education", "maharashtra state board", "matriculation",
-        "roll number", "seat number", "subject code", "marks obtained", "grade", "pass"
-    ],
-    "HSC": [
-        "higher secondary certificate", "statement of marks", "hsc", "12th", "class xii",
-        "board of higher secondary education", "maharashtra state board", "intermediate",
-        "stream", "science", "commerce", "arts", "marks obtained", "grade", "percentage"
-    ],
-    "AgeNationalityDomicile": [
-        "certificate of age nationality and domicile", "domicile certificate",
-        "age nationality domicile", "tehsildar", "executive magistrate", "collector",
-        "certificate of residence", "domiciled in the state of", "citizen of india",
-        "residence proof", "maharashtra domicile", "satara", "karad", "taluka", "district"
-    ],
-    "Ration_Card": [
-        "ration card", "food and civil supplies", "apl", "bpl", "aay", "antyodaya",
-        "ration card number", "family members", "head of family",
-        "राशन कार्ड", "खाद्य पुरवठा", "नागरी पुरवठा विभाग", "fps", "fair price shop"
-    ],
-    "Cast_Certificate": [
-        "CASTE CERTIFICATE",
-        "FORM - 8",
-        "Rule No. 5(6)",
-        "De-Notified Tribe (Vimukt Jati)",
-        "Nomadic Tribe/Other Backward Class",
-        "Special Backward Category",
-        "recognised as",
-        "Government Resolution",
-        "Sub Divisional Officer",
-        "belonging to the State of Maharashtra"
-    ],
-    "Income_Certificate": [
-        "१ वर्षासाठी उत्पन्नाचे प्रमाणपत्र",
-        "ऑफिस ऑफ नायब तहसीलदार",
-        "वार्षिक उत्पन्न",
-        "मिळालेले १ वर्षाचे उत्पन्न",
-        "कुटुंबातील सर्व सदस्यांचे",
-        "प्रमाणित करण्यात येते की",
-        "वैध राहील",
-        "Signature valid",
-        "Digitally Signed by"
-    ],
-    "PCM_Score_Card": [
-        "MAH-MHT CET (PCM Group)",
-        "State Common Entrance Test Cell",
-        "Score Card",
-        "Physics",
-        "Chemistry",
-        "Mathematics",
-        "Total Percentile",
-        "Normalization document",
-        "Centralized Admission Process (CAP)",
-        "IP address of the Computer"
-    ]
-}
-# Validate keyword uniqueness (optional debug output)
-_keyword_sets = {k: set(v) for k, v in DOC_KEYWORDS.items()}
-for doc1 in DOC_KEYWORDS:
-    for doc2 in DOC_KEYWORDS:
-        if doc1 < doc2:
-            overlap = _keyword_sets[doc1].intersection(_keyword_sets[doc2])
-            if overlap:
-                print(f"⚠️  Warning: Overlap between {doc1} and {doc2}: {overlap}")
-def normalize_text(text):
-    """Robust multilingual tokenization with noise filtering"""
-    text = text.lower()
-    # Extract Hindi Devanagari (2+ chars) OR English alphanumeric (3+ chars)
-    tokens = re.findall(r'[\u0900-\u097F]{2,}|\w{3,}', text)
-    # Remove common English stopwords
-    stopwords = {'the', 'and', 'of', 'in', 'to', 'for', 'is', 'on', 'by', 'with', 'at', 'from', 'a', 'an', 'this'}
-    tokens = [t for t in tokens if t not in stopwords]
-    # Remove OCR noise (4+ consecutive consonants = garbage)
-    noise_pattern = re.compile(r'^[b-df-hj-np-tv-xz]{4,}$')
-    tokens = [t for t in tokens if not noise_pattern.match(t)]
-    return tokens
-def pdf_to_images(pdf_path, max_pages=3):
-    """Convert PDF pages to high-resolution temporary images"""
-    if not PDF_SUPPORT:
-        raise ValueError("PDF support not available. Install PyMuPDF")
-    doc = fitz.open(pdf_path)
-    total_pages = len(doc)
-    pages_to_process = min(total_pages, max_pages)
-    image_paths = []
-    temp_dir = tempfile.mkdtemp(prefix="ocr_pdf_")
-    if token in target_set:
-        return token
-    # Levenshtein distance match
-    matches = difflib.get_close_matches(token, target_set, n=1, cutoff=threshold)
-    if matches:
-        return matches[0]
-    # Substring match (handles concatenated words)
-    for ocr_token in target_set:
-        if token in ocr_token or ocr_token in token:
-            return ocr_token
-    # Hindi-specific fuzzy matching (handles OCR errors like सत्पमेव → सत्यमेव)
-    if any('\u0900' <= c <= '\u097F' for c in token):
-        for ocr_token in target_set:
-            if len(ocr_token) > 3:
-                similarity = difflib.SequenceMatcher(None, token, ocr_token).ratio()
-                if similarity > threshold:
-                    return ocr_token
-    return None
-def calculate_doc_type(ocr_tokens, debug=False):
-    """
-    Enhanced document classification with CORRECTED tie-breaking logic.
-    Only compares documents that are ACTUALLY TIED (within 5% score).
-    """
-    ocr_set = set(ocr_tokens)
-    ocr_combined = " ".join(ocr_tokens)
-    scores = {}
-    for doc_type, keywords in DOC_KEYWORDS.items():
-        kw_set = set(k.lower() for k in keywords)
-        # Primary: exact/fuzzy token matches (weighted 2 for exact, 1.5 for fuzzy)
-        primary_matches = sum(2 if kw in ocr_set else 1.5 if fuzzy_match(kw, ocr_set) else 0
-                             for kw in kw_set)
-        # Secondary: multi-word phrase matches in combined text
-        phrase_matches = sum(1 for kw in kw_set if " " in kw and kw in ocr_combined)
-        # Tertiary: title keyword bonus (certificate, card, licence, passport)
-        title_keywords = [kw for kw in kw_set if any(word in kw for word in ["certificate", "card", "licence", "passport"])]
-        title_match = sum(1 for kw in title_keywords if kw in ocr_combined)
-        # Calculate weighted score (max possible = len(kw_set) * 2)
-        max_possible = len(kw_set) * 2
-        weighted_score = ((primary_matches + phrase_matches + title_match) / max_possible) * 100
-        scores[doc_type] = weighted_score
-        if debug:
-            print(f"  {doc_type:<25}: {weighted_score:>6.1f}% ({primary_matches:.1f} + {phrase_matches} + {title_match})")
-    # Sort by score descending
-    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
-    best_type, best_score = sorted_scores[0]
-    # CRITICAL FIX: Only trigger tie-breaking if top TWO scores are close (within 5%)
-    if len(sorted_scores) > 1 and (sorted_scores[0][1] - sorted_scores[1][1]) < 5:
-        if debug:
-            print(f"\n⚠️  Tie detected between '{sorted_scores[0][0]}' and '{sorted_scores[1][0]}'!")
-        # Get ONLY the tied documents (within 5% of top score)
-        tied_docs = [(doc_type, score) for doc_type, score in sorted_scores
-                     if (best_score - score) < 5]
-        if debug:
-            print(f"Tied documents: {[f'{doc}({score:.1f}%)' for doc, score in tied_docs]}")
-        # Calculate unique keywords ONLY for tied documents
-        unique_counts = {}
-        for doc_type, _ in tied_docs:
-            kw_set = set(k.lower() for k in DOC_KEYWORDS[doc_type])
-            # Get keywords from OTHER tied documents only
-            other_tied_keywords = set()
-            for other_doc, _ in tied_docs:
-                if other_doc != doc_type:
-                    other_tied_keywords.update(k.lower() for k in DOC_KEYWORDS[other_doc])
-            unique_keywords = kw_set - other_tied_keywords
-            unique_matches = sum(1 for kw in unique_keywords if fuzzy_match(kw, ocr_set))
-            unique_counts[doc_type] = unique_matches
-            if debug:
-                print(f"  {doc_type:<25}: {unique_matches} unique matches ({len(unique_keywords)} available)")
-        # Only use tie-breaker if there's a clear winner
-        if unique_counts and max(unique_counts.values()) > 0:
-            sorted_unique = sorted(unique_counts.items(), key=lambda x: x[1], reverse=True)
-            if len(sorted_unique) > 1 and sorted_unique[0][1] > sorted_unique[1][1]:
-                best_type = sorted_unique[0][0]
-                best_score = scores[best_type]
-                if debug:
-                    print(f"✓ Tie broken: {best_type} wins with {unique_counts[best_type]} unique matches")
-    return best_type, best_score
-def verify_keywords(ocr_tokens, user_keywords, use_fuzzy=False):
-    """
-    FIXED: Sequence-aware matching for multi-keyword inputs (names, addresses).
-    Checks if keywords appear consecutively in OCR text first.
-    """
-    ocr_set = set(ocr_tokens)
-    ocr_combined = " ".join(ocr_tokens)
-    results = []
-    # CRITICAL: For multi-keyword inputs, check for SEQUENCE match first
-    if len(user_keywords) > 1:
-        # Build the phrase as it should appear in OCR
-        user_phrase = " ".join([kw.lower() if all(ord(c) < 128 for c in kw) else kw for kw in user_keywords])
-        # Check if entire phrase exists in OCR text
-        if user_phrase in ocr_combined:
-            if args.debug:
-                print(f"\n✓ Sequence match: '{user_phrase}' found in OCR text")
-            # All keywords matched in correct order
-            for kw in user_keywords:
-                results.append({
-                    'keyword': kw,
-                    'matched': True,
-                    'matched_text': kw
-                })
-            return results
-        # Fuzzy phrase matching if enabled
-        if use_fuzzy:
-            # Create n-grams from OCR tokens matching user keyword count
-            n = len(user_keywords)
-            ocr_phrases = [" ".join(ocr_tokens[i:i+n]) for i in range(len(ocr_tokens) - n + 1)]
-            phrase_match = fuzzy_match(user_phrase, set(ocr_phrases))
-            if phrase_match:
-                if args.debug:
-                    print(f"\n✓ Fuzzy sequence match: '{user_phrase}' ~ '{phrase_match}'")
-                for kw in user_keywords:
-                    results.append({
-                        'keyword': kw,
-                        'matched': True,
-                        'matched_text': kw
-                    })
-                return results
-    # Fallback to individual keyword matching
-    for kw in user_keywords:
-        kw_processed = kw.lower() if all(ord(c) < 128 for c in kw) else kw
-        matched = False
-        matched_text = None
-        if kw_processed in ocr_set:
-            matched = True
-            matched_text = kw_processed
-        elif " " in kw_processed and kw_processed in ocr_combined:
-            matched = True
-            matched_text = kw_processed
-        elif use_fuzzy:
-            matched_text = fuzzy_match(kw_processed, ocr_set)
-            if matched_text:
-                matched = True
-        results.append({
-            'keyword': kw,
-            'matched': matched,
-            'matched_text': matched_text or kw_processed if matched else None
-        })
-    return results
-def main():
-    parser = argparse.ArgumentParser(description='OCR Document Verification with PDF Support')
-    parser.add_argument('--file', nargs='+', required=True, help='Paths to image or PDF files')
-    parser.add_argument('--inputkeywords', required=True, help='Space-separated keywords to verify')
-    parser.add_argument('--required', nargs='+', help='List of required document types (space-separated, e.g., PAN Aadhaar HSC)')
-    parser.add_argument('--fuzzy', action='store_true', help='Enable fuzzy matching')
-    parser.add_argument('--debug', action='store_true', help='Show detailed OCR and scoring output')
-    parser.add_argument('--pages', type=int, default=3, help='Max pages to process for PDFs (default: 3)')
-    global args
-    args = parser.parse_args()
-    # CRITICAL FIX: Clean the required list by stripping commas and whitespace
-    required_list = []
-    if args.required:
-        for item in args.required:
-            # Split on commas and strip whitespace from each part
-            parts = [part.strip() for part in item.split(',') if part.strip()]
-            required_list.extend(parts)
-    required_set = set(required_list)
-    # Process each file and collect results
-    file_results = []
-    found_documents = set()
-    all_matched_keywords_per_file = []
-    print(f"\n{'='*60}")
-    print(f"PROCESSING {len(args.file)} FILES")
-    print(f"{'='*60}\n")
-    for idx, file_path in enumerate(args.file, 1):
-        print(f"--- FILE {idx}/{len(args.file)}: {os.path.basename(file_path)} ---")
-        # Extract text from file
-        ocr_texts = get_ocr_text(file_path, args.pages)
-        if not ocr_texts:
-            print(f"⚠️  No text extracted from {file_path}\n")
-            file_results.append({
-                'file': file_path,
-                'doc_type': 'Unknown',
-                'doc_score': 0,
-                'keywords_matched': [],
-                'status': 'ERROR'
-            })
-            continue
-        # Debug: Show raw OCR
-        if args.debug:
-            print("\n" + "="*60)
-            print("RAW OCR EXTRACTED TEXT:")
-            print("="*60)
-            for i, text in enumerate(ocr_texts, 1):
-                print(f"{i:3d}. {text}")
-            print("="*60 + "\n")
-        # Normalize tokens
-        ocr_tokens = normalize_text(" ".join(ocr_texts))
-        # Debug: Show normalized tokens
-        if args.debug:
-            print("="*60)
-            print("NORMALIZED TOKENS:")
-            print("="*60)
-            print(f"Total tokens: {len(ocr_tokens)}")
-            print(f"First 50 tokens: {', '.join(ocr_tokens[:50])}{'...' if len(ocr_tokens) > 50 else ''}")
-            print("="*60 + "\n")
-        # Document classification
-        if args.debug:
-            print("="*60)
-            print("DOCUMENT TYPE SCORING:")
-            print("="*60)
-        doc_type, doc_score = calculate_doc_type(ocr_tokens, debug=args.debug)
-        found_documents.add(doc_type)
-        if args.debug:
-            print("="*60 + "\n")
-        # Keyword verification
-        user_keywords = [kw.strip() for kw in args.inputkeywords.split()]
-        verification_results = verify_keywords(ocr_tokens, user_keywords, args.fuzzy)
-        # Status: ALL keywords must match in this file
-        all_matched = all(r['matched'] for r in verification_results)
-        status = "VERIFIED" if all_matched else "NOT VERIFIED"
-        # Store results for this file
-        file_results.append({
-            'file': file_path,
-            'doc_type': doc_type,
-            'doc_score': doc_score,
-            'keywords_matched': verification_results,
-            'status': status,
-            'all_keywords_matched': all_matched
-        })
-        # Track which keywords were matched in this file
-        matched_keywords_in_file = {r['keyword'] for r in verification_results if r['matched']}
-        all_matched_keywords_per_file.append(matched_keywords_in_file)
-        # Per-file output
-        print(f"\n{'='*60}")
-        print(f"Document Type: {doc_type} ({doc_score:.1f}% confidence)")
-        print(f"{'='*60}")
-        print(f"{'Keyword':<25} | {'Status':<10} | {'Matched Text'}")
-        print(f"{'-'*60}")
-        for r in verification_results:
-            status_icon = "✓" if r['matched'] else "✗"
-            matched_text = r['matched_text'] if r['matched_text'] else "Not found"
-            print(f"{r['keyword']:<25} | {status_icon:<10} | {matched_text}")
-        print(f"{'='*60}")
-        print(f"File Status: {status}")
-        print(f"{'='*60}\n")
-    # FINAL SUMMARY
-    print(f"\n{'='*60}")
-    print(f"FINAL SUMMARY")
-    print(f"{'='*60}")
-    # Required documents check
-    if required_set:
-        missing_docs = required_set - found_documents
-        print(f"\nRequired Documents: {', '.join(sorted(required_set))}")
-        print(f"Found Documents: {', '.join(sorted(found_documents)) if found_documents else 'None'}")
-        if missing_docs:
-            print(f"❌ Missing Documents: {', '.join(sorted(missing_docs))}")
-            docs_status = "NOT VERIFIED"
-        else:
-            print(f"✅ All required documents found!")
-            docs_status = "VERIFIED"
-    else:
-        docs_status = "N/A (no required list specified)"
-        missing_docs = set()
-    # Overall keyword verification across ALL files
-    # Check if every keyword appears in at least one file
-    all_user_keywords = set(args.inputkeywords.split())
-    keywords_found_across_files = set()
-    for file_keyword_set in all_matched_keywords_per_file:
-        keywords_found_across_files.update(file_keyword_set)
-    missing_keywords = all_user_keywords - keywords_found_across_files
-    print(f"\nKeywords to Find: {', '.join(sorted(all_user_keywords))}")
-    print(f"Keywords Found (across all files): {', '.join(sorted(keywords_found_across_files)) if keywords_found_across_files else 'None'}")
-    if missing_keywords:
-        print(f"❌ Missing Keywords: {', '.join(sorted(missing_keywords))}")
-        keywords_status = "NOT VERIFIED"
-    else:
-        print(f"✅ All keywords found across uploaded documents!")
-        keywords_status = "VERIFIED"
-    # Overall status: BOTH documents and keywords must be verified
-    overall_status = "VERIFIED" if (docs_status == "VERIFIED" and keywords_status == "VERIFIED") else "NOT VERIFIED"
-    print(f"\n{'='*60}")
-    print(f"Documents Status: {docs_status}")
-    print(f"Keywords Status: {keywords_status}")
-    print(f"OVERALL STATUS: {overall_status}")
-    print(f"{'='*60}")
-if __name__ == "__main__":
-    main()