Spaces:

triflix
/

DocumentVerification

Sleeping

App Files Files Community

triflix commited on Nov 19, 2025

Commit

a081bdc

verified ·

1 Parent(s): bf3efa4

Upload 2 files

Browse files

Files changed (2) hide show

app.py +294 -0
logiccode.py +549 -0

app.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import gradio as gr
+import os
+import pandas as pd
+import shutil
+import sys
+# ---------------------------------------------------------
+# IMPORT LOGICCODE
+# ---------------------------------------------------------
+# We expect logiccode.py to be in the same directory
+try:
+    import logiccode
+except ImportError as e:
+    print("CRITICAL ERROR: Could not import 'logiccode.py'.")
+    print(f"Ensure logiccode.py is in the same directory as app.py. Error: {e}")
+    sys.exit(1)
+# ---------------------------------------------------------
+# MOCK ARGUMENTS
+# ---------------------------------------------------------
+# This class mimics the argparse object that logiccode expects
+class MockArgs:
+    def __init__(self):
+        self.debug = False
+        self.pages = 3
+        self.file = []
+        self.inputkeywords = ""
+        self.required = []
+        self.fuzzy = True
+        self.visualize = False
+# Initialize args in logiccode if not already present
+if not hasattr(logiccode, 'args'):
+    logiccode.args = MockArgs()
+# ---------------------------------------------------------
+# CORE PROCESSING FUNCTION
+# ---------------------------------------------------------
+def process_documents(files, keywords_input, required_docs, fuzzy_match_enabled, debug_enabled):
+    """
+    Process uploaded files using the imported logiccode module.
+    """
+    # 1. Update global args in logiccode based on UI inputs
+    logiccode.args.debug = debug_enabled
+    logiccode.args.fuzzy = fuzzy_match_enabled
+    # Initialize output containers
+    results = []
+    gallery_images = []
+    logs = []
+    # Parse keywords
+    user_keywords = [kw.strip() for kw in keywords_input.split() if kw.strip()]
+    # Track found documents for "Required" check
+    found_documents = set()
+    all_matched_keywords_per_file = []
+    if not files:
+        return "<h3>⚠️ No files uploaded</h3>", [], pd.DataFrame(), "Please upload files to begin."
+    logs.append(f"Starting processing of {len(files)} files...")
+    logs.append(f"Target Keywords: {user_keywords}")
+    logs.append(f"Required Documents: {required_docs}")
+    # 2. Iterate through uploaded files
+    for file_obj in files:
+        file_path = file_obj.name
+        filename = os.path.basename(file_path)
+        logs.append(f"\n--- Processing: {filename} ---")
+        # --- A. Generate Previews for Gallery ---
+        if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
+            gallery_images.append((file_path, filename))
+        elif file_path.lower().endswith('.pdf'):
+            try:
+                # Use logiccode's utility to get a preview of the 1st page
+                preview_pages, _, _ = logiccode.pdf_to_images(file_path, max_pages=1)
+                if preview_pages:
+                    gallery_images.append((preview_pages[0], f"{filename} (PDF Preview)"))
+                    logs.append(f"Generated PDF preview for {filename}")
+            except Exception as e:
+                logs.append(f"⚠️ PDF Preview failed for {filename}: {e}")
+        # --- B. Text Extraction & Analysis ---
+        try:
+            # Extract text (logiccode handles PDF vs Image internally)
+            ocr_texts = logiccode.get_ocr_text(file_path, logiccode.args.pages)
+            if not ocr_texts:
+                logs.append(f"⚠️ Warning: No text extracted from {filename}")
+                results.append({
+                    "File": filename, "Type": "Unreadable", "Score": 0,
+                    "Status": "FAILED", "Matched Keywords": ""
+                })
+                continue
+            # Normalize text
+            full_text = " ".join(ocr_texts)
+            ocr_tokens = logiccode.normalize_text(full_text)
+            # Classify Document
+            doc_type, doc_score = logiccode.calculate_doc_type(ocr_tokens, debug=debug_enabled)
+            found_documents.add(doc_type)
+            logs.append(f"Classified as: {doc_type} (Confidence: {doc_score:.1f}%)")
+            # Verify Keywords
+            # logiccode.verify_keywords returns [{'keyword': 'x', 'matched': True/False, ...}]
+            verification_results = logiccode.verify_keywords(ocr_tokens, user_keywords, fuzzy_match_enabled)
+            matched_kws = [r['keyword'] for r in verification_results if r['matched']]
+            all_matched_keywords_per_file.append(set(matched_kws))
+            # Determine File Status
+            # If keywords were provided, we require all of them to match for "VERIFIED"
+            if user_keywords:
+                file_status = "VERIFIED" if len(matched_kws) == len(user_keywords) else "PARTIAL"
+                if len(matched_kws) == 0: file_status = "FAILED"
+            else:
+                file_status = "INFO ONLY"
+            logs.append(f"Matched: {matched_kws if matched_kws else 'None'}")
+            results.append({
+                "File": filename,
+                "Type": doc_type,
+                "Score": f"{doc_score:.1f}%",
+                "Status": file_status,
+                "Matched Keywords": ", ".join(matched_kws)
+            })
+        except Exception as e:
+            error_msg = f"Error processing {filename}: {str(e)}"
+            logs.append(error_msg)
+            if debug_enabled:
+                import traceback
+                logs.append(traceback.format_exc())
+            results.append({
+                "File": filename, "Type": "Error", "Score": 0,
+                "Status": "ERROR", "Matched Keywords": str(e)
+            })
+    # 3. Calculate Summary Logic
+    required_set = set(required_docs)
+    missing_docs = required_set - found_documents
+    all_user_keywords = set(user_keywords)
+    keywords_found_across_all_files = set()
+    for file_kw_set in all_matched_keywords_per_file:
+        keywords_found_across_all_files.update(file_kw_set)
+    missing_keywords = all_user_keywords - keywords_found_across_all_files
+    # 4. Build HTML Report
+    return build_html_summary(required_set, missing_docs, missing_keywords), gallery_images, pd.DataFrame(results), "\n".join(logs)
+def build_html_summary(required_set, missing_docs, missing_keywords):
+    html = """
+    <div style='padding: 20px; background-color: white; border-radius: 10px; border: 1px solid #e5e7eb;'>
+        <h3 style='margin-top: 0; color: #333;'>Verification Summary</h3>
+    """
+    # Document Status
+    doc_status_bool = True
+    if required_set:
+        if missing_docs:
+            doc_status_bool = False
+            html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Documents:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_docs))}</span></div>"
+        else:
+            html += f"<div style='margin-bottom: 8px;'>✅ <b>Documents:</b> All required types found.</div>"
+    else:
+        html += "<div style='margin-bottom: 8px; color: #666;'>ℹ️ No specific document types required.</div>"
+    # Keyword Status
+    kw_status_bool = True
+    if missing_keywords:
+        kw_status_bool = False
+        html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Keywords:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_keywords))}</span></div>"
+    else:
+        html += f"<div style='margin-bottom: 8px;'>✅ <b>Keywords:</b> All keywords found.</div>"
+    # Final Status
+    overall_color = "#10b981" if (doc_status_bool and kw_status_bool) else "#ef4444"
+    overall_text = "VERIFIED" if (doc_status_bool and kw_status_bool) else "ACTION REQUIRED"
+    html += f"<hr style='margin: 15px 0; border-color: #eee;'>"
+    html += f"<h2 style='color: {overall_color}; margin: 0; text-align: center;'>{overall_text}</h2>"
+    html += "</div>"
+    return html
+# ---------------------------------------------------------
+# GRADIO UI SETUP
+# ---------------------------------------------------------
+theme = gr.themes.Soft(
+    primary_hue="blue",
+    secondary_hue="slate",
+).set(
+    body_background_fill="#f9fafb",
+    block_background_fill="white",
+    block_border_width="1px"
+)
+with gr.Blocks(theme=theme, title="DocuVerify Pro") as demo:
+    gr.Markdown(
+        """
+        # 📄 Intelligent Document Verification
+        Upload documents, specify required types, and verify content matches automatically.
+        """
+    )
+    with gr.Row():
+        # Left Column: Inputs
+        with gr.Column(scale=4):
+            files_input = gr.File(
+                file_count="multiple",
+                label="1. Upload Documents",
+                file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp"],
+                height=250
+            )
+            keywords_input = gr.Textbox(
+                label="2. Keywords to Verify",
+                placeholder="Name, ID Number, Date of Birth...",
+                info="Enter values that MUST appear in the documents (space separated)",
+                lines=2
+            )
+        # Right Column: Configuration
+        with gr.Column(scale=3):
+            # Fetch doc types dynamically from logiccode
+            available_types = sorted(list(logiccode.DOC_KEYWORDS.keys())) if hasattr(logiccode, 'DOC_KEYWORDS') else []
+            required_docs_input = gr.Dropdown(
+                choices=available_types,
+                multiselect=True,
+                label="3. Required Document Types",
+                info="Which documents are mandatory?",
+                value=[]
+            )
+            with gr.Group():
+                gr.Markdown("### Settings")
+                fuzzy_checkbox = gr.Checkbox(value=True, label="Enable Fuzzy Matching (Approximate spelling)")
+                debug_checkbox = gr.Checkbox(value=False, label="Show Debug Logs")
+            verify_btn = gr.Button("🔍 Verify Documents", variant="primary", size="lg")
+    gr.Markdown("---")
+    # Results Area
+    with gr.Row():
+        # Summary Box
+        with gr.Column(scale=1):
+            status_output = gr.HTML(label="Overall Status")
+        # Detailed Tabs
+        with gr.Column(scale=2):
+            with gr.Tabs():
+                with gr.TabItem("📊 Results Table"):
+                    results_df = gr.Dataframe(
+                        headers=["File", "Type", "Score", "Status", "Matched Keywords"],
+                        interactive=False
+                    )
+                with gr.TabItem("🖼️ Document Gallery"):
+                    gallery = gr.Gallery(
+                        label="Processed Images",
+                        show_label=False,
+                        columns=[3], rows=[2],
+                        object_fit="contain",
+                        height="auto"
+                    )
+                with gr.TabItem("📝 System Logs"):
+                    logs_output = gr.Textbox(
+                        label="Processing Logs",
+                        lines=15,
+                        interactive=False,
+                        show_copy_button=True
+                    )
+    # Event Trigger
+    verify_btn.click(
+        fn=process_documents,
+        inputs=[files_input, keywords_input, required_docs_input, fuzzy_checkbox, debug_checkbox],
+        outputs=[status_output, gallery, results_df, logs_output]
+    )
+if __name__ == "__main__":
+    # Increase max file size if needed, allow sharing
+    demo.launch(share=False, server_name="0.0.0.0")

logiccode.py ADDED Viewed

	@@ -0,0 +1,549 @@

+#!/usr/bin/env python3
+"""
+OCR Document Verification with Batch Processing & Required Document Checklist
+Usage:
+# Single file (backward compatible)
+python ocrupdated2.py --file image.jpg --inputkeywords "keyword1 keyword2" --fuzzy --debug
+# Multiple files with required document checklist
+python ocrupdated2.py --file doc1.pdf doc2.jpg doc3.png --inputkeywords "Shaikh Anisa Rahat" --required PAN HSC AgeNationalityDomicile --fuzzy --debug
+NOTE: Use spaces to separate required document types, NOT commas:
+✅ --required PAN Aadhaar HSC
+❌ --required PAN, Aadhaar, HSC
+"""
+import argparse
+import re
+import os
+import tempfile
+from collections import defaultdict
+from paddleocr import PaddleOCR
+import difflib
+# Optional PDF support
+try:
+    import fitz  # PyMuPDF
+    PDF_SUPPORT = True
+except ImportError:
+    PDF_SUPPORT = False
+    print("Warning: PyMuPDF not installed. PDF support disabled. Install with: pip install PyMuPDF")
+# Document keywords (kept same as your updated version)
+DOC_KEYWORDS = {
+    "Aadhaar": [
+        "uidai", "aadhaar", "aadhar", "government of india", "भारत सरकार",
+        "आधार", "यूआईडीएआई", "प्रधानमंत्री", "जन्म तिथि", "पता", "लिंग",
+        "unique identification authority", "aadhaar number", "enrollment number"
+    ],
+    "PAN": [
+        "permanent account number", "income tax", "incometaxindia", "pan",
+        "income tax department", "आयकर विभाग", "स्थायी खाता संख्या",
+        "taxpayer", "father's name", "पिता का नाम", "signature", "inc"
+    ],
+    "Driving_License": [
+        "driving licence", "motor vehicles act", "rto", "mcwg", "lmv",
+        "transport department", "licence no", "valid till", "date of issue",
+        "ड्राइविंग लाइसेंस", "परिवहन विभाग", "challan", "regional transport office"
+    ],
+    "Passport": [
+        "passport", "republic of india", "ministry of external affairs",
+        "passport number", "date of issue", "date of expiry", "surname",
+        "given names", "nationality indian", "पासपोर्ट", "गणराज्य", "विदेश मंत्रालय",
+        "consular", "visa"
+    ],
+    "SSC": [
+        "secondary school certificate", "statement of marks", "ssc", "10th", "class x",
+        "board of secondary education", "maharashtra state board", "matriculation",
+        "roll number", "seat number", "subject code", "marks obtained", "grade", "pass"
+    ],
+    "HSC": [
+        "higher secondary certificate", "statement of marks", "hsc", "12th", "class xii",
+        "board of higher secondary education", "maharashtra state board", "intermediate",
+        "stream", "science", "commerce", "arts", "marks obtained", "grade", "percentage"
+    ],
+    "AgeNationalityDomicile": [
+        "certificate of age nationality and domicile", "domicile certificate",
+        "age nationality domicile", "tehsildar", "executive magistrate", "collector",
+        "certificate of residence", "domiciled in the state of", "citizen of india",
+        "residence proof", "maharashtra domicile", "satara", "karad", "taluka", "district"
+    ],
+    "Ration_Card": [
+        "ration card", "food and civil supplies", "apl", "bpl", "aay", "antyodaya",
+        "ration card number", "family members", "head of family",
+        "राशन कार्ड", "खाद्य पुरवठा", "नागरी पुरवठा विभाग", "fps", "fair price shop"
+    ],
+    "Cast_Certificate": [
+        "CASTE CERTIFICATE",
+        "FORM - 8",
+        "Rule No. 5(6)",
+        "De-Notified Tribe (Vimukt Jati)",
+        "Nomadic Tribe/Other Backward Class",
+        "Special Backward Category",
+        "recognised as",
+        "Government Resolution",
+        "Sub Divisional Officer",
+        "belonging to the State of Maharashtra"
+    ],
+    "Income_Certificate": [
+        "१ वर्षासाठी उत्पन्नाचे प्रमाणपत्र",
+        "ऑफिस ऑफ नायब तहसीलदार",
+        "वार्षिक उत्पन्न",
+        "मिळालेले १ वर्षाचे उत्पन्न",
+        "कुटुंबातील सर्व सदस्यांचे",
+        "प्रमाणित करण्यात येते की",
+        "वैध राहील",
+        "Signature valid",
+        "Digitally Signed by"
+    ],
+    "PCM_Score_Card": [
+        "MAH-MHT CET (PCM Group)",
+        "State Common Entrance Test Cell",
+        "Score Card",
+        "Physics",
+        "Chemistry",
+        "Mathematics",
+        "Total Percentile",
+        "Normalization document",
+        "Centralized Admission Process (CAP)",
+        "IP address of the Computer"
+    ]
+}
+# Validate keyword uniqueness (optional debug output)
+_keyword_sets = {k: set(v) for k, v in DOC_KEYWORDS.items()}
+for doc1 in DOC_KEYWORDS:
+    for doc2 in DOC_KEYWORDS:
+        if doc1 < doc2:
+            overlap = _keyword_sets[doc1].intersection(_keyword_sets[doc2])
+            if overlap:
+                print(f"⚠️  Warning: Overlap between {doc1} and {doc2}: {overlap}")
+def normalize_text(text):
+    """Robust multilingual tokenization with noise filtering"""
+    text = text.lower()
+    # Extract Hindi Devanagari (2+ chars) OR English alphanumeric (3+ chars)
+    tokens = re.findall(r'[\u0900-\u097F]{2,}|\w{3,}', text)
+    # Remove common English stopwords
+    stopwords = {'the', 'and', 'of', 'in', 'to', 'for', 'is', 'on', 'by', 'with', 'at', 'from', 'a', 'an', 'this'}
+    tokens = [t for t in tokens if t not in stopwords]
+    # Remove OCR noise (4+ consecutive consonants = garbage)
+    noise_pattern = re.compile(r'^[b-df-hj-np-tv-xz]{4,}$')
+    tokens = [t for t in tokens if not noise_pattern.match(t)]
+    return tokens
+def pdf_to_images(pdf_path, max_pages=3):
+    """Convert PDF pages to high-resolution temporary images"""
+    if not PDF_SUPPORT:
+        raise ValueError("PDF support not available. Install PyMuPDF")
+    doc = fitz.open(pdf_path)
+    total_pages = len(doc)
+    pages_to_process = min(total_pages, max_pages)
+    image_paths = []
+    temp_dir = tempfile.mkdtemp(prefix="ocr_pdf_")
+    for page_num in range(pages_to_process):
+        page = doc.load_page(page_num)
+        zoom = 2  # 2x resolution for better OCR
+        mat = fitz.Matrix(zoom, zoom)
+        pix = page.get_pixmap(matrix=mat)
+        img_path = os.path.join(temp_dir, f"page_{page_num + 1}.png")
+        pix.save(img_path)
+        image_paths.append(img_path)
+    doc.close()
+    return image_paths, total_pages, temp_dir
+def get_ocr_text(file_path, max_pages=3):
+    """Process image or PDF with OCR, returning all extracted text lines"""
+    ocr = PaddleOCR(
+        lang="mr",
+        use_doc_orientation_classify=False,
+        use_doc_unwarping=False,
+        use_textline_orientation=False)
+    all_texts = []
+    temp_dir = None
+    try:
+        if file_path.lower().endswith('.pdf'):
+            if not PDF_SUPPORT:
+                print("Error: PDF file provided but PyMuPDF not installed")
+                return []
+            image_paths, total_pages, temp_dir = pdf_to_images(file_path, max_pages)
+            print(f"Processing PDF: {total_pages} pages total, processing first {len(image_paths)} pages...")
+            for i, img_path in enumerate(image_paths, 1):
+                if args.debug:
+                    print(f"\n--- Processing PDF Page {i} ---")
+                result = ocr.predict(input=img_path)
+                for res in result:
+                    all_texts.extend(res['rec_texts'])
+        else:
+            result = ocr.predict(input=file_path)
+            for res in result:
+                all_texts.extend(res['rec_texts'])
+    finally:
+        if temp_dir and os.path.exists(temp_dir):
+            import shutil
+            shutil.rmtree(temp_dir)
+    return all_texts
+def fuzzy_match(token, target_set, threshold=0.75):
+    """
+    Multi-level matching for OCR errors:
+    1. Exact match
+    2. Levenshtein distance
+    3. Substring containment
+    4. Hindi character-level similarity
+    """
+    if token in target_set:
+        return token
+    # Levenshtein distance match
+    matches = difflib.get_close_matches(token, target_set, n=1, cutoff=threshold)
+    if matches:
+        return matches[0]
+    # Substring match (handles concatenated words)
+    for ocr_token in target_set:
+        if token in ocr_token or ocr_token in token:
+            return ocr_token
+    # Hindi-specific fuzzy matching (handles OCR errors like सत्पमेव → सत्यमेव)
+    if any('\u0900' <= c <= '\u097F' for c in token):
+        for ocr_token in target_set:
+            if len(ocr_token) > 3:
+                similarity = difflib.SequenceMatcher(None, token, ocr_token).ratio()
+                if similarity > threshold:
+                    return ocr_token
+    return None
+def calculate_doc_type(ocr_tokens, debug=False):
+    """
+    Enhanced document classification with CORRECTED tie-breaking logic.
+    Only compares documents that are ACTUALLY TIED (within 5% score).
+    """
+    ocr_set = set(ocr_tokens)
+    ocr_combined = " ".join(ocr_tokens)
+    scores = {}
+    for doc_type, keywords in DOC_KEYWORDS.items():
+        kw_set = set(k.lower() for k in keywords)
+        # Primary: exact/fuzzy token matches (weighted 2 for exact, 1.5 for fuzzy)
+        primary_matches = sum(2 if kw in ocr_set else 1.5 if fuzzy_match(kw, ocr_set) else 0
+                             for kw in kw_set)
+        # Secondary: multi-word phrase matches in combined text
+        phrase_matches = sum(1 for kw in kw_set if " " in kw and kw in ocr_combined)
+        # Tertiary: title keyword bonus (certificate, card, licence, passport)
+        title_keywords = [kw for kw in kw_set if any(word in kw for word in ["certificate", "card", "licence", "passport"])]
+        title_match = sum(1 for kw in title_keywords if kw in ocr_combined)
+        # Calculate weighted score (max possible = len(kw_set) * 2)
+        max_possible = len(kw_set) * 2
+        weighted_score = ((primary_matches + phrase_matches + title_match) / max_possible) * 100
+        scores[doc_type] = weighted_score
+        if debug:
+            print(f"  {doc_type:<25}: {weighted_score:>6.1f}% ({primary_matches:.1f} + {phrase_matches} + {title_match})")
+    # Sort by score descending
+    sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    best_type, best_score = sorted_scores[0]
+    # CRITICAL FIX: Only trigger tie-breaking if top TWO scores are close (within 5%)
+    if len(sorted_scores) > 1 and (sorted_scores[0][1] - sorted_scores[1][1]) < 5:
+        if debug:
+            print(f"\n⚠️  Tie detected between '{sorted_scores[0][0]}' and '{sorted_scores[1][0]}'!")
+        # Get ONLY the tied documents (within 5% of top score)
+        tied_docs = [(doc_type, score) for doc_type, score in sorted_scores
+                     if (best_score - score) < 5]
+        if debug:
+            print(f"Tied documents: {[f'{doc}({score:.1f}%)' for doc, score in tied_docs]}")
+        # Calculate unique keywords ONLY for tied documents
+        unique_counts = {}
+        for doc_type, _ in tied_docs:
+            kw_set = set(k.lower() for k in DOC_KEYWORDS[doc_type])
+            # Get keywords from OTHER tied documents only
+            other_tied_keywords = set()
+            for other_doc, _ in tied_docs:
+                if other_doc != doc_type:
+                    other_tied_keywords.update(k.lower() for k in DOC_KEYWORDS[other_doc])
+            unique_keywords = kw_set - other_tied_keywords
+            unique_matches = sum(1 for kw in unique_keywords if fuzzy_match(kw, ocr_set))
+            unique_counts[doc_type] = unique_matches
+            if debug:
+                print(f"  {doc_type:<25}: {unique_matches} unique matches ({len(unique_keywords)} available)")
+        # Only use tie-breaker if there's a clear winner
+        if unique_counts and max(unique_counts.values()) > 0:
+            sorted_unique = sorted(unique_counts.items(), key=lambda x: x[1], reverse=True)
+            if len(sorted_unique) > 1 and sorted_unique[0][1] > sorted_unique[1][1]:
+                best_type = sorted_unique[0][0]
+                best_score = scores[best_type]
+                if debug:
+                    print(f"✓ Tie broken: {best_type} wins with {unique_counts[best_type]} unique matches")
+    return best_type, best_score
+def verify_keywords(ocr_tokens, user_keywords, use_fuzzy=False):
+    """
+    FIXED: Sequence-aware matching for multi-keyword inputs (names, addresses).
+    Checks if keywords appear consecutively in OCR text first.
+    """
+    ocr_set = set(ocr_tokens)
+    ocr_combined = " ".join(ocr_tokens)
+    results = []
+    # CRITICAL: For multi-keyword inputs, check for SEQUENCE match first
+    if len(user_keywords) > 1:
+        # Build the phrase as it should appear in OCR
+        user_phrase = " ".join([kw.lower() if all(ord(c) < 128 for c in kw) else kw for kw in user_keywords])
+        # Check if entire phrase exists in OCR text
+        if user_phrase in ocr_combined:
+            if args.debug:
+                print(f"\n✓ Sequence match: '{user_phrase}' found in OCR text")
+            # All keywords matched in correct order
+            for kw in user_keywords:
+                results.append({
+                    'keyword': kw,
+                    'matched': True,
+                    'matched_text': kw
+                })
+            return results
+        # Fuzzy phrase matching if enabled
+        if use_fuzzy:
+            # Create n-grams from OCR tokens matching user keyword count
+            n = len(user_keywords)
+            ocr_phrases = [" ".join(ocr_tokens[i:i+n]) for i in range(len(ocr_tokens) - n + 1)]
+            phrase_match = fuzzy_match(user_phrase, set(ocr_phrases))
+            if phrase_match:
+                if args.debug:
+                    print(f"\n✓ Fuzzy sequence match: '{user_phrase}' ~ '{phrase_match}'")
+                for kw in user_keywords:
+                    results.append({
+                        'keyword': kw,
+                        'matched': True,
+                        'matched_text': kw
+                    })
+                return results
+    # Fallback to individual keyword matching
+    for kw in user_keywords:
+        kw_processed = kw.lower() if all(ord(c) < 128 for c in kw) else kw
+        matched = False
+        matched_text = None
+        if kw_processed in ocr_set:
+            matched = True
+            matched_text = kw_processed
+        elif " " in kw_processed and kw_processed in ocr_combined:
+            matched = True
+            matched_text = kw_processed
+        elif use_fuzzy:
+            matched_text = fuzzy_match(kw_processed, ocr_set)
+            if matched_text:
+                matched = True
+        results.append({
+            'keyword': kw,
+            'matched': matched,
+            'matched_text': matched_text or kw_processed if matched else None
+        })
+    return results
+def main():
+    parser = argparse.ArgumentParser(description='OCR Document Verification with PDF Support')
+    parser.add_argument('--file', nargs='+', required=True, help='Paths to image or PDF files')
+    parser.add_argument('--inputkeywords', required=True, help='Space-separated keywords to verify')
+    parser.add_argument('--required', nargs='+', help='List of required document types (space-separated, e.g., PAN Aadhaar HSC)')
+    parser.add_argument('--fuzzy', action='store_true', help='Enable fuzzy matching')
+    parser.add_argument('--debug', action='store_true', help='Show detailed OCR and scoring output')
+    parser.add_argument('--pages', type=int, default=3, help='Max pages to process for PDFs (default: 3)')
+    global args
+    args = parser.parse_args()
+    # CRITICAL FIX: Clean the required list by stripping commas and whitespace
+    required_list = []
+    if args.required:
+        for item in args.required:
+            # Split on commas and strip whitespace from each part
+            parts = [part.strip() for part in item.split(',') if part.strip()]
+            required_list.extend(parts)
+    required_set = set(required_list)
+    # Process each file and collect results
+    file_results = []
+    found_documents = set()
+    all_matched_keywords_per_file = []
+    print(f"\n{'='*60}")
+    print(f"PROCESSING {len(args.file)} FILES")
+    print(f"{'='*60}\n")
+    for idx, file_path in enumerate(args.file, 1):
+        print(f"--- FILE {idx}/{len(args.file)}: {os.path.basename(file_path)} ---")
+        # Extract text from file
+        ocr_texts = get_ocr_text(file_path, args.pages)
+        if not ocr_texts:
+            print(f"⚠️  No text extracted from {file_path}\n")
+            file_results.append({
+                'file': file_path,
+                'doc_type': 'Unknown',
+                'doc_score': 0,
+                'keywords_matched': [],
+                'status': 'ERROR'
+            })
+            continue
+        # Debug: Show raw OCR
+        if args.debug:
+            print("\n" + "="*60)
+            print("RAW OCR EXTRACTED TEXT:")
+            print("="*60)
+            for i, text in enumerate(ocr_texts, 1):
+                print(f"{i:3d}. {text}")
+            print("="*60 + "\n")
+        # Normalize tokens
+        ocr_tokens = normalize_text(" ".join(ocr_texts))
+        # Debug: Show normalized tokens
+        if args.debug:
+            print("="*60)
+            print("NORMALIZED TOKENS:")
+            print("="*60)
+            print(f"Total tokens: {len(ocr_tokens)}")
+            print(f"First 50 tokens: {', '.join(ocr_tokens[:50])}{'...' if len(ocr_tokens) > 50 else ''}")
+            print("="*60 + "\n")
+        # Document classification
+        if args.debug:
+            print("="*60)
+            print("DOCUMENT TYPE SCORING:")
+            print("="*60)
+        doc_type, doc_score = calculate_doc_type(ocr_tokens, debug=args.debug)
+        found_documents.add(doc_type)
+        if args.debug:
+            print("="*60 + "\n")
+        # Keyword verification
+        user_keywords = [kw.strip() for kw in args.inputkeywords.split()]
+        verification_results = verify_keywords(ocr_tokens, user_keywords, args.fuzzy)
+        # Status: ALL keywords must match in this file
+        all_matched = all(r['matched'] for r in verification_results)
+        status = "VERIFIED" if all_matched else "NOT VERIFIED"
+        # Store results for this file
+        file_results.append({
+            'file': file_path,
+            'doc_type': doc_type,
+            'doc_score': doc_score,
+            'keywords_matched': verification_results,
+            'status': status,
+            'all_keywords_matched': all_matched
+        })
+        # Track which keywords were matched in this file
+        matched_keywords_in_file = {r['keyword'] for r in verification_results if r['matched']}
+        all_matched_keywords_per_file.append(matched_keywords_in_file)
+        # Per-file output
+        print(f"\n{'='*60}")
+        print(f"Document Type: {doc_type} ({doc_score:.1f}% confidence)")
+        print(f"{'='*60}")
+        print(f"{'Keyword':<25} | {'Status':<10} | {'Matched Text'}")
+        print(f"{'-'*60}")
+        for r in verification_results:
+            status_icon = "✓" if r['matched'] else "✗"
+            matched_text = r['matched_text'] if r['matched_text'] else "Not found"
+            print(f"{r['keyword']:<25} | {status_icon:<10} | {matched_text}")
+        print(f"{'='*60}")
+        print(f"File Status: {status}")
+        print(f"{'='*60}\n")
+    # FINAL SUMMARY
+    print(f"\n{'='*60}")
+    print(f"FINAL SUMMARY")
+    print(f"{'='*60}")
+    # Required documents check
+    if required_set:
+        missing_docs = required_set - found_documents
+        print(f"\nRequired Documents: {', '.join(sorted(required_set))}")
+        print(f"Found Documents: {', '.join(sorted(found_documents)) if found_documents else 'None'}")
+        if missing_docs:
+            print(f"❌ Missing Documents: {', '.join(sorted(missing_docs))}")
+            docs_status = "NOT VERIFIED"
+        else:
+            print(f"✅ All required documents found!")
+            docs_status = "VERIFIED"
+    else:
+        docs_status = "N/A (no required list specified)"
+        missing_docs = set()
+    # Overall keyword verification across ALL files
+    # Check if every keyword appears in at least one file
+    all_user_keywords = set(args.inputkeywords.split())
+    keywords_found_across_files = set()
+    for file_keyword_set in all_matched_keywords_per_file:
+        keywords_found_across_files.update(file_keyword_set)
+    missing_keywords = all_user_keywords - keywords_found_across_files
+    print(f"\nKeywords to Find: {', '.join(sorted(all_user_keywords))}")
+    print(f"Keywords Found (across all files): {', '.join(sorted(keywords_found_across_files)) if keywords_found_across_files else 'None'}")
+    if missing_keywords:
+        print(f"❌ Missing Keywords: {', '.join(sorted(missing_keywords))}")
+        keywords_status = "NOT VERIFIED"
+    else:
+        print(f"✅ All keywords found across uploaded documents!")
+        keywords_status = "VERIFIED"
+    # Overall status: BOTH documents and keywords must be verified
+    overall_status = "VERIFIED" if (docs_status == "VERIFIED" and keywords_status == "VERIFIED") else "NOT VERIFIED"
+    print(f"\n{'='*60}")
+    print(f"Documents Status: {docs_status}")
+    print(f"Keywords Status: {keywords_status}")
+    print(f"OVERALL STATUS: {overall_status}")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    main()