triflix commited on
Commit
a081bdc
·
verified ·
1 Parent(s): bf3efa4

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +294 -0
  2. logiccode.py +549 -0
app.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import pandas as pd
4
+ import shutil
5
+ import sys
6
+
7
+ # ---------------------------------------------------------
8
+ # IMPORT LOGICCODE
9
+ # ---------------------------------------------------------
10
+ # We expect logiccode.py to be in the same directory
11
+ try:
12
+ import logiccode
13
+ except ImportError as e:
14
+ print("CRITICAL ERROR: Could not import 'logiccode.py'.")
15
+ print(f"Ensure logiccode.py is in the same directory as app.py. Error: {e}")
16
+ sys.exit(1)
17
+
18
+ # ---------------------------------------------------------
19
+ # MOCK ARGUMENTS
20
+ # ---------------------------------------------------------
21
+ # This class mimics the argparse object that logiccode expects
22
+ class MockArgs:
23
+ def __init__(self):
24
+ self.debug = False
25
+ self.pages = 3
26
+ self.file = []
27
+ self.inputkeywords = ""
28
+ self.required = []
29
+ self.fuzzy = True
30
+ self.visualize = False
31
+
32
+ # Initialize args in logiccode if not already present
33
+ if not hasattr(logiccode, 'args'):
34
+ logiccode.args = MockArgs()
35
+
36
+ # ---------------------------------------------------------
37
+ # CORE PROCESSING FUNCTION
38
+ # ---------------------------------------------------------
39
+ def process_documents(files, keywords_input, required_docs, fuzzy_match_enabled, debug_enabled):
40
+ """
41
+ Process uploaded files using the imported logiccode module.
42
+ """
43
+ # 1. Update global args in logiccode based on UI inputs
44
+ logiccode.args.debug = debug_enabled
45
+ logiccode.args.fuzzy = fuzzy_match_enabled
46
+
47
+ # Initialize output containers
48
+ results = []
49
+ gallery_images = []
50
+ logs = []
51
+
52
+ # Parse keywords
53
+ user_keywords = [kw.strip() for kw in keywords_input.split() if kw.strip()]
54
+
55
+ # Track found documents for "Required" check
56
+ found_documents = set()
57
+ all_matched_keywords_per_file = []
58
+
59
+ if not files:
60
+ return "<h3>⚠️ No files uploaded</h3>", [], pd.DataFrame(), "Please upload files to begin."
61
+
62
+ logs.append(f"Starting processing of {len(files)} files...")
63
+ logs.append(f"Target Keywords: {user_keywords}")
64
+ logs.append(f"Required Documents: {required_docs}")
65
+
66
+ # 2. Iterate through uploaded files
67
+ for file_obj in files:
68
+ file_path = file_obj.name
69
+ filename = os.path.basename(file_path)
70
+
71
+ logs.append(f"\n--- Processing: {filename} ---")
72
+
73
+ # --- A. Generate Previews for Gallery ---
74
+ if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
75
+ gallery_images.append((file_path, filename))
76
+
77
+ elif file_path.lower().endswith('.pdf'):
78
+ try:
79
+ # Use logiccode's utility to get a preview of the 1st page
80
+ preview_pages, _, _ = logiccode.pdf_to_images(file_path, max_pages=1)
81
+ if preview_pages:
82
+ gallery_images.append((preview_pages[0], f"{filename} (PDF Preview)"))
83
+ logs.append(f"Generated PDF preview for {filename}")
84
+ except Exception as e:
85
+ logs.append(f"⚠️ PDF Preview failed for {filename}: {e}")
86
+
87
+ # --- B. Text Extraction & Analysis ---
88
+ try:
89
+ # Extract text (logiccode handles PDF vs Image internally)
90
+ ocr_texts = logiccode.get_ocr_text(file_path, logiccode.args.pages)
91
+
92
+ if not ocr_texts:
93
+ logs.append(f"⚠️ Warning: No text extracted from {filename}")
94
+ results.append({
95
+ "File": filename, "Type": "Unreadable", "Score": 0,
96
+ "Status": "FAILED", "Matched Keywords": ""
97
+ })
98
+ continue
99
+
100
+ # Normalize text
101
+ full_text = " ".join(ocr_texts)
102
+ ocr_tokens = logiccode.normalize_text(full_text)
103
+
104
+ # Classify Document
105
+ doc_type, doc_score = logiccode.calculate_doc_type(ocr_tokens, debug=debug_enabled)
106
+ found_documents.add(doc_type)
107
+ logs.append(f"Classified as: {doc_type} (Confidence: {doc_score:.1f}%)")
108
+
109
+ # Verify Keywords
110
+ # logiccode.verify_keywords returns [{'keyword': 'x', 'matched': True/False, ...}]
111
+ verification_results = logiccode.verify_keywords(ocr_tokens, user_keywords, fuzzy_match_enabled)
112
+
113
+ matched_kws = [r['keyword'] for r in verification_results if r['matched']]
114
+ all_matched_keywords_per_file.append(set(matched_kws))
115
+
116
+ # Determine File Status
117
+ # If keywords were provided, we require all of them to match for "VERIFIED"
118
+ if user_keywords:
119
+ file_status = "VERIFIED" if len(matched_kws) == len(user_keywords) else "PARTIAL"
120
+ if len(matched_kws) == 0: file_status = "FAILED"
121
+ else:
122
+ file_status = "INFO ONLY"
123
+
124
+ logs.append(f"Matched: {matched_kws if matched_kws else 'None'}")
125
+
126
+ results.append({
127
+ "File": filename,
128
+ "Type": doc_type,
129
+ "Score": f"{doc_score:.1f}%",
130
+ "Status": file_status,
131
+ "Matched Keywords": ", ".join(matched_kws)
132
+ })
133
+
134
+ except Exception as e:
135
+ error_msg = f"Error processing {filename}: {str(e)}"
136
+ logs.append(error_msg)
137
+ if debug_enabled:
138
+ import traceback
139
+ logs.append(traceback.format_exc())
140
+
141
+ results.append({
142
+ "File": filename, "Type": "Error", "Score": 0,
143
+ "Status": "ERROR", "Matched Keywords": str(e)
144
+ })
145
+
146
+ # 3. Calculate Summary Logic
147
+ required_set = set(required_docs)
148
+ missing_docs = required_set - found_documents
149
+
150
+ all_user_keywords = set(user_keywords)
151
+ keywords_found_across_all_files = set()
152
+ for file_kw_set in all_matched_keywords_per_file:
153
+ keywords_found_across_all_files.update(file_kw_set)
154
+
155
+ missing_keywords = all_user_keywords - keywords_found_across_all_files
156
+
157
+ # 4. Build HTML Report
158
+ return build_html_summary(required_set, missing_docs, missing_keywords), gallery_images, pd.DataFrame(results), "\n".join(logs)
159
+
160
+ def build_html_summary(required_set, missing_docs, missing_keywords):
161
+ html = """
162
+ <div style='padding: 20px; background-color: white; border-radius: 10px; border: 1px solid #e5e7eb;'>
163
+ <h3 style='margin-top: 0; color: #333;'>Verification Summary</h3>
164
+ """
165
+
166
+ # Document Status
167
+ doc_status_bool = True
168
+ if required_set:
169
+ if missing_docs:
170
+ doc_status_bool = False
171
+ html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Documents:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_docs))}</span></div>"
172
+ else:
173
+ html += f"<div style='margin-bottom: 8px;'>✅ <b>Documents:</b> All required types found.</div>"
174
+ else:
175
+ html += "<div style='margin-bottom: 8px; color: #666;'>ℹ️ No specific document types required.</div>"
176
+
177
+ # Keyword Status
178
+ kw_status_bool = True
179
+ if missing_keywords:
180
+ kw_status_bool = False
181
+ html += f"<div style='margin-bottom: 8px;'>❌ <b>Missing Keywords:</b> <span style='color: #ef4444;'>{', '.join(sorted(missing_keywords))}</span></div>"
182
+ else:
183
+ html += f"<div style='margin-bottom: 8px;'>✅ <b>Keywords:</b> All keywords found.</div>"
184
+
185
+ # Final Status
186
+ overall_color = "#10b981" if (doc_status_bool and kw_status_bool) else "#ef4444"
187
+ overall_text = "VERIFIED" if (doc_status_bool and kw_status_bool) else "ACTION REQUIRED"
188
+
189
+ html += f"<hr style='margin: 15px 0; border-color: #eee;'>"
190
+ html += f"<h2 style='color: {overall_color}; margin: 0; text-align: center;'>{overall_text}</h2>"
191
+ html += "</div>"
192
+ return html
193
+
194
+ # ---------------------------------------------------------
195
+ # GRADIO UI SETUP
196
+ # ---------------------------------------------------------
197
+ theme = gr.themes.Soft(
198
+ primary_hue="blue",
199
+ secondary_hue="slate",
200
+ ).set(
201
+ body_background_fill="#f9fafb",
202
+ block_background_fill="white",
203
+ block_border_width="1px"
204
+ )
205
+
206
+ with gr.Blocks(theme=theme, title="DocuVerify Pro") as demo:
207
+ gr.Markdown(
208
+ """
209
+ # 📄 Intelligent Document Verification
210
+ Upload documents, specify required types, and verify content matches automatically.
211
+ """
212
+ )
213
+
214
+ with gr.Row():
215
+ # Left Column: Inputs
216
+ with gr.Column(scale=4):
217
+ files_input = gr.File(
218
+ file_count="multiple",
219
+ label="1. Upload Documents",
220
+ file_types=[".pdf", ".png", ".jpg", ".jpeg", ".bmp"],
221
+ height=250
222
+ )
223
+
224
+ keywords_input = gr.Textbox(
225
+ label="2. Keywords to Verify",
226
+ placeholder="Name, ID Number, Date of Birth...",
227
+ info="Enter values that MUST appear in the documents (space separated)",
228
+ lines=2
229
+ )
230
+
231
+ # Right Column: Configuration
232
+ with gr.Column(scale=3):
233
+ # Fetch doc types dynamically from logiccode
234
+ available_types = sorted(list(logiccode.DOC_KEYWORDS.keys())) if hasattr(logiccode, 'DOC_KEYWORDS') else []
235
+
236
+ required_docs_input = gr.Dropdown(
237
+ choices=available_types,
238
+ multiselect=True,
239
+ label="3. Required Document Types",
240
+ info="Which documents are mandatory?",
241
+ value=[]
242
+ )
243
+
244
+ with gr.Group():
245
+ gr.Markdown("### Settings")
246
+ fuzzy_checkbox = gr.Checkbox(value=True, label="Enable Fuzzy Matching (Approximate spelling)")
247
+ debug_checkbox = gr.Checkbox(value=False, label="Show Debug Logs")
248
+
249
+ verify_btn = gr.Button("🔍 Verify Documents", variant="primary", size="lg")
250
+
251
+ gr.Markdown("---")
252
+
253
+ # Results Area
254
+ with gr.Row():
255
+ # Summary Box
256
+ with gr.Column(scale=1):
257
+ status_output = gr.HTML(label="Overall Status")
258
+
259
+ # Detailed Tabs
260
+ with gr.Column(scale=2):
261
+ with gr.Tabs():
262
+ with gr.TabItem("📊 Results Table"):
263
+ results_df = gr.Dataframe(
264
+ headers=["File", "Type", "Score", "Status", "Matched Keywords"],
265
+ interactive=False
266
+ )
267
+
268
+ with gr.TabItem("🖼️ Document Gallery"):
269
+ gallery = gr.Gallery(
270
+ label="Processed Images",
271
+ show_label=False,
272
+ columns=[3], rows=[2],
273
+ object_fit="contain",
274
+ height="auto"
275
+ )
276
+
277
+ with gr.TabItem("📝 System Logs"):
278
+ logs_output = gr.Textbox(
279
+ label="Processing Logs",
280
+ lines=15,
281
+ interactive=False,
282
+ show_copy_button=True
283
+ )
284
+
285
+ # Event Trigger
286
+ verify_btn.click(
287
+ fn=process_documents,
288
+ inputs=[files_input, keywords_input, required_docs_input, fuzzy_checkbox, debug_checkbox],
289
+ outputs=[status_output, gallery, results_df, logs_output]
290
+ )
291
+
292
+ if __name__ == "__main__":
293
+ # Increase max file size if needed, allow sharing
294
+ demo.launch(share=False, server_name="0.0.0.0")
logiccode.py ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #!/usr/bin/env python3
3
+ """
4
+ OCR Document Verification with Batch Processing & Required Document Checklist
5
+ Usage:
6
+ # Single file (backward compatible)
7
+ python ocrupdated2.py --file image.jpg --inputkeywords "keyword1 keyword2" --fuzzy --debug
8
+ # Multiple files with required document checklist
9
+ python ocrupdated2.py --file doc1.pdf doc2.jpg doc3.png --inputkeywords "Shaikh Anisa Rahat" --required PAN HSC AgeNationalityDomicile --fuzzy --debug
10
+ NOTE: Use spaces to separate required document types, NOT commas:
11
+ ✅ --required PAN Aadhaar HSC
12
+ ❌ --required PAN, Aadhaar, HSC
13
+ """
14
+
15
+ import argparse
16
+ import re
17
+ import os
18
+ import tempfile
19
+ from collections import defaultdict
20
+ from paddleocr import PaddleOCR
21
+ import difflib
22
+
23
+ # Optional PDF support
24
+ try:
25
+ import fitz # PyMuPDF
26
+ PDF_SUPPORT = True
27
+ except ImportError:
28
+ PDF_SUPPORT = False
29
+ print("Warning: PyMuPDF not installed. PDF support disabled. Install with: pip install PyMuPDF")
30
+
31
+ # Document keywords (kept same as your updated version)
32
+ DOC_KEYWORDS = {
33
+ "Aadhaar": [
34
+ "uidai", "aadhaar", "aadhar", "government of india", "भारत सरकार",
35
+ "आधार", "यूआईडीएआई", "प्रधानमंत्री", "जन्म तिथि", "पता", "लिंग",
36
+ "unique identification authority", "aadhaar number", "enrollment number"
37
+ ],
38
+ "PAN": [
39
+ "permanent account number", "income tax", "incometaxindia", "pan",
40
+ "income tax department", "आयकर विभाग", "स्थायी खाता संख्या",
41
+ "taxpayer", "father's name", "पिता का नाम", "signature", "inc"
42
+ ],
43
+ "Driving_License": [
44
+ "driving licence", "motor vehicles act", "rto", "mcwg", "lmv",
45
+ "transport department", "licence no", "valid till", "date of issue",
46
+ "ड्राइविंग लाइसेंस", "परिवहन विभाग", "challan", "regional transport office"
47
+ ],
48
+ "Passport": [
49
+ "passport", "republic of india", "ministry of external affairs",
50
+ "passport number", "date of issue", "date of expiry", "surname",
51
+ "given names", "nationality indian", "पासपोर्ट", "गणराज्य", "विदेश मंत्रालय",
52
+ "consular", "visa"
53
+ ],
54
+ "SSC": [
55
+ "secondary school certificate", "statement of marks", "ssc", "10th", "class x",
56
+ "board of secondary education", "maharashtra state board", "matriculation",
57
+ "roll number", "seat number", "subject code", "marks obtained", "grade", "pass"
58
+ ],
59
+ "HSC": [
60
+ "higher secondary certificate", "statement of marks", "hsc", "12th", "class xii",
61
+ "board of higher secondary education", "maharashtra state board", "intermediate",
62
+ "stream", "science", "commerce", "arts", "marks obtained", "grade", "percentage"
63
+ ],
64
+ "AgeNationalityDomicile": [
65
+ "certificate of age nationality and domicile", "domicile certificate",
66
+ "age nationality domicile", "tehsildar", "executive magistrate", "collector",
67
+ "certificate of residence", "domiciled in the state of", "citizen of india",
68
+ "residence proof", "maharashtra domicile", "satara", "karad", "taluka", "district"
69
+ ],
70
+ "Ration_Card": [
71
+ "ration card", "food and civil supplies", "apl", "bpl", "aay", "antyodaya",
72
+ "ration card number", "family members", "head of family",
73
+ "राशन कार्ड", "खाद्य पुरवठा", "नागरी पुरवठा विभाग", "fps", "fair price shop"
74
+ ],
75
+ "Cast_Certificate": [
76
+ "CASTE CERTIFICATE",
77
+ "FORM - 8",
78
+ "Rule No. 5(6)",
79
+ "De-Notified Tribe (Vimukt Jati)",
80
+ "Nomadic Tribe/Other Backward Class",
81
+ "Special Backward Category",
82
+ "recognised as",
83
+ "Government Resolution",
84
+ "Sub Divisional Officer",
85
+ "belonging to the State of Maharashtra"
86
+ ],
87
+ "Income_Certificate": [
88
+ "१ वर्षासाठी उत्पन्नाचे प्रमाणपत्र",
89
+ "ऑफिस ऑफ नायब तहसीलदार",
90
+ "वार्षिक उत्पन्न",
91
+ "मिळालेले १ वर्षाचे उत्पन्न",
92
+ "कुटुंबातील सर्व सदस्यांचे",
93
+ "प्रमाणित करण्यात येते की",
94
+ "वैध राहील",
95
+ "Signature valid",
96
+ "Digitally Signed by"
97
+ ],
98
+ "PCM_Score_Card": [
99
+ "MAH-MHT CET (PCM Group)",
100
+ "State Common Entrance Test Cell",
101
+ "Score Card",
102
+ "Physics",
103
+ "Chemistry",
104
+ "Mathematics",
105
+ "Total Percentile",
106
+ "Normalization document",
107
+ "Centralized Admission Process (CAP)",
108
+ "IP address of the Computer"
109
+ ]
110
+ }
111
+
112
+ # Validate keyword uniqueness (optional debug output)
113
+ _keyword_sets = {k: set(v) for k, v in DOC_KEYWORDS.items()}
114
+ for doc1 in DOC_KEYWORDS:
115
+ for doc2 in DOC_KEYWORDS:
116
+ if doc1 < doc2:
117
+ overlap = _keyword_sets[doc1].intersection(_keyword_sets[doc2])
118
+ if overlap:
119
+ print(f"⚠️ Warning: Overlap between {doc1} and {doc2}: {overlap}")
120
+
121
+ def normalize_text(text):
122
+ """Robust multilingual tokenization with noise filtering"""
123
+ text = text.lower()
124
+ # Extract Hindi Devanagari (2+ chars) OR English alphanumeric (3+ chars)
125
+ tokens = re.findall(r'[\u0900-\u097F]{2,}|\w{3,}', text)
126
+
127
+ # Remove common English stopwords
128
+ stopwords = {'the', 'and', 'of', 'in', 'to', 'for', 'is', 'on', 'by', 'with', 'at', 'from', 'a', 'an', 'this'}
129
+ tokens = [t for t in tokens if t not in stopwords]
130
+
131
+ # Remove OCR noise (4+ consecutive consonants = garbage)
132
+ noise_pattern = re.compile(r'^[b-df-hj-np-tv-xz]{4,}$')
133
+ tokens = [t for t in tokens if not noise_pattern.match(t)]
134
+
135
+ return tokens
136
+
137
+ def pdf_to_images(pdf_path, max_pages=3):
138
+ """Convert PDF pages to high-resolution temporary images"""
139
+ if not PDF_SUPPORT:
140
+ raise ValueError("PDF support not available. Install PyMuPDF")
141
+
142
+ doc = fitz.open(pdf_path)
143
+ total_pages = len(doc)
144
+ pages_to_process = min(total_pages, max_pages)
145
+
146
+ image_paths = []
147
+ temp_dir = tempfile.mkdtemp(prefix="ocr_pdf_")
148
+
149
+ for page_num in range(pages_to_process):
150
+ page = doc.load_page(page_num)
151
+ zoom = 2 # 2x resolution for better OCR
152
+ mat = fitz.Matrix(zoom, zoom)
153
+ pix = page.get_pixmap(matrix=mat)
154
+
155
+ img_path = os.path.join(temp_dir, f"page_{page_num + 1}.png")
156
+ pix.save(img_path)
157
+ image_paths.append(img_path)
158
+
159
+ doc.close()
160
+ return image_paths, total_pages, temp_dir
161
+
162
+ def get_ocr_text(file_path, max_pages=3):
163
+ """Process image or PDF with OCR, returning all extracted text lines"""
164
+ ocr = PaddleOCR(
165
+ lang="mr",
166
+ use_doc_orientation_classify=False,
167
+ use_doc_unwarping=False,
168
+ use_textline_orientation=False)
169
+
170
+ all_texts = []
171
+ temp_dir = None
172
+
173
+ try:
174
+ if file_path.lower().endswith('.pdf'):
175
+ if not PDF_SUPPORT:
176
+ print("Error: PDF file provided but PyMuPDF not installed")
177
+ return []
178
+
179
+ image_paths, total_pages, temp_dir = pdf_to_images(file_path, max_pages)
180
+ print(f"Processing PDF: {total_pages} pages total, processing first {len(image_paths)} pages...")
181
+
182
+ for i, img_path in enumerate(image_paths, 1):
183
+ if args.debug:
184
+ print(f"\n--- Processing PDF Page {i} ---")
185
+ result = ocr.predict(input=img_path)
186
+ for res in result:
187
+ all_texts.extend(res['rec_texts'])
188
+ else:
189
+ result = ocr.predict(input=file_path)
190
+ for res in result:
191
+ all_texts.extend(res['rec_texts'])
192
+
193
+ finally:
194
+ if temp_dir and os.path.exists(temp_dir):
195
+ import shutil
196
+ shutil.rmtree(temp_dir)
197
+
198
+ return all_texts
199
+
200
+ def fuzzy_match(token, target_set, threshold=0.75):
201
+ """
202
+ Multi-level matching for OCR errors:
203
+ 1. Exact match
204
+ 2. Levenshtein distance
205
+ 3. Substring containment
206
+ 4. Hindi character-level similarity
207
+ """
208
+ if token in target_set:
209
+ return token
210
+
211
+ # Levenshtein distance match
212
+ matches = difflib.get_close_matches(token, target_set, n=1, cutoff=threshold)
213
+ if matches:
214
+ return matches[0]
215
+
216
+ # Substring match (handles concatenated words)
217
+ for ocr_token in target_set:
218
+ if token in ocr_token or ocr_token in token:
219
+ return ocr_token
220
+
221
+ # Hindi-specific fuzzy matching (handles OCR errors like सत्पमेव → सत्यमेव)
222
+ if any('\u0900' <= c <= '\u097F' for c in token):
223
+ for ocr_token in target_set:
224
+ if len(ocr_token) > 3:
225
+ similarity = difflib.SequenceMatcher(None, token, ocr_token).ratio()
226
+ if similarity > threshold:
227
+ return ocr_token
228
+
229
+ return None
230
+
231
+ def calculate_doc_type(ocr_tokens, debug=False):
232
+ """
233
+ Enhanced document classification with CORRECTED tie-breaking logic.
234
+ Only compares documents that are ACTUALLY TIED (within 5% score).
235
+ """
236
+ ocr_set = set(ocr_tokens)
237
+ ocr_combined = " ".join(ocr_tokens)
238
+ scores = {}
239
+
240
+ for doc_type, keywords in DOC_KEYWORDS.items():
241
+ kw_set = set(k.lower() for k in keywords)
242
+
243
+ # Primary: exact/fuzzy token matches (weighted 2 for exact, 1.5 for fuzzy)
244
+ primary_matches = sum(2 if kw in ocr_set else 1.5 if fuzzy_match(kw, ocr_set) else 0
245
+ for kw in kw_set)
246
+
247
+ # Secondary: multi-word phrase matches in combined text
248
+ phrase_matches = sum(1 for kw in kw_set if " " in kw and kw in ocr_combined)
249
+
250
+ # Tertiary: title keyword bonus (certificate, card, licence, passport)
251
+ title_keywords = [kw for kw in kw_set if any(word in kw for word in ["certificate", "card", "licence", "passport"])]
252
+ title_match = sum(1 for kw in title_keywords if kw in ocr_combined)
253
+
254
+ # Calculate weighted score (max possible = len(kw_set) * 2)
255
+ max_possible = len(kw_set) * 2
256
+ weighted_score = ((primary_matches + phrase_matches + title_match) / max_possible) * 100
257
+
258
+ scores[doc_type] = weighted_score
259
+
260
+ if debug:
261
+ print(f" {doc_type:<25}: {weighted_score:>6.1f}% ({primary_matches:.1f} + {phrase_matches} + {title_match})")
262
+
263
+ # Sort by score descending
264
+ sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
265
+ best_type, best_score = sorted_scores[0]
266
+
267
+ # CRITICAL FIX: Only trigger tie-breaking if top TWO scores are close (within 5%)
268
+ if len(sorted_scores) > 1 and (sorted_scores[0][1] - sorted_scores[1][1]) < 5:
269
+ if debug:
270
+ print(f"\n⚠️ Tie detected between '{sorted_scores[0][0]}' and '{sorted_scores[1][0]}'!")
271
+
272
+ # Get ONLY the tied documents (within 5% of top score)
273
+ tied_docs = [(doc_type, score) for doc_type, score in sorted_scores
274
+ if (best_score - score) < 5]
275
+
276
+ if debug:
277
+ print(f"Tied documents: {[f'{doc}({score:.1f}%)' for doc, score in tied_docs]}")
278
+
279
+ # Calculate unique keywords ONLY for tied documents
280
+ unique_counts = {}
281
+ for doc_type, _ in tied_docs:
282
+ kw_set = set(k.lower() for k in DOC_KEYWORDS[doc_type])
283
+
284
+ # Get keywords from OTHER tied documents only
285
+ other_tied_keywords = set()
286
+ for other_doc, _ in tied_docs:
287
+ if other_doc != doc_type:
288
+ other_tied_keywords.update(k.lower() for k in DOC_KEYWORDS[other_doc])
289
+
290
+ unique_keywords = kw_set - other_tied_keywords
291
+ unique_matches = sum(1 for kw in unique_keywords if fuzzy_match(kw, ocr_set))
292
+ unique_counts[doc_type] = unique_matches
293
+
294
+ if debug:
295
+ print(f" {doc_type:<25}: {unique_matches} unique matches ({len(unique_keywords)} available)")
296
+
297
+ # Only use tie-breaker if there's a clear winner
298
+ if unique_counts and max(unique_counts.values()) > 0:
299
+ sorted_unique = sorted(unique_counts.items(), key=lambda x: x[1], reverse=True)
300
+ if len(sorted_unique) > 1 and sorted_unique[0][1] > sorted_unique[1][1]:
301
+ best_type = sorted_unique[0][0]
302
+ best_score = scores[best_type]
303
+
304
+ if debug:
305
+ print(f"✓ Tie broken: {best_type} wins with {unique_counts[best_type]} unique matches")
306
+
307
+ return best_type, best_score
308
+
309
+ def verify_keywords(ocr_tokens, user_keywords, use_fuzzy=False):
310
+ """
311
+ FIXED: Sequence-aware matching for multi-keyword inputs (names, addresses).
312
+ Checks if keywords appear consecutively in OCR text first.
313
+ """
314
+ ocr_set = set(ocr_tokens)
315
+ ocr_combined = " ".join(ocr_tokens)
316
+ results = []
317
+
318
+ # CRITICAL: For multi-keyword inputs, check for SEQUENCE match first
319
+ if len(user_keywords) > 1:
320
+ # Build the phrase as it should appear in OCR
321
+ user_phrase = " ".join([kw.lower() if all(ord(c) < 128 for c in kw) else kw for kw in user_keywords])
322
+
323
+ # Check if entire phrase exists in OCR text
324
+ if user_phrase in ocr_combined:
325
+ if args.debug:
326
+ print(f"\n✓ Sequence match: '{user_phrase}' found in OCR text")
327
+ # All keywords matched in correct order
328
+ for kw in user_keywords:
329
+ results.append({
330
+ 'keyword': kw,
331
+ 'matched': True,
332
+ 'matched_text': kw
333
+ })
334
+ return results
335
+
336
+ # Fuzzy phrase matching if enabled
337
+ if use_fuzzy:
338
+ # Create n-grams from OCR tokens matching user keyword count
339
+ n = len(user_keywords)
340
+ ocr_phrases = [" ".join(ocr_tokens[i:i+n]) for i in range(len(ocr_tokens) - n + 1)]
341
+
342
+ phrase_match = fuzzy_match(user_phrase, set(ocr_phrases))
343
+ if phrase_match:
344
+ if args.debug:
345
+ print(f"\n✓ Fuzzy sequence match: '{user_phrase}' ~ '{phrase_match}'")
346
+ for kw in user_keywords:
347
+ results.append({
348
+ 'keyword': kw,
349
+ 'matched': True,
350
+ 'matched_text': kw
351
+ })
352
+ return results
353
+
354
+ # Fallback to individual keyword matching
355
+ for kw in user_keywords:
356
+ kw_processed = kw.lower() if all(ord(c) < 128 for c in kw) else kw
357
+ matched = False
358
+ matched_text = None
359
+
360
+ if kw_processed in ocr_set:
361
+ matched = True
362
+ matched_text = kw_processed
363
+ elif " " in kw_processed and kw_processed in ocr_combined:
364
+ matched = True
365
+ matched_text = kw_processed
366
+ elif use_fuzzy:
367
+ matched_text = fuzzy_match(kw_processed, ocr_set)
368
+ if matched_text:
369
+ matched = True
370
+
371
+ results.append({
372
+ 'keyword': kw,
373
+ 'matched': matched,
374
+ 'matched_text': matched_text or kw_processed if matched else None
375
+ })
376
+
377
+ return results
378
+
379
+ def main():
380
+ parser = argparse.ArgumentParser(description='OCR Document Verification with PDF Support')
381
+ parser.add_argument('--file', nargs='+', required=True, help='Paths to image or PDF files')
382
+ parser.add_argument('--inputkeywords', required=True, help='Space-separated keywords to verify')
383
+ parser.add_argument('--required', nargs='+', help='List of required document types (space-separated, e.g., PAN Aadhaar HSC)')
384
+ parser.add_argument('--fuzzy', action='store_true', help='Enable fuzzy matching')
385
+ parser.add_argument('--debug', action='store_true', help='Show detailed OCR and scoring output')
386
+ parser.add_argument('--pages', type=int, default=3, help='Max pages to process for PDFs (default: 3)')
387
+ global args
388
+ args = parser.parse_args()
389
+
390
+ # CRITICAL FIX: Clean the required list by stripping commas and whitespace
391
+ required_list = []
392
+ if args.required:
393
+ for item in args.required:
394
+ # Split on commas and strip whitespace from each part
395
+ parts = [part.strip() for part in item.split(',') if part.strip()]
396
+ required_list.extend(parts)
397
+
398
+ required_set = set(required_list)
399
+
400
+ # Process each file and collect results
401
+ file_results = []
402
+ found_documents = set()
403
+ all_matched_keywords_per_file = []
404
+
405
+ print(f"\n{'='*60}")
406
+ print(f"PROCESSING {len(args.file)} FILES")
407
+ print(f"{'='*60}\n")
408
+
409
+ for idx, file_path in enumerate(args.file, 1):
410
+ print(f"--- FILE {idx}/{len(args.file)}: {os.path.basename(file_path)} ---")
411
+
412
+ # Extract text from file
413
+ ocr_texts = get_ocr_text(file_path, args.pages)
414
+
415
+ if not ocr_texts:
416
+ print(f"⚠️ No text extracted from {file_path}\n")
417
+ file_results.append({
418
+ 'file': file_path,
419
+ 'doc_type': 'Unknown',
420
+ 'doc_score': 0,
421
+ 'keywords_matched': [],
422
+ 'status': 'ERROR'
423
+ })
424
+ continue
425
+
426
+ # Debug: Show raw OCR
427
+ if args.debug:
428
+ print("\n" + "="*60)
429
+ print("RAW OCR EXTRACTED TEXT:")
430
+ print("="*60)
431
+ for i, text in enumerate(ocr_texts, 1):
432
+ print(f"{i:3d}. {text}")
433
+ print("="*60 + "\n")
434
+
435
+ # Normalize tokens
436
+ ocr_tokens = normalize_text(" ".join(ocr_texts))
437
+
438
+ # Debug: Show normalized tokens
439
+ if args.debug:
440
+ print("="*60)
441
+ print("NORMALIZED TOKENS:")
442
+ print("="*60)
443
+ print(f"Total tokens: {len(ocr_tokens)}")
444
+ print(f"First 50 tokens: {', '.join(ocr_tokens[:50])}{'...' if len(ocr_tokens) > 50 else ''}")
445
+ print("="*60 + "\n")
446
+
447
+ # Document classification
448
+ if args.debug:
449
+ print("="*60)
450
+ print("DOCUMENT TYPE SCORING:")
451
+ print("="*60)
452
+
453
+ doc_type, doc_score = calculate_doc_type(ocr_tokens, debug=args.debug)
454
+ found_documents.add(doc_type)
455
+
456
+ if args.debug:
457
+ print("="*60 + "\n")
458
+
459
+ # Keyword verification
460
+ user_keywords = [kw.strip() for kw in args.inputkeywords.split()]
461
+ verification_results = verify_keywords(ocr_tokens, user_keywords, args.fuzzy)
462
+
463
+ # Status: ALL keywords must match in this file
464
+ all_matched = all(r['matched'] for r in verification_results)
465
+ status = "VERIFIED" if all_matched else "NOT VERIFIED"
466
+
467
+ # Store results for this file
468
+ file_results.append({
469
+ 'file': file_path,
470
+ 'doc_type': doc_type,
471
+ 'doc_score': doc_score,
472
+ 'keywords_matched': verification_results,
473
+ 'status': status,
474
+ 'all_keywords_matched': all_matched
475
+ })
476
+
477
+ # Track which keywords were matched in this file
478
+ matched_keywords_in_file = {r['keyword'] for r in verification_results if r['matched']}
479
+ all_matched_keywords_per_file.append(matched_keywords_in_file)
480
+
481
+ # Per-file output
482
+ print(f"\n{'='*60}")
483
+ print(f"Document Type: {doc_type} ({doc_score:.1f}% confidence)")
484
+ print(f"{'='*60}")
485
+ print(f"{'Keyword':<25} | {'Status':<10} | {'Matched Text'}")
486
+ print(f"{'-'*60}")
487
+
488
+ for r in verification_results:
489
+ status_icon = "✓" if r['matched'] else "✗"
490
+ matched_text = r['matched_text'] if r['matched_text'] else "Not found"
491
+ print(f"{r['keyword']:<25} | {status_icon:<10} | {matched_text}")
492
+
493
+ print(f"{'='*60}")
494
+ print(f"File Status: {status}")
495
+ print(f"{'='*60}\n")
496
+
497
+ # FINAL SUMMARY
498
+ print(f"\n{'='*60}")
499
+ print(f"FINAL SUMMARY")
500
+ print(f"{'='*60}")
501
+
502
+ # Required documents check
503
+ if required_set:
504
+ missing_docs = required_set - found_documents
505
+
506
+ print(f"\nRequired Documents: {', '.join(sorted(required_set))}")
507
+ print(f"Found Documents: {', '.join(sorted(found_documents)) if found_documents else 'None'}")
508
+
509
+ if missing_docs:
510
+ print(f"❌ Missing Documents: {', '.join(sorted(missing_docs))}")
511
+ docs_status = "NOT VERIFIED"
512
+ else:
513
+ print(f"✅ All required documents found!")
514
+ docs_status = "VERIFIED"
515
+ else:
516
+ docs_status = "N/A (no required list specified)"
517
+ missing_docs = set()
518
+
519
+ # Overall keyword verification across ALL files
520
+ # Check if every keyword appears in at least one file
521
+ all_user_keywords = set(args.inputkeywords.split())
522
+ keywords_found_across_files = set()
523
+
524
+ for file_keyword_set in all_matched_keywords_per_file:
525
+ keywords_found_across_files.update(file_keyword_set)
526
+
527
+ missing_keywords = all_user_keywords - keywords_found_across_files
528
+
529
+ print(f"\nKeywords to Find: {', '.join(sorted(all_user_keywords))}")
530
+ print(f"Keywords Found (across all files): {', '.join(sorted(keywords_found_across_files)) if keywords_found_across_files else 'None'}")
531
+
532
+ if missing_keywords:
533
+ print(f"❌ Missing Keywords: {', '.join(sorted(missing_keywords))}")
534
+ keywords_status = "NOT VERIFIED"
535
+ else:
536
+ print(f"✅ All keywords found across uploaded documents!")
537
+ keywords_status = "VERIFIED"
538
+
539
+ # Overall status: BOTH documents and keywords must be verified
540
+ overall_status = "VERIFIED" if (docs_status == "VERIFIED" and keywords_status == "VERIFIED") else "NOT VERIFIED"
541
+
542
+ print(f"\n{'='*60}")
543
+ print(f"Documents Status: {docs_status}")
544
+ print(f"Keywords Status: {keywords_status}")
545
+ print(f"OVERALL STATUS: {overall_status}")
546
+ print(f"{'='*60}")
547
+
548
+ if __name__ == "__main__":
549
+ main()