triflix commited on
Commit
6f9d4a7
·
verified ·
1 Parent(s): a081bdc

Update logiccode.py

Browse files
Files changed (1) hide show
  1. logiccode.py +574 -548
logiccode.py CHANGED
@@ -1,549 +1,575 @@
1
-
2
- #!/usr/bin/env python3
3
- """
4
- OCR Document Verification with Batch Processing & Required Document Checklist
5
- Usage:
6
- # Single file (backward compatible)
7
- python ocrupdated2.py --file image.jpg --inputkeywords "keyword1 keyword2" --fuzzy --debug
8
- # Multiple files with required document checklist
9
- python ocrupdated2.py --file doc1.pdf doc2.jpg doc3.png --inputkeywords "Shaikh Anisa Rahat" --required PAN HSC AgeNationalityDomicile --fuzzy --debug
10
- NOTE: Use spaces to separate required document types, NOT commas:
11
- --required PAN Aadhaar HSC
12
- ❌ --required PAN, Aadhaar, HSC
13
- """
14
-
15
- import argparse
16
- import re
17
- import os
18
- import tempfile
19
- from collections import defaultdict
20
- from paddleocr import PaddleOCR
21
- import difflib
22
-
23
- # Optional PDF support
24
- try:
25
- import fitz # PyMuPDF
26
- PDF_SUPPORT = True
27
- except ImportError:
28
- PDF_SUPPORT = False
29
- print("Warning: PyMuPDF not installed. PDF support disabled. Install with: pip install PyMuPDF")
30
-
31
- # Document keywords (kept same as your updated version)
32
- DOC_KEYWORDS = {
33
- "Aadhaar": [
34
- "uidai", "aadhaar", "aadhar", "government of india", "भारत सरकार",
35
- "आधार", "यूआईडीएआई", "प्रधानमंत्री", "जन्म तिथि", "पता", "लिंग",
36
- "unique identification authority", "aadhaar number", "enrollment number"
37
- ],
38
- "PAN": [
39
- "permanent account number", "income tax", "incometaxindia", "pan",
40
- "income tax department", "आयकर विभाग", "स्थायी खाता संख्या",
41
- "taxpayer", "father's name", "पिता का नाम", "signature", "inc"
42
- ],
43
- "Driving_License": [
44
- "driving licence", "motor vehicles act", "rto", "mcwg", "lmv",
45
- "transport department", "licence no", "valid till", "date of issue",
46
- "ड्राइविंग लाइसेंस", "परिवहन विभाग", "challan", "regional transport office"
47
- ],
48
- "Passport": [
49
- "passport", "republic of india", "ministry of external affairs",
50
- "passport number", "date of issue", "date of expiry", "surname",
51
- "given names", "nationality indian", "पासपोर्ट", "गणराज्य", "विदेश मंत्रालय",
52
- "consular", "visa"
53
- ],
54
- "SSC": [
55
- "secondary school certificate", "statement of marks", "ssc", "10th", "class x",
56
- "board of secondary education", "maharashtra state board", "matriculation",
57
- "roll number", "seat number", "subject code", "marks obtained", "grade", "pass"
58
- ],
59
- "HSC": [
60
- "higher secondary certificate", "statement of marks", "hsc", "12th", "class xii",
61
- "board of higher secondary education", "maharashtra state board", "intermediate",
62
- "stream", "science", "commerce", "arts", "marks obtained", "grade", "percentage"
63
- ],
64
- "AgeNationalityDomicile": [
65
- "certificate of age nationality and domicile", "domicile certificate",
66
- "age nationality domicile", "tehsildar", "executive magistrate", "collector",
67
- "certificate of residence", "domiciled in the state of", "citizen of india",
68
- "residence proof", "maharashtra domicile", "satara", "karad", "taluka", "district"
69
- ],
70
- "Ration_Card": [
71
- "ration card", "food and civil supplies", "apl", "bpl", "aay", "antyodaya",
72
- "ration card number", "family members", "head of family",
73
- "राशन कार्ड", "खाद्य पुरवठा", "नागरी पुरवठा विभाग", "fps", "fair price shop"
74
- ],
75
- "Cast_Certificate": [
76
- "CASTE CERTIFICATE",
77
- "FORM - 8",
78
- "Rule No. 5(6)",
79
- "De-Notified Tribe (Vimukt Jati)",
80
- "Nomadic Tribe/Other Backward Class",
81
- "Special Backward Category",
82
- "recognised as",
83
- "Government Resolution",
84
- "Sub Divisional Officer",
85
- "belonging to the State of Maharashtra"
86
- ],
87
- "Income_Certificate": [
88
- " वर्षासाठी उत्पन्नाचे प्रमाणपत्र",
89
- "ऑफिस ऑफ नायब तहसीलदार",
90
- "वार्षिक उत्पन्न",
91
- "मिळालेले १ वर्षाचे उत्पन्न",
92
- "कुटुंबातील सर्व सदस्यांचे",
93
- "प्रमाणित करण्यात येते की",
94
- "वैध राहील",
95
- "Signature valid",
96
- "Digitally Signed by"
97
- ],
98
- "PCM_Score_Card": [
99
- "MAH-MHT CET (PCM Group)",
100
- "State Common Entrance Test Cell",
101
- "Score Card",
102
- "Physics",
103
- "Chemistry",
104
- "Mathematics",
105
- "Total Percentile",
106
- "Normalization document",
107
- "Centralized Admission Process (CAP)",
108
- "IP address of the Computer"
109
- ]
110
- }
111
-
112
- # Validate keyword uniqueness (optional debug output)
113
- _keyword_sets = {k: set(v) for k, v in DOC_KEYWORDS.items()}
114
- for doc1 in DOC_KEYWORDS:
115
- for doc2 in DOC_KEYWORDS:
116
- if doc1 < doc2:
117
- overlap = _keyword_sets[doc1].intersection(_keyword_sets[doc2])
118
- if overlap:
119
- print(f"⚠️ Warning: Overlap between {doc1} and {doc2}: {overlap}")
120
-
121
- def normalize_text(text):
122
- """Robust multilingual tokenization with noise filtering"""
123
- text = text.lower()
124
- # Extract Hindi Devanagari (2+ chars) OR English alphanumeric (3+ chars)
125
- tokens = re.findall(r'[\u0900-\u097F]{2,}|\w{3,}', text)
126
-
127
- # Remove common English stopwords
128
- stopwords = {'the', 'and', 'of', 'in', 'to', 'for', 'is', 'on', 'by', 'with', 'at', 'from', 'a', 'an', 'this'}
129
- tokens = [t for t in tokens if t not in stopwords]
130
-
131
- # Remove OCR noise (4+ consecutive consonants = garbage)
132
- noise_pattern = re.compile(r'^[b-df-hj-np-tv-xz]{4,}$')
133
- tokens = [t for t in tokens if not noise_pattern.match(t)]
134
-
135
- return tokens
136
-
137
- def pdf_to_images(pdf_path, max_pages=3):
138
- """Convert PDF pages to high-resolution temporary images"""
139
- if not PDF_SUPPORT:
140
- raise ValueError("PDF support not available. Install PyMuPDF")
141
-
142
- doc = fitz.open(pdf_path)
143
- total_pages = len(doc)
144
- pages_to_process = min(total_pages, max_pages)
145
-
146
- image_paths = []
147
- temp_dir = tempfile.mkdtemp(prefix="ocr_pdf_")
148
-
149
- for page_num in range(pages_to_process):
150
- page = doc.load_page(page_num)
151
- zoom = 2 # 2x resolution for better OCR
152
- mat = fitz.Matrix(zoom, zoom)
153
- pix = page.get_pixmap(matrix=mat)
154
-
155
- img_path = os.path.join(temp_dir, f"page_{page_num + 1}.png")
156
- pix.save(img_path)
157
- image_paths.append(img_path)
158
-
159
- doc.close()
160
- return image_paths, total_pages, temp_dir
161
-
162
- def get_ocr_text(file_path, max_pages=3):
163
- """Process image or PDF with OCR, returning all extracted text lines"""
164
- ocr = PaddleOCR(
165
- lang="mr",
166
- use_doc_orientation_classify=False,
167
- use_doc_unwarping=False,
168
- use_textline_orientation=False)
169
-
170
- all_texts = []
171
- temp_dir = None
172
-
173
- try:
174
- if file_path.lower().endswith('.pdf'):
175
- if not PDF_SUPPORT:
176
- print("Error: PDF file provided but PyMuPDF not installed")
177
- return []
178
-
179
- image_paths, total_pages, temp_dir = pdf_to_images(file_path, max_pages)
180
- print(f"Processing PDF: {total_pages} pages total, processing first {len(image_paths)} pages...")
181
-
182
- for i, img_path in enumerate(image_paths, 1):
183
- if args.debug:
184
- print(f"\n--- Processing PDF Page {i} ---")
185
- result = ocr.predict(input=img_path)
186
- for res in result:
187
- all_texts.extend(res['rec_texts'])
188
- else:
189
- result = ocr.predict(input=file_path)
190
- for res in result:
191
- all_texts.extend(res['rec_texts'])
192
-
193
- finally:
194
- if temp_dir and os.path.exists(temp_dir):
195
- import shutil
196
- shutil.rmtree(temp_dir)
197
-
198
- return all_texts
199
-
200
- def fuzzy_match(token, target_set, threshold=0.75):
201
- """
202
- Multi-level matching for OCR errors:
203
- 1. Exact match
204
- 2. Levenshtein distance
205
- 3. Substring containment
206
- 4. Hindi character-level similarity
207
- """
208
- if token in target_set:
209
- return token
210
-
211
- # Levenshtein distance match
212
- matches = difflib.get_close_matches(token, target_set, n=1, cutoff=threshold)
213
- if matches:
214
- return matches[0]
215
-
216
- # Substring match (handles concatenated words)
217
- for ocr_token in target_set:
218
- if token in ocr_token or ocr_token in token:
219
- return ocr_token
220
-
221
- # Hindi-specific fuzzy matching (handles OCR errors like सत्पमेव → सत्यमेव)
222
- if any('\u0900' <= c <= '\u097F' for c in token):
223
- for ocr_token in target_set:
224
- if len(ocr_token) > 3:
225
- similarity = difflib.SequenceMatcher(None, token, ocr_token).ratio()
226
- if similarity > threshold:
227
- return ocr_token
228
-
229
- return None
230
-
231
- def calculate_doc_type(ocr_tokens, debug=False):
232
- """
233
- Enhanced document classification with CORRECTED tie-breaking logic.
234
- Only compares documents that are ACTUALLY TIED (within 5% score).
235
- """
236
- ocr_set = set(ocr_tokens)
237
- ocr_combined = " ".join(ocr_tokens)
238
- scores = {}
239
-
240
- for doc_type, keywords in DOC_KEYWORDS.items():
241
- kw_set = set(k.lower() for k in keywords)
242
-
243
- # Primary: exact/fuzzy token matches (weighted 2 for exact, 1.5 for fuzzy)
244
- primary_matches = sum(2 if kw in ocr_set else 1.5 if fuzzy_match(kw, ocr_set) else 0
245
- for kw in kw_set)
246
-
247
- # Secondary: multi-word phrase matches in combined text
248
- phrase_matches = sum(1 for kw in kw_set if " " in kw and kw in ocr_combined)
249
-
250
- # Tertiary: title keyword bonus (certificate, card, licence, passport)
251
- title_keywords = [kw for kw in kw_set if any(word in kw for word in ["certificate", "card", "licence", "passport"])]
252
- title_match = sum(1 for kw in title_keywords if kw in ocr_combined)
253
-
254
- # Calculate weighted score (max possible = len(kw_set) * 2)
255
- max_possible = len(kw_set) * 2
256
- weighted_score = ((primary_matches + phrase_matches + title_match) / max_possible) * 100
257
-
258
- scores[doc_type] = weighted_score
259
-
260
- if debug:
261
- print(f" {doc_type:<25}: {weighted_score:>6.1f}% ({primary_matches:.1f} + {phrase_matches} + {title_match})")
262
-
263
- # Sort by score descending
264
- sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
265
- best_type, best_score = sorted_scores[0]
266
-
267
- # CRITICAL FIX: Only trigger tie-breaking if top TWO scores are close (within 5%)
268
- if len(sorted_scores) > 1 and (sorted_scores[0][1] - sorted_scores[1][1]) < 5:
269
- if debug:
270
- print(f"\n⚠️ Tie detected between '{sorted_scores[0][0]}' and '{sorted_scores[1][0]}'!")
271
-
272
- # Get ONLY the tied documents (within 5% of top score)
273
- tied_docs = [(doc_type, score) for doc_type, score in sorted_scores
274
- if (best_score - score) < 5]
275
-
276
- if debug:
277
- print(f"Tied documents: {[f'{doc}({score:.1f}%)' for doc, score in tied_docs]}")
278
-
279
- # Calculate unique keywords ONLY for tied documents
280
- unique_counts = {}
281
- for doc_type, _ in tied_docs:
282
- kw_set = set(k.lower() for k in DOC_KEYWORDS[doc_type])
283
-
284
- # Get keywords from OTHER tied documents only
285
- other_tied_keywords = set()
286
- for other_doc, _ in tied_docs:
287
- if other_doc != doc_type:
288
- other_tied_keywords.update(k.lower() for k in DOC_KEYWORDS[other_doc])
289
-
290
- unique_keywords = kw_set - other_tied_keywords
291
- unique_matches = sum(1 for kw in unique_keywords if fuzzy_match(kw, ocr_set))
292
- unique_counts[doc_type] = unique_matches
293
-
294
- if debug:
295
- print(f" {doc_type:<25}: {unique_matches} unique matches ({len(unique_keywords)} available)")
296
-
297
- # Only use tie-breaker if there's a clear winner
298
- if unique_counts and max(unique_counts.values()) > 0:
299
- sorted_unique = sorted(unique_counts.items(), key=lambda x: x[1], reverse=True)
300
- if len(sorted_unique) > 1 and sorted_unique[0][1] > sorted_unique[1][1]:
301
- best_type = sorted_unique[0][0]
302
- best_score = scores[best_type]
303
-
304
- if debug:
305
- print(f"✓ Tie broken: {best_type} wins with {unique_counts[best_type]} unique matches")
306
-
307
- return best_type, best_score
308
-
309
- def verify_keywords(ocr_tokens, user_keywords, use_fuzzy=False):
310
- """
311
- FIXED: Sequence-aware matching for multi-keyword inputs (names, addresses).
312
- Checks if keywords appear consecutively in OCR text first.
313
- """
314
- ocr_set = set(ocr_tokens)
315
- ocr_combined = " ".join(ocr_tokens)
316
- results = []
317
-
318
- # CRITICAL: For multi-keyword inputs, check for SEQUENCE match first
319
- if len(user_keywords) > 1:
320
- # Build the phrase as it should appear in OCR
321
- user_phrase = " ".join([kw.lower() if all(ord(c) < 128 for c in kw) else kw for kw in user_keywords])
322
-
323
- # Check if entire phrase exists in OCR text
324
- if user_phrase in ocr_combined:
325
- if args.debug:
326
- print(f"\n✓ Sequence match: '{user_phrase}' found in OCR text")
327
- # All keywords matched in correct order
328
- for kw in user_keywords:
329
- results.append({
330
- 'keyword': kw,
331
- 'matched': True,
332
- 'matched_text': kw
333
- })
334
- return results
335
-
336
- # Fuzzy phrase matching if enabled
337
- if use_fuzzy:
338
- # Create n-grams from OCR tokens matching user keyword count
339
- n = len(user_keywords)
340
- ocr_phrases = [" ".join(ocr_tokens[i:i+n]) for i in range(len(ocr_tokens) - n + 1)]
341
-
342
- phrase_match = fuzzy_match(user_phrase, set(ocr_phrases))
343
- if phrase_match:
344
- if args.debug:
345
- print(f"\n✓ Fuzzy sequence match: '{user_phrase}' ~ '{phrase_match}'")
346
- for kw in user_keywords:
347
- results.append({
348
- 'keyword': kw,
349
- 'matched': True,
350
- 'matched_text': kw
351
- })
352
- return results
353
-
354
- # Fallback to individual keyword matching
355
- for kw in user_keywords:
356
- kw_processed = kw.lower() if all(ord(c) < 128 for c in kw) else kw
357
- matched = False
358
- matched_text = None
359
-
360
- if kw_processed in ocr_set:
361
- matched = True
362
- matched_text = kw_processed
363
- elif " " in kw_processed and kw_processed in ocr_combined:
364
- matched = True
365
- matched_text = kw_processed
366
- elif use_fuzzy:
367
- matched_text = fuzzy_match(kw_processed, ocr_set)
368
- if matched_text:
369
- matched = True
370
-
371
- results.append({
372
- 'keyword': kw,
373
- 'matched': matched,
374
- 'matched_text': matched_text or kw_processed if matched else None
375
- })
376
-
377
- return results
378
-
379
- def main():
380
- parser = argparse.ArgumentParser(description='OCR Document Verification with PDF Support')
381
- parser.add_argument('--file', nargs='+', required=True, help='Paths to image or PDF files')
382
- parser.add_argument('--inputkeywords', required=True, help='Space-separated keywords to verify')
383
- parser.add_argument('--required', nargs='+', help='List of required document types (space-separated, e.g., PAN Aadhaar HSC)')
384
- parser.add_argument('--fuzzy', action='store_true', help='Enable fuzzy matching')
385
- parser.add_argument('--debug', action='store_true', help='Show detailed OCR and scoring output')
386
- parser.add_argument('--pages', type=int, default=3, help='Max pages to process for PDFs (default: 3)')
387
- global args
388
- args = parser.parse_args()
389
-
390
- # CRITICAL FIX: Clean the required list by stripping commas and whitespace
391
- required_list = []
392
- if args.required:
393
- for item in args.required:
394
- # Split on commas and strip whitespace from each part
395
- parts = [part.strip() for part in item.split(',') if part.strip()]
396
- required_list.extend(parts)
397
-
398
- required_set = set(required_list)
399
-
400
- # Process each file and collect results
401
- file_results = []
402
- found_documents = set()
403
- all_matched_keywords_per_file = []
404
-
405
- print(f"\n{'='*60}")
406
- print(f"PROCESSING {len(args.file)} FILES")
407
- print(f"{'='*60}\n")
408
-
409
- for idx, file_path in enumerate(args.file, 1):
410
- print(f"--- FILE {idx}/{len(args.file)}: {os.path.basename(file_path)} ---")
411
-
412
- # Extract text from file
413
- ocr_texts = get_ocr_text(file_path, args.pages)
414
-
415
- if not ocr_texts:
416
- print(f"⚠️ No text extracted from {file_path}\n")
417
- file_results.append({
418
- 'file': file_path,
419
- 'doc_type': 'Unknown',
420
- 'doc_score': 0,
421
- 'keywords_matched': [],
422
- 'status': 'ERROR'
423
- })
424
- continue
425
-
426
- # Debug: Show raw OCR
427
- if args.debug:
428
- print("\n" + "="*60)
429
- print("RAW OCR EXTRACTED TEXT:")
430
- print("="*60)
431
- for i, text in enumerate(ocr_texts, 1):
432
- print(f"{i:3d}. {text}")
433
- print("="*60 + "\n")
434
-
435
- # Normalize tokens
436
- ocr_tokens = normalize_text(" ".join(ocr_texts))
437
-
438
- # Debug: Show normalized tokens
439
- if args.debug:
440
- print("="*60)
441
- print("NORMALIZED TOKENS:")
442
- print("="*60)
443
- print(f"Total tokens: {len(ocr_tokens)}")
444
- print(f"First 50 tokens: {', '.join(ocr_tokens[:50])}{'...' if len(ocr_tokens) > 50 else ''}")
445
- print("="*60 + "\n")
446
-
447
- # Document classification
448
- if args.debug:
449
- print("="*60)
450
- print("DOCUMENT TYPE SCORING:")
451
- print("="*60)
452
-
453
- doc_type, doc_score = calculate_doc_type(ocr_tokens, debug=args.debug)
454
- found_documents.add(doc_type)
455
-
456
- if args.debug:
457
- print("="*60 + "\n")
458
-
459
- # Keyword verification
460
- user_keywords = [kw.strip() for kw in args.inputkeywords.split()]
461
- verification_results = verify_keywords(ocr_tokens, user_keywords, args.fuzzy)
462
-
463
- # Status: ALL keywords must match in this file
464
- all_matched = all(r['matched'] for r in verification_results)
465
- status = "VERIFIED" if all_matched else "NOT VERIFIED"
466
-
467
- # Store results for this file
468
- file_results.append({
469
- 'file': file_path,
470
- 'doc_type': doc_type,
471
- 'doc_score': doc_score,
472
- 'keywords_matched': verification_results,
473
- 'status': status,
474
- 'all_keywords_matched': all_matched
475
- })
476
-
477
- # Track which keywords were matched in this file
478
- matched_keywords_in_file = {r['keyword'] for r in verification_results if r['matched']}
479
- all_matched_keywords_per_file.append(matched_keywords_in_file)
480
-
481
- # Per-file output
482
- print(f"\n{'='*60}")
483
- print(f"Document Type: {doc_type} ({doc_score:.1f}% confidence)")
484
- print(f"{'='*60}")
485
- print(f"{'Keyword':<25} | {'Status':<10} | {'Matched Text'}")
486
- print(f"{'-'*60}")
487
-
488
- for r in verification_results:
489
- status_icon = "✓" if r['matched'] else "✗"
490
- matched_text = r['matched_text'] if r['matched_text'] else "Not found"
491
- print(f"{r['keyword']:<25} | {status_icon:<10} | {matched_text}")
492
-
493
- print(f"{'='*60}")
494
- print(f"File Status: {status}")
495
- print(f"{'='*60}\n")
496
-
497
- # FINAL SUMMARY
498
- print(f"\n{'='*60}")
499
- print(f"FINAL SUMMARY")
500
- print(f"{'='*60}")
501
-
502
- # Required documents check
503
- if required_set:
504
- missing_docs = required_set - found_documents
505
-
506
- print(f"\nRequired Documents: {', '.join(sorted(required_set))}")
507
- print(f"Found Documents: {', '.join(sorted(found_documents)) if found_documents else 'None'}")
508
-
509
- if missing_docs:
510
- print(f" Missing Documents: {', '.join(sorted(missing_docs))}")
511
- docs_status = "NOT VERIFIED"
512
- else:
513
- print(f"✅ All required documents found!")
514
- docs_status = "VERIFIED"
515
- else:
516
- docs_status = "N/A (no required list specified)"
517
- missing_docs = set()
518
-
519
- # Overall keyword verification across ALL files
520
- # Check if every keyword appears in at least one file
521
- all_user_keywords = set(args.inputkeywords.split())
522
- keywords_found_across_files = set()
523
-
524
- for file_keyword_set in all_matched_keywords_per_file:
525
- keywords_found_across_files.update(file_keyword_set)
526
-
527
- missing_keywords = all_user_keywords - keywords_found_across_files
528
-
529
- print(f"\nKeywords to Find: {', '.join(sorted(all_user_keywords))}")
530
- print(f"Keywords Found (across all files): {', '.join(sorted(keywords_found_across_files)) if keywords_found_across_files else 'None'}")
531
-
532
- if missing_keywords:
533
- print(f" Missing Keywords: {', '.join(sorted(missing_keywords))}")
534
- keywords_status = "NOT VERIFIED"
535
- else:
536
- print(f"✅ All keywords found across uploaded documents!")
537
- keywords_status = "VERIFIED"
538
-
539
- # Overall status: BOTH documents and keywords must be verified
540
- overall_status = "VERIFIED" if (docs_status == "VERIFIED" and keywords_status == "VERIFIED") else "NOT VERIFIED"
541
-
542
- print(f"\n{'='*60}")
543
- print(f"Documents Status: {docs_status}")
544
- print(f"Keywords Status: {keywords_status}")
545
- print(f"OVERALL STATUS: {overall_status}")
546
- print(f"{'='*60}")
547
-
548
- if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
549
  main()
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OCR Document Verification with Batch Processing & Required Document Checklist
4
+ Usage:
5
+ # Single file (backward compatible)
6
+ python ocrupdated2.py --file image.jpg --inputkeywords "keyword1 keyword2" --fuzzy --debug
7
+ # Multiple files with required document checklist
8
+ python ocrupdated2.py --file doc1.pdf doc2.jpg doc3.png --inputkeywords "Shaikh Anisa Rahat" --required PAN HSC AgeNationalityDomicile --fuzzy --debug
9
+ NOTE: Use spaces to separate required document types, NOT commas:
10
+ --required PAN Aadhaar HSC
11
+ --required PAN, Aadhaar, HSC
12
+ """
13
+
14
+ import argparse
15
+ import re
16
+ import os
17
+ import tempfile
18
+ from collections import defaultdict
19
+ from paddleocr import PaddleOCR
20
+ import difflib
21
+ from concurrent.futures import ThreadPoolExecutor
22
+ import multiprocessing
23
+
24
+ # Optional PDF support
25
+ try:
26
+ import fitz # PyMuPDF
27
+ PDF_SUPPORT = True
28
+ except ImportError:
29
+ PDF_SUPPORT = False
30
+ print("Warning: PyMuPDF not installed. PDF support disabled. Install with: pip install PyMuPDF")
31
+
32
+ # Document keywords (unchanged)
33
+ DOC_KEYWORDS = {
34
+ "Aadhaar": [
35
+ "uidai", "aadhaar", "aadhar", "government of india", "भारत सरकार",
36
+ "आधार", "यूआईडीएआई", "प्रधानमंत्री", "जन्म तिथि", "पता", "लिंग",
37
+ "unique identification authority", "aadhaar number", "enrollment number"
38
+ ],
39
+ "PAN": [
40
+ "permanent account number", "income tax", "incometaxindia", "pan",
41
+ "income tax department", "आयकर विभाग", "स्थायी खाता संख्या",
42
+ "taxpayer", "father's name", "पिता का नाम", "signature", "inc"
43
+ ],
44
+ "Driving_License": [
45
+ "driving licence", "motor vehicles act", "rto", "mcwg", "lmv",
46
+ "transport department", "licence no", "valid till", "date of issue",
47
+ "ड्राइविंग लाइसेंस", "परिवहन विभाग", "challan", "regional transport office"
48
+ ],
49
+ "Passport": [
50
+ "passport", "republic of india", "ministry of external affairs",
51
+ "passport number", "date of issue", "date of expiry", "surname",
52
+ "given names", "nationality indian", "पासपोर्ट", "गणराज्य", "विदेश मंत्रालय",
53
+ "consular", "visa"
54
+ ],
55
+ "SSC": [
56
+ "secondary school certificate", "statement of marks", "ssc", "10th", "class x",
57
+ "board of secondary education", "maharashtra state board", "matriculation",
58
+ "roll number", "seat number", "subject code", "marks obtained", "grade", "pass"
59
+ ],
60
+ "HSC": [
61
+ "higher secondary certificate", "statement of marks", "hsc", "12th", "class xii",
62
+ "board of higher secondary education", "maharashtra state board", "intermediate",
63
+ "stream", "science", "commerce", "arts", "marks obtained", "grade", "percentage"
64
+ ],
65
+ "AgeNationalityDomicile": [
66
+ "certificate of age nationality and domicile", "domicile certificate",
67
+ "age nationality domicile", "tehsildar", "executive magistrate", "collector",
68
+ "certificate of residence", "domiciled in the state of", "citizen of india",
69
+ "residence proof", "maharashtra domicile", "satara", "karad", "taluka", "district"
70
+ ],
71
+ "Ration_Card": [
72
+ "ration card", "food and civil supplies", "apl", "bpl", "aay", "antyodaya",
73
+ "ration card number", "family members", "head of family",
74
+ "राशन कार्ड", "खाद्य पुरवठा", "नागरी पुरवठा विभाग", "fps", "fair price shop"
75
+ ],
76
+ "Cast_Certificate": [
77
+ "CASTE CERTIFICATE",
78
+ "FORM - 8",
79
+ "Rule No. 5(6)",
80
+ "De-Notified Tribe (Vimukt Jati)",
81
+ "Nomadic Tribe/Other Backward Class",
82
+ "Special Backward Category",
83
+ "recognised as",
84
+ "Government Resolution",
85
+ "Sub Divisional Officer",
86
+ "belonging to the State of Maharashtra"
87
+ ],
88
+ "Income_Certificate": [
89
+ " वर्षासाठी उत्पन्नाचे प्रमाणपत्र",
90
+ "ऑफिस ऑफ नायब तहसीलदार",
91
+ "वार्षिक उत्पन्न",
92
+ "मिळालेले वर्षाचे उत्पन्न",
93
+ "कुटुंबातील सर्व सदस्यांचे",
94
+ "प्रमाणित करण्यात येते की",
95
+ "वैध राहील",
96
+ "Signature valid",
97
+ "Digitally Signed by"
98
+ ],
99
+ "PCM_Score_Card": [
100
+ "MAH-MHT CET (PCM Group)",
101
+ "State Common Entrance Test Cell",
102
+ "Score Card",
103
+ "Physics",
104
+ "Chemistry",
105
+ "Mathematics",
106
+ "Total Percentile",
107
+ "Normalization document",
108
+ "Centralized Admission Process (CAP)",
109
+ "IP address of the Computer"
110
+ ]
111
+ }
112
+
113
+ # Validate keyword uniqueness (unchanged)
114
+ _keyword_sets = {k: set(v) for k, v in DOC_KEYWORDS.items()}
115
+ for doc1 in DOC_KEYWORDS:
116
+ for doc2 in DOC_KEYWORDS:
117
+ if doc1 < doc2:
118
+ overlap = _keyword_sets[doc1].intersection(_keyword_sets[doc2])
119
+ if overlap:
120
+ print(f"⚠️ Warning: Overlap between {doc1} and {doc2}: {overlap}")
121
+
122
+ # NEW: Pre-compile regex patterns for performance
123
+ NOISE_PATTERN = re.compile(r'^[b-df-hj-np-tv-xz]{4,}$')
124
+ TOKEN_PATTERN = re.compile(r'[\u0900-\u097F]{2,}|\w{3,}')
125
+ STOPWORDS = {'the', 'and', 'of', 'in', 'to', 'for', 'is', 'on', 'by', 'with', 'at', 'from', 'a', 'an', 'this'}
126
+
127
+ def normalize_text(text):
128
+ """Robust multilingual tokenization with noise filtering"""
129
+ text = text.lower()
130
+ # Extract Hindi Devanagari (2+ chars) OR English alphanumeric (3+ chars)
131
+ tokens = TOKEN_PATTERN.findall(text)
132
+
133
+ # Remove common English stopwords
134
+ tokens = [t for t in tokens if t not in STOPWORDS]
135
+
136
+ # Remove OCR noise (4+ consecutive consonants = garbage)
137
+ tokens = [t for t in tokens if not NOISE_PATTERN.match(t)]
138
+
139
+ return tokens
140
+
141
+ def pdf_to_images(pdf_path, max_pages=3):
142
+ """Convert PDF pages to high-resolution temporary images"""
143
+ if not PDF_SUPPORT:
144
+ raise ValueError("PDF support not available. Install PyMuPDF")
145
+
146
+ doc = fitz.open(pdf_path)
147
+ total_pages = len(doc)
148
+ pages_to_process = min(total_pages, max_pages)
149
+
150
+ image_paths = []
151
+ temp_dir = tempfile.mkdtemp(prefix="ocr_pdf_")
152
+
153
+ for page_num in range(pages_to_process):
154
+ page = doc.load_page(page_num)
155
+ zoom = 2 # 2x resolution for better OCR
156
+ mat = fitz.Matrix(zoom, zoom)
157
+ pix = page.get_pixmap(matrix=mat)
158
+
159
+ img_path = os.path.join(temp_dir, f"page_{page_num + 1}.png")
160
+ pix.save(img_path)
161
+ image_paths.append(img_path)
162
+
163
+ doc.close()
164
+ return image_paths, total_pages, temp_dir
165
+
166
+ def process_page_ocr(img_path, page_num, ocr, debug):
167
+ """Process a single page with OCR (for parallel execution)"""
168
+ if debug:
169
+ print(f"\n--- Processing PDF Page {page_num} ---")
170
+ result = ocr.predict(input=img_path)
171
+ texts = []
172
+ for res in result:
173
+ texts.extend(res['rec_texts'])
174
+ return texts
175
+
176
+ def get_ocr_text(file_path, ocr, max_pages=3, debug=False):
177
+ """Process image or PDF with OCR, returning all extracted text lines"""
178
+ all_texts = []
179
+ temp_dir = None
180
+
181
+ try:
182
+ if file_path.lower().endswith('.pdf'):
183
+ if not PDF_SUPPORT:
184
+ print("Error: PDF file provided but PyMuPDF not installed")
185
+ return []
186
+
187
+ image_paths, total_pages, temp_dir = pdf_to_images(file_path, max_pages)
188
+ print(f"Processing PDF: {total_pages} pages total, processing first {len(image_paths)} pages...")
189
+
190
+ # NEW: Process pages in parallel with ThreadPoolExecutor
191
+ max_workers = min(len(image_paths), 4) # Max 4 parallel pages
192
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
193
+ # Submit all pages
194
+ future_to_page = {
195
+ executor.submit(process_page_ocr, img_path, i+1, ocr, debug): i
196
+ for i, img_path in enumerate(image_paths)
197
+ }
198
+
199
+ # Collect results in order
200
+ page_results = [None] * len(image_paths)
201
+ for future in future_to_page:
202
+ page_idx = future_to_page[future]
203
+ try:
204
+ page_results[page_idx] = future.result()
205
+ except Exception as e:
206
+ print(f"Error processing page {page_idx+1}: {e}")
207
+ page_results[page_idx] = []
208
+
209
+ # Combine results in correct order
210
+ for texts in page_results:
211
+ all_texts.extend(texts)
212
+ else:
213
+ result = ocr.predict(input=file_path)
214
+ for res in result:
215
+ all_texts.extend(res['rec_texts'])
216
+
217
+ finally:
218
+ if temp_dir and os.path.exists(temp_dir):
219
+ import shutil
220
+ shutil.rmtree(temp_dir)
221
+
222
+ return all_texts
223
+
224
+ def fuzzy_match(token, target_set, threshold=0.75):
225
+ """
226
+ Multi-level matching for OCR errors:
227
+ 1. Exact match
228
+ 2. Levenshtein distance
229
+ 3. Substring containment
230
+ 4. Hindi character-level similarity
231
+ """
232
+ if token in target_set:
233
+ return token
234
+
235
+ # Levenshtein distance match
236
+ matches = difflib.get_close_matches(token, target_set, n=1, cutoff=threshold)
237
+ if matches:
238
+ return matches[0]
239
+
240
+ # Substring match (handles concatenated words)
241
+ for ocr_token in target_set:
242
+ if token in ocr_token or ocr_token in token:
243
+ return ocr_token
244
+
245
+ # Hindi-specific fuzzy matching
246
+ if any('\u0900' <= c <= '\u097F' for c in token):
247
+ for ocr_token in target_set:
248
+ if len(ocr_token) > 3:
249
+ similarity = difflib.SequenceMatcher(None, token, ocr_token).ratio()
250
+ if similarity > threshold:
251
+ return ocr_token
252
+
253
+ return None
254
+
255
+ def calculate_doc_type(ocr_tokens, debug=False):
256
+ """
257
+ Enhanced document classification with CORRECTED tie-breaking logic.
258
+ Only compares documents that are ACTUALLY TIED (within 5% score).
259
+ """
260
+ ocr_set = set(ocr_tokens)
261
+ ocr_combined = " ".join(ocr_tokens)
262
+ scores = {}
263
+
264
+ # NEW: Pre-calculate keyword sets once
265
+ doc_keyword_sets = {}
266
+ for doc_type, keywords in DOC_KEYWORDS.items():
267
+ doc_keyword_sets[doc_type] = set(k.lower() for k in keywords)
268
+
269
+ for doc_type, kw_set in doc_keyword_sets.items():
270
+ # Primary: exact/fuzzy token matches (weighted 2 for exact, 1.5 for fuzzy)
271
+ primary_matches = 0
272
+ for kw in kw_set:
273
+ if kw in ocr_set:
274
+ primary_matches += 2
275
+ elif fuzzy_match(kw, ocr_set):
276
+ primary_matches += 1.5
277
+
278
+ # Secondary: multi-word phrase matches in combined text
279
+ phrase_matches = sum(1 for kw in kw_set if " " in kw and kw in ocr_combined)
280
+
281
+ # Tertiary: title keyword bonus
282
+ title_keywords = [kw for kw in kw_set if any(word in kw for word in ["certificate", "card", "licence", "passport"])]
283
+ title_match = sum(1 for kw in title_keywords if kw in ocr_combined)
284
+
285
+ # Calculate weighted score
286
+ max_possible = len(kw_set) * 2
287
+ weighted_score = ((primary_matches + phrase_matches + title_match) / max_possible) * 100
288
+
289
+ scores[doc_type] = weighted_score
290
+
291
+ if debug:
292
+ print(f" {doc_type:<25}: {weighted_score:>6.1f}% ({primary_matches:.1f} + {phrase_matches} + {title_match})")
293
+
294
+ # Sort by score descending
295
+ sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
296
+ best_type, best_score = sorted_scores[0]
297
+
298
+ # Tie-breaking logic (unchanged)
299
+ if len(sorted_scores) > 1 and (sorted_scores[0][1] - sorted_scores[1][1]) < 5:
300
+ if debug:
301
+ print(f"\n⚠️ Tie detected between '{sorted_scores[0][0]}' and '{sorted_scores[1][0]}'!")
302
+
303
+ tied_docs = [(doc_type, score) for doc_type, score in sorted_scores
304
+ if (best_score - score) < 5]
305
+
306
+ if debug:
307
+ print(f"Tied documents: {[f'{doc}({score:.1f}%)' for doc, score in tied_docs]}")
308
+
309
+ unique_counts = {}
310
+ for doc_type, _ in tied_docs:
311
+ kw_set = doc_keyword_sets[doc_type]
312
+
313
+ other_tied_keywords = set()
314
+ for other_doc, _ in tied_docs:
315
+ if other_doc != doc_type:
316
+ other_tied_keywords.update(doc_keyword_sets[other_doc])
317
+
318
+ unique_keywords = kw_set - other_tied_keywords
319
+ unique_matches = sum(1 for kw in unique_keywords if fuzzy_match(kw, ocr_set))
320
+ unique_counts[doc_type] = unique_matches
321
+
322
+ if debug:
323
+ print(f" {doc_type:<25}: {unique_matches} unique matches ({len(unique_keywords)} available)")
324
+
325
+ if unique_counts and max(unique_counts.values()) > 0:
326
+ sorted_unique = sorted(unique_counts.items(), key=lambda x: x[1], reverse=True)
327
+ if len(sorted_unique) > 1 and sorted_unique[0][1] > sorted_unique[1][1]:
328
+ best_type = sorted_unique[0][0]
329
+ best_score = scores[best_type]
330
+
331
+ if debug:
332
+ print(f"✓ Tie broken: {best_type} wins with {unique_counts[best_type]} unique matches")
333
+
334
+ return best_type, best_score
335
+
336
+ def verify_keywords(ocr_tokens, user_keywords, use_fuzzy=False):
337
+ """
338
+ FIXED: Sequence-aware matching for multi-keyword inputs.
339
+ Checks if keywords appear consecutively in OCR text first.
340
+ """
341
+ ocr_set = set(ocr_tokens)
342
+ ocr_combined = " ".join(ocr_tokens)
343
+ results = []
344
+
345
+ if len(user_keywords) > 1:
346
+ user_phrase = " ".join([kw.lower() if all(ord(c) < 128 for c in kw) else kw for kw in user_keywords])
347
+
348
+ if user_phrase in ocr_combined:
349
+ for kw in user_keywords:
350
+ results.append({
351
+ 'keyword': kw,
352
+ 'matched': True,
353
+ 'matched_text': kw
354
+ })
355
+ return results
356
+
357
+ if use_fuzzy:
358
+ n = len(user_keywords)
359
+ ocr_phrases = [" ".join(ocr_tokens[i:i+n]) for i in range(len(ocr_tokens) - n + 1)]
360
+
361
+ phrase_match = fuzzy_match(user_phrase, set(ocr_phrases))
362
+ if phrase_match:
363
+ for kw in user_keywords:
364
+ results.append({
365
+ 'keyword': kw,
366
+ 'matched': True,
367
+ 'matched_text': kw
368
+ })
369
+ return results
370
+
371
+ # Fallback to individual keyword matching
372
+ for kw in user_keywords:
373
+ kw_processed = kw.lower() if all(ord(c) < 128 for c in kw) else kw
374
+ matched = False
375
+ matched_text = None
376
+
377
+ if kw_processed in ocr_set:
378
+ matched = True
379
+ matched_text = kw_processed
380
+ elif " " in kw_processed and kw_processed in ocr_combined:
381
+ matched = True
382
+ matched_text = kw_processed
383
+ elif use_fuzzy:
384
+ matched_text = fuzzy_match(kw_processed, ocr_set)
385
+ if matched_text:
386
+ matched = True
387
+
388
+ results.append({
389
+ 'keyword': kw,
390
+ 'matched': matched,
391
+ 'matched_text': matched_text or kw_processed if matched else None
392
+ })
393
+
394
+ return results
395
+
396
+ def main():
397
+ parser = argparse.ArgumentParser(description='OCR Document Verification with PDF Support')
398
+ parser.add_argument('--file', nargs='+', required=True, help='Paths to image or PDF files')
399
+ parser.add_argument('--inputkeywords', required=True, help='Space-separated keywords to verify')
400
+ parser.add_argument('--required', nargs='+', help='List of required document types')
401
+ parser.add_argument('--fuzzy', action='store_true', help='Enable fuzzy matching')
402
+ parser.add_argument('--debug', action='store_true', help='Show detailed OCR and scoring output')
403
+ parser.add_argument('--pages', type=int, default=3, help='Max pages to process for PDFs')
404
+ global args
405
+ args = parser.parse_args()
406
+
407
+ # Clean required list
408
+ required_list = []
409
+ if args.required:
410
+ for item in args.required:
411
+ parts = [part.strip() for part in item.split(',') if part.strip()]
412
+ required_list.extend(parts)
413
+
414
+ required_set = set(required_list)
415
+
416
+ # NEW: Initialize OCR once, reuse for all files
417
+ print("Initializing OCR engine (first run may take a few seconds)...")
418
+ ocr_engine = PaddleOCR(
419
+ lang="mr",
420
+ use_doc_orientation_classify=False,
421
+ use_doc_unwarping=False,
422
+ use_textline_orientation=False,
423
+ max_batch_size=16, # Process multiple images in parallel
424
+ num_workers=min(4, multiprocessing.cpu_count()), # CPU workers for preprocessing
425
+ )
426
+
427
+ # Process each file and collect results
428
+ file_results = []
429
+ found_documents = set()
430
+ all_matched_keywords_per_file = []
431
+
432
+ print(f"\n{'='*60}")
433
+ print(f"PROCESSING {len(args.file)} FILES")
434
+ print(f"{'='*60}\n")
435
+
436
+ for idx, file_path in enumerate(args.file, 1):
437
+ print(f"--- FILE {idx}/{len(args.file)}: {os.path.basename(file_path)} ---")
438
+
439
+ # Extract text from file
440
+ ocr_texts = get_ocr_text(file_path, ocr_engine, args.pages, args.debug)
441
+
442
+ if not ocr_texts:
443
+ print(f"⚠️ No text extracted from {file_path}\n")
444
+ file_results.append({
445
+ 'file': file_path,
446
+ 'doc_type': 'Unknown',
447
+ 'doc_score': 0,
448
+ 'keywords_matched': [],
449
+ 'status': 'ERROR'
450
+ })
451
+ continue
452
+
453
+ # Debug: Show raw OCR
454
+ if args.debug:
455
+ print("\n" + "="*60)
456
+ print("RAW OCR EXTRACTED TEXT:")
457
+ print("="*60)
458
+ for i, text in enumerate(ocr_texts, 1):
459
+ print(f"{i:3d}. {text}")
460
+ print("="*60 + "\n")
461
+
462
+ # Normalize tokens
463
+ ocr_tokens = normalize_text(" ".join(ocr_texts))
464
+
465
+ # Debug: Show normalized tokens
466
+ if args.debug:
467
+ print("="*60)
468
+ print("NORMALIZED TOKENS:")
469
+ print("="*60)
470
+ print(f"Total tokens: {len(ocr_tokens)}")
471
+ print(f"First 50 tokens: {', '.join(ocr_tokens[:50])}{'...' if len(ocr_tokens) > 50 else ''}")
472
+ print("="*60 + "\n")
473
+
474
+ # Document classification
475
+ if args.debug:
476
+ print("="*60)
477
+ print("DOCUMENT TYPE SCORING:")
478
+ print("="*60)
479
+
480
+ doc_type, doc_score = calculate_doc_type(ocr_tokens, debug=args.debug)
481
+ found_documents.add(doc_type)
482
+
483
+ if args.debug:
484
+ print("="*60 + "\n")
485
+
486
+ # Keyword verification
487
+ user_keywords = [kw.strip() for kw in args.inputkeywords.split()]
488
+ verification_results = verify_keywords(ocr_tokens, user_keywords, args.fuzzy)
489
+
490
+ # Status: ALL keywords must match in this file
491
+ all_matched = all(r['matched'] for r in verification_results)
492
+ status = "VERIFIED" if all_matched else "NOT VERIFIED"
493
+
494
+ # Store results for this file
495
+ file_results.append({
496
+ 'file': file_path,
497
+ 'doc_type': doc_type,
498
+ 'doc_score': doc_score,
499
+ 'keywords_matched': verification_results,
500
+ 'status': status,
501
+ 'all_keywords_matched': all_matched
502
+ })
503
+
504
+ # Track which keywords were matched in this file
505
+ matched_keywords_in_file = {r['keyword'] for r in verification_results if r['matched']}
506
+ all_matched_keywords_per_file.append(matched_keywords_in_file)
507
+
508
+ # Per-file output
509
+ print(f"\n{'='*60}")
510
+ print(f"Document Type: {doc_type} ({doc_score:.1f}% confidence)")
511
+ print(f"{'='*60}")
512
+ print(f"{'Keyword':<25} | {'Status':<10} | {'Matched Text'}")
513
+ print(f"{'-'*60}")
514
+
515
+ for r in verification_results:
516
+ status_icon = "✓" if r['matched'] else "✗"
517
+ matched_text = r['matched_text'] if r['matched_text'] else "Not found"
518
+ print(f"{r['keyword']:<25} | {status_icon:<10} | {matched_text}")
519
+
520
+ print(f"{'='*60}")
521
+ print(f"File Status: {status}")
522
+ print(f"{'='*60}\n")
523
+
524
+ # FINAL SUMMARY (unchanged)
525
+ print(f"\n{'='*60}")
526
+ print(f"FINAL SUMMARY")
527
+ print(f"{'='*60}")
528
+
529
+ # Required documents check
530
+ if required_set:
531
+ missing_docs = required_set - found_documents
532
+
533
+ print(f"\nRequired Documents: {', '.join(sorted(required_set))}")
534
+ print(f"Found Documents: {', '.join(sorted(found_documents)) if found_documents else 'None'}")
535
+
536
+ if missing_docs:
537
+ print(f"❌ Missing Documents: {', '.join(sorted(missing_docs))}")
538
+ docs_status = "NOT VERIFIED"
539
+ else:
540
+ print(f" All required documents found!")
541
+ docs_status = "VERIFIED"
542
+ else:
543
+ docs_status = "N/A (no required list specified)"
544
+ missing_docs = set()
545
+
546
+ # Overall keyword verification across ALL files
547
+ all_user_keywords = set(args.inputkeywords.split())
548
+ keywords_found_across_files = set()
549
+
550
+ for file_keyword_set in all_matched_keywords_per_file:
551
+ keywords_found_across_files.update(file_keyword_set)
552
+
553
+ missing_keywords = all_user_keywords - keywords_found_across_files
554
+
555
+ print(f"\nKeywords to Find: {', '.join(sorted(all_user_keywords))}")
556
+ print(f"Keywords Found (across all files): {', '.join(sorted(keywords_found_across_files)) if keywords_found_across_files else 'None'}")
557
+
558
+ if missing_keywords:
559
+ print(f"❌ Missing Keywords: {', '.join(sorted(missing_keywords))}")
560
+ keywords_status = "NOT VERIFIED"
561
+ else:
562
+ print(f"✅ All keywords found across uploaded documents!")
563
+ keywords_status = "VERIFIED"
564
+
565
+ # Overall status: BOTH documents and keywords must be verified
566
+ overall_status = "VERIFIED" if (docs_status == "VERIFIED" and keywords_status == "VERIFIED") else "NOT VERIFIED"
567
+
568
+ print(f"\n{'='*60}")
569
+ print(f"Documents Status: {docs_status}")
570
+ print(f"Keywords Status: {keywords_status}")
571
+ print(f"OVERALL STATUS: {overall_status}")
572
+ print(f"{'='*60}")
573
+
574
+ if __name__ == "__main__":
575
  main()