triflix commited on
Commit
bf3efa4
·
verified ·
1 Parent(s): 795714c

Delete logiccode.py

Browse files
Files changed (1) hide show
  1. logiccode.py +0 -490
logiccode.py DELETED
@@ -1,490 +0,0 @@
1
-
2
- #!/usr/bin/env python3
3
- """
4
- OCR Document Verification with Batch Processing & Required Document Checklist
5
- Usage:
6
- # Single file (backward compatible)
7
- python ocrupdated2.py --file image.jpg --inputkeywords "keyword1 keyword2" --fuzzy --debug
8
- # Multiple files with required document checklist
9
- python ocrupdated2.py --file doc1.pdf doc2.jpg doc3.png --inputkeywords "Shaikh Anisa Rahat" --required PAN HSC AgeNationalityDomicile --fuzzy --debug
10
- NOTE: Use spaces to separate required document types, NOT commas:
11
- ✅ --required PAN Aadhaar HSC
12
- ❌ --required PAN, Aadhaar, HSC
13
- """
14
-
15
- import argparse
16
- import re
17
- import os
18
- import tempfile
19
- from collections import defaultdict
20
- from paddleocr import PaddleOCR
21
- import difflib
22
-
23
- # Optional PDF support
24
- try:
25
- import fitz # PyMuPDF
26
- PDF_SUPPORT = True
27
- except ImportError:
28
- PDF_SUPPORT = False
29
- print("Warning: PyMuPDF not installed. PDF support disabled. Install with: pip install PyMuPDF")
30
-
31
- # Document keywords (kept same as your updated version)
32
- DOC_KEYWORDS = {
33
- "Aadhaar": [
34
- "uidai", "aadhaar", "aadhar", "government of india", "भारत सरकार",
35
- "आधार", "यूआईडीएआई", "प्रधानमंत्री", "जन्म तिथि", "पता", "लिंग",
36
- "unique identification authority", "aadhaar number", "enrollment number"
37
- ],
38
- "PAN": [
39
- "permanent account number", "income tax", "incometaxindia", "pan",
40
- "income tax department", "आयकर विभाग", "स्थायी खाता संख्या",
41
- "taxpayer", "father's name", "पिता का नाम", "signature", "inc"
42
- ],
43
- "Driving_License": [
44
- "driving licence", "motor vehicles act", "rto", "mcwg", "lmv",
45
- "transport department", "licence no", "valid till", "date of issue",
46
- "ड्राइविंग लाइसेंस", "परिवहन विभाग", "challan", "regional transport office"
47
- ],
48
- "Passport": [
49
- "passport", "republic of india", "ministry of external affairs",
50
- "passport number", "date of issue", "date of expiry", "surname",
51
- "given names", "nationality indian", "पासपोर्ट", "गणराज्य", "विदेश मंत्रालय",
52
- "consular", "visa"
53
- ],
54
- "SSC": [
55
- "secondary school certificate", "statement of marks", "ssc", "10th", "class x",
56
- "board of secondary education", "maharashtra state board", "matriculation",
57
- "roll number", "seat number", "subject code", "marks obtained", "grade", "pass"
58
- ],
59
- "HSC": [
60
- "higher secondary certificate", "statement of marks", "hsc", "12th", "class xii",
61
- "board of higher secondary education", "maharashtra state board", "intermediate",
62
- "stream", "science", "commerce", "arts", "marks obtained", "grade", "percentage"
63
- ],
64
- "AgeNationalityDomicile": [
65
- "certificate of age nationality and domicile", "domicile certificate",
66
- "age nationality domicile", "tehsildar", "executive magistrate", "collector",
67
- "certificate of residence", "domiciled in the state of", "citizen of india",
68
- "residence proof", "maharashtra domicile", "satara", "karad", "taluka", "district"
69
- ],
70
- "Ration_Card": [
71
- "ration card", "food and civil supplies", "apl", "bpl", "aay", "antyodaya",
72
- "ration card number", "family members", "head of family",
73
- "राशन कार्ड", "खाद्य पुरवठा", "नागरी पुरवठा विभाग", "fps", "fair price shop"
74
- ],
75
- "Cast_Certificate": [
76
- "CASTE CERTIFICATE",
77
- "FORM - 8",
78
- "Rule No. 5(6)",
79
- "De-Notified Tribe (Vimukt Jati)",
80
- "Nomadic Tribe/Other Backward Class",
81
- "Special Backward Category",
82
- "recognised as",
83
- "Government Resolution",
84
- "Sub Divisional Officer",
85
- "belonging to the State of Maharashtra"
86
- ],
87
- "Income_Certificate": [
88
- "१ वर्षासाठी उत्पन्नाचे प्रमाणपत्र",
89
- "ऑफिस ऑफ नायब तहसीलदार",
90
- "वार्षिक उत्पन्न",
91
- "मिळालेले १ वर्षाचे उत्पन्न",
92
- "कुटुंबातील सर्व सदस्यांचे",
93
- "प्रमाणित करण्यात येते की",
94
- "वैध राहील",
95
- "Signature valid",
96
- "Digitally Signed by"
97
- ],
98
- "PCM_Score_Card": [
99
- "MAH-MHT CET (PCM Group)",
100
- "State Common Entrance Test Cell",
101
- "Score Card",
102
- "Physics",
103
- "Chemistry",
104
- "Mathematics",
105
- "Total Percentile",
106
- "Normalization document",
107
- "Centralized Admission Process (CAP)",
108
- "IP address of the Computer"
109
- ]
110
- }
111
-
112
- # Validate keyword uniqueness (optional debug output)
113
- _keyword_sets = {k: set(v) for k, v in DOC_KEYWORDS.items()}
114
- for doc1 in DOC_KEYWORDS:
115
- for doc2 in DOC_KEYWORDS:
116
- if doc1 < doc2:
117
- overlap = _keyword_sets[doc1].intersection(_keyword_sets[doc2])
118
- if overlap:
119
- print(f"⚠️ Warning: Overlap between {doc1} and {doc2}: {overlap}")
120
-
121
- def normalize_text(text):
122
- """Robust multilingual tokenization with noise filtering"""
123
- text = text.lower()
124
- # Extract Hindi Devanagari (2+ chars) OR English alphanumeric (3+ chars)
125
- tokens = re.findall(r'[\u0900-\u097F]{2,}|\w{3,}', text)
126
-
127
- # Remove common English stopwords
128
- stopwords = {'the', 'and', 'of', 'in', 'to', 'for', 'is', 'on', 'by', 'with', 'at', 'from', 'a', 'an', 'this'}
129
- tokens = [t for t in tokens if t not in stopwords]
130
-
131
- # Remove OCR noise (4+ consecutive consonants = garbage)
132
- noise_pattern = re.compile(r'^[b-df-hj-np-tv-xz]{4,}$')
133
- tokens = [t for t in tokens if not noise_pattern.match(t)]
134
-
135
- return tokens
136
-
137
- def pdf_to_images(pdf_path, max_pages=3):
138
- """Convert PDF pages to high-resolution temporary images"""
139
- if not PDF_SUPPORT:
140
- raise ValueError("PDF support not available. Install PyMuPDF")
141
-
142
- doc = fitz.open(pdf_path)
143
- total_pages = len(doc)
144
- pages_to_process = min(total_pages, max_pages)
145
-
146
- image_paths = []
147
- temp_dir = tempfile.mkdtemp(prefix="ocr_pdf_")
148
-
149
- if token in target_set:
150
- return token
151
-
152
- # Levenshtein distance match
153
- matches = difflib.get_close_matches(token, target_set, n=1, cutoff=threshold)
154
- if matches:
155
- return matches[0]
156
-
157
- # Substring match (handles concatenated words)
158
- for ocr_token in target_set:
159
- if token in ocr_token or ocr_token in token:
160
- return ocr_token
161
-
162
- # Hindi-specific fuzzy matching (handles OCR errors like सत्पमेव → सत्यमेव)
163
- if any('\u0900' <= c <= '\u097F' for c in token):
164
- for ocr_token in target_set:
165
- if len(ocr_token) > 3:
166
- similarity = difflib.SequenceMatcher(None, token, ocr_token).ratio()
167
- if similarity > threshold:
168
- return ocr_token
169
-
170
- return None
171
-
172
- def calculate_doc_type(ocr_tokens, debug=False):
173
- """
174
- Enhanced document classification with CORRECTED tie-breaking logic.
175
- Only compares documents that are ACTUALLY TIED (within 5% score).
176
- """
177
- ocr_set = set(ocr_tokens)
178
- ocr_combined = " ".join(ocr_tokens)
179
- scores = {}
180
-
181
- for doc_type, keywords in DOC_KEYWORDS.items():
182
- kw_set = set(k.lower() for k in keywords)
183
-
184
- # Primary: exact/fuzzy token matches (weighted 2 for exact, 1.5 for fuzzy)
185
- primary_matches = sum(2 if kw in ocr_set else 1.5 if fuzzy_match(kw, ocr_set) else 0
186
- for kw in kw_set)
187
-
188
- # Secondary: multi-word phrase matches in combined text
189
- phrase_matches = sum(1 for kw in kw_set if " " in kw and kw in ocr_combined)
190
-
191
- # Tertiary: title keyword bonus (certificate, card, licence, passport)
192
- title_keywords = [kw for kw in kw_set if any(word in kw for word in ["certificate", "card", "licence", "passport"])]
193
- title_match = sum(1 for kw in title_keywords if kw in ocr_combined)
194
-
195
- # Calculate weighted score (max possible = len(kw_set) * 2)
196
- max_possible = len(kw_set) * 2
197
- weighted_score = ((primary_matches + phrase_matches + title_match) / max_possible) * 100
198
-
199
- scores[doc_type] = weighted_score
200
-
201
- if debug:
202
- print(f" {doc_type:<25}: {weighted_score:>6.1f}% ({primary_matches:.1f} + {phrase_matches} + {title_match})")
203
-
204
- # Sort by score descending
205
- sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
206
- best_type, best_score = sorted_scores[0]
207
-
208
- # CRITICAL FIX: Only trigger tie-breaking if top TWO scores are close (within 5%)
209
- if len(sorted_scores) > 1 and (sorted_scores[0][1] - sorted_scores[1][1]) < 5:
210
- if debug:
211
- print(f"\n⚠️ Tie detected between '{sorted_scores[0][0]}' and '{sorted_scores[1][0]}'!")
212
-
213
- # Get ONLY the tied documents (within 5% of top score)
214
- tied_docs = [(doc_type, score) for doc_type, score in sorted_scores
215
- if (best_score - score) < 5]
216
-
217
- if debug:
218
- print(f"Tied documents: {[f'{doc}({score:.1f}%)' for doc, score in tied_docs]}")
219
-
220
- # Calculate unique keywords ONLY for tied documents
221
- unique_counts = {}
222
- for doc_type, _ in tied_docs:
223
- kw_set = set(k.lower() for k in DOC_KEYWORDS[doc_type])
224
-
225
- # Get keywords from OTHER tied documents only
226
- other_tied_keywords = set()
227
- for other_doc, _ in tied_docs:
228
- if other_doc != doc_type:
229
- other_tied_keywords.update(k.lower() for k in DOC_KEYWORDS[other_doc])
230
-
231
- unique_keywords = kw_set - other_tied_keywords
232
- unique_matches = sum(1 for kw in unique_keywords if fuzzy_match(kw, ocr_set))
233
- unique_counts[doc_type] = unique_matches
234
-
235
- if debug:
236
- print(f" {doc_type:<25}: {unique_matches} unique matches ({len(unique_keywords)} available)")
237
-
238
- # Only use tie-breaker if there's a clear winner
239
- if unique_counts and max(unique_counts.values()) > 0:
240
- sorted_unique = sorted(unique_counts.items(), key=lambda x: x[1], reverse=True)
241
- if len(sorted_unique) > 1 and sorted_unique[0][1] > sorted_unique[1][1]:
242
- best_type = sorted_unique[0][0]
243
- best_score = scores[best_type]
244
-
245
- if debug:
246
- print(f"✓ Tie broken: {best_type} wins with {unique_counts[best_type]} unique matches")
247
-
248
- return best_type, best_score
249
-
250
- def verify_keywords(ocr_tokens, user_keywords, use_fuzzy=False):
251
- """
252
- FIXED: Sequence-aware matching for multi-keyword inputs (names, addresses).
253
- Checks if keywords appear consecutively in OCR text first.
254
- """
255
- ocr_set = set(ocr_tokens)
256
- ocr_combined = " ".join(ocr_tokens)
257
- results = []
258
-
259
- # CRITICAL: For multi-keyword inputs, check for SEQUENCE match first
260
- if len(user_keywords) > 1:
261
- # Build the phrase as it should appear in OCR
262
- user_phrase = " ".join([kw.lower() if all(ord(c) < 128 for c in kw) else kw for kw in user_keywords])
263
-
264
- # Check if entire phrase exists in OCR text
265
- if user_phrase in ocr_combined:
266
- if args.debug:
267
- print(f"\n✓ Sequence match: '{user_phrase}' found in OCR text")
268
- # All keywords matched in correct order
269
- for kw in user_keywords:
270
- results.append({
271
- 'keyword': kw,
272
- 'matched': True,
273
- 'matched_text': kw
274
- })
275
- return results
276
-
277
- # Fuzzy phrase matching if enabled
278
- if use_fuzzy:
279
- # Create n-grams from OCR tokens matching user keyword count
280
- n = len(user_keywords)
281
- ocr_phrases = [" ".join(ocr_tokens[i:i+n]) for i in range(len(ocr_tokens) - n + 1)]
282
-
283
- phrase_match = fuzzy_match(user_phrase, set(ocr_phrases))
284
- if phrase_match:
285
- if args.debug:
286
- print(f"\n✓ Fuzzy sequence match: '{user_phrase}' ~ '{phrase_match}'")
287
- for kw in user_keywords:
288
- results.append({
289
- 'keyword': kw,
290
- 'matched': True,
291
- 'matched_text': kw
292
- })
293
- return results
294
-
295
- # Fallback to individual keyword matching
296
- for kw in user_keywords:
297
- kw_processed = kw.lower() if all(ord(c) < 128 for c in kw) else kw
298
- matched = False
299
- matched_text = None
300
-
301
- if kw_processed in ocr_set:
302
- matched = True
303
- matched_text = kw_processed
304
- elif " " in kw_processed and kw_processed in ocr_combined:
305
- matched = True
306
- matched_text = kw_processed
307
- elif use_fuzzy:
308
- matched_text = fuzzy_match(kw_processed, ocr_set)
309
- if matched_text:
310
- matched = True
311
-
312
- results.append({
313
- 'keyword': kw,
314
- 'matched': matched,
315
- 'matched_text': matched_text or kw_processed if matched else None
316
- })
317
-
318
- return results
319
-
320
- def main():
321
- parser = argparse.ArgumentParser(description='OCR Document Verification with PDF Support')
322
- parser.add_argument('--file', nargs='+', required=True, help='Paths to image or PDF files')
323
- parser.add_argument('--inputkeywords', required=True, help='Space-separated keywords to verify')
324
- parser.add_argument('--required', nargs='+', help='List of required document types (space-separated, e.g., PAN Aadhaar HSC)')
325
- parser.add_argument('--fuzzy', action='store_true', help='Enable fuzzy matching')
326
- parser.add_argument('--debug', action='store_true', help='Show detailed OCR and scoring output')
327
- parser.add_argument('--pages', type=int, default=3, help='Max pages to process for PDFs (default: 3)')
328
- global args
329
- args = parser.parse_args()
330
-
331
- # CRITICAL FIX: Clean the required list by stripping commas and whitespace
332
- required_list = []
333
- if args.required:
334
- for item in args.required:
335
- # Split on commas and strip whitespace from each part
336
- parts = [part.strip() for part in item.split(',') if part.strip()]
337
- required_list.extend(parts)
338
-
339
- required_set = set(required_list)
340
-
341
- # Process each file and collect results
342
- file_results = []
343
- found_documents = set()
344
- all_matched_keywords_per_file = []
345
-
346
- print(f"\n{'='*60}")
347
- print(f"PROCESSING {len(args.file)} FILES")
348
- print(f"{'='*60}\n")
349
-
350
- for idx, file_path in enumerate(args.file, 1):
351
- print(f"--- FILE {idx}/{len(args.file)}: {os.path.basename(file_path)} ---")
352
-
353
- # Extract text from file
354
- ocr_texts = get_ocr_text(file_path, args.pages)
355
-
356
- if not ocr_texts:
357
- print(f"⚠️ No text extracted from {file_path}\n")
358
- file_results.append({
359
- 'file': file_path,
360
- 'doc_type': 'Unknown',
361
- 'doc_score': 0,
362
- 'keywords_matched': [],
363
- 'status': 'ERROR'
364
- })
365
- continue
366
-
367
- # Debug: Show raw OCR
368
- if args.debug:
369
- print("\n" + "="*60)
370
- print("RAW OCR EXTRACTED TEXT:")
371
- print("="*60)
372
- for i, text in enumerate(ocr_texts, 1):
373
- print(f"{i:3d}. {text}")
374
- print("="*60 + "\n")
375
-
376
- # Normalize tokens
377
- ocr_tokens = normalize_text(" ".join(ocr_texts))
378
-
379
- # Debug: Show normalized tokens
380
- if args.debug:
381
- print("="*60)
382
- print("NORMALIZED TOKENS:")
383
- print("="*60)
384
- print(f"Total tokens: {len(ocr_tokens)}")
385
- print(f"First 50 tokens: {', '.join(ocr_tokens[:50])}{'...' if len(ocr_tokens) > 50 else ''}")
386
- print("="*60 + "\n")
387
-
388
- # Document classification
389
- if args.debug:
390
- print("="*60)
391
- print("DOCUMENT TYPE SCORING:")
392
- print("="*60)
393
-
394
- doc_type, doc_score = calculate_doc_type(ocr_tokens, debug=args.debug)
395
- found_documents.add(doc_type)
396
-
397
- if args.debug:
398
- print("="*60 + "\n")
399
-
400
- # Keyword verification
401
- user_keywords = [kw.strip() for kw in args.inputkeywords.split()]
402
- verification_results = verify_keywords(ocr_tokens, user_keywords, args.fuzzy)
403
-
404
- # Status: ALL keywords must match in this file
405
- all_matched = all(r['matched'] for r in verification_results)
406
- status = "VERIFIED" if all_matched else "NOT VERIFIED"
407
-
408
- # Store results for this file
409
- file_results.append({
410
- 'file': file_path,
411
- 'doc_type': doc_type,
412
- 'doc_score': doc_score,
413
- 'keywords_matched': verification_results,
414
- 'status': status,
415
- 'all_keywords_matched': all_matched
416
- })
417
-
418
- # Track which keywords were matched in this file
419
- matched_keywords_in_file = {r['keyword'] for r in verification_results if r['matched']}
420
- all_matched_keywords_per_file.append(matched_keywords_in_file)
421
-
422
- # Per-file output
423
- print(f"\n{'='*60}")
424
- print(f"Document Type: {doc_type} ({doc_score:.1f}% confidence)")
425
- print(f"{'='*60}")
426
- print(f"{'Keyword':<25} | {'Status':<10} | {'Matched Text'}")
427
- print(f"{'-'*60}")
428
-
429
- for r in verification_results:
430
- status_icon = "✓" if r['matched'] else "✗"
431
- matched_text = r['matched_text'] if r['matched_text'] else "Not found"
432
- print(f"{r['keyword']:<25} | {status_icon:<10} | {matched_text}")
433
-
434
- print(f"{'='*60}")
435
- print(f"File Status: {status}")
436
- print(f"{'='*60}\n")
437
-
438
- # FINAL SUMMARY
439
- print(f"\n{'='*60}")
440
- print(f"FINAL SUMMARY")
441
- print(f"{'='*60}")
442
-
443
- # Required documents check
444
- if required_set:
445
- missing_docs = required_set - found_documents
446
-
447
- print(f"\nRequired Documents: {', '.join(sorted(required_set))}")
448
- print(f"Found Documents: {', '.join(sorted(found_documents)) if found_documents else 'None'}")
449
-
450
- if missing_docs:
451
- print(f"❌ Missing Documents: {', '.join(sorted(missing_docs))}")
452
- docs_status = "NOT VERIFIED"
453
- else:
454
- print(f"✅ All required documents found!")
455
- docs_status = "VERIFIED"
456
- else:
457
- docs_status = "N/A (no required list specified)"
458
- missing_docs = set()
459
-
460
- # Overall keyword verification across ALL files
461
- # Check if every keyword appears in at least one file
462
- all_user_keywords = set(args.inputkeywords.split())
463
- keywords_found_across_files = set()
464
-
465
- for file_keyword_set in all_matched_keywords_per_file:
466
- keywords_found_across_files.update(file_keyword_set)
467
-
468
- missing_keywords = all_user_keywords - keywords_found_across_files
469
-
470
- print(f"\nKeywords to Find: {', '.join(sorted(all_user_keywords))}")
471
- print(f"Keywords Found (across all files): {', '.join(sorted(keywords_found_across_files)) if keywords_found_across_files else 'None'}")
472
-
473
- if missing_keywords:
474
- print(f"❌ Missing Keywords: {', '.join(sorted(missing_keywords))}")
475
- keywords_status = "NOT VERIFIED"
476
- else:
477
- print(f"✅ All keywords found across uploaded documents!")
478
- keywords_status = "VERIFIED"
479
-
480
- # Overall status: BOTH documents and keywords must be verified
481
- overall_status = "VERIFIED" if (docs_status == "VERIFIED" and keywords_status == "VERIFIED") else "NOT VERIFIED"
482
-
483
- print(f"\n{'='*60}")
484
- print(f"Documents Status: {docs_status}")
485
- print(f"Keywords Status: {keywords_status}")
486
- print(f"OVERALL STATUS: {overall_status}")
487
- print(f"{'='*60}")
488
-
489
- if __name__ == "__main__":
490
- main()