Midnightar commited on
Commit
0c8d1e3
·
verified ·
1 Parent(s): 2878711

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +250 -36
app.py CHANGED
@@ -57,22 +57,64 @@ def extract_text(image_path):
57
 
58
  def detect_document(text):
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  # =========================
61
  # NIN
62
  # =========================
63
 
64
  nin_keywords = [
65
  "national identification number",
66
- "nin"
 
 
67
  ]
68
 
69
  matched_keywords = []
70
 
71
  for keyword in nin_keywords:
72
- if keyword in text:
 
73
  matched_keywords.append(keyword)
74
 
75
  if len(matched_keywords) > 0:
 
76
  return {
77
  "document_type": "nin",
78
  "confidence": 95,
@@ -85,16 +127,19 @@ def detect_document(text):
85
 
86
  passport_keywords = [
87
  "passport",
88
- "federal republic of nigeria"
 
89
  ]
90
 
91
  matched_keywords = []
92
 
93
  for keyword in passport_keywords:
94
- if keyword in text:
 
95
  matched_keywords.append(keyword)
96
 
97
  if len(matched_keywords) > 0:
 
98
  return {
99
  "document_type": "passport",
100
  "confidence": 94,
@@ -107,16 +152,21 @@ def detect_document(text):
107
 
108
  license_keywords = [
109
  "driver",
110
- "license"
 
 
 
111
  ]
112
 
113
  matched_keywords = []
114
 
115
  for keyword in license_keywords:
116
- if keyword in text:
 
117
  matched_keywords.append(keyword)
118
 
119
  if len(matched_keywords) >= 2:
 
120
  return {
121
  "document_type": "drivers_license",
122
  "confidence": 92,
@@ -129,16 +179,20 @@ def detect_document(text):
129
 
130
  voter_keywords = [
131
  "voter",
132
- "inec"
 
 
133
  ]
134
 
135
  matched_keywords = []
136
 
137
  for keyword in voter_keywords:
138
- if keyword in text:
 
139
  matched_keywords.append(keyword)
140
 
141
  if len(matched_keywords) > 0:
 
142
  return {
143
  "document_type": "voters_card",
144
  "confidence": 90,
@@ -146,28 +200,61 @@ def detect_document(text):
146
  }
147
 
148
  # =========================
149
- # UTILITY BILL
150
  # =========================
151
 
152
- utility_keywords = [
 
 
153
  "electricity",
154
- "water bill",
 
 
 
 
 
 
 
155
  "ikeja electric",
 
156
  "eko electric",
 
157
  "abuja electricity",
158
- "aedc"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  ]
160
 
161
  matched_keywords = []
162
 
163
- for keyword in utility_keywords:
164
- if keyword in text:
 
165
  matched_keywords.append(keyword)
166
 
167
  if len(matched_keywords) > 0:
 
168
  return {
169
  "document_type": "utility_bill",
170
- "confidence": 88,
171
  "matched_keywords": matched_keywords
172
  }
173
 
@@ -176,21 +263,39 @@ def detect_document(text):
176
  # =========================
177
 
178
  bank_keywords = [
 
179
  "account statement",
 
180
  "transaction",
181
  "balance",
182
  "account number",
183
  "credit",
184
- "debit"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  ]
186
 
187
  matched_keywords = []
188
 
189
  for keyword in bank_keywords:
190
- if keyword in text:
 
191
  matched_keywords.append(keyword)
192
 
193
  if len(matched_keywords) > 0:
 
194
  return {
195
  "document_type": "bank_statement",
196
  "confidence": 91,
@@ -202,20 +307,25 @@ def detect_document(text):
202
  # =========================
203
 
204
  tenancy_keywords = [
 
205
  "tenancy agreement",
206
  "landlord",
207
  "tenant",
208
  "rent",
209
- "property"
 
 
210
  ]
211
 
212
  matched_keywords = []
213
 
214
  for keyword in tenancy_keywords:
215
- if keyword in text:
 
216
  matched_keywords.append(keyword)
217
 
218
  if len(matched_keywords) > 0:
 
219
  return {
220
  "document_type": "tenancy_agreement",
221
  "confidence": 89,
@@ -223,38 +333,142 @@ def detect_document(text):
223
  }
224
 
225
  # =========================
226
- # VEHICLE PLATE NUMBER
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  # =========================
228
 
229
  plate_patterns = [
230
 
231
- # Standard Nigerian plates
232
- r"[A-Z]{2,3}\s?\d{2,3}[A-Z]{1,2}",
 
 
 
 
233
 
234
- # OCR damaged versions
235
- r"[A-Z]{1,3}\d{2,3}[A-Z]{1,3}",
236
 
237
- # Hyphen formats
238
- r"[A-Z]{2,3}-?\d{2,3}-?[A-Z]{1,2}"
239
- ]
 
240
 
241
- for pattern in plate_patterns:
242
 
243
- plate_match = re.search(
244
- pattern,
245
- text.upper()
246
- )
 
 
 
247
 
248
- if plate_match:
249
 
250
  return {
251
  "document_type": "vehicle_plate",
252
- "confidence": 85,
253
- "matched_keywords": [plate_match.group()]
 
 
254
  }
255
 
256
- return None
 
257
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
  # =========================
260
  # HOME ROUTE
 
57
 
58
  def detect_document(text):
59
 
60
+ # CLEAN TEXT
61
+ text = text.lower().strip()
62
+
63
+ # REMOVE EXTRA SYMBOLS
64
+ cleaned_text = re.sub(
65
+ r'[^a-zA-Z0-9\\s-]',
66
+ ' ',
67
+ text
68
+ )
69
+
70
+ # SPLIT WORDS
71
+ words = cleaned_text.split()
72
+
73
+ # =========================
74
+ # REJECT RANDOM OCR GARBAGE
75
+ # =========================
76
+
77
+ garbage_patterns = [
78
+ r'^[a-z0-9]{4,8}$'
79
+ ]
80
+
81
+ for pattern in garbage_patterns:
82
+
83
+ for word in words:
84
+
85
+ if re.match(pattern, word):
86
+
87
+ if len(words) <= 2:
88
+ return {
89
+ "document_type": "unknown",
90
+ "confidence": 5,
91
+ "matched_keywords": [word],
92
+ "reason": (
93
+ "OCR detected unreadable or "
94
+ "meaningless text."
95
+ )
96
+ }
97
+
98
  # =========================
99
  # NIN
100
  # =========================
101
 
102
  nin_keywords = [
103
  "national identification number",
104
+ "national identity",
105
+ "nin",
106
+ "nimc"
107
  ]
108
 
109
  matched_keywords = []
110
 
111
  for keyword in nin_keywords:
112
+
113
+ if keyword in cleaned_text:
114
  matched_keywords.append(keyword)
115
 
116
  if len(matched_keywords) > 0:
117
+
118
  return {
119
  "document_type": "nin",
120
  "confidence": 95,
 
127
 
128
  passport_keywords = [
129
  "passport",
130
+ "federal republic of nigeria",
131
+ "nigeria passport"
132
  ]
133
 
134
  matched_keywords = []
135
 
136
  for keyword in passport_keywords:
137
+
138
+ if keyword in cleaned_text:
139
  matched_keywords.append(keyword)
140
 
141
  if len(matched_keywords) > 0:
142
+
143
  return {
144
  "document_type": "passport",
145
  "confidence": 94,
 
152
 
153
  license_keywords = [
154
  "driver",
155
+ "license",
156
+ "drivers licence",
157
+ "driver licence",
158
+ "frsc"
159
  ]
160
 
161
  matched_keywords = []
162
 
163
  for keyword in license_keywords:
164
+
165
+ if keyword in cleaned_text:
166
  matched_keywords.append(keyword)
167
 
168
  if len(matched_keywords) >= 2:
169
+
170
  return {
171
  "document_type": "drivers_license",
172
  "confidence": 92,
 
179
 
180
  voter_keywords = [
181
  "voter",
182
+ "inec",
183
+ "permanent voter",
184
+ "polling unit"
185
  ]
186
 
187
  matched_keywords = []
188
 
189
  for keyword in voter_keywords:
190
+
191
+ if keyword in cleaned_text:
192
  matched_keywords.append(keyword)
193
 
194
  if len(matched_keywords) > 0:
195
+
196
  return {
197
  "document_type": "voters_card",
198
  "confidence": 90,
 
200
  }
201
 
202
  # =========================
203
+ # ELECTRICITY COMPANIES
204
  # =========================
205
 
206
+ electricity_keywords = [
207
+
208
+ # General
209
  "electricity",
210
+ "electric bill",
211
+ "power bill",
212
+ "meter number",
213
+
214
+ # Nigerian DISCOs
215
+ "ibedc",
216
+ "ibadan electricity",
217
+ "ikedc",
218
  "ikeja electric",
219
+ "ekedc",
220
  "eko electric",
221
+ "aedc",
222
  "abuja electricity",
223
+ "eedc",
224
+ "enugu electricity",
225
+ "bedc",
226
+ "benin electricity",
227
+ "jed",
228
+ "jos electricity",
229
+ "kedco",
230
+ "kano electricity",
231
+ "kaedco",
232
+ "kaduna electric",
233
+ "phed",
234
+ "port harcourt electricity",
235
+ "yedc",
236
+ "yola electricity",
237
+
238
+ # Common Nigerian utility terms
239
+ "prepaid",
240
+ "postpaid",
241
+ "disco",
242
+ "energy charge",
243
+ "tariff"
244
  ]
245
 
246
  matched_keywords = []
247
 
248
+ for keyword in electricity_keywords:
249
+
250
+ if keyword in cleaned_text:
251
  matched_keywords.append(keyword)
252
 
253
  if len(matched_keywords) > 0:
254
+
255
  return {
256
  "document_type": "utility_bill",
257
+ "confidence": 90,
258
  "matched_keywords": matched_keywords
259
  }
260
 
 
263
  # =========================
264
 
265
  bank_keywords = [
266
+
267
  "account statement",
268
+ "statement of account",
269
  "transaction",
270
  "balance",
271
  "account number",
272
  "credit",
273
+ "debit",
274
+ "withdrawal",
275
+ "deposit",
276
+
277
+ # Nigerian Banks
278
+ "access bank",
279
+ "gtbank",
280
+ "uba",
281
+ "zenith bank",
282
+ "first bank",
283
+ "opay",
284
+ "moniepoint",
285
+ "kuda",
286
+ "fcmb",
287
+ "sterling bank"
288
  ]
289
 
290
  matched_keywords = []
291
 
292
  for keyword in bank_keywords:
293
+
294
+ if keyword in cleaned_text:
295
  matched_keywords.append(keyword)
296
 
297
  if len(matched_keywords) > 0:
298
+
299
  return {
300
  "document_type": "bank_statement",
301
  "confidence": 91,
 
307
  # =========================
308
 
309
  tenancy_keywords = [
310
+
311
  "tenancy agreement",
312
  "landlord",
313
  "tenant",
314
  "rent",
315
+ "property",
316
+ "lease agreement",
317
+ "rental agreement"
318
  ]
319
 
320
  matched_keywords = []
321
 
322
  for keyword in tenancy_keywords:
323
+
324
+ if keyword in cleaned_text:
325
  matched_keywords.append(keyword)
326
 
327
  if len(matched_keywords) > 0:
328
+
329
  return {
330
  "document_type": "tenancy_agreement",
331
  "confidence": 89,
 
333
  }
334
 
335
  # =========================
336
+ # VEHICLE KEYWORDS
337
+ # =========================
338
+
339
+ vehicle_keywords = [
340
+
341
+ "toyota",
342
+ "honda",
343
+ "lexus",
344
+ "benz",
345
+ "mercedes",
346
+ "ford",
347
+ "jeep",
348
+ "hyundai",
349
+ "kia",
350
+ "nissan",
351
+ "camry",
352
+ "corolla",
353
+ "rav4",
354
+ "pilot",
355
+ "highlander",
356
+ "vehicle",
357
+ "plate number"
358
+ ]
359
+
360
+ matched_keywords = []
361
+
362
+ for keyword in vehicle_keywords:
363
+
364
+ if keyword in cleaned_text:
365
+ matched_keywords.append(keyword)
366
+
367
+ # =========================
368
+ # NIGERIAN STATES
369
+ # =========================
370
+
371
+ nigeria_states = [
372
+
373
+ "lagos",
374
+ "abuja",
375
+ "kano",
376
+ "kaduna",
377
+ "oyo",
378
+ "ogun",
379
+ "ondo",
380
+ "osun",
381
+ "kwara",
382
+ "imo",
383
+ "anambra",
384
+ "enugu",
385
+ "rivers",
386
+ "delta",
387
+ "edo",
388
+ "cross river",
389
+ "akwa ibom",
390
+ "bayelsa",
391
+ "plateau",
392
+ "benue",
393
+ "kogi",
394
+ "ekiti",
395
+ "niger",
396
+ "zamfara",
397
+ "sokoto",
398
+ "katsina",
399
+ "borno",
400
+ "yobe",
401
+ "adamawa",
402
+ "taraba",
403
+ "gombe",
404
+ "bauchi",
405
+ "jigawa",
406
+ "nasarawa",
407
+ "kebbi",
408
+ "ebonyi"
409
+ ]
410
+
411
+ state_matches = []
412
+
413
+ for state in nigeria_states:
414
+
415
+ if state in cleaned_text:
416
+ state_matches.append(state)
417
+
418
+ # =========================
419
+ # NIGERIAN PLATE PATTERNS
420
  # =========================
421
 
422
  plate_patterns = [
423
 
424
+ r"[A-Z]{3}-?\\d{3}[A-Z]{2}",
425
+ r"[A-Z]{2}\\d{3}[A-Z]{3}",
426
+ r"[A-Z]{3}\\s\\d{3}\\s[A-Z]{2}"
427
+ ]
428
+
429
+ detected_plate = None
430
 
431
+ for pattern in plate_patterns:
 
432
 
433
+ plate_match = re.search(
434
+ pattern,
435
+ cleaned_text.upper()
436
+ )
437
 
438
+ if plate_match:
439
 
440
+ detected_plate = plate_match.group()
441
+
442
+ break
443
+
444
+ # =========================
445
+ # VEHICLE DETECTION
446
+ # =========================
447
 
448
+ if detected_plate:
449
 
450
  return {
451
  "document_type": "vehicle_plate",
452
+ "confidence": 97,
453
+ "matched_keywords": [
454
+ detected_plate
455
+ ] + state_matches
456
  }
457
 
458
+ # VEHICLE WITHOUT CLEAR PLATE
459
+ if len(matched_keywords) > 0:
460
 
461
+ return {
462
+ "document_type": "vehicle_image",
463
+ "confidence": 75,
464
+ "matched_keywords": matched_keywords
465
+ }
466
+
467
+ # =========================
468
+ # UNKNOWN DOCUMENT
469
+ # =========================
470
+
471
+ return None
472
 
473
  # =========================
474
  # HOME ROUTE