nuronv2 / app.py
Sukritgoel's picture
Update app.py
2741fde
import streamlit as st
import os
import sys
import re
from azure.storage.blob import BlobServiceClient
from PIL import Image
import cv2
import datetime
import clip
import torch
import chromadb
import uuid
import pdf2image
# import pandas as pd
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
#For generating CLIP embeddings
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
#File locations
# db_path = r'C:\Users\sukri\OneDrive\FantaClaus\NuronAI\experiments\indpass'
db_path = os.path.dirname(os.path.abspath(sys.argv[0]))
# Azure storage details
connect_str = st.secrets["conn_str"]
container_name = st.secrets["cont_name"]
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
def pdf_to_jpeg(pdf_file):
# poppler_path = r"C:\Program Files\poppler-23.08.0\Library\bin"
images = pdf2image.convert_from_path(pdf_file)
# print("Number of pages in PDF file-->",len(images))
# returns a list of JPEG images
return images
def upload_to_azure(image_path,file_name):
blob_client = blob_service_client.get_blob_client(container_name, file_name)
with open(image_path,"rb") as data:
blob_client.upload_blob(data, overwrite=True)
# print("file uploaded!!!")
def Azure_ocr_sdk(image_path):
# Create a computer vision client
subscription_key = "82e0c013e94849f7ab5bfc8c7c5e54c8"
endpoint = "https://docaicomputervisionocr1.cognitiveservices.azure.com/"
credentials = CognitiveServicesCredentials(subscription_key)
cv_client = ComputerVisionClient(endpoint, credentials)
response = cv_client.read_in_stream(open(image_path,'rb'), raw=True,language='en')
operationLocation = response.headers['Operation-Location']
operation_id = operationLocation.split('/')[-1]
result = cv_client.get_read_result(operation_id)
while(result.status == OperationStatusCodes.running):
operationLocation = response.headers['Operation-Location']
operation_id = operationLocation.split('/')[-1]
result = cv_client.get_read_result(operation_id)
text_blob = []
if result.status == OperationStatusCodes.succeeded:
read_results = result.analyze_result.read_results
for analyzed_result in read_results:
for line in analyzed_result.lines:
text_blob.append(line.text)
# print(text_blob)
return text_blob
def Passport_front(textdata):
Document_Type="PASSPORT DOCUMENT"
DateofBirth = None
Type = None
Countrycode = None
passportno = None
Surname = None
Firstname = None
Nationality = None
Sex = None
PlaceofBirth = None
PlaceofIssue = None
DateofIssue = None
DateofExpiry = None
MRZ = None
output2=[]
for w in textdata:
output2.append(w.upper())
output3=[]
for x in output2:
if 'REPUBLIC' not in x and 'TYPE' not in x and 'GIVEN' not in x and 'COUNTRY' not in x and 'PASSPORT' not in x and 'NO.' not in x and 'NAME' not in x and 'NATIONALITY' not in x and 'SEX' not in x and 'DATE' not in x and 'PLACE' not in x:
output3.append(x)
dates=[]
for w in output3:
match = re.search('\d{2}\s*/\d{2}\s*/\d{4}', w)
if match:
dates.append(w)
if len(dates)==2:
caldate=int(dates[1].split('/')[-1])-int(dates[0].split('/')[-1])
if caldate==10:
DateofBirth=None
try:
DateofIssue=dates[0]
except:
DateofIssue=None
try:
DateofExpiry=dates[1]
except:
DateofExpiry=None
else:
try:
DateofBirth=dates[0]
except:
DateofBirth=None
try:
DateofIssue=dates[1]
except:
DateofIssue=None
try:
DateofExpiry=dates[2]
except:
DateofExpiry=None
try:
for x in output3:
if DateofBirth in x or DateofIssue in x or DateofExpiry in x:
output3.remove(x)
except:
print(None)
single=[]
for w in output3:
if re.match(r'^[A-Z]{1}$',w):
single+=w
try:
Type=single[0]
except:
Type=None
try:
Sex=''.join([i for i in single if re.match(r'[M,F]$',i)])
except:
Sex=None
for w in single:
output3.remove(w)
for w in output3:
if re.match('[A-Z]*\s*/\s*[A-Z]*',w):
output3.remove(w)
passportno=None
for w in output3:
if re.search(r'^[A-Z]*\s*[0-9]{7}$', w):
passportno=w
break
else:
passportno=None
try:
output3.remove(passportno)
except:
print('None')
for x in output3:
if 'BIRTH' in x:
output3.remove(x)
if 'IND' or 'INDIAN' in x:
Nationality='INDIAN'
Countrycode='IND'
else:
Nationality=None
Countrycode=None
MRZ=''
for w in output3:
if '<<' in w:
MRZ+=w
try:
indices=[i for i,s in enumerate(output3) if re.match(r'IND$',s)]
if indices:
z=max(indices)
output4=output3[z+1:]
else:
output4=[]
except:
indices=[i for i,s in enumerate(output3) if re.match(r'IND',s)]
if indices:
z=max(indices)
output4=output3[z+1:]
else:
output4=[]
for i in output4:
if 'INDIAN' in i:
output4.remove(i)
if 'OF BIRTH' in i:
output4.remove(i)
if re.search(r'\d',i):
output4.remove(i)
try:
Sname=output4[0]
Name=output4[1]
Pob=output4[2]
Poi=output4[3]
except:
Sname=None
Name=None
Pob=None
Poi=None
strings={ "Surname": Sname,
"FirstName": Name,
"PlaceofBirth": Pob,
"PlaceofIssue": Poi}
values=list(strings.values())
strings2=[]
try:
for w in values:
string1=[]
for w1 in w.split(' '):
if re.match(r'^[0-9]*',w1):
string1.append(re.sub(r'[^A-Z]','',w1))
strings2.append(string1)
except:
strings2=[]
strings3=[]
for w in strings2:
if len(w)>=2:
strings3.append(" ".join(map(str,w)))
else:
strings3.append(w)
try:
str1 = ''.join(str(e) for e in strings3[0])
# print(str1)
Surname=str1
str2 = ''.join(str(e) for e in strings3[1])
# print(str2)
Firstname=str2
str3 = ''.join(str(e) for e in strings3[2])
# print(str3)
PlaceofBirth=str3
str4 = ''.join(str(e) for e in strings3[3])
# print(str4)
PlaceofIssue=str4
except:
Surname=None
Firstname=None
PlaceofBirth=None
PlaceofIssue=None
data={"Type": Document_Type,"Type of Passport":Type,"Countrycode": Countrycode,"Passportno": passportno,"Surname": Surname,"Firstname": Firstname,"Nationality": Nationality,"Sex": Sex,"Date of Birth": DateofBirth,"Place of Birth": PlaceofBirth,"Place of Issue": PlaceofIssue,"Date of Issue": DateofIssue,"Date of Expiry": DateofExpiry,"MRZ": MRZ}
return data
def Passport_back(text):
Document_Type="PASSPORT BACK"
data={}
old_passno=''
dateplace_issue=''
Father_name=''
mother_name=''
spouse=''
for x in text:
if 'CAUTION' in x:
index=text.index(x)
textdata=text[index+1:]
break
else:
textdata=text
print(len(textdata),textdata)
try:
if len(textdata)>=9:
for x in textdata:
if 'FATHER' in x or 'LEGAL GUARDIAN' in x:
index1=textdata.index(x)
Father_name=textdata[index1+1]
if 'MOTHER' in x or 'NAME OF MOT' in x:
index2=textdata.index(x)
mother_name=textdata[index2+1]
if 'SPOUSE' in x:
index3=textdata.index(x)
spouse=textdata[index3+1]
try:
indices1=[i for i, s in enumerate(textdata) if 'NAME' in s]
z1=max(indices1)
indices2=[i for i, s in enumerate(textdata) if 'OLD PASSPORT NO' in s or 'PLACE OF ISSUE' in s]
z2=min(indices2)
if 'ADDRESS' in textdata[z1+2]:
address=''.join(textdata[z1+3:z2])
else:
address=''.join(textdata[z1+2:z2])
except Exception as e:
print(e)
address='Not Found'
try:
if 'FILE NO' in x:
index4=textdata.index(x)
File_no=textdata[index4+1]
else:
if re.search(r'\d',textdata[-1]):
File_no=textdata[-1]
else:
if re.search(r'\d',textdata[-2]):
File_no=textdata[-2]
except:
File_no='Not Found'
try:
indices3=[i for i, s in enumerate(textdata) if 'OLD PASSPORT NO' in s]
z3=max(indices3)
textdata2=textdata[z3+1:-1]
old_passno=[w for w in textdata2 if re.match(r'[A-Z]{1}[0-9]{7}',w)][0]
except:
old_passno='Not Found'
try:
indices4=[i for i, s in enumerate(textdata) if 'PLACE OF ISSUE' in s]
z4=max(indices4)
if 'FILE NO' not in textdata[z4+1]:
if 'FILE NO' in textdata[-2]:
dateplace_issue=textdata[z4+1:-2]
else:
dateplace_issue=textdata[z4+1:-1]
else:
dateplace_issue='Not Found'
for w in dateplace_issue:
if old_passno in w or File_no in w:
dateplace_issue.remove(w)
except:
dateplace_issue='Not Found'
else:
if len(textdata)<=8:
textdata1_1=[]
print(textdata)
for i in range(0,3):
if re.match(r'[A-Z]+\s*[A-Z]*',textdata[i]) and not re.search(r'\d',textdata[i]):
textdata1_1.append(textdata[i])
if len(textdata1_1)==3:
Father_name=textdata1_1[0]
mother_name=textdata1_1[1]
spouse=textdata1_1[2]
address=' '.join(textdata[3:5])
File_no=textdata[-1]
else:
if len(textdata1_1)==2:
Father_name=textdata1_1[0]
mother_name=textdata1_1[1]
spouse='Not Found'
File_no='Not Found'
dateplace_issue='Not Found'
old_passno='Not Found'
address=' '.join(textdata[3:5])
File_no=textdata[-1]
else:
if len(textdata1_1)==1:
Father_name=textdata1_1[0]
mother_name='Not Found'
spouse='Not Found'
mother_name='Not Found'
spouse='Not Found'
address='Not Found'
File_no='Not Found'
dateplace_issue='Not Found'
old_passno='Not Found'
if textdata1_1==[]:
Father_name='Not Found'
mother_name='Not Found'
spouse='Not Found'
address='Not Found'
File_no='Not Found'
dateplace_issue='Not Found'
old_passno='Not Found'
except Exception as e:
print(e)
data['Type of Document']=Document_Type
Father_name='Not Found'
mother_name='Not Found'
spouse='Not Found'
address='Not Found'
File_no='Not Found'
dateplace_issue='Not Found'
old_passno='Not Found'
data['Type of Document']=Document_Type
data["Name of Father/Legal Guardian"]=Father_name
data["Name of Mother"]=mother_name
data["Name of Spouse"]=spouse
data["Address"]=address
data["Old Passport No"]=old_passno
data['Date and Place of Issue']=dateplace_issue
data["File No"]=File_no
return data
def aadhar_front(textdata):
Document_Type="AADHAR CARD FRONT"
indices=[i for i, s in enumerate(textdata) if 'INDIA' in s or 'GOVERNMENT' in s or 'India' in s]
z=max(indices)
text=textdata[z+1:]
output=[]
for w in text:
output.append(w.upper())
for w in textdata:
if 'FEMALE' or 'MALE' in output:
aadharno='NA'
w1=[]
w=[]
for a1 in output:
if re.search(r'^\d{4}\s\d{4}\s\d{4}$', a1):
aadharno=a1
output.remove(aadharno)
output=[]
for w in text:
output.append(w.upper())
dob=''
for a1 in output:
if re.match(r'.*[0-9].*',a1):
dob+=' '+ a1
#print(dob)
Date='NA'
match = re.search('\d{2}/\d{2}/\d{4}', dob)
#print(match)
if match:
Date = datetime.datetime.strptime('%s'%(match.group()), "%d/%m/%Y").date().strftime('%d/%m/%Y')
else:
date=dob
date=date.split(aadharno)
date1=date[0]
for a1 in date1.split(' '):
match=re.search('[0-9]{4}',a1)
if match:
date2=a1
Date=date2
#print(Date)
Father_name='NA'
F_name=''
Name='NA'
for i in output:
if 'FATHER' in i:
F_name=i
F_name=F_name.split(':')
Father_name=F_name[1]
# print(output[0])
Name=output[0]
break
else:
for w in range(0,len(output)):
for w1 in output[w].split(' '):
if Date in w1 or 'BIRTH' in w1:
x=w
Name=output[x-1]
Gender='NA'
for w in output:
for w1 in w.split(' '):
if 'MALE' in w1 or 'FEMALE' in w1:
Gender=w1
data = {"Type of Document": Document_Type,"Name":Name,"Father's Name":Father_name,"Date of Birth": Date,"Aadhar No": aadharno,"Gender":Gender}
return data
def aadhar_back(textdata2,polygons_new):
Document_Type='AADHAR CARD BACK'
textdata=[]
boxes=[]
for x in polygons_new:
textdata.append(x[1])
boxes.append(x[2])
indices=[i for i, s in enumerate(textdata) if 'Address' in s]
z=max(indices)
text1=textdata[z:]
boxes1=boxes[z:]
print(text1,boxes1)
p1=(boxes1[0][0],boxes1[0][1])
p2=(boxes1[0][4],boxes1[0][5])
dist = math.hypot(p2[0] - p1[0], p2[1] - p1[1])
check=[0]
check_list=[]
add_box=[]
add_text=[]
for j in range(1,len(boxes1)):
for k in boxes1[j]:
l=boxes1[0][0]
if l-(1.3*dist) <= boxes1[j][0] <= l+(3.5*dist):
check.append(j)
check_list=set(check)
add_text=[text1[i] for i in check_list]
add_box=[boxes1[i] for i in check_list]
add=''
for x in add_text:
add=add+x.upper()+' '
add=add.strip()
print(textdata2)
for a1 in textdata2:
if re.search(r'^\d{4}\s\d{4}\s\d{4}$', a1):
aadharno = a1
break
else:
aadharno='Not Found'
pin=re.findall(r'\b[1-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\b',add)
pinno=str(pin[0])
pinno=re.sub(r'\s{1,}','',pinno)
address=re.split(r'\b[1-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\b',add)[0]
address=address+' '+pinno
address=re.sub(r'ADDRESS:*',' ',address)
address=re.sub(r'\s{2,}',' ',address)
address=address.lstrip()
data = {"Type of Document": Document_Type,"Address":address,"PIN Code":pinno, "Aadhar No": aadharno}
return data
def pan_card(textdata):
Document_Type="PAN CARD"
indices=[i for i, s in enumerate(textdata) if 'INCOME' in s or 'Card' in s or 'INDIA' in s or 'DEPARTMENT' in s]
z=max(indices)
text=textdata[z+1:]
output=[]
for w in text:
output.append(w.upper())
dob='NA'
for w in output:
if re.match(r"^\d+",w ):
dob+=w
date=None
match = re.search('\d{2}/\d{2}/\d{4}', dob)
if match:
date = datetime.datetime.strptime('%s'%(match.group()), "%d/%m/%Y").date().strftime('%d/%m/%Y')
else:
date=dob
output1=[]
for x in output:
if 'INCOME' not in x and 'INDIA' not in x and 'NAME' not in x and 'BIRTH' not in x and 'PERMANENT' not in x and 'CARD' not in x and 'TAX' not in x :
output1.append(x)
panno='NA'
w1=[]
w=[]
for w1 in output1:
if re.search(r'^[A-Z0-9]{10}$', w1):
panno=w1
output1.remove(w1)
Name=output1[0]
Father_name=output1[1]
data = { "Type of Document": Document_Type, "Name":Name,"Father's Name": Father_name,"Date of Birth": date,"PAN Number":panno}
return data
def DL(textdata):
Document_Type="DRIVING LICENCE"
indices=[i for i, s in enumerate(textdata) if 'DRIVING' in s or 'UNION' in s or 'INDIAN' in s]
z=max(indices)
text=textdata[z+1:]
State=text[0]
State=State.replace("STATE","")
text=[re.sub('[:;.]+', '', _) for _ in text]
text=[x.strip(' ') for x in text]
Number='NA'
Number_text=str(text)
Number_text=Number_text.replace("'","")
#print(Number_text)
for x in Number_text.split(','):
#print(x)
if re.match(r'^[\s]*[A-Z]{2}[-]*[\s]*[0-9]{2}[\s]*[0-9]{4}[0-9]{7}$',x):
Number=x
indices=[i for i, s in enumerate(text) if "Number" in s ]
z=max(indices)
text1=text[z+1:]
AName=[]
Name='NA'
SName='NA'
indices=[i for i, s in enumerate(text1) if "Address" in s or "Blood" in s]
z=max(indices)
Name2=text1[:z]
Name2.remove('Name')
for w in Name2:
AName.append(w)
Number=Number.lstrip()
for x in AName:
# print(x)
if Number==x:
AName.remove(x)
break
for x in AName:
if "S/D/W" in x:
SName=x.split("S/D/W of")[1]
else:
Name+= x +' '
indices=[i for i, s in enumerate(text1) if "Address" in s]
z=max(indices)
text2=text1[z:]
pin='NA'
for x in text2:
for y in x.split(' '):
if re.match(r'.*[1-9]{1}[0-9]{5}$',y):
pin=y
text3=str(text2)
if pin!='':
pinno=str(pin)
Address="" .join(text3.split(pinno,2)[:1])
Address=Address+pinno
Address=Address.replace("'", "")
Address=Address.replace("[", "")
Address=Address.replace(",,", ",")
Address=Address.replace("Address,","")
else:
Address=text3[:text3.find("Issued")]
Address=Address.replace("[", "")
Address=Address.replace(",,", ",")
Address=Address.replace("Address","")
dates=[]
for w in text2:
#print(w)
match = re.search('\d{2}\s*-\d{2}\s*-\d{4}', w)
if match or "Issued" in w:
dates.append(w)
Final_dates=[]
for w in dates:
for w1 in w.split(' '):
if re.match('[A-Za-z0-9]*\d{2}\s*-\d{2}\s*-\d{4}', w1):
Final_dates.append(w1)
Dob='NA'
Valid_till='NA'
Date_of_birth='NA'
Issued_date='NA'
if len(Final_dates)>=3:
Issued_date=Final_dates[0]
Dob=Final_dates[1]
Dob=Dob.split(' ')
Date_of_birth=Dob[0]
Valid_till=Final_dates[2]
else:
if Final_dates==2:
Issued_date=dates[0]
Dob=dates[1]
else:
Issued_date=Final_dates[0]
for w in Issued_date:
if re.match(r'[a-zA-Z]',w):
Issued_date=Issued_date.replace(w,'')
Bgroup='NA'
for x in text2:
for y in x.split(' '):
if "BG" in y or "Blood" in y:
Bgroup=x
BG='NA'
for x in Bgroup.split(' '):
if '+' in x:
BG=x
indices=[i for i, s in enumerate(text2) if 'Vehicle' in s or 'Class' in s]
z=max(indices)
text4=text2[z+1:]
Vehicle_class='NA'
for w in text4:
if re.match(r'^[A-Z0-9]+$',w):
Vehicle_class+=w+' '
data = {"Type of Document": Document_Type,"Number":Number,"Name":Name,"S/D/W of":SName,"Date of Birth":Date_of_birth,"Address":Address,"State":State,"Pinno":pin}
return data
def VoterId(textdata):
Document_Type="Voter ID Card"
data={}
textdata=[x.upper() for x in textdata]
id_no=''
elector_name=''
father_name=''
sex=''
place=''
date=''
address=''
hno=''
mohalla=''
town=''
police=''
distt=''
pin=''
add=''
age=''
try:
for k in textdata:
index=textdata.index(k)
if re.search(r'[A-Z]{3}[0-9]{7}$',textdata[index]):
id_no=re.search(r'[A-Z]{3}[0-9]{7}$',textdata[index])[0]
break
if id_no=='':
id_no='Not Found'
except:
id_no='Not Found'
try:
### elector's name
for key in textdata:
if "ELECTOR'S NAME" in key:
index=textdata.index(key)
x1=textdata[index]
if ':' in x1:
x2=x1.split(':')[1]
else:
x2=x1.split('NAME')[1]
if x2=='' or x2==' ':
x3=textdata[index+1]
if ':' in x3:
x3=x3.split(':')[1]
elector_name=x3
break
else:
elector_name=x3
break
else:
elector_name=x2
break
if elector_name=='':
elector_name='Not Found'
except:
elector_name='Not Found'
try:
### Father's Name
for k in textdata:
if "FATHER'S NAME" in k or "HUSBAND'S NAME" in k:
index=textdata.index(k)
y1=textdata[index]
if ':' in y1:
y2=y1.split(':')[1]
else:
y2=y1.split('NAME')[1]
if y2=='' or y2==' ':
y3=textdata[index+1]
if ':' in y3:
y3=y3.split(':')[1]
father_name=y3
break
else:
father_name=y3
break
else:
father_name=y2
break
if father_name=='':
father_name='Not Found'
except:
father_name='Not Found'
try:
### Sex
foundflag=False
for key in textdata:
if 'SEX' in key:
index=textdata.index(key)
if 'MALE' in textdata[index]:
if 'FEMALE' in textdata[index]:
sex='FEMALE'
foundflag=True
else:
sex='MALE'
foundflag=True
else:
if 'MALE' in textdata[index+1]:
if 'FEMALE' in textdata[index+1]:
sex='FEMALE'
foundflag=True
else:
sex='MALE'
foundflag=True
if foundflag==False:
if 'MALE' in key:
if 'FEMALE' in key:
sex='FEMALE'
foundflag=True
else:
sex='MALE'
foundflag=True
if foundflag==False:
if 'SEX' in key:
index=textdata.index(key)
sex1=''.join(textdata[index+1])
if sex1=='M':
sex='MALE'
foundflag=True
else:
if sex1=='F':
sex='FEMALE'
foundflag=True
if sex=='':
sex='Not Found'
except:
sex='Not Found'
try:
### Place
for k in textdata:
if 'PLACE' in k:
index=textdata.index(k)
z1=textdata[index]
if ':' in z1:
z2=z1.split(':')[1]
else:
z2=z1.split('PLACE')[1]
if z2=='' or z2==' ':
z3=textdata[index+1]
if ':' in z3:
z3=z3.split(':')[1]
place=z3
break
else:
place=z3
break
else:
place=z2
break
if place=='':
place='Not Found'
except:
place='Not Found'
try:
### Date
for key in textdata:
if 'DATE' in key:
index=textdata.index(key)
a1=textdata[index]
a2=textdata[index+1]
if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a1):
date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a1)[0]
break
else:
if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a2):
date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a2)[0]
break
else:
if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',key):
date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',key)[0]
break
else:
date='Not found'
except:
date='Not found'
try:
### Add
### House no
for h in textdata:
if 'H.NO.' in h:
index=textdata.index(h)
h1=textdata[index]
if ':' in h1:
h2=h1.split(':')[1]
else:
h2=h1.split('H.NO.')[1]
if h2=='' or h2==' ':
h3=textdata[index+1]
if not 'MOHALLA' in h3:
if ':' in h3:
h3=h3.split(':')[1]
hno=h3
break
else:
hno=h3
break
else:
hno=h2
break
### Mohalla
for m in textdata:
if 'MOHALLA' in m:
index=textdata.index(m)
m1=textdata[index]
if ':' in m1:
m2=m1.split(':')[1]
else:
m2=m1.split('MOHALLA')[1]
if m2=='' or m2==' ':
m3=textdata[index+1]
if not 'TOWN' in m3:
if ':' in m3:
m3=m3.split(':')[1]
mohalla=m3
break
else:
mohalla=m3
break
else:
mohalla=m2
break
### Town
for t in textdata:
if 'TOWN' in t:
index=textdata.index(t)
t1=textdata[index]
if ':' in t1:
t2=t1.split(':')[1]
else:
t2=t1.split('TOWN')[1]
if t2=='' or t2==' ':
t3=textdata[index+1]
if not 'POLICESTN.' in t3:
if ':' in t3:
t3=t3.split(':')[1]
town=t3
break
else:
town=t3
break
else:
town=t2
break
###Police
for p in textdata:
if 'POLICE' in p:
index=textdata.index(p)
p1=textdata[index]
if ':' in p1:
p2=p1.split(':')[1]
else:
p2=p1.split('POLICESTN.')[1]
if p2=='' or p2==' ':
p3=textdata[index+1]
if not 'DISTT' in p3:
if ':' in p3:
p3=p3.split(':')[1]
police=p3
break
else:
police=p3
break
else:
police=p2
break
### District
for d in textdata:
if 'DISTT' in d:
index=textdata.index(d)
d1=textdata[index]
if ':' in y1:
d2=d1.split(':')[1]
else:
d2=d1.split('DISTT.')[1]
if d2=='' or d2==' ':
d3=textdata[index+1]
if not 'PIN' in d3:
if ':' in d3:
d3=d3.split(':')[1]
distt=d3
break
else:
distt=d3
break
else:
distt=d2
break
### Pin
for pp in textdata:
if 'PIN' in pp:
index=textdata.index(pp)
pp1=textdata[index]
if ':' in pp1:
pp2=pp1.split(':')[1]
else:
pp2=pp1.split('PIN')[1]
if pp2=='' or pp2==' ':
pp3=textdata[index+1]
if re.search(r'[0-9\s*]{6,}',pp3):
pin=re.search(r'[0-9\s*]{6,}',pp3)[0]
else:
pin=pp2
break
add=hno+' '+mohalla+' '+town+' '+police+' '+distt+' '+pin
if len(add)>15:
address=add
else:
try:
add1=[i for i,s in enumerate(textdata) if 'ADDRESS' in s]
add2=min(add1)
add3=[i for i,s in enumerate(textdata) if 'ELECTION' in s]
add4=min(add3)
address=' '.join(textdata[add2:add4])
except:
address='Not Found'
except:
address:'Not Found'
try:
### Age
for k in textdata:
if 'AGE AS ON' in k:
index=textdata.index(k)
i1=textdata[index]
if ':' in i1:
if re.search(r'[0-9]{2,3}',i1):
age=i1
break
else:
i3=textdata[index+1]
if re.search(r'[0-9]{2,3}',i3):
age=i1+' '+i3
break
else:
i4=textdata[index-1]
if re.search(r'[0-9]{2,3}',i4):
age=i1+' '+i4
break
else:
age='Not Found'
else:
if 'DATE OF BIRTH' in k:
index=textdata.index(k)
i1=textdata[index]
if ':' in i1 or 'BIRTH' in i1:
if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1):
age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0]
break
else:
i3=textdata[index+1]
if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i3):
age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0]
break
else:
i4=textdata[index-1]
if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i4):
age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0]
break
else:
age='Not Found'
except:
age='Not Found'
data['Type of Document']=Document_Type
data['Voter ID Number']=id_no
data["Elector's Name"]=elector_name
data["Father's/Husband's Name"]=father_name
data["Sex"]=sex
data["Address"]=address
data["Date of Birth or Age"]=age
data['Date']=date
data["Place"]=place
return data
# This function creates a chromaDB client that connects to existing or newly created db
def getChromaClient(dbPath):
client=chromadb.PersistentClient(path=dbPath)
return client
# This function will get or create a new collection by taking openAI details and chromaClient object
def getCollection(collectionName, client):
collection = client.get_or_create_collection(name=collectionName, metadata={"hsnw:space": "cosine"})
return collection
# Function to generate an embedding for an image
def get_embedding(img):
query_image = preprocess(Image.fromarray(img)).unsqueeze(0).to(device)
# Get the embedding from the CLIP model
with torch.no_grad():
query_embeddings = model.encode_image(query_image)
return query_embeddings
def store_embeddings(collection, emb,cat,id):
try:
collection.add(
embeddings=emb,
# documents=[img],
metadatas=[{"category": cat}],
ids=[id] )
# print('embeddings added')
except Exception as e:
print(e)
exit
def get_category(collection,emb):
similar_img = collection.query(query_embeddings = emb, n_results = 1)
option = similar_img["metadatas"][0][0]["category"]
return option
def train_classifier(coll,emb,cat):
id = str(uuid.uuid4())
store_embeddings(coll, emb, cat, id)
def save_uploadedfile(uploadedfile):
with open(os.path.join(uploadedfile.name),"wb") as f:
f.write(uploadedfile.getbuffer())
# return st.success("Saved File:{} to tempDir".format(uploadedfile.name))
def main():
# Delete all the items in Session state
for key in st.session_state.keys():
del st.session_state[key]
client = getChromaClient(db_path)
coll = getCollection('visaDocImgs',client)
st.sidebar.title("Choose Mode")
mode = st.sidebar.selectbox("", ("Train", "Test"))
st.write("Upload an image or pdf file")
uploaded_file = st.file_uploader("File upload", type=['png','jpg','jpeg','pdf'])
if uploaded_file is not None:
save_uploadedfile(uploaded_file)
folder_path = os.path.join("/")
image_name = uploaded_file.name
img_path = folder_path+ image_name
upload_to_azure(img_path, image_name)
#PDF file handeling
if image_name.endswith('.pdf'):
img_class_arr = []
images = pdf_to_jpeg(img_path)
st.write("You have uploaded a pdf file with "+str(len(images))+" pages.")
pass_name = None
for i in range(len(images)):
new_image_name = image_name.split(".")[0]+"_page_"+str(i)+".jpeg"
images[i].save(os.path.join(folder_path,new_image_name))
new_img_path = folder_path+new_image_name
new_img_array = cv2.imread(new_img_path)
new_emb_tensor = get_embedding(new_img_array)
new_emb = new_emb_tensor.tolist()
new_cat = get_category(coll,new_emb)
st.image(images[i],caption=new_cat)
img_class_arr.append([new_image_name,new_cat,new_img_path])
pdf_pg_sel = st.selectbox('which page to process?',(set(img_class_arr[i][0] for i in range(len(img_class_arr)))))
for row in img_class_arr:
if row[0]==pdf_pg_sel:
pass_name = row[2]
new_cat = row[1]
cat_list = ['passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid']
if new_cat not in cat_list:
st.write("This category is not configured for extraction yet!")
st.stop()
else:
img_path = pass_name
#Process single image file
im = Image.open(img_path)
st.image(im, caption = "Image being processed")
img_array = cv2.imread(img_path)
# Get the image embedding
emb_tensor = get_embedding(img_array)
emb = emb_tensor.tolist()
if mode == 'Test':
option = get_category(coll,emb)
st.write("Category -- >",option)
cor_cat = None
# print("is this correct classification?")
cor_cat = st.selectbox(
'is this correct classification?',('y','n'))
st.write('You selected:', cor_cat)
if cor_cat == 'n':
option = st.selectbox('What is the correct classification?', ('passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid'))
st.write('You selected:', option)
else:
new_cat = option
x = Azure_ocr_sdk(img_path)
# Calling function based on category input
if option == 'passport_front':
res = Passport_front(x)
elif option == 'passport_back':
res = Passport_back(x)
elif option == 'aadhar_front':
res = aadhar_front(x)
elif option == 'aadhar_back':
res = aadhar_back(x)
elif option == 'PAN_front':
res = pan_card(x)
elif option == 'DL':
res = DL(x)
elif option == 'voterid':
res = VoterId(x)
else:
st.write("Couldn't classify")
st.header("Extracted text")
st.table(res)
else:
new_cat = st.selectbox('What is the correct classification?', ('passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid'))
st.write('You selected:', new_cat)
train_classifier(coll,emb,new_cat)
if __name__ == "__main__":
main()