Spaces:

Sukritgoel
/

nuronv2

Sleeping

App Files Files Community

nuronv2 / app.py

Sukritgoel

Update app.py

2741fde over 2 years ago

raw

history blame contribute delete

43.1 kB


	import streamlit as st
	import os
	import sys
	import re
	from azure.storage.blob import BlobServiceClient
	from PIL import Image
	import cv2
	import datetime
	import clip
	import torch
	import chromadb
	import uuid
	import pdf2image
	# import pandas as pd

	from azure.cognitiveservices.vision.computervision import ComputerVisionClient
	from msrest.authentication import CognitiveServicesCredentials
	from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes

	#For generating CLIP embeddings
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model, preprocess = clip.load("ViT-B/32", device=device)

	#File locations
	# db_path = r'C:\Users\sukri\OneDrive\FantaClaus\NuronAI\experiments\indpass'
	db_path = os.path.dirname(os.path.abspath(sys.argv[0]))

	# Azure storage details
	connect_str = st.secrets["conn_str"]
	container_name = st.secrets["cont_name"]
	blob_service_client = BlobServiceClient.from_connection_string(connect_str)

	def pdf_to_jpeg(pdf_file):
	# poppler_path = r"C:\Program Files\poppler-23.08.0\Library\bin"
	images = pdf2image.convert_from_path(pdf_file)
	# print("Number of pages in PDF file-->",len(images))
	# returns a list of JPEG images
	return images

	def upload_to_azure(image_path,file_name):
	blob_client = blob_service_client.get_blob_client(container_name, file_name)
	with open(image_path,"rb") as data:
	blob_client.upload_blob(data, overwrite=True)
	# print("file uploaded!!!")

	def Azure_ocr_sdk(image_path):
	# Create a computer vision client
	subscription_key = "82e0c013e94849f7ab5bfc8c7c5e54c8"
	endpoint = "https://docaicomputervisionocr1.cognitiveservices.azure.com/"
	credentials = CognitiveServicesCredentials(subscription_key)
	cv_client = ComputerVisionClient(endpoint, credentials)
	response = cv_client.read_in_stream(open(image_path,'rb'), raw=True,language='en')
	operationLocation = response.headers['Operation-Location']
	operation_id = operationLocation.split('/')[-1]
	result = cv_client.get_read_result(operation_id)
	while(result.status == OperationStatusCodes.running):
	operationLocation = response.headers['Operation-Location']
	operation_id = operationLocation.split('/')[-1]
	result = cv_client.get_read_result(operation_id)

	text_blob = []

	if result.status == OperationStatusCodes.succeeded:
	read_results = result.analyze_result.read_results
	for analyzed_result in read_results:
	for line in analyzed_result.lines:
	text_blob.append(line.text)
	# print(text_blob)
	return text_blob

	def Passport_front(textdata):
	Document_Type="PASSPORT DOCUMENT"
	DateofBirth = None
	Type = None
	Countrycode = None
	passportno = None
	Surname = None
	Firstname = None
	Nationality = None
	Sex = None
	PlaceofBirth = None
	PlaceofIssue = None
	DateofIssue = None
	DateofExpiry = None
	MRZ = None

	output2=[]
	for w in textdata:
	output2.append(w.upper())

	output3=[]
	for x in output2:
	if 'REPUBLIC' not in x and 'TYPE' not in x and 'GIVEN' not in x and 'COUNTRY' not in x and 'PASSPORT' not in x and 'NO.' not in x and 'NAME' not in x and 'NATIONALITY' not in x and 'SEX' not in x and 'DATE' not in x and 'PLACE' not in x:
	output3.append(x)
	dates=[]
	for w in output3:
	match = re.search('\d{2}\s/\d{2}\s/\d{4}', w)
	if match:
	dates.append(w)
	if len(dates)==2:
	caldate=int(dates[1].split('/')[-1])-int(dates[0].split('/')[-1])
	if caldate==10:
	DateofBirth=None
	try:
	DateofIssue=dates[0]
	except:
	DateofIssue=None
	try:
	DateofExpiry=dates[1]
	except:
	DateofExpiry=None
	else:
	try:
	DateofBirth=dates[0]
	except:
	DateofBirth=None

	try:
	DateofIssue=dates[1]
	except:
	DateofIssue=None

	try:
	DateofExpiry=dates[2]
	except:
	DateofExpiry=None


	try:
	for x in output3:
	if DateofBirth in x or DateofIssue in x or DateofExpiry in x:
	output3.remove(x)

	except:
	print(None)



	single=[]
	for w in output3:
	if re.match(r'^[A-Z]{1}$',w):
	single+=w



	try:
	Type=single[0]
	except:
	Type=None

	try:
	Sex=''.join([i for i in single if re.match(r'[M,F]$',i)])

	except:
	Sex=None


	for w in single:
	output3.remove(w)
	for w in output3:
	if re.match('[A-Z]\s/\s[A-Z]',w):
	output3.remove(w)
	passportno=None
	for w in output3:
	if re.search(r'^[A-Z]\s[0-9]{7}$', w):
	passportno=w
	break

	else:
	passportno=None

	try:
	output3.remove(passportno)
	except:
	print('None')

	for x in output3:
	if 'BIRTH' in x:
	output3.remove(x)
	if 'IND' or 'INDIAN' in x:
	Nationality='INDIAN'
	Countrycode='IND'
	else:
	Nationality=None
	Countrycode=None
	MRZ=''
	for w in output3:
	if '<<' in w:
	MRZ+=w
	try:
	indices=[i for i,s in enumerate(output3) if re.match(r'IND$',s)]
	if indices:
	z=max(indices)
	output4=output3[z+1:]
	else:
	output4=[]
	except:
	indices=[i for i,s in enumerate(output3) if re.match(r'IND',s)]
	if indices:
	z=max(indices)
	output4=output3[z+1:]
	else:
	output4=[]
	for i in output4:
	if 'INDIAN' in i:
	output4.remove(i)
	if 'OF BIRTH' in i:
	output4.remove(i)
	if re.search(r'\d',i):
	output4.remove(i)

	try:

	Sname=output4[0]
	Name=output4[1]
	Pob=output4[2]
	Poi=output4[3]
	except:

	Sname=None
	Name=None
	Pob=None
	Poi=None


	strings={ "Surname": Sname,
	"FirstName": Name,
	"PlaceofBirth": Pob,
	"PlaceofIssue": Poi}

	values=list(strings.values())
	strings2=[]

	try:
	for w in values:
	string1=[]
	for w1 in w.split(' '):
	if re.match(r'^[0-9]*',w1):
	string1.append(re.sub(r'[^A-Z]','',w1))
	strings2.append(string1)
	except:
	strings2=[]

	strings3=[]
	for w in strings2:
	if len(w)>=2:
	strings3.append(" ".join(map(str,w)))
	else:
	strings3.append(w)


	try:
	str1 = ''.join(str(e) for e in strings3[0])
	# print(str1)
	Surname=str1
	str2 = ''.join(str(e) for e in strings3[1])
	# print(str2)
	Firstname=str2
	str3 = ''.join(str(e) for e in strings3[2])
	# print(str3)
	PlaceofBirth=str3
	str4 = ''.join(str(e) for e in strings3[3])
	# print(str4)
	PlaceofIssue=str4

	except:

	Surname=None
	Firstname=None
	PlaceofBirth=None
	PlaceofIssue=None




	data={"Type": Document_Type,"Type of Passport":Type,"Countrycode": Countrycode,"Passportno": passportno,"Surname": Surname,"Firstname": Firstname,"Nationality": Nationality,"Sex": Sex,"Date of Birth": DateofBirth,"Place of Birth": PlaceofBirth,"Place of Issue": PlaceofIssue,"Date of Issue": DateofIssue,"Date of Expiry": DateofExpiry,"MRZ": MRZ}
	return data

	def Passport_back(text):
	Document_Type="PASSPORT BACK"
	data={}
	old_passno=''
	dateplace_issue=''
	Father_name=''
	mother_name=''
	spouse=''
	for x in text:
	if 'CAUTION' in x:
	index=text.index(x)
	textdata=text[index+1:]
	break
	else:
	textdata=text
	print(len(textdata),textdata)
	try:
	if len(textdata)>=9:
	for x in textdata:
	if 'FATHER' in x or 'LEGAL GUARDIAN' in x:
	index1=textdata.index(x)
	Father_name=textdata[index1+1]
	if 'MOTHER' in x or 'NAME OF MOT' in x:
	index2=textdata.index(x)
	mother_name=textdata[index2+1]
	if 'SPOUSE' in x:
	index3=textdata.index(x)
	spouse=textdata[index3+1]
	try:
	indices1=[i for i, s in enumerate(textdata) if 'NAME' in s]
	z1=max(indices1)
	indices2=[i for i, s in enumerate(textdata) if 'OLD PASSPORT NO' in s or 'PLACE OF ISSUE' in s]
	z2=min(indices2)
	if 'ADDRESS' in textdata[z1+2]:
	address=''.join(textdata[z1+3:z2])
	else:
	address=''.join(textdata[z1+2:z2])

	except Exception as e:
	print(e)
	address='Not Found'
	try:
	if 'FILE NO' in x:
	index4=textdata.index(x)
	File_no=textdata[index4+1]
	else:
	if re.search(r'\d',textdata[-1]):
	File_no=textdata[-1]
	else:
	if re.search(r'\d',textdata[-2]):
	File_no=textdata[-2]
	except:
	File_no='Not Found'
	try:
	indices3=[i for i, s in enumerate(textdata) if 'OLD PASSPORT NO' in s]
	z3=max(indices3)
	textdata2=textdata[z3+1:-1]
	old_passno=[w for w in textdata2 if re.match(r'[A-Z]{1}[0-9]{7}',w)][0]

	except:
	old_passno='Not Found'
	try:
	indices4=[i for i, s in enumerate(textdata) if 'PLACE OF ISSUE' in s]
	z4=max(indices4)
	if 'FILE NO' not in textdata[z4+1]:
	if 'FILE NO' in textdata[-2]:
	dateplace_issue=textdata[z4+1:-2]
	else:
	dateplace_issue=textdata[z4+1:-1]
	else:
	dateplace_issue='Not Found'
	for w in dateplace_issue:
	if old_passno in w or File_no in w:
	dateplace_issue.remove(w)
	except:
	dateplace_issue='Not Found'
	else:
	if len(textdata)<=8:
	textdata1_1=[]
	print(textdata)
	for i in range(0,3):
	if re.match(r'[A-Z]+\s[A-Z]',textdata[i]) and not re.search(r'\d',textdata[i]):
	textdata1_1.append(textdata[i])
	if len(textdata1_1)==3:
	Father_name=textdata1_1[0]
	mother_name=textdata1_1[1]
	spouse=textdata1_1[2]
	address=' '.join(textdata[3:5])
	File_no=textdata[-1]
	else:
	if len(textdata1_1)==2:
	Father_name=textdata1_1[0]
	mother_name=textdata1_1[1]
	spouse='Not Found'
	File_no='Not Found'
	dateplace_issue='Not Found'
	old_passno='Not Found'
	address=' '.join(textdata[3:5])
	File_no=textdata[-1]
	else:
	if len(textdata1_1)==1:
	Father_name=textdata1_1[0]
	mother_name='Not Found'
	spouse='Not Found'
	mother_name='Not Found'
	spouse='Not Found'
	address='Not Found'
	File_no='Not Found'
	dateplace_issue='Not Found'
	old_passno='Not Found'
	if textdata1_1==[]:
	Father_name='Not Found'
	mother_name='Not Found'
	spouse='Not Found'
	address='Not Found'
	File_no='Not Found'
	dateplace_issue='Not Found'
	old_passno='Not Found'

	except Exception as e:
	print(e)
	data['Type of Document']=Document_Type
	Father_name='Not Found'
	mother_name='Not Found'
	spouse='Not Found'
	address='Not Found'
	File_no='Not Found'
	dateplace_issue='Not Found'
	old_passno='Not Found'

	data['Type of Document']=Document_Type
	data["Name of Father/Legal Guardian"]=Father_name
	data["Name of Mother"]=mother_name
	data["Name of Spouse"]=spouse
	data["Address"]=address
	data["Old Passport No"]=old_passno
	data['Date and Place of Issue']=dateplace_issue
	data["File No"]=File_no

	return data

	def aadhar_front(textdata):
	Document_Type="AADHAR CARD FRONT"
	indices=[i for i, s in enumerate(textdata) if 'INDIA' in s or 'GOVERNMENT' in s or 'India' in s]
	z=max(indices)
	text=textdata[z+1:]

	output=[]
	for w in text:
	output.append(w.upper())
	for w in textdata:
	if 'FEMALE' or 'MALE' in output:
	aadharno='NA'
	w1=[]
	w=[]
	for a1 in output:
	if re.search(r'^\d{4}\s\d{4}\s\d{4}$', a1):
	aadharno=a1
	output.remove(aadharno)
	output=[]
	for w in text:
	output.append(w.upper())

	dob=''
	for a1 in output:
	if re.match(r'.[0-9].',a1):
	dob+=' '+ a1
	#print(dob)

	Date='NA'
	match = re.search('\d{2}/\d{2}/\d{4}', dob)
	#print(match)
	if match:
	Date = datetime.datetime.strptime('%s'%(match.group()), "%d/%m/%Y").date().strftime('%d/%m/%Y')

	else:
	date=dob
	date=date.split(aadharno)
	date1=date[0]
	for a1 in date1.split(' '):
	match=re.search('[0-9]{4}',a1)
	if match:
	date2=a1
	Date=date2
	#print(Date)


	Father_name='NA'
	F_name=''
	Name='NA'
	for i in output:
	if 'FATHER' in i:
	F_name=i
	F_name=F_name.split(':')
	Father_name=F_name[1]
	# print(output[0])
	Name=output[0]
	break
	else:
	for w in range(0,len(output)):
	for w1 in output[w].split(' '):
	if Date in w1 or 'BIRTH' in w1:
	x=w
	Name=output[x-1]

	Gender='NA'
	for w in output:
	for w1 in w.split(' '):
	if 'MALE' in w1 or 'FEMALE' in w1:
	Gender=w1

	data = {"Type of Document": Document_Type,"Name":Name,"Father's Name":Father_name,"Date of Birth": Date,"Aadhar No": aadharno,"Gender":Gender}
	return data

	def aadhar_back(textdata2,polygons_new):

	Document_Type='AADHAR CARD BACK'

	textdata=[]
	boxes=[]
	for x in polygons_new:
	textdata.append(x[1])
	boxes.append(x[2])


	indices=[i for i, s in enumerate(textdata) if 'Address' in s]
	z=max(indices)
	text1=textdata[z:]
	boxes1=boxes[z:]
	print(text1,boxes1)
	p1=(boxes1[0][0],boxes1[0][1])
	p2=(boxes1[0][4],boxes1[0][5])
	dist = math.hypot(p2[0] - p1[0], p2[1] - p1[1])

	check=[0]
	check_list=[]
	add_box=[]
	add_text=[]
	for j in range(1,len(boxes1)):
	for k in boxes1[j]:
	l=boxes1[0][0]
	if l-(1.3dist) <= boxes1[j][0] <= l+(3.5dist):
	check.append(j)
	check_list=set(check)
	add_text=[text1[i] for i in check_list]
	add_box=[boxes1[i] for i in check_list]


	add=''
	for x in add_text:
	add=add+x.upper()+' '
	add=add.strip()
	print(textdata2)
	for a1 in textdata2:
	if re.search(r'^\d{4}\s\d{4}\s\d{4}$', a1):
	aadharno = a1
	break
	else:
	aadharno='Not Found'
	pin=re.findall(r'\b[1-9]\s[0-9]\s[0-9]\s[0-9]\s[0-9]\s*[0-9]\b',add)
	pinno=str(pin[0])
	pinno=re.sub(r'\s{1,}','',pinno)
	address=re.split(r'\b[1-9]\s[0-9]\s[0-9]\s[0-9]\s[0-9]\s*[0-9]\b',add)[0]
	address=address+' '+pinno

	address=re.sub(r'ADDRESS:*',' ',address)
	address=re.sub(r'\s{2,}',' ',address)
	address=address.lstrip()

	data = {"Type of Document": Document_Type,"Address":address,"PIN Code":pinno, "Aadhar No": aadharno}

	return data

	def pan_card(textdata):
	Document_Type="PAN CARD"
	indices=[i for i, s in enumerate(textdata) if 'INCOME' in s or 'Card' in s or 'INDIA' in s or 'DEPARTMENT' in s]
	z=max(indices)
	text=textdata[z+1:]
	output=[]
	for w in text:
	output.append(w.upper())
	dob='NA'

	for w in output:
	if re.match(r"^\d+",w ):
	dob+=w

	date=None
	match = re.search('\d{2}/\d{2}/\d{4}', dob)
	if match:
	date = datetime.datetime.strptime('%s'%(match.group()), "%d/%m/%Y").date().strftime('%d/%m/%Y')

	else:
	date=dob

	output1=[]
	for x in output:
	if 'INCOME' not in x and 'INDIA' not in x and 'NAME' not in x and 'BIRTH' not in x and 'PERMANENT' not in x and 'CARD' not in x and 'TAX' not in x :
	output1.append(x)
	panno='NA'
	w1=[]
	w=[]
	for w1 in output1:
	if re.search(r'^[A-Z0-9]{10}$', w1):
	panno=w1
	output1.remove(w1)

	Name=output1[0]
	Father_name=output1[1]
	data = { "Type of Document": Document_Type, "Name":Name,"Father's Name": Father_name,"Date of Birth": date,"PAN Number":panno}
	return data

	def DL(textdata):

	Document_Type="DRIVING LICENCE"
	indices=[i for i, s in enumerate(textdata) if 'DRIVING' in s or 'UNION' in s or 'INDIAN' in s]
	z=max(indices)
	text=textdata[z+1:]
	State=text[0]
	State=State.replace("STATE","")
	text=[re.sub('[:;.]+', '', _) for _ in text]
	text=[x.strip(' ') for x in text]
	Number='NA'
	Number_text=str(text)
	Number_text=Number_text.replace("'","")
	#print(Number_text)
	for x in Number_text.split(','):
	#print(x)
	if re.match(r'^[\s][A-Z]{2}[-][\s][0-9]{2}[\s][0-9]{4}[0-9]{7}$',x):
	Number=x

	indices=[i for i, s in enumerate(text) if "Number" in s ]
	z=max(indices)
	text1=text[z+1:]

	AName=[]
	Name='NA'
	SName='NA'
	indices=[i for i, s in enumerate(text1) if "Address" in s or "Blood" in s]
	z=max(indices)
	Name2=text1[:z]
	Name2.remove('Name')
	for w in Name2:
	AName.append(w)

	Number=Number.lstrip()
	for x in AName:
	# print(x)
	if Number==x:
	AName.remove(x)
	break

	for x in AName:
	if "S/D/W" in x:
	SName=x.split("S/D/W of")[1]
	else:
	Name+= x +' '
	indices=[i for i, s in enumerate(text1) if "Address" in s]
	z=max(indices)
	text2=text1[z:]

	pin='NA'
	for x in text2:
	for y in x.split(' '):
	if re.match(r'.*[1-9]{1}[0-9]{5}$',y):
	pin=y

	text3=str(text2)
	if pin!='':
	pinno=str(pin)
	Address="" .join(text3.split(pinno,2)[:1])
	Address=Address+pinno
	Address=Address.replace("'", "")
	Address=Address.replace("[", "")
	Address=Address.replace(",,", ",")
	Address=Address.replace("Address,","")
	else:
	Address=text3[:text3.find("Issued")]
	Address=Address.replace("[", "")
	Address=Address.replace(",,", ",")
	Address=Address.replace("Address","")

	dates=[]
	for w in text2:
	#print(w)
	match = re.search('\d{2}\s-\d{2}\s-\d{4}', w)
	if match or "Issued" in w:
	dates.append(w)

	Final_dates=[]
	for w in dates:
	for w1 in w.split(' '):
	if re.match('[A-Za-z0-9]\d{2}\s-\d{2}\s*-\d{4}', w1):
	Final_dates.append(w1)

	Dob='NA'
	Valid_till='NA'
	Date_of_birth='NA'
	Issued_date='NA'
	if len(Final_dates)>=3:
	Issued_date=Final_dates[0]
	Dob=Final_dates[1]
	Dob=Dob.split(' ')
	Date_of_birth=Dob[0]
	Valid_till=Final_dates[2]

	else:
	if Final_dates==2:
	Issued_date=dates[0]
	Dob=dates[1]
	else:
	Issued_date=Final_dates[0]


	for w in Issued_date:
	if re.match(r'[a-zA-Z]',w):
	Issued_date=Issued_date.replace(w,'')

	Bgroup='NA'
	for x in text2:
	for y in x.split(' '):
	if "BG" in y or "Blood" in y:
	Bgroup=x

	BG='NA'
	for x in Bgroup.split(' '):
	if '+' in x:
	BG=x


	indices=[i for i, s in enumerate(text2) if 'Vehicle' in s or 'Class' in s]
	z=max(indices)
	text4=text2[z+1:]
	Vehicle_class='NA'
	for w in text4:
	if re.match(r'^[A-Z0-9]+$',w):
	Vehicle_class+=w+' '

	data = {"Type of Document": Document_Type,"Number":Number,"Name":Name,"S/D/W of":SName,"Date of Birth":Date_of_birth,"Address":Address,"State":State,"Pinno":pin}
	return data

	def VoterId(textdata):
	Document_Type="Voter ID Card"
	data={}
	textdata=[x.upper() for x in textdata]
	id_no=''
	elector_name=''
	father_name=''
	sex=''
	place=''
	date=''
	address=''
	hno=''
	mohalla=''
	town=''
	police=''
	distt=''
	pin=''
	add=''
	age=''
	try:
	for k in textdata:
	index=textdata.index(k)
	if re.search(r'[A-Z]{3}[0-9]{7}$',textdata[index]):
	id_no=re.search(r'[A-Z]{3}[0-9]{7}$',textdata[index])[0]
	break
	if id_no=='':
	id_no='Not Found'
	except:
	id_no='Not Found'
	try:
	### elector's name
	for key in textdata:
	if "ELECTOR'S NAME" in key:
	index=textdata.index(key)
	x1=textdata[index]
	if ':' in x1:
	x2=x1.split(':')[1]
	else:
	x2=x1.split('NAME')[1]
	if x2=='' or x2==' ':
	x3=textdata[index+1]
	if ':' in x3:
	x3=x3.split(':')[1]
	elector_name=x3
	break
	else:
	elector_name=x3
	break
	else:
	elector_name=x2
	break
	if elector_name=='':
	elector_name='Not Found'
	except:
	elector_name='Not Found'
	try:
	### Father's Name
	for k in textdata:
	if "FATHER'S NAME" in k or "HUSBAND'S NAME" in k:
	index=textdata.index(k)
	y1=textdata[index]
	if ':' in y1:
	y2=y1.split(':')[1]
	else:
	y2=y1.split('NAME')[1]
	if y2=='' or y2==' ':
	y3=textdata[index+1]
	if ':' in y3:
	y3=y3.split(':')[1]
	father_name=y3
	break
	else:
	father_name=y3
	break
	else:
	father_name=y2
	break
	if father_name=='':
	father_name='Not Found'
	except:
	father_name='Not Found'
	try:
	### Sex
	foundflag=False
	for key in textdata:
	if 'SEX' in key:
	index=textdata.index(key)
	if 'MALE' in textdata[index]:
	if 'FEMALE' in textdata[index]:
	sex='FEMALE'
	foundflag=True
	else:
	sex='MALE'
	foundflag=True
	else:
	if 'MALE' in textdata[index+1]:
	if 'FEMALE' in textdata[index+1]:
	sex='FEMALE'
	foundflag=True
	else:
	sex='MALE'
	foundflag=True
	if foundflag==False:
	if 'MALE' in key:
	if 'FEMALE' in key:
	sex='FEMALE'
	foundflag=True
	else:
	sex='MALE'
	foundflag=True

	if foundflag==False:
	if 'SEX' in key:
	index=textdata.index(key)
	sex1=''.join(textdata[index+1])
	if sex1=='M':
	sex='MALE'
	foundflag=True
	else:
	if sex1=='F':
	sex='FEMALE'
	foundflag=True
	if sex=='':
	sex='Not Found'
	except:
	sex='Not Found'
	try:
	### Place
	for k in textdata:
	if 'PLACE' in k:
	index=textdata.index(k)
	z1=textdata[index]
	if ':' in z1:
	z2=z1.split(':')[1]
	else:
	z2=z1.split('PLACE')[1]
	if z2=='' or z2==' ':
	z3=textdata[index+1]
	if ':' in z3:
	z3=z3.split(':')[1]
	place=z3
	break
	else:
	place=z3
	break
	else:
	place=z2
	break
	if place=='':
	place='Not Found'
	except:
	place='Not Found'
	try:
	### Date
	for key in textdata:
	if 'DATE' in key:
	index=textdata.index(key)
	a1=textdata[index]
	a2=textdata[index+1]
	if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a1):
	date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a1)[0]
	break
	else:
	if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a2):
	date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a2)[0]
	break
	else:
	if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',key):
	date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',key)[0]
	break
	else:
	date='Not found'
	except:
	date='Not found'
	try:
	### Add
	### House no
	for h in textdata:
	if 'H.NO.' in h:
	index=textdata.index(h)
	h1=textdata[index]
	if ':' in h1:
	h2=h1.split(':')[1]
	else:
	h2=h1.split('H.NO.')[1]
	if h2=='' or h2==' ':
	h3=textdata[index+1]
	if not 'MOHALLA' in h3:
	if ':' in h3:
	h3=h3.split(':')[1]
	hno=h3
	break
	else:
	hno=h3
	break
	else:
	hno=h2
	break
	### Mohalla
	for m in textdata:
	if 'MOHALLA' in m:
	index=textdata.index(m)
	m1=textdata[index]
	if ':' in m1:
	m2=m1.split(':')[1]
	else:
	m2=m1.split('MOHALLA')[1]
	if m2=='' or m2==' ':
	m3=textdata[index+1]
	if not 'TOWN' in m3:
	if ':' in m3:
	m3=m3.split(':')[1]
	mohalla=m3
	break
	else:
	mohalla=m3
	break
	else:
	mohalla=m2
	break
	### Town
	for t in textdata:
	if 'TOWN' in t:
	index=textdata.index(t)
	t1=textdata[index]
	if ':' in t1:
	t2=t1.split(':')[1]
	else:
	t2=t1.split('TOWN')[1]
	if t2=='' or t2==' ':
	t3=textdata[index+1]
	if not 'POLICESTN.' in t3:
	if ':' in t3:
	t3=t3.split(':')[1]
	town=t3
	break
	else:
	town=t3
	break
	else:
	town=t2
	break
	###Police
	for p in textdata:
	if 'POLICE' in p:
	index=textdata.index(p)
	p1=textdata[index]
	if ':' in p1:
	p2=p1.split(':')[1]
	else:
	p2=p1.split('POLICESTN.')[1]
	if p2=='' or p2==' ':
	p3=textdata[index+1]
	if not 'DISTT' in p3:
	if ':' in p3:
	p3=p3.split(':')[1]
	police=p3
	break
	else:
	police=p3
	break
	else:
	police=p2
	break
	### District
	for d in textdata:
	if 'DISTT' in d:
	index=textdata.index(d)
	d1=textdata[index]
	if ':' in y1:
	d2=d1.split(':')[1]
	else:
	d2=d1.split('DISTT.')[1]
	if d2=='' or d2==' ':
	d3=textdata[index+1]
	if not 'PIN' in d3:
	if ':' in d3:
	d3=d3.split(':')[1]
	distt=d3
	break
	else:
	distt=d3
	break
	else:
	distt=d2
	break
	### Pin
	for pp in textdata:
	if 'PIN' in pp:
	index=textdata.index(pp)
	pp1=textdata[index]
	if ':' in pp1:
	pp2=pp1.split(':')[1]
	else:
	pp2=pp1.split('PIN')[1]
	if pp2=='' or pp2==' ':
	pp3=textdata[index+1]
	if re.search(r'[0-9\s*]{6,}',pp3):
	pin=re.search(r'[0-9\s*]{6,}',pp3)[0]
	else:
	pin=pp2
	break
	add=hno+' '+mohalla+' '+town+' '+police+' '+distt+' '+pin
	if len(add)>15:
	address=add
	else:
	try:
	add1=[i for i,s in enumerate(textdata) if 'ADDRESS' in s]
	add2=min(add1)
	add3=[i for i,s in enumerate(textdata) if 'ELECTION' in s]
	add4=min(add3)
	address=' '.join(textdata[add2:add4])
	except:
	address='Not Found'
	except:
	address:'Not Found'
	try:
	### Age
	for k in textdata:
	if 'AGE AS ON' in k:
	index=textdata.index(k)
	i1=textdata[index]
	if ':' in i1:
	if re.search(r'[0-9]{2,3}',i1):
	age=i1
	break
	else:
	i3=textdata[index+1]
	if re.search(r'[0-9]{2,3}',i3):
	age=i1+' '+i3
	break
	else:
	i4=textdata[index-1]
	if re.search(r'[0-9]{2,3}',i4):
	age=i1+' '+i4
	break
	else:
	age='Not Found'
	else:
	if 'DATE OF BIRTH' in k:
	index=textdata.index(k)
	i1=textdata[index]
	if ':' in i1 or 'BIRTH' in i1:
	if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1):
	age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0]
	break
	else:
	i3=textdata[index+1]
	if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i3):
	age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0]
	break
	else:
	i4=textdata[index-1]
	if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i4):
	age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0]
	break
	else:
	age='Not Found'
	except:
	age='Not Found'

	data['Type of Document']=Document_Type
	data['Voter ID Number']=id_no
	data["Elector's Name"]=elector_name
	data["Father's/Husband's Name"]=father_name
	data["Sex"]=sex
	data["Address"]=address
	data["Date of Birth or Age"]=age
	data['Date']=date
	data["Place"]=place
	return data

	# This function creates a chromaDB client that connects to existing or newly created db
	def getChromaClient(dbPath):
	client=chromadb.PersistentClient(path=dbPath)
	return client

	# This function will get or create a new collection by taking openAI details and chromaClient object
	def getCollection(collectionName, client):
	collection = client.get_or_create_collection(name=collectionName, metadata={"hsnw:space": "cosine"})
	return collection

	# Function to generate an embedding for an image
	def get_embedding(img):
	query_image = preprocess(Image.fromarray(img)).unsqueeze(0).to(device)
	# Get the embedding from the CLIP model
	with torch.no_grad():
	query_embeddings = model.encode_image(query_image)
	return query_embeddings

	def store_embeddings(collection, emb,cat,id):
	try:
	collection.add(
	embeddings=emb,
	# documents=[img],
	metadatas=[{"category": cat}],
	ids=[id] )
	# print('embeddings added')
	except Exception as e:
	print(e)
	exit

	def get_category(collection,emb):
	similar_img = collection.query(query_embeddings = emb, n_results = 1)
	option = similar_img["metadatas"][0][0]["category"]
	return option

	def train_classifier(coll,emb,cat):
	id = str(uuid.uuid4())
	store_embeddings(coll, emb, cat, id)

	def save_uploadedfile(uploadedfile):
	with open(os.path.join(uploadedfile.name),"wb") as f:
	f.write(uploadedfile.getbuffer())
	# return st.success("Saved File:{} to tempDir".format(uploadedfile.name))

	def main():

	# Delete all the items in Session state
	for key in st.session_state.keys():
	del st.session_state[key]

	client = getChromaClient(db_path)
	coll = getCollection('visaDocImgs',client)

	st.sidebar.title("Choose Mode")
	mode = st.sidebar.selectbox("", ("Train", "Test"))

	st.write("Upload an image or pdf file")
	uploaded_file = st.file_uploader("File upload", type=['png','jpg','jpeg','pdf'])
	if uploaded_file is not None:
	save_uploadedfile(uploaded_file)
	folder_path = os.path.join("/")
	image_name = uploaded_file.name

	img_path = folder_path+ image_name
	upload_to_azure(img_path, image_name)

	#PDF file handeling
	if image_name.endswith('.pdf'):
	img_class_arr = []
	images = pdf_to_jpeg(img_path)
	st.write("You have uploaded a pdf file with "+str(len(images))+" pages.")
	pass_name = None
	for i in range(len(images)):
	new_image_name = image_name.split(".")[0]+"_page_"+str(i)+".jpeg"
	images[i].save(os.path.join(folder_path,new_image_name))
	new_img_path = folder_path+new_image_name
	new_img_array = cv2.imread(new_img_path)
	new_emb_tensor = get_embedding(new_img_array)
	new_emb = new_emb_tensor.tolist()
	new_cat = get_category(coll,new_emb)
	st.image(images[i],caption=new_cat)
	img_class_arr.append([new_image_name,new_cat,new_img_path])

	pdf_pg_sel = st.selectbox('which page to process?',(set(img_class_arr[i][0] for i in range(len(img_class_arr)))))
	for row in img_class_arr:
	if row[0]==pdf_pg_sel:
	pass_name = row[2]
	new_cat = row[1]

	cat_list = ['passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid']
	if new_cat not in cat_list:
	st.write("This category is not configured for extraction yet!")
	st.stop()
	else:
	img_path = pass_name


	#Process single image file
	im = Image.open(img_path)
	st.image(im, caption = "Image being processed")


	img_array = cv2.imread(img_path)
	# Get the image embedding
	emb_tensor = get_embedding(img_array)
	emb = emb_tensor.tolist()

	if mode == 'Test':
	option = get_category(coll,emb)
	st.write("Category -- >",option)
	cor_cat = None
	# print("is this correct classification?")
	cor_cat = st.selectbox(
	'is this correct classification?',('y','n'))
	st.write('You selected:', cor_cat)

	if cor_cat == 'n':
	option = st.selectbox('What is the correct classification?', ('passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid'))
	st.write('You selected:', option)
	else:
	new_cat = option


	x = Azure_ocr_sdk(img_path)
	# Calling function based on category input
	if option == 'passport_front':
	res = Passport_front(x)
	elif option == 'passport_back':
	res = Passport_back(x)
	elif option == 'aadhar_front':
	res = aadhar_front(x)
	elif option == 'aadhar_back':
	res = aadhar_back(x)
	elif option == 'PAN_front':
	res = pan_card(x)
	elif option == 'DL':
	res = DL(x)
	elif option == 'voterid':
	res = VoterId(x)
	else:
	st.write("Couldn't classify")

	st.header("Extracted text")
	st.table(res)
	else:
	new_cat = st.selectbox('What is the correct classification?', ('passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid'))
	st.write('You selected:', new_cat)

	train_classifier(coll,emb,new_cat)

	if __name__ == "__main__":
	main()