import streamlit as st import os import sys import re from azure.storage.blob import BlobServiceClient from PIL import Image import cv2 import datetime import clip import torch import chromadb import uuid import pdf2image # import pandas as pd from azure.cognitiveservices.vision.computervision import ComputerVisionClient from msrest.authentication import CognitiveServicesCredentials from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes #For generating CLIP embeddings device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load("ViT-B/32", device=device) #File locations # db_path = r'C:\Users\sukri\OneDrive\FantaClaus\NuronAI\experiments\indpass' db_path = os.path.dirname(os.path.abspath(sys.argv[0])) # Azure storage details connect_str = st.secrets["conn_str"] container_name = st.secrets["cont_name"] blob_service_client = BlobServiceClient.from_connection_string(connect_str) def pdf_to_jpeg(pdf_file): # poppler_path = r"C:\Program Files\poppler-23.08.0\Library\bin" images = pdf2image.convert_from_path(pdf_file) # print("Number of pages in PDF file-->",len(images)) # returns a list of JPEG images return images def upload_to_azure(image_path,file_name): blob_client = blob_service_client.get_blob_client(container_name, file_name) with open(image_path,"rb") as data: blob_client.upload_blob(data, overwrite=True) # print("file uploaded!!!") def Azure_ocr_sdk(image_path): # Create a computer vision client subscription_key = "82e0c013e94849f7ab5bfc8c7c5e54c8" endpoint = "https://docaicomputervisionocr1.cognitiveservices.azure.com/" credentials = CognitiveServicesCredentials(subscription_key) cv_client = ComputerVisionClient(endpoint, credentials) response = cv_client.read_in_stream(open(image_path,'rb'), raw=True,language='en') operationLocation = response.headers['Operation-Location'] operation_id = operationLocation.split('/')[-1] result = cv_client.get_read_result(operation_id) while(result.status == OperationStatusCodes.running): operationLocation = response.headers['Operation-Location'] operation_id = operationLocation.split('/')[-1] result = cv_client.get_read_result(operation_id) text_blob = [] if result.status == OperationStatusCodes.succeeded: read_results = result.analyze_result.read_results for analyzed_result in read_results: for line in analyzed_result.lines: text_blob.append(line.text) # print(text_blob) return text_blob def Passport_front(textdata): Document_Type="PASSPORT DOCUMENT" DateofBirth = None Type = None Countrycode = None passportno = None Surname = None Firstname = None Nationality = None Sex = None PlaceofBirth = None PlaceofIssue = None DateofIssue = None DateofExpiry = None MRZ = None output2=[] for w in textdata: output2.append(w.upper()) output3=[] for x in output2: if 'REPUBLIC' not in x and 'TYPE' not in x and 'GIVEN' not in x and 'COUNTRY' not in x and 'PASSPORT' not in x and 'NO.' not in x and 'NAME' not in x and 'NATIONALITY' not in x and 'SEX' not in x and 'DATE' not in x and 'PLACE' not in x: output3.append(x) dates=[] for w in output3: match = re.search('\d{2}\s*/\d{2}\s*/\d{4}', w) if match: dates.append(w) if len(dates)==2: caldate=int(dates[1].split('/')[-1])-int(dates[0].split('/')[-1]) if caldate==10: DateofBirth=None try: DateofIssue=dates[0] except: DateofIssue=None try: DateofExpiry=dates[1] except: DateofExpiry=None else: try: DateofBirth=dates[0] except: DateofBirth=None try: DateofIssue=dates[1] except: DateofIssue=None try: DateofExpiry=dates[2] except: DateofExpiry=None try: for x in output3: if DateofBirth in x or DateofIssue in x or DateofExpiry in x: output3.remove(x) except: print(None) single=[] for w in output3: if re.match(r'^[A-Z]{1}$',w): single+=w try: Type=single[0] except: Type=None try: Sex=''.join([i for i in single if re.match(r'[M,F]$',i)]) except: Sex=None for w in single: output3.remove(w) for w in output3: if re.match('[A-Z]*\s*/\s*[A-Z]*',w): output3.remove(w) passportno=None for w in output3: if re.search(r'^[A-Z]*\s*[0-9]{7}$', w): passportno=w break else: passportno=None try: output3.remove(passportno) except: print('None') for x in output3: if 'BIRTH' in x: output3.remove(x) if 'IND' or 'INDIAN' in x: Nationality='INDIAN' Countrycode='IND' else: Nationality=None Countrycode=None MRZ='' for w in output3: if '<<' in w: MRZ+=w try: indices=[i for i,s in enumerate(output3) if re.match(r'IND$',s)] if indices: z=max(indices) output4=output3[z+1:] else: output4=[] except: indices=[i for i,s in enumerate(output3) if re.match(r'IND',s)] if indices: z=max(indices) output4=output3[z+1:] else: output4=[] for i in output4: if 'INDIAN' in i: output4.remove(i) if 'OF BIRTH' in i: output4.remove(i) if re.search(r'\d',i): output4.remove(i) try: Sname=output4[0] Name=output4[1] Pob=output4[2] Poi=output4[3] except: Sname=None Name=None Pob=None Poi=None strings={ "Surname": Sname, "FirstName": Name, "PlaceofBirth": Pob, "PlaceofIssue": Poi} values=list(strings.values()) strings2=[] try: for w in values: string1=[] for w1 in w.split(' '): if re.match(r'^[0-9]*',w1): string1.append(re.sub(r'[^A-Z]','',w1)) strings2.append(string1) except: strings2=[] strings3=[] for w in strings2: if len(w)>=2: strings3.append(" ".join(map(str,w))) else: strings3.append(w) try: str1 = ''.join(str(e) for e in strings3[0]) # print(str1) Surname=str1 str2 = ''.join(str(e) for e in strings3[1]) # print(str2) Firstname=str2 str3 = ''.join(str(e) for e in strings3[2]) # print(str3) PlaceofBirth=str3 str4 = ''.join(str(e) for e in strings3[3]) # print(str4) PlaceofIssue=str4 except: Surname=None Firstname=None PlaceofBirth=None PlaceofIssue=None data={"Type": Document_Type,"Type of Passport":Type,"Countrycode": Countrycode,"Passportno": passportno,"Surname": Surname,"Firstname": Firstname,"Nationality": Nationality,"Sex": Sex,"Date of Birth": DateofBirth,"Place of Birth": PlaceofBirth,"Place of Issue": PlaceofIssue,"Date of Issue": DateofIssue,"Date of Expiry": DateofExpiry,"MRZ": MRZ} return data def Passport_back(text): Document_Type="PASSPORT BACK" data={} old_passno='' dateplace_issue='' Father_name='' mother_name='' spouse='' for x in text: if 'CAUTION' in x: index=text.index(x) textdata=text[index+1:] break else: textdata=text print(len(textdata),textdata) try: if len(textdata)>=9: for x in textdata: if 'FATHER' in x or 'LEGAL GUARDIAN' in x: index1=textdata.index(x) Father_name=textdata[index1+1] if 'MOTHER' in x or 'NAME OF MOT' in x: index2=textdata.index(x) mother_name=textdata[index2+1] if 'SPOUSE' in x: index3=textdata.index(x) spouse=textdata[index3+1] try: indices1=[i for i, s in enumerate(textdata) if 'NAME' in s] z1=max(indices1) indices2=[i for i, s in enumerate(textdata) if 'OLD PASSPORT NO' in s or 'PLACE OF ISSUE' in s] z2=min(indices2) if 'ADDRESS' in textdata[z1+2]: address=''.join(textdata[z1+3:z2]) else: address=''.join(textdata[z1+2:z2]) except Exception as e: print(e) address='Not Found' try: if 'FILE NO' in x: index4=textdata.index(x) File_no=textdata[index4+1] else: if re.search(r'\d',textdata[-1]): File_no=textdata[-1] else: if re.search(r'\d',textdata[-2]): File_no=textdata[-2] except: File_no='Not Found' try: indices3=[i for i, s in enumerate(textdata) if 'OLD PASSPORT NO' in s] z3=max(indices3) textdata2=textdata[z3+1:-1] old_passno=[w for w in textdata2 if re.match(r'[A-Z]{1}[0-9]{7}',w)][0] except: old_passno='Not Found' try: indices4=[i for i, s in enumerate(textdata) if 'PLACE OF ISSUE' in s] z4=max(indices4) if 'FILE NO' not in textdata[z4+1]: if 'FILE NO' in textdata[-2]: dateplace_issue=textdata[z4+1:-2] else: dateplace_issue=textdata[z4+1:-1] else: dateplace_issue='Not Found' for w in dateplace_issue: if old_passno in w or File_no in w: dateplace_issue.remove(w) except: dateplace_issue='Not Found' else: if len(textdata)<=8: textdata1_1=[] print(textdata) for i in range(0,3): if re.match(r'[A-Z]+\s*[A-Z]*',textdata[i]) and not re.search(r'\d',textdata[i]): textdata1_1.append(textdata[i]) if len(textdata1_1)==3: Father_name=textdata1_1[0] mother_name=textdata1_1[1] spouse=textdata1_1[2] address=' '.join(textdata[3:5]) File_no=textdata[-1] else: if len(textdata1_1)==2: Father_name=textdata1_1[0] mother_name=textdata1_1[1] spouse='Not Found' File_no='Not Found' dateplace_issue='Not Found' old_passno='Not Found' address=' '.join(textdata[3:5]) File_no=textdata[-1] else: if len(textdata1_1)==1: Father_name=textdata1_1[0] mother_name='Not Found' spouse='Not Found' mother_name='Not Found' spouse='Not Found' address='Not Found' File_no='Not Found' dateplace_issue='Not Found' old_passno='Not Found' if textdata1_1==[]: Father_name='Not Found' mother_name='Not Found' spouse='Not Found' address='Not Found' File_no='Not Found' dateplace_issue='Not Found' old_passno='Not Found' except Exception as e: print(e) data['Type of Document']=Document_Type Father_name='Not Found' mother_name='Not Found' spouse='Not Found' address='Not Found' File_no='Not Found' dateplace_issue='Not Found' old_passno='Not Found' data['Type of Document']=Document_Type data["Name of Father/Legal Guardian"]=Father_name data["Name of Mother"]=mother_name data["Name of Spouse"]=spouse data["Address"]=address data["Old Passport No"]=old_passno data['Date and Place of Issue']=dateplace_issue data["File No"]=File_no return data def aadhar_front(textdata): Document_Type="AADHAR CARD FRONT" indices=[i for i, s in enumerate(textdata) if 'INDIA' in s or 'GOVERNMENT' in s or 'India' in s] z=max(indices) text=textdata[z+1:] output=[] for w in text: output.append(w.upper()) for w in textdata: if 'FEMALE' or 'MALE' in output: aadharno='NA' w1=[] w=[] for a1 in output: if re.search(r'^\d{4}\s\d{4}\s\d{4}$', a1): aadharno=a1 output.remove(aadharno) output=[] for w in text: output.append(w.upper()) dob='' for a1 in output: if re.match(r'.*[0-9].*',a1): dob+=' '+ a1 #print(dob) Date='NA' match = re.search('\d{2}/\d{2}/\d{4}', dob) #print(match) if match: Date = datetime.datetime.strptime('%s'%(match.group()), "%d/%m/%Y").date().strftime('%d/%m/%Y') else: date=dob date=date.split(aadharno) date1=date[0] for a1 in date1.split(' '): match=re.search('[0-9]{4}',a1) if match: date2=a1 Date=date2 #print(Date) Father_name='NA' F_name='' Name='NA' for i in output: if 'FATHER' in i: F_name=i F_name=F_name.split(':') Father_name=F_name[1] # print(output[0]) Name=output[0] break else: for w in range(0,len(output)): for w1 in output[w].split(' '): if Date in w1 or 'BIRTH' in w1: x=w Name=output[x-1] Gender='NA' for w in output: for w1 in w.split(' '): if 'MALE' in w1 or 'FEMALE' in w1: Gender=w1 data = {"Type of Document": Document_Type,"Name":Name,"Father's Name":Father_name,"Date of Birth": Date,"Aadhar No": aadharno,"Gender":Gender} return data def aadhar_back(textdata2,polygons_new): Document_Type='AADHAR CARD BACK' textdata=[] boxes=[] for x in polygons_new: textdata.append(x[1]) boxes.append(x[2]) indices=[i for i, s in enumerate(textdata) if 'Address' in s] z=max(indices) text1=textdata[z:] boxes1=boxes[z:] print(text1,boxes1) p1=(boxes1[0][0],boxes1[0][1]) p2=(boxes1[0][4],boxes1[0][5]) dist = math.hypot(p2[0] - p1[0], p2[1] - p1[1]) check=[0] check_list=[] add_box=[] add_text=[] for j in range(1,len(boxes1)): for k in boxes1[j]: l=boxes1[0][0] if l-(1.3*dist) <= boxes1[j][0] <= l+(3.5*dist): check.append(j) check_list=set(check) add_text=[text1[i] for i in check_list] add_box=[boxes1[i] for i in check_list] add='' for x in add_text: add=add+x.upper()+' ' add=add.strip() print(textdata2) for a1 in textdata2: if re.search(r'^\d{4}\s\d{4}\s\d{4}$', a1): aadharno = a1 break else: aadharno='Not Found' pin=re.findall(r'\b[1-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\b',add) pinno=str(pin[0]) pinno=re.sub(r'\s{1,}','',pinno) address=re.split(r'\b[1-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\b',add)[0] address=address+' '+pinno address=re.sub(r'ADDRESS:*',' ',address) address=re.sub(r'\s{2,}',' ',address) address=address.lstrip() data = {"Type of Document": Document_Type,"Address":address,"PIN Code":pinno, "Aadhar No": aadharno} return data def pan_card(textdata): Document_Type="PAN CARD" indices=[i for i, s in enumerate(textdata) if 'INCOME' in s or 'Card' in s or 'INDIA' in s or 'DEPARTMENT' in s] z=max(indices) text=textdata[z+1:] output=[] for w in text: output.append(w.upper()) dob='NA' for w in output: if re.match(r"^\d+",w ): dob+=w date=None match = re.search('\d{2}/\d{2}/\d{4}', dob) if match: date = datetime.datetime.strptime('%s'%(match.group()), "%d/%m/%Y").date().strftime('%d/%m/%Y') else: date=dob output1=[] for x in output: if 'INCOME' not in x and 'INDIA' not in x and 'NAME' not in x and 'BIRTH' not in x and 'PERMANENT' not in x and 'CARD' not in x and 'TAX' not in x : output1.append(x) panno='NA' w1=[] w=[] for w1 in output1: if re.search(r'^[A-Z0-9]{10}$', w1): panno=w1 output1.remove(w1) Name=output1[0] Father_name=output1[1] data = { "Type of Document": Document_Type, "Name":Name,"Father's Name": Father_name,"Date of Birth": date,"PAN Number":panno} return data def DL(textdata): Document_Type="DRIVING LICENCE" indices=[i for i, s in enumerate(textdata) if 'DRIVING' in s or 'UNION' in s or 'INDIAN' in s] z=max(indices) text=textdata[z+1:] State=text[0] State=State.replace("STATE","") text=[re.sub('[:;.]+', '', _) for _ in text] text=[x.strip(' ') for x in text] Number='NA' Number_text=str(text) Number_text=Number_text.replace("'","") #print(Number_text) for x in Number_text.split(','): #print(x) if re.match(r'^[\s]*[A-Z]{2}[-]*[\s]*[0-9]{2}[\s]*[0-9]{4}[0-9]{7}$',x): Number=x indices=[i for i, s in enumerate(text) if "Number" in s ] z=max(indices) text1=text[z+1:] AName=[] Name='NA' SName='NA' indices=[i for i, s in enumerate(text1) if "Address" in s or "Blood" in s] z=max(indices) Name2=text1[:z] Name2.remove('Name') for w in Name2: AName.append(w) Number=Number.lstrip() for x in AName: # print(x) if Number==x: AName.remove(x) break for x in AName: if "S/D/W" in x: SName=x.split("S/D/W of")[1] else: Name+= x +' ' indices=[i for i, s in enumerate(text1) if "Address" in s] z=max(indices) text2=text1[z:] pin='NA' for x in text2: for y in x.split(' '): if re.match(r'.*[1-9]{1}[0-9]{5}$',y): pin=y text3=str(text2) if pin!='': pinno=str(pin) Address="" .join(text3.split(pinno,2)[:1]) Address=Address+pinno Address=Address.replace("'", "") Address=Address.replace("[", "") Address=Address.replace(",,", ",") Address=Address.replace("Address,","") else: Address=text3[:text3.find("Issued")] Address=Address.replace("[", "") Address=Address.replace(",,", ",") Address=Address.replace("Address","") dates=[] for w in text2: #print(w) match = re.search('\d{2}\s*-\d{2}\s*-\d{4}', w) if match or "Issued" in w: dates.append(w) Final_dates=[] for w in dates: for w1 in w.split(' '): if re.match('[A-Za-z0-9]*\d{2}\s*-\d{2}\s*-\d{4}', w1): Final_dates.append(w1) Dob='NA' Valid_till='NA' Date_of_birth='NA' Issued_date='NA' if len(Final_dates)>=3: Issued_date=Final_dates[0] Dob=Final_dates[1] Dob=Dob.split(' ') Date_of_birth=Dob[0] Valid_till=Final_dates[2] else: if Final_dates==2: Issued_date=dates[0] Dob=dates[1] else: Issued_date=Final_dates[0] for w in Issued_date: if re.match(r'[a-zA-Z]',w): Issued_date=Issued_date.replace(w,'') Bgroup='NA' for x in text2: for y in x.split(' '): if "BG" in y or "Blood" in y: Bgroup=x BG='NA' for x in Bgroup.split(' '): if '+' in x: BG=x indices=[i for i, s in enumerate(text2) if 'Vehicle' in s or 'Class' in s] z=max(indices) text4=text2[z+1:] Vehicle_class='NA' for w in text4: if re.match(r'^[A-Z0-9]+$',w): Vehicle_class+=w+' ' data = {"Type of Document": Document_Type,"Number":Number,"Name":Name,"S/D/W of":SName,"Date of Birth":Date_of_birth,"Address":Address,"State":State,"Pinno":pin} return data def VoterId(textdata): Document_Type="Voter ID Card" data={} textdata=[x.upper() for x in textdata] id_no='' elector_name='' father_name='' sex='' place='' date='' address='' hno='' mohalla='' town='' police='' distt='' pin='' add='' age='' try: for k in textdata: index=textdata.index(k) if re.search(r'[A-Z]{3}[0-9]{7}$',textdata[index]): id_no=re.search(r'[A-Z]{3}[0-9]{7}$',textdata[index])[0] break if id_no=='': id_no='Not Found' except: id_no='Not Found' try: ### elector's name for key in textdata: if "ELECTOR'S NAME" in key: index=textdata.index(key) x1=textdata[index] if ':' in x1: x2=x1.split(':')[1] else: x2=x1.split('NAME')[1] if x2=='' or x2==' ': x3=textdata[index+1] if ':' in x3: x3=x3.split(':')[1] elector_name=x3 break else: elector_name=x3 break else: elector_name=x2 break if elector_name=='': elector_name='Not Found' except: elector_name='Not Found' try: ### Father's Name for k in textdata: if "FATHER'S NAME" in k or "HUSBAND'S NAME" in k: index=textdata.index(k) y1=textdata[index] if ':' in y1: y2=y1.split(':')[1] else: y2=y1.split('NAME')[1] if y2=='' or y2==' ': y3=textdata[index+1] if ':' in y3: y3=y3.split(':')[1] father_name=y3 break else: father_name=y3 break else: father_name=y2 break if father_name=='': father_name='Not Found' except: father_name='Not Found' try: ### Sex foundflag=False for key in textdata: if 'SEX' in key: index=textdata.index(key) if 'MALE' in textdata[index]: if 'FEMALE' in textdata[index]: sex='FEMALE' foundflag=True else: sex='MALE' foundflag=True else: if 'MALE' in textdata[index+1]: if 'FEMALE' in textdata[index+1]: sex='FEMALE' foundflag=True else: sex='MALE' foundflag=True if foundflag==False: if 'MALE' in key: if 'FEMALE' in key: sex='FEMALE' foundflag=True else: sex='MALE' foundflag=True if foundflag==False: if 'SEX' in key: index=textdata.index(key) sex1=''.join(textdata[index+1]) if sex1=='M': sex='MALE' foundflag=True else: if sex1=='F': sex='FEMALE' foundflag=True if sex=='': sex='Not Found' except: sex='Not Found' try: ### Place for k in textdata: if 'PLACE' in k: index=textdata.index(k) z1=textdata[index] if ':' in z1: z2=z1.split(':')[1] else: z2=z1.split('PLACE')[1] if z2=='' or z2==' ': z3=textdata[index+1] if ':' in z3: z3=z3.split(':')[1] place=z3 break else: place=z3 break else: place=z2 break if place=='': place='Not Found' except: place='Not Found' try: ### Date for key in textdata: if 'DATE' in key: index=textdata.index(key) a1=textdata[index] a2=textdata[index+1] if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a1): date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a1)[0] break else: if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a2): date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a2)[0] break else: if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',key): date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',key)[0] break else: date='Not found' except: date='Not found' try: ### Add ### House no for h in textdata: if 'H.NO.' in h: index=textdata.index(h) h1=textdata[index] if ':' in h1: h2=h1.split(':')[1] else: h2=h1.split('H.NO.')[1] if h2=='' or h2==' ': h3=textdata[index+1] if not 'MOHALLA' in h3: if ':' in h3: h3=h3.split(':')[1] hno=h3 break else: hno=h3 break else: hno=h2 break ### Mohalla for m in textdata: if 'MOHALLA' in m: index=textdata.index(m) m1=textdata[index] if ':' in m1: m2=m1.split(':')[1] else: m2=m1.split('MOHALLA')[1] if m2=='' or m2==' ': m3=textdata[index+1] if not 'TOWN' in m3: if ':' in m3: m3=m3.split(':')[1] mohalla=m3 break else: mohalla=m3 break else: mohalla=m2 break ### Town for t in textdata: if 'TOWN' in t: index=textdata.index(t) t1=textdata[index] if ':' in t1: t2=t1.split(':')[1] else: t2=t1.split('TOWN')[1] if t2=='' or t2==' ': t3=textdata[index+1] if not 'POLICESTN.' in t3: if ':' in t3: t3=t3.split(':')[1] town=t3 break else: town=t3 break else: town=t2 break ###Police for p in textdata: if 'POLICE' in p: index=textdata.index(p) p1=textdata[index] if ':' in p1: p2=p1.split(':')[1] else: p2=p1.split('POLICESTN.')[1] if p2=='' or p2==' ': p3=textdata[index+1] if not 'DISTT' in p3: if ':' in p3: p3=p3.split(':')[1] police=p3 break else: police=p3 break else: police=p2 break ### District for d in textdata: if 'DISTT' in d: index=textdata.index(d) d1=textdata[index] if ':' in y1: d2=d1.split(':')[1] else: d2=d1.split('DISTT.')[1] if d2=='' or d2==' ': d3=textdata[index+1] if not 'PIN' in d3: if ':' in d3: d3=d3.split(':')[1] distt=d3 break else: distt=d3 break else: distt=d2 break ### Pin for pp in textdata: if 'PIN' in pp: index=textdata.index(pp) pp1=textdata[index] if ':' in pp1: pp2=pp1.split(':')[1] else: pp2=pp1.split('PIN')[1] if pp2=='' or pp2==' ': pp3=textdata[index+1] if re.search(r'[0-9\s*]{6,}',pp3): pin=re.search(r'[0-9\s*]{6,}',pp3)[0] else: pin=pp2 break add=hno+' '+mohalla+' '+town+' '+police+' '+distt+' '+pin if len(add)>15: address=add else: try: add1=[i for i,s in enumerate(textdata) if 'ADDRESS' in s] add2=min(add1) add3=[i for i,s in enumerate(textdata) if 'ELECTION' in s] add4=min(add3) address=' '.join(textdata[add2:add4]) except: address='Not Found' except: address:'Not Found' try: ### Age for k in textdata: if 'AGE AS ON' in k: index=textdata.index(k) i1=textdata[index] if ':' in i1: if re.search(r'[0-9]{2,3}',i1): age=i1 break else: i3=textdata[index+1] if re.search(r'[0-9]{2,3}',i3): age=i1+' '+i3 break else: i4=textdata[index-1] if re.search(r'[0-9]{2,3}',i4): age=i1+' '+i4 break else: age='Not Found' else: if 'DATE OF BIRTH' in k: index=textdata.index(k) i1=textdata[index] if ':' in i1 or 'BIRTH' in i1: if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1): age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0] break else: i3=textdata[index+1] if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i3): age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0] break else: i4=textdata[index-1] if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i4): age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0] break else: age='Not Found' except: age='Not Found' data['Type of Document']=Document_Type data['Voter ID Number']=id_no data["Elector's Name"]=elector_name data["Father's/Husband's Name"]=father_name data["Sex"]=sex data["Address"]=address data["Date of Birth or Age"]=age data['Date']=date data["Place"]=place return data # This function creates a chromaDB client that connects to existing or newly created db def getChromaClient(dbPath): client=chromadb.PersistentClient(path=dbPath) return client # This function will get or create a new collection by taking openAI details and chromaClient object def getCollection(collectionName, client): collection = client.get_or_create_collection(name=collectionName, metadata={"hsnw:space": "cosine"}) return collection # Function to generate an embedding for an image def get_embedding(img): query_image = preprocess(Image.fromarray(img)).unsqueeze(0).to(device) # Get the embedding from the CLIP model with torch.no_grad(): query_embeddings = model.encode_image(query_image) return query_embeddings def store_embeddings(collection, emb,cat,id): try: collection.add( embeddings=emb, # documents=[img], metadatas=[{"category": cat}], ids=[id] ) # print('embeddings added') except Exception as e: print(e) exit def get_category(collection,emb): similar_img = collection.query(query_embeddings = emb, n_results = 1) option = similar_img["metadatas"][0][0]["category"] return option def train_classifier(coll,emb,cat): id = str(uuid.uuid4()) store_embeddings(coll, emb, cat, id) def save_uploadedfile(uploadedfile): with open(os.path.join(uploadedfile.name),"wb") as f: f.write(uploadedfile.getbuffer()) # return st.success("Saved File:{} to tempDir".format(uploadedfile.name)) def main(): # Delete all the items in Session state for key in st.session_state.keys(): del st.session_state[key] client = getChromaClient(db_path) coll = getCollection('visaDocImgs',client) st.sidebar.title("Choose Mode") mode = st.sidebar.selectbox("", ("Train", "Test")) st.write("Upload an image or pdf file") uploaded_file = st.file_uploader("File upload", type=['png','jpg','jpeg','pdf']) if uploaded_file is not None: save_uploadedfile(uploaded_file) folder_path = os.path.join("/") image_name = uploaded_file.name img_path = folder_path+ image_name upload_to_azure(img_path, image_name) #PDF file handeling if image_name.endswith('.pdf'): img_class_arr = [] images = pdf_to_jpeg(img_path) st.write("You have uploaded a pdf file with "+str(len(images))+" pages.") pass_name = None for i in range(len(images)): new_image_name = image_name.split(".")[0]+"_page_"+str(i)+".jpeg" images[i].save(os.path.join(folder_path,new_image_name)) new_img_path = folder_path+new_image_name new_img_array = cv2.imread(new_img_path) new_emb_tensor = get_embedding(new_img_array) new_emb = new_emb_tensor.tolist() new_cat = get_category(coll,new_emb) st.image(images[i],caption=new_cat) img_class_arr.append([new_image_name,new_cat,new_img_path]) pdf_pg_sel = st.selectbox('which page to process?',(set(img_class_arr[i][0] for i in range(len(img_class_arr))))) for row in img_class_arr: if row[0]==pdf_pg_sel: pass_name = row[2] new_cat = row[1] cat_list = ['passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid'] if new_cat not in cat_list: st.write("This category is not configured for extraction yet!") st.stop() else: img_path = pass_name #Process single image file im = Image.open(img_path) st.image(im, caption = "Image being processed") img_array = cv2.imread(img_path) # Get the image embedding emb_tensor = get_embedding(img_array) emb = emb_tensor.tolist() if mode == 'Test': option = get_category(coll,emb) st.write("Category -- >",option) cor_cat = None # print("is this correct classification?") cor_cat = st.selectbox( 'is this correct classification?',('y','n')) st.write('You selected:', cor_cat) if cor_cat == 'n': option = st.selectbox('What is the correct classification?', ('passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid')) st.write('You selected:', option) else: new_cat = option x = Azure_ocr_sdk(img_path) # Calling function based on category input if option == 'passport_front': res = Passport_front(x) elif option == 'passport_back': res = Passport_back(x) elif option == 'aadhar_front': res = aadhar_front(x) elif option == 'aadhar_back': res = aadhar_back(x) elif option == 'PAN_front': res = pan_card(x) elif option == 'DL': res = DL(x) elif option == 'voterid': res = VoterId(x) else: st.write("Couldn't classify") st.header("Extracted text") st.table(res) else: new_cat = st.selectbox('What is the correct classification?', ('passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid')) st.write('You selected:', new_cat) train_classifier(coll,emb,new_cat) if __name__ == "__main__": main()