Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import os | |
| import sys | |
| import re | |
| from azure.storage.blob import BlobServiceClient | |
| from PIL import Image | |
| import cv2 | |
| import datetime | |
| import clip | |
| import torch | |
| import chromadb | |
| import uuid | |
| import pdf2image | |
| # import pandas as pd | |
| from azure.cognitiveservices.vision.computervision import ComputerVisionClient | |
| from msrest.authentication import CognitiveServicesCredentials | |
| from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes | |
| #For generating CLIP embeddings | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model, preprocess = clip.load("ViT-B/32", device=device) | |
| #File locations | |
| # db_path = r'C:\Users\sukri\OneDrive\FantaClaus\NuronAI\experiments\indpass' | |
| db_path = os.path.dirname(os.path.abspath(sys.argv[0])) | |
| # Azure storage details | |
| connect_str = st.secrets["conn_str"] | |
| container_name = st.secrets["cont_name"] | |
| blob_service_client = BlobServiceClient.from_connection_string(connect_str) | |
| def pdf_to_jpeg(pdf_file): | |
| # poppler_path = r"C:\Program Files\poppler-23.08.0\Library\bin" | |
| images = pdf2image.convert_from_path(pdf_file) | |
| # print("Number of pages in PDF file-->",len(images)) | |
| # returns a list of JPEG images | |
| return images | |
| def upload_to_azure(image_path,file_name): | |
| blob_client = blob_service_client.get_blob_client(container_name, file_name) | |
| with open(image_path,"rb") as data: | |
| blob_client.upload_blob(data, overwrite=True) | |
| # print("file uploaded!!!") | |
| def Azure_ocr_sdk(image_path): | |
| # Create a computer vision client | |
| subscription_key = "82e0c013e94849f7ab5bfc8c7c5e54c8" | |
| endpoint = "https://docaicomputervisionocr1.cognitiveservices.azure.com/" | |
| credentials = CognitiveServicesCredentials(subscription_key) | |
| cv_client = ComputerVisionClient(endpoint, credentials) | |
| response = cv_client.read_in_stream(open(image_path,'rb'), raw=True,language='en') | |
| operationLocation = response.headers['Operation-Location'] | |
| operation_id = operationLocation.split('/')[-1] | |
| result = cv_client.get_read_result(operation_id) | |
| while(result.status == OperationStatusCodes.running): | |
| operationLocation = response.headers['Operation-Location'] | |
| operation_id = operationLocation.split('/')[-1] | |
| result = cv_client.get_read_result(operation_id) | |
| text_blob = [] | |
| if result.status == OperationStatusCodes.succeeded: | |
| read_results = result.analyze_result.read_results | |
| for analyzed_result in read_results: | |
| for line in analyzed_result.lines: | |
| text_blob.append(line.text) | |
| # print(text_blob) | |
| return text_blob | |
| def Passport_front(textdata): | |
| Document_Type="PASSPORT DOCUMENT" | |
| DateofBirth = None | |
| Type = None | |
| Countrycode = None | |
| passportno = None | |
| Surname = None | |
| Firstname = None | |
| Nationality = None | |
| Sex = None | |
| PlaceofBirth = None | |
| PlaceofIssue = None | |
| DateofIssue = None | |
| DateofExpiry = None | |
| MRZ = None | |
| output2=[] | |
| for w in textdata: | |
| output2.append(w.upper()) | |
| output3=[] | |
| for x in output2: | |
| if 'REPUBLIC' not in x and 'TYPE' not in x and 'GIVEN' not in x and 'COUNTRY' not in x and 'PASSPORT' not in x and 'NO.' not in x and 'NAME' not in x and 'NATIONALITY' not in x and 'SEX' not in x and 'DATE' not in x and 'PLACE' not in x: | |
| output3.append(x) | |
| dates=[] | |
| for w in output3: | |
| match = re.search('\d{2}\s*/\d{2}\s*/\d{4}', w) | |
| if match: | |
| dates.append(w) | |
| if len(dates)==2: | |
| caldate=int(dates[1].split('/')[-1])-int(dates[0].split('/')[-1]) | |
| if caldate==10: | |
| DateofBirth=None | |
| try: | |
| DateofIssue=dates[0] | |
| except: | |
| DateofIssue=None | |
| try: | |
| DateofExpiry=dates[1] | |
| except: | |
| DateofExpiry=None | |
| else: | |
| try: | |
| DateofBirth=dates[0] | |
| except: | |
| DateofBirth=None | |
| try: | |
| DateofIssue=dates[1] | |
| except: | |
| DateofIssue=None | |
| try: | |
| DateofExpiry=dates[2] | |
| except: | |
| DateofExpiry=None | |
| try: | |
| for x in output3: | |
| if DateofBirth in x or DateofIssue in x or DateofExpiry in x: | |
| output3.remove(x) | |
| except: | |
| print(None) | |
| single=[] | |
| for w in output3: | |
| if re.match(r'^[A-Z]{1}$',w): | |
| single+=w | |
| try: | |
| Type=single[0] | |
| except: | |
| Type=None | |
| try: | |
| Sex=''.join([i for i in single if re.match(r'[M,F]$',i)]) | |
| except: | |
| Sex=None | |
| for w in single: | |
| output3.remove(w) | |
| for w in output3: | |
| if re.match('[A-Z]*\s*/\s*[A-Z]*',w): | |
| output3.remove(w) | |
| passportno=None | |
| for w in output3: | |
| if re.search(r'^[A-Z]*\s*[0-9]{7}$', w): | |
| passportno=w | |
| break | |
| else: | |
| passportno=None | |
| try: | |
| output3.remove(passportno) | |
| except: | |
| print('None') | |
| for x in output3: | |
| if 'BIRTH' in x: | |
| output3.remove(x) | |
| if 'IND' or 'INDIAN' in x: | |
| Nationality='INDIAN' | |
| Countrycode='IND' | |
| else: | |
| Nationality=None | |
| Countrycode=None | |
| MRZ='' | |
| for w in output3: | |
| if '<<' in w: | |
| MRZ+=w | |
| try: | |
| indices=[i for i,s in enumerate(output3) if re.match(r'IND$',s)] | |
| if indices: | |
| z=max(indices) | |
| output4=output3[z+1:] | |
| else: | |
| output4=[] | |
| except: | |
| indices=[i for i,s in enumerate(output3) if re.match(r'IND',s)] | |
| if indices: | |
| z=max(indices) | |
| output4=output3[z+1:] | |
| else: | |
| output4=[] | |
| for i in output4: | |
| if 'INDIAN' in i: | |
| output4.remove(i) | |
| if 'OF BIRTH' in i: | |
| output4.remove(i) | |
| if re.search(r'\d',i): | |
| output4.remove(i) | |
| try: | |
| Sname=output4[0] | |
| Name=output4[1] | |
| Pob=output4[2] | |
| Poi=output4[3] | |
| except: | |
| Sname=None | |
| Name=None | |
| Pob=None | |
| Poi=None | |
| strings={ "Surname": Sname, | |
| "FirstName": Name, | |
| "PlaceofBirth": Pob, | |
| "PlaceofIssue": Poi} | |
| values=list(strings.values()) | |
| strings2=[] | |
| try: | |
| for w in values: | |
| string1=[] | |
| for w1 in w.split(' '): | |
| if re.match(r'^[0-9]*',w1): | |
| string1.append(re.sub(r'[^A-Z]','',w1)) | |
| strings2.append(string1) | |
| except: | |
| strings2=[] | |
| strings3=[] | |
| for w in strings2: | |
| if len(w)>=2: | |
| strings3.append(" ".join(map(str,w))) | |
| else: | |
| strings3.append(w) | |
| try: | |
| str1 = ''.join(str(e) for e in strings3[0]) | |
| # print(str1) | |
| Surname=str1 | |
| str2 = ''.join(str(e) for e in strings3[1]) | |
| # print(str2) | |
| Firstname=str2 | |
| str3 = ''.join(str(e) for e in strings3[2]) | |
| # print(str3) | |
| PlaceofBirth=str3 | |
| str4 = ''.join(str(e) for e in strings3[3]) | |
| # print(str4) | |
| PlaceofIssue=str4 | |
| except: | |
| Surname=None | |
| Firstname=None | |
| PlaceofBirth=None | |
| PlaceofIssue=None | |
| data={"Type": Document_Type,"Type of Passport":Type,"Countrycode": Countrycode,"Passportno": passportno,"Surname": Surname,"Firstname": Firstname,"Nationality": Nationality,"Sex": Sex,"Date of Birth": DateofBirth,"Place of Birth": PlaceofBirth,"Place of Issue": PlaceofIssue,"Date of Issue": DateofIssue,"Date of Expiry": DateofExpiry,"MRZ": MRZ} | |
| return data | |
| def Passport_back(text): | |
| Document_Type="PASSPORT BACK" | |
| data={} | |
| old_passno='' | |
| dateplace_issue='' | |
| Father_name='' | |
| mother_name='' | |
| spouse='' | |
| for x in text: | |
| if 'CAUTION' in x: | |
| index=text.index(x) | |
| textdata=text[index+1:] | |
| break | |
| else: | |
| textdata=text | |
| print(len(textdata),textdata) | |
| try: | |
| if len(textdata)>=9: | |
| for x in textdata: | |
| if 'FATHER' in x or 'LEGAL GUARDIAN' in x: | |
| index1=textdata.index(x) | |
| Father_name=textdata[index1+1] | |
| if 'MOTHER' in x or 'NAME OF MOT' in x: | |
| index2=textdata.index(x) | |
| mother_name=textdata[index2+1] | |
| if 'SPOUSE' in x: | |
| index3=textdata.index(x) | |
| spouse=textdata[index3+1] | |
| try: | |
| indices1=[i for i, s in enumerate(textdata) if 'NAME' in s] | |
| z1=max(indices1) | |
| indices2=[i for i, s in enumerate(textdata) if 'OLD PASSPORT NO' in s or 'PLACE OF ISSUE' in s] | |
| z2=min(indices2) | |
| if 'ADDRESS' in textdata[z1+2]: | |
| address=''.join(textdata[z1+3:z2]) | |
| else: | |
| address=''.join(textdata[z1+2:z2]) | |
| except Exception as e: | |
| print(e) | |
| address='Not Found' | |
| try: | |
| if 'FILE NO' in x: | |
| index4=textdata.index(x) | |
| File_no=textdata[index4+1] | |
| else: | |
| if re.search(r'\d',textdata[-1]): | |
| File_no=textdata[-1] | |
| else: | |
| if re.search(r'\d',textdata[-2]): | |
| File_no=textdata[-2] | |
| except: | |
| File_no='Not Found' | |
| try: | |
| indices3=[i for i, s in enumerate(textdata) if 'OLD PASSPORT NO' in s] | |
| z3=max(indices3) | |
| textdata2=textdata[z3+1:-1] | |
| old_passno=[w for w in textdata2 if re.match(r'[A-Z]{1}[0-9]{7}',w)][0] | |
| except: | |
| old_passno='Not Found' | |
| try: | |
| indices4=[i for i, s in enumerate(textdata) if 'PLACE OF ISSUE' in s] | |
| z4=max(indices4) | |
| if 'FILE NO' not in textdata[z4+1]: | |
| if 'FILE NO' in textdata[-2]: | |
| dateplace_issue=textdata[z4+1:-2] | |
| else: | |
| dateplace_issue=textdata[z4+1:-1] | |
| else: | |
| dateplace_issue='Not Found' | |
| for w in dateplace_issue: | |
| if old_passno in w or File_no in w: | |
| dateplace_issue.remove(w) | |
| except: | |
| dateplace_issue='Not Found' | |
| else: | |
| if len(textdata)<=8: | |
| textdata1_1=[] | |
| print(textdata) | |
| for i in range(0,3): | |
| if re.match(r'[A-Z]+\s*[A-Z]*',textdata[i]) and not re.search(r'\d',textdata[i]): | |
| textdata1_1.append(textdata[i]) | |
| if len(textdata1_1)==3: | |
| Father_name=textdata1_1[0] | |
| mother_name=textdata1_1[1] | |
| spouse=textdata1_1[2] | |
| address=' '.join(textdata[3:5]) | |
| File_no=textdata[-1] | |
| else: | |
| if len(textdata1_1)==2: | |
| Father_name=textdata1_1[0] | |
| mother_name=textdata1_1[1] | |
| spouse='Not Found' | |
| File_no='Not Found' | |
| dateplace_issue='Not Found' | |
| old_passno='Not Found' | |
| address=' '.join(textdata[3:5]) | |
| File_no=textdata[-1] | |
| else: | |
| if len(textdata1_1)==1: | |
| Father_name=textdata1_1[0] | |
| mother_name='Not Found' | |
| spouse='Not Found' | |
| mother_name='Not Found' | |
| spouse='Not Found' | |
| address='Not Found' | |
| File_no='Not Found' | |
| dateplace_issue='Not Found' | |
| old_passno='Not Found' | |
| if textdata1_1==[]: | |
| Father_name='Not Found' | |
| mother_name='Not Found' | |
| spouse='Not Found' | |
| address='Not Found' | |
| File_no='Not Found' | |
| dateplace_issue='Not Found' | |
| old_passno='Not Found' | |
| except Exception as e: | |
| print(e) | |
| data['Type of Document']=Document_Type | |
| Father_name='Not Found' | |
| mother_name='Not Found' | |
| spouse='Not Found' | |
| address='Not Found' | |
| File_no='Not Found' | |
| dateplace_issue='Not Found' | |
| old_passno='Not Found' | |
| data['Type of Document']=Document_Type | |
| data["Name of Father/Legal Guardian"]=Father_name | |
| data["Name of Mother"]=mother_name | |
| data["Name of Spouse"]=spouse | |
| data["Address"]=address | |
| data["Old Passport No"]=old_passno | |
| data['Date and Place of Issue']=dateplace_issue | |
| data["File No"]=File_no | |
| return data | |
| def aadhar_front(textdata): | |
| Document_Type="AADHAR CARD FRONT" | |
| indices=[i for i, s in enumerate(textdata) if 'INDIA' in s or 'GOVERNMENT' in s or 'India' in s] | |
| z=max(indices) | |
| text=textdata[z+1:] | |
| output=[] | |
| for w in text: | |
| output.append(w.upper()) | |
| for w in textdata: | |
| if 'FEMALE' or 'MALE' in output: | |
| aadharno='NA' | |
| w1=[] | |
| w=[] | |
| for a1 in output: | |
| if re.search(r'^\d{4}\s\d{4}\s\d{4}$', a1): | |
| aadharno=a1 | |
| output.remove(aadharno) | |
| output=[] | |
| for w in text: | |
| output.append(w.upper()) | |
| dob='' | |
| for a1 in output: | |
| if re.match(r'.*[0-9].*',a1): | |
| dob+=' '+ a1 | |
| #print(dob) | |
| Date='NA' | |
| match = re.search('\d{2}/\d{2}/\d{4}', dob) | |
| #print(match) | |
| if match: | |
| Date = datetime.datetime.strptime('%s'%(match.group()), "%d/%m/%Y").date().strftime('%d/%m/%Y') | |
| else: | |
| date=dob | |
| date=date.split(aadharno) | |
| date1=date[0] | |
| for a1 in date1.split(' '): | |
| match=re.search('[0-9]{4}',a1) | |
| if match: | |
| date2=a1 | |
| Date=date2 | |
| #print(Date) | |
| Father_name='NA' | |
| F_name='' | |
| Name='NA' | |
| for i in output: | |
| if 'FATHER' in i: | |
| F_name=i | |
| F_name=F_name.split(':') | |
| Father_name=F_name[1] | |
| # print(output[0]) | |
| Name=output[0] | |
| break | |
| else: | |
| for w in range(0,len(output)): | |
| for w1 in output[w].split(' '): | |
| if Date in w1 or 'BIRTH' in w1: | |
| x=w | |
| Name=output[x-1] | |
| Gender='NA' | |
| for w in output: | |
| for w1 in w.split(' '): | |
| if 'MALE' in w1 or 'FEMALE' in w1: | |
| Gender=w1 | |
| data = {"Type of Document": Document_Type,"Name":Name,"Father's Name":Father_name,"Date of Birth": Date,"Aadhar No": aadharno,"Gender":Gender} | |
| return data | |
| def aadhar_back(textdata2,polygons_new): | |
| Document_Type='AADHAR CARD BACK' | |
| textdata=[] | |
| boxes=[] | |
| for x in polygons_new: | |
| textdata.append(x[1]) | |
| boxes.append(x[2]) | |
| indices=[i for i, s in enumerate(textdata) if 'Address' in s] | |
| z=max(indices) | |
| text1=textdata[z:] | |
| boxes1=boxes[z:] | |
| print(text1,boxes1) | |
| p1=(boxes1[0][0],boxes1[0][1]) | |
| p2=(boxes1[0][4],boxes1[0][5]) | |
| dist = math.hypot(p2[0] - p1[0], p2[1] - p1[1]) | |
| check=[0] | |
| check_list=[] | |
| add_box=[] | |
| add_text=[] | |
| for j in range(1,len(boxes1)): | |
| for k in boxes1[j]: | |
| l=boxes1[0][0] | |
| if l-(1.3*dist) <= boxes1[j][0] <= l+(3.5*dist): | |
| check.append(j) | |
| check_list=set(check) | |
| add_text=[text1[i] for i in check_list] | |
| add_box=[boxes1[i] for i in check_list] | |
| add='' | |
| for x in add_text: | |
| add=add+x.upper()+' ' | |
| add=add.strip() | |
| print(textdata2) | |
| for a1 in textdata2: | |
| if re.search(r'^\d{4}\s\d{4}\s\d{4}$', a1): | |
| aadharno = a1 | |
| break | |
| else: | |
| aadharno='Not Found' | |
| pin=re.findall(r'\b[1-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\b',add) | |
| pinno=str(pin[0]) | |
| pinno=re.sub(r'\s{1,}','',pinno) | |
| address=re.split(r'\b[1-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\b',add)[0] | |
| address=address+' '+pinno | |
| address=re.sub(r'ADDRESS:*',' ',address) | |
| address=re.sub(r'\s{2,}',' ',address) | |
| address=address.lstrip() | |
| data = {"Type of Document": Document_Type,"Address":address,"PIN Code":pinno, "Aadhar No": aadharno} | |
| return data | |
| def pan_card(textdata): | |
| Document_Type="PAN CARD" | |
| indices=[i for i, s in enumerate(textdata) if 'INCOME' in s or 'Card' in s or 'INDIA' in s or 'DEPARTMENT' in s] | |
| z=max(indices) | |
| text=textdata[z+1:] | |
| output=[] | |
| for w in text: | |
| output.append(w.upper()) | |
| dob='NA' | |
| for w in output: | |
| if re.match(r"^\d+",w ): | |
| dob+=w | |
| date=None | |
| match = re.search('\d{2}/\d{2}/\d{4}', dob) | |
| if match: | |
| date = datetime.datetime.strptime('%s'%(match.group()), "%d/%m/%Y").date().strftime('%d/%m/%Y') | |
| else: | |
| date=dob | |
| output1=[] | |
| for x in output: | |
| if 'INCOME' not in x and 'INDIA' not in x and 'NAME' not in x and 'BIRTH' not in x and 'PERMANENT' not in x and 'CARD' not in x and 'TAX' not in x : | |
| output1.append(x) | |
| panno='NA' | |
| w1=[] | |
| w=[] | |
| for w1 in output1: | |
| if re.search(r'^[A-Z0-9]{10}$', w1): | |
| panno=w1 | |
| output1.remove(w1) | |
| Name=output1[0] | |
| Father_name=output1[1] | |
| data = { "Type of Document": Document_Type, "Name":Name,"Father's Name": Father_name,"Date of Birth": date,"PAN Number":panno} | |
| return data | |
| def DL(textdata): | |
| Document_Type="DRIVING LICENCE" | |
| indices=[i for i, s in enumerate(textdata) if 'DRIVING' in s or 'UNION' in s or 'INDIAN' in s] | |
| z=max(indices) | |
| text=textdata[z+1:] | |
| State=text[0] | |
| State=State.replace("STATE","") | |
| text=[re.sub('[:;.]+', '', _) for _ in text] | |
| text=[x.strip(' ') for x in text] | |
| Number='NA' | |
| Number_text=str(text) | |
| Number_text=Number_text.replace("'","") | |
| #print(Number_text) | |
| for x in Number_text.split(','): | |
| #print(x) | |
| if re.match(r'^[\s]*[A-Z]{2}[-]*[\s]*[0-9]{2}[\s]*[0-9]{4}[0-9]{7}$',x): | |
| Number=x | |
| indices=[i for i, s in enumerate(text) if "Number" in s ] | |
| z=max(indices) | |
| text1=text[z+1:] | |
| AName=[] | |
| Name='NA' | |
| SName='NA' | |
| indices=[i for i, s in enumerate(text1) if "Address" in s or "Blood" in s] | |
| z=max(indices) | |
| Name2=text1[:z] | |
| Name2.remove('Name') | |
| for w in Name2: | |
| AName.append(w) | |
| Number=Number.lstrip() | |
| for x in AName: | |
| # print(x) | |
| if Number==x: | |
| AName.remove(x) | |
| break | |
| for x in AName: | |
| if "S/D/W" in x: | |
| SName=x.split("S/D/W of")[1] | |
| else: | |
| Name+= x +' ' | |
| indices=[i for i, s in enumerate(text1) if "Address" in s] | |
| z=max(indices) | |
| text2=text1[z:] | |
| pin='NA' | |
| for x in text2: | |
| for y in x.split(' '): | |
| if re.match(r'.*[1-9]{1}[0-9]{5}$',y): | |
| pin=y | |
| text3=str(text2) | |
| if pin!='': | |
| pinno=str(pin) | |
| Address="" .join(text3.split(pinno,2)[:1]) | |
| Address=Address+pinno | |
| Address=Address.replace("'", "") | |
| Address=Address.replace("[", "") | |
| Address=Address.replace(",,", ",") | |
| Address=Address.replace("Address,","") | |
| else: | |
| Address=text3[:text3.find("Issued")] | |
| Address=Address.replace("[", "") | |
| Address=Address.replace(",,", ",") | |
| Address=Address.replace("Address","") | |
| dates=[] | |
| for w in text2: | |
| #print(w) | |
| match = re.search('\d{2}\s*-\d{2}\s*-\d{4}', w) | |
| if match or "Issued" in w: | |
| dates.append(w) | |
| Final_dates=[] | |
| for w in dates: | |
| for w1 in w.split(' '): | |
| if re.match('[A-Za-z0-9]*\d{2}\s*-\d{2}\s*-\d{4}', w1): | |
| Final_dates.append(w1) | |
| Dob='NA' | |
| Valid_till='NA' | |
| Date_of_birth='NA' | |
| Issued_date='NA' | |
| if len(Final_dates)>=3: | |
| Issued_date=Final_dates[0] | |
| Dob=Final_dates[1] | |
| Dob=Dob.split(' ') | |
| Date_of_birth=Dob[0] | |
| Valid_till=Final_dates[2] | |
| else: | |
| if Final_dates==2: | |
| Issued_date=dates[0] | |
| Dob=dates[1] | |
| else: | |
| Issued_date=Final_dates[0] | |
| for w in Issued_date: | |
| if re.match(r'[a-zA-Z]',w): | |
| Issued_date=Issued_date.replace(w,'') | |
| Bgroup='NA' | |
| for x in text2: | |
| for y in x.split(' '): | |
| if "BG" in y or "Blood" in y: | |
| Bgroup=x | |
| BG='NA' | |
| for x in Bgroup.split(' '): | |
| if '+' in x: | |
| BG=x | |
| indices=[i for i, s in enumerate(text2) if 'Vehicle' in s or 'Class' in s] | |
| z=max(indices) | |
| text4=text2[z+1:] | |
| Vehicle_class='NA' | |
| for w in text4: | |
| if re.match(r'^[A-Z0-9]+$',w): | |
| Vehicle_class+=w+' ' | |
| data = {"Type of Document": Document_Type,"Number":Number,"Name":Name,"S/D/W of":SName,"Date of Birth":Date_of_birth,"Address":Address,"State":State,"Pinno":pin} | |
| return data | |
| def VoterId(textdata): | |
| Document_Type="Voter ID Card" | |
| data={} | |
| textdata=[x.upper() for x in textdata] | |
| id_no='' | |
| elector_name='' | |
| father_name='' | |
| sex='' | |
| place='' | |
| date='' | |
| address='' | |
| hno='' | |
| mohalla='' | |
| town='' | |
| police='' | |
| distt='' | |
| pin='' | |
| add='' | |
| age='' | |
| try: | |
| for k in textdata: | |
| index=textdata.index(k) | |
| if re.search(r'[A-Z]{3}[0-9]{7}$',textdata[index]): | |
| id_no=re.search(r'[A-Z]{3}[0-9]{7}$',textdata[index])[0] | |
| break | |
| if id_no=='': | |
| id_no='Not Found' | |
| except: | |
| id_no='Not Found' | |
| try: | |
| ### elector's name | |
| for key in textdata: | |
| if "ELECTOR'S NAME" in key: | |
| index=textdata.index(key) | |
| x1=textdata[index] | |
| if ':' in x1: | |
| x2=x1.split(':')[1] | |
| else: | |
| x2=x1.split('NAME')[1] | |
| if x2=='' or x2==' ': | |
| x3=textdata[index+1] | |
| if ':' in x3: | |
| x3=x3.split(':')[1] | |
| elector_name=x3 | |
| break | |
| else: | |
| elector_name=x3 | |
| break | |
| else: | |
| elector_name=x2 | |
| break | |
| if elector_name=='': | |
| elector_name='Not Found' | |
| except: | |
| elector_name='Not Found' | |
| try: | |
| ### Father's Name | |
| for k in textdata: | |
| if "FATHER'S NAME" in k or "HUSBAND'S NAME" in k: | |
| index=textdata.index(k) | |
| y1=textdata[index] | |
| if ':' in y1: | |
| y2=y1.split(':')[1] | |
| else: | |
| y2=y1.split('NAME')[1] | |
| if y2=='' or y2==' ': | |
| y3=textdata[index+1] | |
| if ':' in y3: | |
| y3=y3.split(':')[1] | |
| father_name=y3 | |
| break | |
| else: | |
| father_name=y3 | |
| break | |
| else: | |
| father_name=y2 | |
| break | |
| if father_name=='': | |
| father_name='Not Found' | |
| except: | |
| father_name='Not Found' | |
| try: | |
| ### Sex | |
| foundflag=False | |
| for key in textdata: | |
| if 'SEX' in key: | |
| index=textdata.index(key) | |
| if 'MALE' in textdata[index]: | |
| if 'FEMALE' in textdata[index]: | |
| sex='FEMALE' | |
| foundflag=True | |
| else: | |
| sex='MALE' | |
| foundflag=True | |
| else: | |
| if 'MALE' in textdata[index+1]: | |
| if 'FEMALE' in textdata[index+1]: | |
| sex='FEMALE' | |
| foundflag=True | |
| else: | |
| sex='MALE' | |
| foundflag=True | |
| if foundflag==False: | |
| if 'MALE' in key: | |
| if 'FEMALE' in key: | |
| sex='FEMALE' | |
| foundflag=True | |
| else: | |
| sex='MALE' | |
| foundflag=True | |
| if foundflag==False: | |
| if 'SEX' in key: | |
| index=textdata.index(key) | |
| sex1=''.join(textdata[index+1]) | |
| if sex1=='M': | |
| sex='MALE' | |
| foundflag=True | |
| else: | |
| if sex1=='F': | |
| sex='FEMALE' | |
| foundflag=True | |
| if sex=='': | |
| sex='Not Found' | |
| except: | |
| sex='Not Found' | |
| try: | |
| ### Place | |
| for k in textdata: | |
| if 'PLACE' in k: | |
| index=textdata.index(k) | |
| z1=textdata[index] | |
| if ':' in z1: | |
| z2=z1.split(':')[1] | |
| else: | |
| z2=z1.split('PLACE')[1] | |
| if z2=='' or z2==' ': | |
| z3=textdata[index+1] | |
| if ':' in z3: | |
| z3=z3.split(':')[1] | |
| place=z3 | |
| break | |
| else: | |
| place=z3 | |
| break | |
| else: | |
| place=z2 | |
| break | |
| if place=='': | |
| place='Not Found' | |
| except: | |
| place='Not Found' | |
| try: | |
| ### Date | |
| for key in textdata: | |
| if 'DATE' in key: | |
| index=textdata.index(key) | |
| a1=textdata[index] | |
| a2=textdata[index+1] | |
| if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a1): | |
| date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a1)[0] | |
| break | |
| else: | |
| if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a2): | |
| date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a2)[0] | |
| break | |
| else: | |
| if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',key): | |
| date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',key)[0] | |
| break | |
| else: | |
| date='Not found' | |
| except: | |
| date='Not found' | |
| try: | |
| ### Add | |
| ### House no | |
| for h in textdata: | |
| if 'H.NO.' in h: | |
| index=textdata.index(h) | |
| h1=textdata[index] | |
| if ':' in h1: | |
| h2=h1.split(':')[1] | |
| else: | |
| h2=h1.split('H.NO.')[1] | |
| if h2=='' or h2==' ': | |
| h3=textdata[index+1] | |
| if not 'MOHALLA' in h3: | |
| if ':' in h3: | |
| h3=h3.split(':')[1] | |
| hno=h3 | |
| break | |
| else: | |
| hno=h3 | |
| break | |
| else: | |
| hno=h2 | |
| break | |
| ### Mohalla | |
| for m in textdata: | |
| if 'MOHALLA' in m: | |
| index=textdata.index(m) | |
| m1=textdata[index] | |
| if ':' in m1: | |
| m2=m1.split(':')[1] | |
| else: | |
| m2=m1.split('MOHALLA')[1] | |
| if m2=='' or m2==' ': | |
| m3=textdata[index+1] | |
| if not 'TOWN' in m3: | |
| if ':' in m3: | |
| m3=m3.split(':')[1] | |
| mohalla=m3 | |
| break | |
| else: | |
| mohalla=m3 | |
| break | |
| else: | |
| mohalla=m2 | |
| break | |
| ### Town | |
| for t in textdata: | |
| if 'TOWN' in t: | |
| index=textdata.index(t) | |
| t1=textdata[index] | |
| if ':' in t1: | |
| t2=t1.split(':')[1] | |
| else: | |
| t2=t1.split('TOWN')[1] | |
| if t2=='' or t2==' ': | |
| t3=textdata[index+1] | |
| if not 'POLICESTN.' in t3: | |
| if ':' in t3: | |
| t3=t3.split(':')[1] | |
| town=t3 | |
| break | |
| else: | |
| town=t3 | |
| break | |
| else: | |
| town=t2 | |
| break | |
| ###Police | |
| for p in textdata: | |
| if 'POLICE' in p: | |
| index=textdata.index(p) | |
| p1=textdata[index] | |
| if ':' in p1: | |
| p2=p1.split(':')[1] | |
| else: | |
| p2=p1.split('POLICESTN.')[1] | |
| if p2=='' or p2==' ': | |
| p3=textdata[index+1] | |
| if not 'DISTT' in p3: | |
| if ':' in p3: | |
| p3=p3.split(':')[1] | |
| police=p3 | |
| break | |
| else: | |
| police=p3 | |
| break | |
| else: | |
| police=p2 | |
| break | |
| ### District | |
| for d in textdata: | |
| if 'DISTT' in d: | |
| index=textdata.index(d) | |
| d1=textdata[index] | |
| if ':' in y1: | |
| d2=d1.split(':')[1] | |
| else: | |
| d2=d1.split('DISTT.')[1] | |
| if d2=='' or d2==' ': | |
| d3=textdata[index+1] | |
| if not 'PIN' in d3: | |
| if ':' in d3: | |
| d3=d3.split(':')[1] | |
| distt=d3 | |
| break | |
| else: | |
| distt=d3 | |
| break | |
| else: | |
| distt=d2 | |
| break | |
| ### Pin | |
| for pp in textdata: | |
| if 'PIN' in pp: | |
| index=textdata.index(pp) | |
| pp1=textdata[index] | |
| if ':' in pp1: | |
| pp2=pp1.split(':')[1] | |
| else: | |
| pp2=pp1.split('PIN')[1] | |
| if pp2=='' or pp2==' ': | |
| pp3=textdata[index+1] | |
| if re.search(r'[0-9\s*]{6,}',pp3): | |
| pin=re.search(r'[0-9\s*]{6,}',pp3)[0] | |
| else: | |
| pin=pp2 | |
| break | |
| add=hno+' '+mohalla+' '+town+' '+police+' '+distt+' '+pin | |
| if len(add)>15: | |
| address=add | |
| else: | |
| try: | |
| add1=[i for i,s in enumerate(textdata) if 'ADDRESS' in s] | |
| add2=min(add1) | |
| add3=[i for i,s in enumerate(textdata) if 'ELECTION' in s] | |
| add4=min(add3) | |
| address=' '.join(textdata[add2:add4]) | |
| except: | |
| address='Not Found' | |
| except: | |
| address:'Not Found' | |
| try: | |
| ### Age | |
| for k in textdata: | |
| if 'AGE AS ON' in k: | |
| index=textdata.index(k) | |
| i1=textdata[index] | |
| if ':' in i1: | |
| if re.search(r'[0-9]{2,3}',i1): | |
| age=i1 | |
| break | |
| else: | |
| i3=textdata[index+1] | |
| if re.search(r'[0-9]{2,3}',i3): | |
| age=i1+' '+i3 | |
| break | |
| else: | |
| i4=textdata[index-1] | |
| if re.search(r'[0-9]{2,3}',i4): | |
| age=i1+' '+i4 | |
| break | |
| else: | |
| age='Not Found' | |
| else: | |
| if 'DATE OF BIRTH' in k: | |
| index=textdata.index(k) | |
| i1=textdata[index] | |
| if ':' in i1 or 'BIRTH' in i1: | |
| if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1): | |
| age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0] | |
| break | |
| else: | |
| i3=textdata[index+1] | |
| if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i3): | |
| age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0] | |
| break | |
| else: | |
| i4=textdata[index-1] | |
| if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i4): | |
| age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0] | |
| break | |
| else: | |
| age='Not Found' | |
| except: | |
| age='Not Found' | |
| data['Type of Document']=Document_Type | |
| data['Voter ID Number']=id_no | |
| data["Elector's Name"]=elector_name | |
| data["Father's/Husband's Name"]=father_name | |
| data["Sex"]=sex | |
| data["Address"]=address | |
| data["Date of Birth or Age"]=age | |
| data['Date']=date | |
| data["Place"]=place | |
| return data | |
| # This function creates a chromaDB client that connects to existing or newly created db | |
| def getChromaClient(dbPath): | |
| client=chromadb.PersistentClient(path=dbPath) | |
| return client | |
| # This function will get or create a new collection by taking openAI details and chromaClient object | |
| def getCollection(collectionName, client): | |
| collection = client.get_or_create_collection(name=collectionName, metadata={"hsnw:space": "cosine"}) | |
| return collection | |
| # Function to generate an embedding for an image | |
| def get_embedding(img): | |
| query_image = preprocess(Image.fromarray(img)).unsqueeze(0).to(device) | |
| # Get the embedding from the CLIP model | |
| with torch.no_grad(): | |
| query_embeddings = model.encode_image(query_image) | |
| return query_embeddings | |
| def store_embeddings(collection, emb,cat,id): | |
| try: | |
| collection.add( | |
| embeddings=emb, | |
| # documents=[img], | |
| metadatas=[{"category": cat}], | |
| ids=[id] ) | |
| # print('embeddings added') | |
| except Exception as e: | |
| print(e) | |
| exit | |
| def get_category(collection,emb): | |
| similar_img = collection.query(query_embeddings = emb, n_results = 1) | |
| option = similar_img["metadatas"][0][0]["category"] | |
| return option | |
| def train_classifier(coll,emb,cat): | |
| id = str(uuid.uuid4()) | |
| store_embeddings(coll, emb, cat, id) | |
| def save_uploadedfile(uploadedfile): | |
| with open(os.path.join(uploadedfile.name),"wb") as f: | |
| f.write(uploadedfile.getbuffer()) | |
| # return st.success("Saved File:{} to tempDir".format(uploadedfile.name)) | |
| def main(): | |
| # Delete all the items in Session state | |
| for key in st.session_state.keys(): | |
| del st.session_state[key] | |
| client = getChromaClient(db_path) | |
| coll = getCollection('visaDocImgs',client) | |
| st.sidebar.title("Choose Mode") | |
| mode = st.sidebar.selectbox("", ("Train", "Test")) | |
| st.write("Upload an image or pdf file") | |
| uploaded_file = st.file_uploader("File upload", type=['png','jpg','jpeg','pdf']) | |
| if uploaded_file is not None: | |
| save_uploadedfile(uploaded_file) | |
| folder_path = os.path.join("/") | |
| image_name = uploaded_file.name | |
| img_path = folder_path+ image_name | |
| upload_to_azure(img_path, image_name) | |
| #PDF file handeling | |
| if image_name.endswith('.pdf'): | |
| img_class_arr = [] | |
| images = pdf_to_jpeg(img_path) | |
| st.write("You have uploaded a pdf file with "+str(len(images))+" pages.") | |
| pass_name = None | |
| for i in range(len(images)): | |
| new_image_name = image_name.split(".")[0]+"_page_"+str(i)+".jpeg" | |
| images[i].save(os.path.join(folder_path,new_image_name)) | |
| new_img_path = folder_path+new_image_name | |
| new_img_array = cv2.imread(new_img_path) | |
| new_emb_tensor = get_embedding(new_img_array) | |
| new_emb = new_emb_tensor.tolist() | |
| new_cat = get_category(coll,new_emb) | |
| st.image(images[i],caption=new_cat) | |
| img_class_arr.append([new_image_name,new_cat,new_img_path]) | |
| pdf_pg_sel = st.selectbox('which page to process?',(set(img_class_arr[i][0] for i in range(len(img_class_arr))))) | |
| for row in img_class_arr: | |
| if row[0]==pdf_pg_sel: | |
| pass_name = row[2] | |
| new_cat = row[1] | |
| cat_list = ['passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid'] | |
| if new_cat not in cat_list: | |
| st.write("This category is not configured for extraction yet!") | |
| st.stop() | |
| else: | |
| img_path = pass_name | |
| #Process single image file | |
| im = Image.open(img_path) | |
| st.image(im, caption = "Image being processed") | |
| img_array = cv2.imread(img_path) | |
| # Get the image embedding | |
| emb_tensor = get_embedding(img_array) | |
| emb = emb_tensor.tolist() | |
| if mode == 'Test': | |
| option = get_category(coll,emb) | |
| st.write("Category -- >",option) | |
| cor_cat = None | |
| # print("is this correct classification?") | |
| cor_cat = st.selectbox( | |
| 'is this correct classification?',('y','n')) | |
| st.write('You selected:', cor_cat) | |
| if cor_cat == 'n': | |
| option = st.selectbox('What is the correct classification?', ('passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid')) | |
| st.write('You selected:', option) | |
| else: | |
| new_cat = option | |
| x = Azure_ocr_sdk(img_path) | |
| # Calling function based on category input | |
| if option == 'passport_front': | |
| res = Passport_front(x) | |
| elif option == 'passport_back': | |
| res = Passport_back(x) | |
| elif option == 'aadhar_front': | |
| res = aadhar_front(x) | |
| elif option == 'aadhar_back': | |
| res = aadhar_back(x) | |
| elif option == 'PAN_front': | |
| res = pan_card(x) | |
| elif option == 'DL': | |
| res = DL(x) | |
| elif option == 'voterid': | |
| res = VoterId(x) | |
| else: | |
| st.write("Couldn't classify") | |
| st.header("Extracted text") | |
| st.table(res) | |
| else: | |
| new_cat = st.selectbox('What is the correct classification?', ('passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid')) | |
| st.write('You selected:', new_cat) | |
| train_classifier(coll,emb,new_cat) | |
| if __name__ == "__main__": | |
| main() |