Spaces:

Sukritgoel
/

nuronv2

Sleeping

File size: 43,144 Bytes


import streamlit as st
import os
import sys
import re
from azure.storage.blob import BlobServiceClient
from PIL import Image
import cv2
import datetime
import clip
import torch
import chromadb
import uuid
import pdf2image
# import pandas as pd

from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes

#For generating CLIP embeddings
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

#File locations 
# db_path = r'C:\Users\sukri\OneDrive\FantaClaus\NuronAI\experiments\indpass'
db_path = os.path.dirname(os.path.abspath(sys.argv[0]))

# Azure storage details
connect_str = st.secrets["conn_str"]
container_name = st.secrets["cont_name"]
blob_service_client = BlobServiceClient.from_connection_string(connect_str)

def pdf_to_jpeg(pdf_file):
    # poppler_path = r"C:\Program Files\poppler-23.08.0\Library\bin"
    images = pdf2image.convert_from_path(pdf_file)
    # print("Number of pages in PDF file-->",len(images))
    # returns a list of JPEG images
    return images
        
def upload_to_azure(image_path,file_name):
    blob_client = blob_service_client.get_blob_client(container_name, file_name)
    with open(image_path,"rb") as data:
        blob_client.upload_blob(data, overwrite=True)
        # print("file uploaded!!!")

def Azure_ocr_sdk(image_path):
    # Create a computer vision client
    subscription_key = "82e0c013e94849f7ab5bfc8c7c5e54c8"
    endpoint = "https://docaicomputervisionocr1.cognitiveservices.azure.com/"
    credentials = CognitiveServicesCredentials(subscription_key)
    cv_client = ComputerVisionClient(endpoint, credentials)
    response = cv_client.read_in_stream(open(image_path,'rb'), raw=True,language='en')
    operationLocation = response.headers['Operation-Location']
    operation_id = operationLocation.split('/')[-1]
    result = cv_client.get_read_result(operation_id)
    while(result.status == OperationStatusCodes.running):
        operationLocation = response.headers['Operation-Location']
        operation_id = operationLocation.split('/')[-1]
        result = cv_client.get_read_result(operation_id)

    text_blob = []

    if result.status == OperationStatusCodes.succeeded:
        read_results = result.analyze_result.read_results
        for analyzed_result in read_results:
            for line in analyzed_result.lines:
                text_blob.append(line.text)
    # print(text_blob)
    return text_blob

def Passport_front(textdata):
        Document_Type="PASSPORT DOCUMENT"
        DateofBirth = None
        Type = None
        Countrycode = None
        passportno = None
        Surname = None
        Firstname = None
        Nationality = None
        Sex = None
        PlaceofBirth = None
        PlaceofIssue = None
        DateofIssue = None
        DateofExpiry = None
        MRZ = None
        
        output2=[]
        for w in textdata:
            output2.append(w.upper())

        output3=[]
        for x in output2:
            if 'REPUBLIC' not in x and 'TYPE' not in x and 'GIVEN' not in x and 'COUNTRY' not in x and 'PASSPORT' not in x and 'NO.' not in x and 'NAME' not in x and 'NATIONALITY' not in x and 'SEX' not in x and 'DATE' not in x and 'PLACE' not in x:
                output3.append(x)
        dates=[]
        for w in output3:
            match = re.search('\d{2}\s*/\d{2}\s*/\d{4}', w)
            if match:
                dates.append(w)
        if len(dates)==2:
            caldate=int(dates[1].split('/')[-1])-int(dates[0].split('/')[-1])
            if caldate==10:
                DateofBirth=None
                try:
                    DateofIssue=dates[0]
                except:
                    DateofIssue=None
                try:
                    DateofExpiry=dates[1]
                except:
                    DateofExpiry=None
        else:
            try:
                DateofBirth=dates[0]
            except:
                DateofBirth=None

            try:
                DateofIssue=dates[1]
            except:
                DateofIssue=None

            try:
                DateofExpiry=dates[2]
            except:
                DateofExpiry=None


        try:
            for x in output3:
                if DateofBirth in x or DateofIssue in x or DateofExpiry in x:
                    output3.remove(x)

        except:
                print(None)



        single=[]
        for w in output3:
            if re.match(r'^[A-Z]{1}$',w):
                single+=w



        try:
                    Type=single[0]
        except:
                    Type=None

        try:
                Sex=''.join([i for i in single if re.match(r'[M,F]$',i)])

        except:
                Sex=None


        for w in single:
                output3.remove(w)
        for w in output3:
                if re.match('[A-Z]*\s*/\s*[A-Z]*',w):
                    output3.remove(w)
        passportno=None
        for w in output3:
            if re.search(r'^[A-Z]*\s*[0-9]{7}$', w):
                passportno=w
                break

            else:
                passportno=None

        try:
                output3.remove(passportno)
        except:
                print('None')

        for x in output3:
                if 'BIRTH' in x:
                    output3.remove(x)
                if 'IND' or 'INDIAN' in x:
                    Nationality='INDIAN'
                    Countrycode='IND'
                else:
                    Nationality=None
                    Countrycode=None
        MRZ=''
        for w in output3:
            if '<<' in w:
                MRZ+=w
        try:
            indices=[i for i,s in enumerate(output3) if re.match(r'IND$',s)]
            if indices:
                z=max(indices)
                output4=output3[z+1:]
            else:
                output4=[]
        except:
            indices=[i for i,s in enumerate(output3) if re.match(r'IND',s)]
            if indices:
                z=max(indices)
                output4=output3[z+1:]
            else:
                output4=[]
        for i in output4:
            if 'INDIAN' in i:
                output4.remove(i)
            if 'OF BIRTH' in i:
                output4.remove(i)
            if re.search(r'\d',i):
                output4.remove(i)

        try:

                Sname=output4[0]
                Name=output4[1]
                Pob=output4[2]
                Poi=output4[3]
        except:

                Sname=None
                Name=None
                Pob=None
                Poi=None


        strings={ "Surname": Sname,
                "FirstName": Name,
                "PlaceofBirth": Pob,
                "PlaceofIssue": Poi}

        values=list(strings.values())
        strings2=[]

        try:
                    for w in values:
                        string1=[]
                        for w1 in w.split(' '):
                            if re.match(r'^[0-9]*',w1):
                                string1.append(re.sub(r'[^A-Z]','',w1))
                        strings2.append(string1)
        except:
                    strings2=[]

        strings3=[]
        for w in strings2:
            if len(w)>=2:
                strings3.append(" ".join(map(str,w)))
            else:
                strings3.append(w)


        try:
                str1 = ''.join(str(e) for e in strings3[0])
                # print(str1)
                Surname=str1
                str2 = ''.join(str(e) for e in strings3[1])
                # print(str2)
                Firstname=str2
                str3 = ''.join(str(e) for e in strings3[2])
                # print(str3)
                PlaceofBirth=str3
                str4 = ''.join(str(e) for e in strings3[3])
                # print(str4)
                PlaceofIssue=str4

        except:

                Surname=None
                Firstname=None
                PlaceofBirth=None
                PlaceofIssue=None




        data={"Type": Document_Type,"Type of Passport":Type,"Countrycode": Countrycode,"Passportno": passportno,"Surname": Surname,"Firstname": Firstname,"Nationality": Nationality,"Sex": Sex,"Date of Birth": DateofBirth,"Place of Birth": PlaceofBirth,"Place of Issue": PlaceofIssue,"Date of Issue": DateofIssue,"Date of Expiry": DateofExpiry,"MRZ": MRZ}
        return data

def Passport_back(text):
    Document_Type="PASSPORT BACK"   
    data={}
    old_passno=''
    dateplace_issue=''
    Father_name=''
    mother_name=''
    spouse=''
    for x in text:
        if 'CAUTION' in x:
            index=text.index(x)
            textdata=text[index+1:]
            break
        else:
            textdata=text
    print(len(textdata),textdata)
    try:
        if len(textdata)>=9:
            for x in textdata:
                if 'FATHER' in x or 'LEGAL GUARDIAN' in x:
                    index1=textdata.index(x)
                    Father_name=textdata[index1+1]
                if 'MOTHER' in x or 'NAME OF MOT' in x:
                    index2=textdata.index(x)
                    mother_name=textdata[index2+1]
                if 'SPOUSE' in x:
                    index3=textdata.index(x)
                    spouse=textdata[index3+1]
                try:
                    indices1=[i for i, s in enumerate(textdata) if 'NAME' in s]
                    z1=max(indices1)
                    indices2=[i for i, s in enumerate(textdata) if 'OLD PASSPORT NO' in s or 'PLACE OF ISSUE' in s]
                    z2=min(indices2)  
                    if 'ADDRESS' in textdata[z1+2]:            
                        address=''.join(textdata[z1+3:z2])  
                    else:
                        address=''.join(textdata[z1+2:z2])                    
                
                except Exception as e:
                    print(e)
                    address='Not Found'    
                try:        
                    if 'FILE NO' in x:
                        index4=textdata.index(x)
                        File_no=textdata[index4+1]   
                    else:
                        if re.search(r'\d',textdata[-1]):
                            File_no=textdata[-1]     
                        else:
                            if re.search(r'\d',textdata[-2]): 
                                File_no=textdata[-2]
                except:
                    File_no='Not Found'     
                try: 
                    indices3=[i for i, s in enumerate(textdata) if 'OLD PASSPORT NO' in s]
                    z3=max(indices3)
                    textdata2=textdata[z3+1:-1]
                    old_passno=[w for w in textdata2 if re.match(r'[A-Z]{1}[0-9]{7}',w)][0]            

                except:
                    old_passno='Not Found'                
                try:
                    indices4=[i for i, s in enumerate(textdata) if 'PLACE OF ISSUE' in s]
                    z4=max(indices4) 
                    if 'FILE NO' not in textdata[z4+1]:
                        if 'FILE NO' in textdata[-2]:
                            dateplace_issue=textdata[z4+1:-2]
                        else:
                            dateplace_issue=textdata[z4+1:-1]
                    else:
                        dateplace_issue='Not Found'
                    for w in dateplace_issue:
                        if old_passno in w or File_no in w:
                            dateplace_issue.remove(w)
                except:
                    dateplace_issue='Not Found'  
        else:
            if len(textdata)<=8:
                textdata1_1=[]
                print(textdata)
                for i in range(0,3): 
                    if re.match(r'[A-Z]+\s*[A-Z]*',textdata[i]) and not re.search(r'\d',textdata[i]):
                        textdata1_1.append(textdata[i])
                if len(textdata1_1)==3:
                    Father_name=textdata1_1[0]
                    mother_name=textdata1_1[1]
                    spouse=textdata1_1[2]
                    address=' '.join(textdata[3:5])
                    File_no=textdata[-1]
                else:
                    if len(textdata1_1)==2:
                        Father_name=textdata1_1[0]
                        mother_name=textdata1_1[1]
                        spouse='Not Found'
                        File_no='Not Found'
                        dateplace_issue='Not Found'
                        old_passno='Not Found'
                        address=' '.join(textdata[3:5])
                        File_no=textdata[-1]
                    else:
                        if len(textdata1_1)==1:
                            Father_name=textdata1_1[0]
                            mother_name='Not Found'
                            spouse='Not Found'
                            mother_name='Not Found'
                            spouse='Not Found'                
                            address='Not Found'
                            File_no='Not Found'
                            dateplace_issue='Not Found'
                            old_passno='Not Found'
                if textdata1_1==[]:                     
                    Father_name='Not Found'
                    mother_name='Not Found'
                    spouse='Not Found'                
                    address='Not Found'
                    File_no='Not Found'
                    dateplace_issue='Not Found'
                    old_passno='Not Found'

    except Exception as e:
        print(e)
        data['Type of Document']=Document_Type
        Father_name='Not Found'
        mother_name='Not Found'
        spouse='Not Found'
        address='Not Found'
        File_no='Not Found'
        dateplace_issue='Not Found'
        old_passno='Not Found'

    data['Type of Document']=Document_Type
    data["Name of Father/Legal Guardian"]=Father_name
    data["Name of Mother"]=mother_name
    data["Name of Spouse"]=spouse
    data["Address"]=address
    data["Old Passport No"]=old_passno
    data['Date and Place of Issue']=dateplace_issue
    data["File No"]=File_no

    return data

def aadhar_front(textdata):
    Document_Type="AADHAR CARD FRONT"
    indices=[i for i, s in enumerate(textdata) if 'INDIA' in s or 'GOVERNMENT' in s or 'India' in s]
    z=max(indices)
    text=textdata[z+1:]                

    output=[]
    for w in text:
        output.append(w.upper())
    for w in textdata:    
        if 'FEMALE' or 'MALE' in output:
            aadharno='NA'
            w1=[]
            w=[]
            for a1 in output:
                if re.search(r'^\d{4}\s\d{4}\s\d{4}$', a1):
                    aadharno=a1
                    output.remove(aadharno)
            output=[]
            for w in text:
                    output.append(w.upper())

            dob=''
            for a1 in output:
                if re.match(r'.*[0-9].*',a1):
                    dob+=' '+ a1    
            #print(dob)                          

            Date='NA'   
            match = re.search('\d{2}/\d{2}/\d{4}', dob)
            #print(match)
            if match:
                    Date = datetime.datetime.strptime('%s'%(match.group()), "%d/%m/%Y").date().strftime('%d/%m/%Y')

            else:
                date=dob
                date=date.split(aadharno)
                date1=date[0]
                for a1 in date1.split(' '):
                    match=re.search('[0-9]{4}',a1)
                    if match:
                            date2=a1
                Date=date2
                #print(Date)


            Father_name='NA'
            F_name=''
            Name='NA'
            for i in output:
                if 'FATHER' in i:
                    F_name=i
                    F_name=F_name.split(':')
                    Father_name=F_name[1]
                   # print(output[0])
                    Name=output[0] 
                    break
                else:
                    for w in range(0,len(output)):
                        for w1 in output[w].split(' '):
                            if Date in w1 or 'BIRTH' in w1:
                                x=w
                                Name=output[x-1] 

            Gender='NA'
            for w in output:
                for w1 in w.split(' '):
                    if 'MALE' in w1 or 'FEMALE' in w1:
                        Gender=w1

            data = {"Type of Document": Document_Type,"Name":Name,"Father's Name":Father_name,"Date of Birth": Date,"Aadhar No": aadharno,"Gender":Gender} 
    return data

def aadhar_back(textdata2,polygons_new):

    Document_Type='AADHAR CARD BACK'
    
    textdata=[]
    boxes=[]
    for x in polygons_new:
        textdata.append(x[1])
        boxes.append(x[2])

        
    indices=[i for i, s in enumerate(textdata) if 'Address' in s]
    z=max(indices)
    text1=textdata[z:]
    boxes1=boxes[z:]
    print(text1,boxes1)
    p1=(boxes1[0][0],boxes1[0][1])
    p2=(boxes1[0][4],boxes1[0][5])
    dist = math.hypot(p2[0] - p1[0], p2[1] - p1[1])
    
    check=[0]
    check_list=[]
    add_box=[]
    add_text=[]
    for j in range(1,len(boxes1)):
        for k in boxes1[j]:
            l=boxes1[0][0]
            if  l-(1.3*dist) <= boxes1[j][0] <= l+(3.5*dist):
                check.append(j)
    check_list=set(check)
    add_text=[text1[i] for i in check_list]
    add_box=[boxes1[i] for i in check_list]    
        
            
    add=''
    for x in add_text:
        add=add+x.upper()+' '
    add=add.strip()
    print(textdata2)
    for a1 in textdata2:
        if re.search(r'^\d{4}\s\d{4}\s\d{4}$', a1):
            aadharno = a1
            break
        else:
            aadharno='Not Found'
    pin=re.findall(r'\b[1-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\b',add)
    pinno=str(pin[0])
    pinno=re.sub(r'\s{1,}','',pinno)
    address=re.split(r'\b[1-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\s*[0-9]\b',add)[0]
    address=address+' '+pinno

    address=re.sub(r'ADDRESS:*',' ',address)
    address=re.sub(r'\s{2,}',' ',address)
    address=address.lstrip()
    
    data = {"Type of Document": Document_Type,"Address":address,"PIN Code":pinno, "Aadhar No": aadharno}
    
    return data

def pan_card(textdata):
                Document_Type="PAN CARD"
                indices=[i for i, s in enumerate(textdata) if 'INCOME' in s or 'Card' in s or 'INDIA' in s or 'DEPARTMENT' in s]
                z=max(indices)
                text=textdata[z+1:]
                output=[]
                for w in text:
                        output.append(w.upper())
                dob='NA'

                for w in output:
                    if re.match(r"^\d+",w ):
                        dob+=w

                    date=None
                match = re.search('\d{2}/\d{2}/\d{4}', dob)
                if match:
                        date = datetime.datetime.strptime('%s'%(match.group()), "%d/%m/%Y").date().strftime('%d/%m/%Y')
                        
                else:
                    date=dob

                output1=[]
                for x in output:
                    if 'INCOME' not in x and 'INDIA' not in x and 'NAME' not in x and 'BIRTH' not in x and 'PERMANENT' not in x and 'CARD' not in x and 'TAX' not in x :
                        output1.append(x)
                panno='NA'
                w1=[]
                w=[]
                for w1 in output1:
                    if re.search(r'^[A-Z0-9]{10}$', w1):
                        panno=w1
                        output1.remove(w1)

                Name=output1[0]
                Father_name=output1[1]
                data = { "Type of Document": Document_Type, "Name":Name,"Father's Name": Father_name,"Date of Birth": date,"PAN Number":panno}
                return data

def DL(textdata):
                
                Document_Type="DRIVING LICENCE"
                indices=[i for i, s in enumerate(textdata) if 'DRIVING' in s or 'UNION' in s or 'INDIAN' in s]
                z=max(indices)
                text=textdata[z+1:]
                State=text[0]
                State=State.replace("STATE","")
                text=[re.sub('[:;.]+', '', _) for _ in text]
                text=[x.strip(' ') for x in text]
                Number='NA'
                Number_text=str(text)
                Number_text=Number_text.replace("'","")
                #print(Number_text)
                for x in Number_text.split(','):
                    #print(x)
                    if re.match(r'^[\s]*[A-Z]{2}[-]*[\s]*[0-9]{2}[\s]*[0-9]{4}[0-9]{7}$',x):
                         Number=x

                indices=[i for i, s in enumerate(text) if "Number" in s ]
                z=max(indices)
                text1=text[z+1:]

                AName=[]
                Name='NA'
                SName='NA'
                indices=[i for i, s in enumerate(text1) if "Address" in s or "Blood" in s]
                z=max(indices)
                Name2=text1[:z]
                Name2.remove('Name')
                for w in Name2:
                    AName.append(w)

                Number=Number.lstrip()
                for x in AName:
                   # print(x)
                    if Number==x:
                        AName.remove(x)
                        break

                for x in AName:
                    if "S/D/W" in x:
                          SName=x.split("S/D/W of")[1]
                    else:
                        Name+= x +' '
                indices=[i for i, s in enumerate(text1) if "Address" in s]
                z=max(indices)
                text2=text1[z:]

                pin='NA'
                for x in text2:
                        for y in x.split(' '):
                            if re.match(r'.*[1-9]{1}[0-9]{5}$',y):
                                pin=y

                text3=str(text2)
                if pin!='':
                    pinno=str(pin)
                    Address="" .join(text3.split(pinno,2)[:1])
                    Address=Address+pinno
                    Address=Address.replace("'", "") 
                    Address=Address.replace("[", "")
                    Address=Address.replace(",,", ",")
                    Address=Address.replace("Address,","")
                else:
                    Address=text3[:text3.find("Issued")]
                    Address=Address.replace("[", "")
                    Address=Address.replace(",,", ",")
                    Address=Address.replace("Address","")

                dates=[]
                for w in text2:
                    #print(w)
                    match = re.search('\d{2}\s*-\d{2}\s*-\d{4}', w)
                    if match or "Issued" in w:
                            dates.append(w)

                Final_dates=[]
                for w in dates:
                    for w1 in w.split(' '):
                        if re.match('[A-Za-z0-9]*\d{2}\s*-\d{2}\s*-\d{4}', w1):
                            Final_dates.append(w1)

                Dob='NA'
                Valid_till='NA'
                Date_of_birth='NA'
                Issued_date='NA'
                if len(Final_dates)>=3:
                    Issued_date=Final_dates[0]
                    Dob=Final_dates[1]
                    Dob=Dob.split(' ')
                    Date_of_birth=Dob[0]
                    Valid_till=Final_dates[2]
                        
                else:
                    if  Final_dates==2:
                        Issued_date=dates[0]
                        Dob=dates[1]
                    else:
                        Issued_date=Final_dates[0]


                for w in Issued_date:
                    if re.match(r'[a-zA-Z]',w):
                        Issued_date=Issued_date.replace(w,'')

                Bgroup='NA'
                for x in text2:
                    for y in x.split(' '):
                        if "BG" in y or "Blood" in y:
                            Bgroup=x

                BG='NA'
                for x in Bgroup.split(' '):
                    if '+' in x:
                        BG=x


                indices=[i for i, s in enumerate(text2) if 'Vehicle' in s or 'Class' in s]
                z=max(indices)
                text4=text2[z+1:]
                Vehicle_class='NA'
                for w in text4:
                    if re.match(r'^[A-Z0-9]+$',w):
                        Vehicle_class+=w+' '

                data = {"Type of Document": Document_Type,"Number":Number,"Name":Name,"S/D/W of":SName,"Date of Birth":Date_of_birth,"Address":Address,"State":State,"Pinno":pin}
                return data

def VoterId(textdata):
    Document_Type="Voter ID Card"   
    data={}
    textdata=[x.upper() for x in textdata] 
    id_no=''
    elector_name=''
    father_name=''
    sex=''
    place=''
    date=''
    address=''
    hno=''
    mohalla=''
    town=''
    police=''
    distt=''
    pin=''
    add=''
    age=''
    try:
        for k in textdata:
            index=textdata.index(k)
            if re.search(r'[A-Z]{3}[0-9]{7}$',textdata[index]):
                id_no=re.search(r'[A-Z]{3}[0-9]{7}$',textdata[index])[0]
                break
        if id_no=='':
            id_no='Not Found'
    except:
        id_no='Not Found'
    try:    
        ### elector's name
        for key in textdata:
            if "ELECTOR'S NAME" in key:
                index=textdata.index(key)
                x1=textdata[index]
                if ':' in x1:
                    x2=x1.split(':')[1]
                else:
                    x2=x1.split('NAME')[1]
                if x2=='' or x2==' ':
                    x3=textdata[index+1]
                    if ':' in x3:
                        x3=x3.split(':')[1]
                        elector_name=x3
                        break
                    else:
                        elector_name=x3
                        break
                else:
                    elector_name=x2
                    break
        if elector_name=='':
            elector_name='Not Found'
    except:
        elector_name='Not Found'
    try:
        ### Father's Name
        for k in textdata:
            if "FATHER'S NAME" in k or "HUSBAND'S NAME" in k:
                index=textdata.index(k)
                y1=textdata[index]
                if ':' in y1:
                    y2=y1.split(':')[1]
                else:
                    y2=y1.split('NAME')[1]
                if y2=='' or y2==' ':
                    y3=textdata[index+1]
                    if ':' in y3:
                        y3=y3.split(':')[1]
                        father_name=y3
                        break
                    else:
                        father_name=y3
                        break
                else:
                    father_name=y2
                    break
        if father_name=='':
            father_name='Not Found'
    except:
        father_name='Not Found'
    try:
        ### Sex
        foundflag=False
        for key in textdata:
            if 'SEX' in key:
                index=textdata.index(key)
                if 'MALE' in textdata[index]:
                    if 'FEMALE' in textdata[index]:
                        sex='FEMALE'
                        foundflag=True
                    else:
                        sex='MALE'
                        foundflag=True
                else:
                    if 'MALE' in textdata[index+1]:
                        if 'FEMALE' in textdata[index+1]:
                            sex='FEMALE'
                            foundflag=True
                        else:
                            sex='MALE'
                            foundflag=True
            if foundflag==False:
                if 'MALE' in key:
                    if 'FEMALE' in key:
                        sex='FEMALE'
                        foundflag=True
                    else:
                        sex='MALE'
                        foundflag=True
                    
            if foundflag==False:
                if 'SEX' in key:
                    index=textdata.index(key)
                    sex1=''.join(textdata[index+1])
                    if sex1=='M':
                        sex='MALE'
                        foundflag=True
                    else:
                        if sex1=='F':
                            sex='FEMALE'
                            foundflag=True
        if sex=='':
            sex='Not Found'
    except:
        sex='Not Found'
    try:
        ### Place
        for k in textdata:
            if 'PLACE' in k:
                index=textdata.index(k)
                z1=textdata[index]
                if ':' in z1:
                    z2=z1.split(':')[1]
                else:
                    z2=z1.split('PLACE')[1]
                if z2=='' or z2==' ':
                    z3=textdata[index+1]
                    if ':' in z3:
                        z3=z3.split(':')[1]
                        place=z3
                        break
                    else:
                        place=z3
                        break
                else:
                    place=z2
                    break
        if place=='':
            place='Not Found'
    except:
        place='Not Found'
    try:
        ### Date
        for key in textdata:
            if 'DATE' in key:
                index=textdata.index(key)
                a1=textdata[index]
                a2=textdata[index+1]
                if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a1):
                    date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a1)[0]
                    break
                else:
                    if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a2):
                        date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',a2)[0]
                        break
            else:
                if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',key):
                    date=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',key)[0]
                    break
                else:
                    date='Not found'
    except:
        date='Not found'
    try:
        ### Add
        ### House no
        for h in textdata:
            if 'H.NO.' in h:
                index=textdata.index(h)
                h1=textdata[index]
                if ':' in h1:
                    h2=h1.split(':')[1]
                else:
                    h2=h1.split('H.NO.')[1]
                if h2=='' or h2==' ':
                    h3=textdata[index+1]
                    if not 'MOHALLA' in h3:
                        if ':' in h3:
                            h3=h3.split(':')[1]
                            hno=h3
                            break
                        else:
                            hno=h3
                            break
                else:
                    hno=h2
                    break
        ### Mohalla
        for m in textdata:
            if 'MOHALLA' in m:
                index=textdata.index(m)
                m1=textdata[index]
                if ':' in m1:
                    m2=m1.split(':')[1]
                else:
                    m2=m1.split('MOHALLA')[1]
                if m2=='' or m2==' ':
                    m3=textdata[index+1]
                    if not 'TOWN' in m3:
                        if ':' in m3:
                            m3=m3.split(':')[1]
                            mohalla=m3
                            break
                        else:
                            mohalla=m3
                            break
                else:
                    mohalla=m2
                    break
        ### Town
        for t in textdata:
            if 'TOWN' in t:
                index=textdata.index(t)
                t1=textdata[index]
                if ':' in t1:
                    t2=t1.split(':')[1]
                else:
                    t2=t1.split('TOWN')[1]
                if t2=='' or t2==' ':
                    t3=textdata[index+1]
                    if not 'POLICESTN.' in t3:
                        if ':' in t3:
                            t3=t3.split(':')[1]
                            town=t3
                            break
                        else:
                            town=t3
                            break
                else:
                    town=t2
                    break
        ###Police
        for p in textdata:
            if 'POLICE' in p:
                index=textdata.index(p)  
                p1=textdata[index]
                if ':' in p1:
                    p2=p1.split(':')[1]
                else:
                    p2=p1.split('POLICESTN.')[1]
                if p2=='' or p2==' ':
                    p3=textdata[index+1]
                    if not 'DISTT' in p3: 
                        if ':' in p3:
                            p3=p3.split(':')[1]
                            police=p3
                            break
                        else:
                            police=p3
                            break
                else:
                    police=p2
                    break
        ### District
        for d in textdata:
            if 'DISTT' in d:
                index=textdata.index(d)  
                d1=textdata[index]
                if ':' in y1:
                    d2=d1.split(':')[1]
                else:
                    d2=d1.split('DISTT.')[1]
                if d2=='' or d2==' ':
                    d3=textdata[index+1]
                    if not 'PIN' in d3: 
                        if ':' in d3:
                            d3=d3.split(':')[1]
                            distt=d3
                            break
                        else:
                            distt=d3
                            break
                else:
                    distt=d2
                    break
        ### Pin
        for pp in textdata:
            if 'PIN' in pp:
                index=textdata.index(pp)  
                pp1=textdata[index]
                if ':' in pp1:
                    pp2=pp1.split(':')[1]
                else:
                    pp2=pp1.split('PIN')[1]
                if pp2=='' or pp2==' ':
                    pp3=textdata[index+1]
                    if re.search(r'[0-9\s*]{6,}',pp3): 
                        pin=re.search(r'[0-9\s*]{6,}',pp3)[0]
                else:
                    pin=pp2
                    break
        add=hno+' '+mohalla+' '+town+' '+police+' '+distt+' '+pin   
        if len(add)>15:
            address=add
        else:
            try:
                add1=[i for i,s in enumerate(textdata) if 'ADDRESS' in s] 
                add2=min(add1)
                add3=[i for i,s in enumerate(textdata) if 'ELECTION' in s] 
                add4=min(add3)
                address=' '.join(textdata[add2:add4])
            except:
                address='Not Found'
    except:
        address:'Not Found'
    try:
        ### Age
        for k in textdata:
            if 'AGE AS ON' in k:
                index=textdata.index(k)
                i1=textdata[index]
                if ':' in i1:
                    if re.search(r'[0-9]{2,3}',i1):
                        age=i1
                        break
                else:
                    i3=textdata[index+1]
                    if re.search(r'[0-9]{2,3}',i3):
                        age=i1+' '+i3
                        break
                    else:
                        i4=textdata[index-1]
                        if re.search(r'[0-9]{2,3}',i4):
                            age=i1+' '+i4 
                            break
                        else:
                            age='Not Found'
            else:
                if 'DATE OF BIRTH' in k:
                    index=textdata.index(k)
                    i1=textdata[index]
                    if ':' in i1 or 'BIRTH' in i1:
                        if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1):
                            age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0]
                            break
                    else:
                        i3=textdata[index+1]
                        if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i3):
                            age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0]
                            break
                        else:
                            i4=textdata[index-1]
                            if re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i4):
                                age=re.search(r'[0-9]{1,2}[-/][0-9]{2}[-/][0-9]{2,4}',i1)[0]
                                break
                            else:
                                age='Not Found'
    except:
        age='Not Found'

    data['Type of Document']=Document_Type
    data['Voter ID Number']=id_no
    data["Elector's Name"]=elector_name
    data["Father's/Husband's Name"]=father_name
    data["Sex"]=sex
    data["Address"]=address
    data["Date of Birth or Age"]=age
    data['Date']=date
    data["Place"]=place
    return data

# This function creates a chromaDB client that connects to existing or newly created db 
def getChromaClient(dbPath):
    client=chromadb.PersistentClient(path=dbPath)
    return client

# This function will get or create a new collection by taking openAI details and chromaClient object 
def getCollection(collectionName, client):
    collection = client.get_or_create_collection(name=collectionName, metadata={"hsnw:space": "cosine"})
    return collection

# Function to generate an embedding for an image
def get_embedding(img):
    query_image = preprocess(Image.fromarray(img)).unsqueeze(0).to(device)
    # Get the embedding from the CLIP model
    with torch.no_grad():
        query_embeddings = model.encode_image(query_image)
    return query_embeddings

def store_embeddings(collection, emb,cat,id):
    try:
        collection.add(
        embeddings=emb,
        # documents=[img],
        metadatas=[{"category": cat}],
        ids=[id] )
        # print('embeddings added')
    except Exception as e:
            print(e)
            exit
    
def get_category(collection,emb):
    similar_img = collection.query(query_embeddings = emb, n_results = 1)
    option = similar_img["metadatas"][0][0]["category"]
    return option

def train_classifier(coll,emb,cat):
    id = str(uuid.uuid4())
    store_embeddings(coll, emb, cat, id)

def save_uploadedfile(uploadedfile):
     with open(os.path.join(uploadedfile.name),"wb") as f:
         f.write(uploadedfile.getbuffer())
    #  return st.success("Saved File:{} to tempDir".format(uploadedfile.name))

def main():
    
    # Delete all the items in Session state
    for key in st.session_state.keys():
        del st.session_state[key]

    client = getChromaClient(db_path)
    coll = getCollection('visaDocImgs',client)
    
    st.sidebar.title("Choose Mode")
    mode = st.sidebar.selectbox("", ("Train", "Test"))
    
    st.write("Upload an image or pdf file")
    uploaded_file = st.file_uploader("File upload", type=['png','jpg','jpeg','pdf'])
    if uploaded_file is not None:
        save_uploadedfile(uploaded_file)
        folder_path = os.path.join("/")
        image_name = uploaded_file.name
                    
        img_path = folder_path+ image_name
        upload_to_azure(img_path, image_name)
        
        #PDF file handeling
        if image_name.endswith('.pdf'):
            img_class_arr = []
            images = pdf_to_jpeg(img_path)
            st.write("You have uploaded a pdf file with "+str(len(images))+" pages.")
            pass_name = None
            for i in range(len(images)):
                new_image_name = image_name.split(".")[0]+"_page_"+str(i)+".jpeg"
                images[i].save(os.path.join(folder_path,new_image_name))
                new_img_path = folder_path+new_image_name
                new_img_array = cv2.imread(new_img_path)
                new_emb_tensor = get_embedding(new_img_array)
                new_emb = new_emb_tensor.tolist()
                new_cat = get_category(coll,new_emb)
                st.image(images[i],caption=new_cat)
                img_class_arr.append([new_image_name,new_cat,new_img_path])
                
            pdf_pg_sel = st.selectbox('which page to process?',(set(img_class_arr[i][0] for i in range(len(img_class_arr)))))
            for row in img_class_arr:
                if row[0]==pdf_pg_sel:
                    pass_name = row[2]
                    new_cat = row[1]
            
            cat_list = ['passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid']
            if new_cat not in cat_list:
                st.write("This category is not configured for extraction yet!")
                st.stop()
            else:
                img_path = pass_name
                
        
        #Process single image file
        im = Image.open(img_path)
        st.image(im, caption = "Image being processed")
        
          
        img_array = cv2.imread(img_path)
        # Get the image embedding
        emb_tensor = get_embedding(img_array)
        emb = emb_tensor.tolist() 
        
        if mode == 'Test':
            option = get_category(coll,emb)
            st.write("Category -- >",option)
            cor_cat = None
            # print("is this correct classification?")
            cor_cat = st.selectbox(
            'is this correct classification?',('y','n'))
            st.write('You selected:', cor_cat)
            
            if cor_cat == 'n':
                option = st.selectbox('What is the correct classification?',  ('passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid'))
                st.write('You selected:', option)
            else:
                new_cat = option
        
            
            x = Azure_ocr_sdk(img_path)  
            # Calling function based on category input
            if option == 'passport_front':
                res = Passport_front(x)
            elif option == 'passport_back':
                res = Passport_back(x)
            elif option == 'aadhar_front':
                res = aadhar_front(x)
            elif option == 'aadhar_back':
                res = aadhar_back(x)
            elif option == 'PAN_front':
                res = pan_card(x)
            elif option == 'DL':
                res = DL(x)
            elif option == 'voterid':
                res = VoterId(x)
            else:
                st.write("Couldn't classify")
            
            st.header("Extracted text")
            st.table(res)
        else:
            new_cat = st.selectbox('What is the correct classification?',  ('passport_front', 'passport_back', 'aadhar_front','aadhar_back','PAN_front','DL','voterid'))
            st.write('You selected:', new_cat)
        
        train_classifier(coll,emb,new_cat)

if __name__ == "__main__":
    main()