Spaces:

Prathamesh1420
/

E_KYC

Sleeping

File size: 4,829 Bytes

e84d7bb


import pandas as pd
from datetime import datetime
import json

def filter_lines(lines):
    start_index = None
    end_index = None

    # Find start and end indices
    for i, line in enumerate(lines):
        if "INCOME TAX DEPARTMENT" in line:
            start_index = i
        if "Signature" in line:
            end_index = i
            break

    # Filter lines based on conditions
    filtered_lines = []
    if start_index is not None and end_index is not None:
        for line in lines[start_index:end_index + 1]:
            if len(line.strip()) > 2:
                filtered_lines.append(line.strip())
    
    return filtered_lines

def create_dataframe(texts):

    lines = filter_lines(texts)
    print("="*20)
    print(lines)
    print("="*20)
    data = []
    name = lines[2].strip()
    father_name = lines[3].strip()
    dob = lines[4].strip()
    for i in range(len(lines)):
        if "Permanent Account Number" in lines[i]:
            pan = lines[i+1].strip()
    data.append({"ID": pan, "Name": name, "Father's Name": father_name, "DOB": dob, "ID Type": "PAN"})
    df = pd.DataFrame(data)
    return df

# def extract_information(data_string):
#     # Split the data string into a list of words based on "|"
#     updated_data_string =  data_string.replace(".", "")
#     words = [word.strip() for word in updated_data_string.split("|") if len(word.strip()) > 2]

#     # Extract the required information based on the specified positions
#     name = ""
#     fathers_name = ""
#     id_number = ""
#     dob = ""
#     data = []
#     try:
#         name_index = words.index("GOVT OF INDIA") + 1
#         name = words[name_index]

#         fathers_name_index = name_index + 1
#         fathers_name = words[fathers_name_index]

#         id_number_index = words.index("Permanent Account Number") + 1
#         id_number = words[id_number_index]

#         dob_index = None
#         for i, word in enumerate(words):
#             try:
#                 datetime.strptime(word, "%d/%m/%Y")
#                 dob_index = i
#                 break
#             except ValueError:
#                 pass

#         if dob_index is not None:
#             dob = words[dob_index]
#         else:
#             print("Error: Date of birth not found.")
#     except ValueError:
#         print("Error: Some required information is missing or incorrectly formatted.")

#     data.append({"ID": id_number, "Name": name, "Father's Name": fathers_name, "DOB": dob, "ID Type": "PAN"})
#     df = pd.DataFrame(data)
#     return df


def extract_information(data_string):
    # Split the data string into a list of words based on "|"
    updated_data_string = data_string.replace(".", "")
    words = [word.strip() for word in updated_data_string.split("|") if len(word.strip()) > 2]

    # Initialize the dictionary to store the extracted information
    extracted_info = {
        "ID": "",
        "Name": "",
        "Father's Name": "",
        "DOB": "",
        "ID Type": "PAN"
    }

    try:
        name_index = words.index("GOVT OF INDIA") + 1
        extracted_info["Name"] = words[name_index]

        fathers_name_index = name_index + 1
        extracted_info["Father's Name"] = words[fathers_name_index]

        id_number_index = words.index("Permanent Account Number") + 1
        extracted_info["ID"] = words[id_number_index]

        dob_index = None
        for i, word in enumerate(words):
            try:
                datetime.strptime(word, "%d/%m/%Y")
                dob_index = i
                break
            except ValueError:
                continue

        if dob_index is not None:
            extracted_info["DOB"] = datetime.strptime(words[dob_index], "%d/%m/%Y")
        else:
            print("Error: Date of birth not found.")
    except ValueError:
        print("Error: Some required information is missing or incorrectly formatted.")

    # Convert the dictionary to JSON format
    # json_data = json.dumps([extracted_info])  # Convert a list containing the dictionary to match DataFrame format
    return extracted_info
# if __name__ == '__main__':
#     # Example usage
#     lines = [
#         "48",
#         "8",
#         "8",
#         "3",
#         "fett",
#         "HRT",
#         "INCOME TAX DEPARTMENT",
#         "GOVT OF INDIA",
#         "SUMIT",
#         "RAM SWARUP",
#         "04/03/1992",
#         "Permanent Account Number",
#         "J",
#         "FZKPS9811P",
#         "1",
#         "2",
#         "Signature",
#         "1",
#         "1",
#         "2",
#         "1",
#         "1",
#         "8",
#         "1"
#     ]

#     filtered_lines = filter_lines(lines)
#     for line in filtered_lines:
#         print(line)

#     df = create_dataframe(filtered_lines)
#     print(df.melt(var_name='columns', value_name=''))