E_KYC / postprocess.py
Prathamesh1420's picture
Upload 17 files
e84d7bb verified
import pandas as pd
from datetime import datetime
import json
def filter_lines(lines):
start_index = None
end_index = None
# Find start and end indices
for i, line in enumerate(lines):
if "INCOME TAX DEPARTMENT" in line:
start_index = i
if "Signature" in line:
end_index = i
break
# Filter lines based on conditions
filtered_lines = []
if start_index is not None and end_index is not None:
for line in lines[start_index:end_index + 1]:
if len(line.strip()) > 2:
filtered_lines.append(line.strip())
return filtered_lines
def create_dataframe(texts):
lines = filter_lines(texts)
print("="*20)
print(lines)
print("="*20)
data = []
name = lines[2].strip()
father_name = lines[3].strip()
dob = lines[4].strip()
for i in range(len(lines)):
if "Permanent Account Number" in lines[i]:
pan = lines[i+1].strip()
data.append({"ID": pan, "Name": name, "Father's Name": father_name, "DOB": dob, "ID Type": "PAN"})
df = pd.DataFrame(data)
return df
# def extract_information(data_string):
# # Split the data string into a list of words based on "|"
# updated_data_string = data_string.replace(".", "")
# words = [word.strip() for word in updated_data_string.split("|") if len(word.strip()) > 2]
# # Extract the required information based on the specified positions
# name = ""
# fathers_name = ""
# id_number = ""
# dob = ""
# data = []
# try:
# name_index = words.index("GOVT OF INDIA") + 1
# name = words[name_index]
# fathers_name_index = name_index + 1
# fathers_name = words[fathers_name_index]
# id_number_index = words.index("Permanent Account Number") + 1
# id_number = words[id_number_index]
# dob_index = None
# for i, word in enumerate(words):
# try:
# datetime.strptime(word, "%d/%m/%Y")
# dob_index = i
# break
# except ValueError:
# pass
# if dob_index is not None:
# dob = words[dob_index]
# else:
# print("Error: Date of birth not found.")
# except ValueError:
# print("Error: Some required information is missing or incorrectly formatted.")
# data.append({"ID": id_number, "Name": name, "Father's Name": fathers_name, "DOB": dob, "ID Type": "PAN"})
# df = pd.DataFrame(data)
# return df
def extract_information(data_string):
# Split the data string into a list of words based on "|"
updated_data_string = data_string.replace(".", "")
words = [word.strip() for word in updated_data_string.split("|") if len(word.strip()) > 2]
# Initialize the dictionary to store the extracted information
extracted_info = {
"ID": "",
"Name": "",
"Father's Name": "",
"DOB": "",
"ID Type": "PAN"
}
try:
name_index = words.index("GOVT OF INDIA") + 1
extracted_info["Name"] = words[name_index]
fathers_name_index = name_index + 1
extracted_info["Father's Name"] = words[fathers_name_index]
id_number_index = words.index("Permanent Account Number") + 1
extracted_info["ID"] = words[id_number_index]
dob_index = None
for i, word in enumerate(words):
try:
datetime.strptime(word, "%d/%m/%Y")
dob_index = i
break
except ValueError:
continue
if dob_index is not None:
extracted_info["DOB"] = datetime.strptime(words[dob_index], "%d/%m/%Y")
else:
print("Error: Date of birth not found.")
except ValueError:
print("Error: Some required information is missing or incorrectly formatted.")
# Convert the dictionary to JSON format
# json_data = json.dumps([extracted_info]) # Convert a list containing the dictionary to match DataFrame format
return extracted_info
# if __name__ == '__main__':
# # Example usage
# lines = [
# "48",
# "8",
# "8",
# "3",
# "fett",
# "HRT",
# "INCOME TAX DEPARTMENT",
# "GOVT OF INDIA",
# "SUMIT",
# "RAM SWARUP",
# "04/03/1992",
# "Permanent Account Number",
# "J",
# "FZKPS9811P",
# "1",
# "2",
# "Signature",
# "1",
# "1",
# "2",
# "1",
# "1",
# "8",
# "1"
# ]
# filtered_lines = filter_lines(lines)
# for line in filtered_lines:
# print(line)
# df = create_dataframe(filtered_lines)
# print(df.melt(var_name='columns', value_name=''))