Spaces:
Sleeping
Sleeping
File size: 4,829 Bytes
e84d7bb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import pandas as pd
from datetime import datetime
import json
def filter_lines(lines):
start_index = None
end_index = None
# Find start and end indices
for i, line in enumerate(lines):
if "INCOME TAX DEPARTMENT" in line:
start_index = i
if "Signature" in line:
end_index = i
break
# Filter lines based on conditions
filtered_lines = []
if start_index is not None and end_index is not None:
for line in lines[start_index:end_index + 1]:
if len(line.strip()) > 2:
filtered_lines.append(line.strip())
return filtered_lines
def create_dataframe(texts):
lines = filter_lines(texts)
print("="*20)
print(lines)
print("="*20)
data = []
name = lines[2].strip()
father_name = lines[3].strip()
dob = lines[4].strip()
for i in range(len(lines)):
if "Permanent Account Number" in lines[i]:
pan = lines[i+1].strip()
data.append({"ID": pan, "Name": name, "Father's Name": father_name, "DOB": dob, "ID Type": "PAN"})
df = pd.DataFrame(data)
return df
# def extract_information(data_string):
# # Split the data string into a list of words based on "|"
# updated_data_string = data_string.replace(".", "")
# words = [word.strip() for word in updated_data_string.split("|") if len(word.strip()) > 2]
# # Extract the required information based on the specified positions
# name = ""
# fathers_name = ""
# id_number = ""
# dob = ""
# data = []
# try:
# name_index = words.index("GOVT OF INDIA") + 1
# name = words[name_index]
# fathers_name_index = name_index + 1
# fathers_name = words[fathers_name_index]
# id_number_index = words.index("Permanent Account Number") + 1
# id_number = words[id_number_index]
# dob_index = None
# for i, word in enumerate(words):
# try:
# datetime.strptime(word, "%d/%m/%Y")
# dob_index = i
# break
# except ValueError:
# pass
# if dob_index is not None:
# dob = words[dob_index]
# else:
# print("Error: Date of birth not found.")
# except ValueError:
# print("Error: Some required information is missing or incorrectly formatted.")
# data.append({"ID": id_number, "Name": name, "Father's Name": fathers_name, "DOB": dob, "ID Type": "PAN"})
# df = pd.DataFrame(data)
# return df
def extract_information(data_string):
# Split the data string into a list of words based on "|"
updated_data_string = data_string.replace(".", "")
words = [word.strip() for word in updated_data_string.split("|") if len(word.strip()) > 2]
# Initialize the dictionary to store the extracted information
extracted_info = {
"ID": "",
"Name": "",
"Father's Name": "",
"DOB": "",
"ID Type": "PAN"
}
try:
name_index = words.index("GOVT OF INDIA") + 1
extracted_info["Name"] = words[name_index]
fathers_name_index = name_index + 1
extracted_info["Father's Name"] = words[fathers_name_index]
id_number_index = words.index("Permanent Account Number") + 1
extracted_info["ID"] = words[id_number_index]
dob_index = None
for i, word in enumerate(words):
try:
datetime.strptime(word, "%d/%m/%Y")
dob_index = i
break
except ValueError:
continue
if dob_index is not None:
extracted_info["DOB"] = datetime.strptime(words[dob_index], "%d/%m/%Y")
else:
print("Error: Date of birth not found.")
except ValueError:
print("Error: Some required information is missing or incorrectly formatted.")
# Convert the dictionary to JSON format
# json_data = json.dumps([extracted_info]) # Convert a list containing the dictionary to match DataFrame format
return extracted_info
# if __name__ == '__main__':
# # Example usage
# lines = [
# "48",
# "8",
# "8",
# "3",
# "fett",
# "HRT",
# "INCOME TAX DEPARTMENT",
# "GOVT OF INDIA",
# "SUMIT",
# "RAM SWARUP",
# "04/03/1992",
# "Permanent Account Number",
# "J",
# "FZKPS9811P",
# "1",
# "2",
# "Signature",
# "1",
# "1",
# "2",
# "1",
# "1",
# "8",
# "1"
# ]
# filtered_lines = filter_lines(lines)
# for line in filtered_lines:
# print(line)
# df = create_dataframe(filtered_lines)
# print(df.melt(var_name='columns', value_name=''))
|