File size: 4,829 Bytes
e84d7bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

import pandas as pd
from datetime import datetime
import json

def filter_lines(lines):
    start_index = None
    end_index = None

    # Find start and end indices
    for i, line in enumerate(lines):
        if "INCOME TAX DEPARTMENT" in line:
            start_index = i
        if "Signature" in line:
            end_index = i
            break

    # Filter lines based on conditions
    filtered_lines = []
    if start_index is not None and end_index is not None:
        for line in lines[start_index:end_index + 1]:
            if len(line.strip()) > 2:
                filtered_lines.append(line.strip())
    
    return filtered_lines

def create_dataframe(texts):

    lines = filter_lines(texts)
    print("="*20)
    print(lines)
    print("="*20)
    data = []
    name = lines[2].strip()
    father_name = lines[3].strip()
    dob = lines[4].strip()
    for i in range(len(lines)):
        if "Permanent Account Number" in lines[i]:
            pan = lines[i+1].strip()
    data.append({"ID": pan, "Name": name, "Father's Name": father_name, "DOB": dob, "ID Type": "PAN"})
    df = pd.DataFrame(data)
    return df

# def extract_information(data_string):
#     # Split the data string into a list of words based on "|"
#     updated_data_string =  data_string.replace(".", "")
#     words = [word.strip() for word in updated_data_string.split("|") if len(word.strip()) > 2]

#     # Extract the required information based on the specified positions
#     name = ""
#     fathers_name = ""
#     id_number = ""
#     dob = ""
#     data = []
#     try:
#         name_index = words.index("GOVT OF INDIA") + 1
#         name = words[name_index]

#         fathers_name_index = name_index + 1
#         fathers_name = words[fathers_name_index]

#         id_number_index = words.index("Permanent Account Number") + 1
#         id_number = words[id_number_index]

#         dob_index = None
#         for i, word in enumerate(words):
#             try:
#                 datetime.strptime(word, "%d/%m/%Y")
#                 dob_index = i
#                 break
#             except ValueError:
#                 pass

#         if dob_index is not None:
#             dob = words[dob_index]
#         else:
#             print("Error: Date of birth not found.")
#     except ValueError:
#         print("Error: Some required information is missing or incorrectly formatted.")

#     data.append({"ID": id_number, "Name": name, "Father's Name": fathers_name, "DOB": dob, "ID Type": "PAN"})
#     df = pd.DataFrame(data)
#     return df


def extract_information(data_string):
    # Split the data string into a list of words based on "|"
    updated_data_string = data_string.replace(".", "")
    words = [word.strip() for word in updated_data_string.split("|") if len(word.strip()) > 2]

    # Initialize the dictionary to store the extracted information
    extracted_info = {
        "ID": "",
        "Name": "",
        "Father's Name": "",
        "DOB": "",
        "ID Type": "PAN"
    }

    try:
        name_index = words.index("GOVT OF INDIA") + 1
        extracted_info["Name"] = words[name_index]

        fathers_name_index = name_index + 1
        extracted_info["Father's Name"] = words[fathers_name_index]

        id_number_index = words.index("Permanent Account Number") + 1
        extracted_info["ID"] = words[id_number_index]

        dob_index = None
        for i, word in enumerate(words):
            try:
                datetime.strptime(word, "%d/%m/%Y")
                dob_index = i
                break
            except ValueError:
                continue

        if dob_index is not None:
            extracted_info["DOB"] = datetime.strptime(words[dob_index], "%d/%m/%Y")
        else:
            print("Error: Date of birth not found.")
    except ValueError:
        print("Error: Some required information is missing or incorrectly formatted.")

    # Convert the dictionary to JSON format
    # json_data = json.dumps([extracted_info])  # Convert a list containing the dictionary to match DataFrame format
    return extracted_info
# if __name__ == '__main__':
#     # Example usage
#     lines = [
#         "48",
#         "8",
#         "8",
#         "3",
#         "fett",
#         "HRT",
#         "INCOME TAX DEPARTMENT",
#         "GOVT OF INDIA",
#         "SUMIT",
#         "RAM SWARUP",
#         "04/03/1992",
#         "Permanent Account Number",
#         "J",
#         "FZKPS9811P",
#         "1",
#         "2",
#         "Signature",
#         "1",
#         "1",
#         "2",
#         "1",
#         "1",
#         "8",
#         "1"
#     ]

#     filtered_lines = filter_lines(lines)
#     for line in filtered_lines:
#         print(line)

#     df = create_dataframe(filtered_lines)
#     print(df.melt(var_name='columns', value_name=''))