Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from datetime import datetime | |
| import json | |
| def filter_lines(lines): | |
| start_index = None | |
| end_index = None | |
| # Find start and end indices | |
| for i, line in enumerate(lines): | |
| if "INCOME TAX DEPARTMENT" in line: | |
| start_index = i | |
| if "Signature" in line: | |
| end_index = i | |
| break | |
| # Filter lines based on conditions | |
| filtered_lines = [] | |
| if start_index is not None and end_index is not None: | |
| for line in lines[start_index:end_index + 1]: | |
| if len(line.strip()) > 2: | |
| filtered_lines.append(line.strip()) | |
| return filtered_lines | |
| def create_dataframe(texts): | |
| lines = filter_lines(texts) | |
| print("="*20) | |
| print(lines) | |
| print("="*20) | |
| data = [] | |
| name = lines[2].strip() | |
| father_name = lines[3].strip() | |
| dob = lines[4].strip() | |
| for i in range(len(lines)): | |
| if "Permanent Account Number" in lines[i]: | |
| pan = lines[i+1].strip() | |
| data.append({"ID": pan, "Name": name, "Father's Name": father_name, "DOB": dob, "ID Type": "PAN"}) | |
| df = pd.DataFrame(data) | |
| return df | |
| # def extract_information(data_string): | |
| # # Split the data string into a list of words based on "|" | |
| # updated_data_string = data_string.replace(".", "") | |
| # words = [word.strip() for word in updated_data_string.split("|") if len(word.strip()) > 2] | |
| # # Extract the required information based on the specified positions | |
| # name = "" | |
| # fathers_name = "" | |
| # id_number = "" | |
| # dob = "" | |
| # data = [] | |
| # try: | |
| # name_index = words.index("GOVT OF INDIA") + 1 | |
| # name = words[name_index] | |
| # fathers_name_index = name_index + 1 | |
| # fathers_name = words[fathers_name_index] | |
| # id_number_index = words.index("Permanent Account Number") + 1 | |
| # id_number = words[id_number_index] | |
| # dob_index = None | |
| # for i, word in enumerate(words): | |
| # try: | |
| # datetime.strptime(word, "%d/%m/%Y") | |
| # dob_index = i | |
| # break | |
| # except ValueError: | |
| # pass | |
| # if dob_index is not None: | |
| # dob = words[dob_index] | |
| # else: | |
| # print("Error: Date of birth not found.") | |
| # except ValueError: | |
| # print("Error: Some required information is missing or incorrectly formatted.") | |
| # data.append({"ID": id_number, "Name": name, "Father's Name": fathers_name, "DOB": dob, "ID Type": "PAN"}) | |
| # df = pd.DataFrame(data) | |
| # return df | |
| def extract_information(data_string): | |
| # Split the data string into a list of words based on "|" | |
| updated_data_string = data_string.replace(".", "") | |
| words = [word.strip() for word in updated_data_string.split("|") if len(word.strip()) > 2] | |
| # Initialize the dictionary to store the extracted information | |
| extracted_info = { | |
| "ID": "", | |
| "Name": "", | |
| "Father's Name": "", | |
| "DOB": "", | |
| "ID Type": "PAN" | |
| } | |
| try: | |
| name_index = words.index("GOVT OF INDIA") + 1 | |
| extracted_info["Name"] = words[name_index] | |
| fathers_name_index = name_index + 1 | |
| extracted_info["Father's Name"] = words[fathers_name_index] | |
| id_number_index = words.index("Permanent Account Number") + 1 | |
| extracted_info["ID"] = words[id_number_index] | |
| dob_index = None | |
| for i, word in enumerate(words): | |
| try: | |
| datetime.strptime(word, "%d/%m/%Y") | |
| dob_index = i | |
| break | |
| except ValueError: | |
| continue | |
| if dob_index is not None: | |
| extracted_info["DOB"] = datetime.strptime(words[dob_index], "%d/%m/%Y") | |
| else: | |
| print("Error: Date of birth not found.") | |
| except ValueError: | |
| print("Error: Some required information is missing or incorrectly formatted.") | |
| # Convert the dictionary to JSON format | |
| # json_data = json.dumps([extracted_info]) # Convert a list containing the dictionary to match DataFrame format | |
| return extracted_info | |
| # if __name__ == '__main__': | |
| # # Example usage | |
| # lines = [ | |
| # "48", | |
| # "8", | |
| # "8", | |
| # "3", | |
| # "fett", | |
| # "HRT", | |
| # "INCOME TAX DEPARTMENT", | |
| # "GOVT OF INDIA", | |
| # "SUMIT", | |
| # "RAM SWARUP", | |
| # "04/03/1992", | |
| # "Permanent Account Number", | |
| # "J", | |
| # "FZKPS9811P", | |
| # "1", | |
| # "2", | |
| # "Signature", | |
| # "1", | |
| # "1", | |
| # "2", | |
| # "1", | |
| # "1", | |
| # "8", | |
| # "1" | |
| # ] | |
| # filtered_lines = filter_lines(lines) | |
| # for line in filtered_lines: | |
| # print(line) | |
| # df = create_dataframe(filtered_lines) | |
| # print(df.melt(var_name='columns', value_name='')) | |