File size: 3,748 Bytes
3efe7a4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pdfplumber
import os
import multiprocessing
from tqdm import tqdm

from Logger import GetLogger

class GetDataCleaning():
    def __init__(self, root_folder, excluding_folder=[], logger=None):
        
        if not logger:
            obj = GetLogger()
            logger = obj.get_logger()
        self.logger = logger
            

        self.root_folder = root_folder
        self.excluding_folder = excluding_folder

        self.folder_list = [item for item in os.listdir(self.root_folder) if (("txt" not in item.split("_")) and (item not in excluding_folder))]
        self.logger.info("all the folder list is generated sucessfully")
        

    def pdf_to_txt(self, pdf_path, txt_path):
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(text)
        
    def clean_txt(self, text):
        lines = text.split("\n")
        cleaned = []

        for line in lines:
            line = line.strip()
            if not line:
                continue
            if line.isdigit():
                continue
            if line in ["Infosys", "ICICI Bank"]:
                continue
            cleaned.append(line)
            
        return " ".join(cleaned)


    def process_file(self, folder, file, logger):
        """Single file processing pipeline"""
        
        input_pdf = os.path.join(self.root_folder, folder, file)
        output_txt = os.path.join(self.root_folder, folder + "_txt", file.replace(".pdf", ".txt"))
        output_cleaned = os.path.join(self.root_folder, folder + "_cleaned_txt", file.replace(".pdf", ".txt"))

        # Convert PDF โ†’ TXT
        self.pdf_to_txt(input_pdf, output_txt)

        # Clean text
        raw_text = open(output_txt, encoding="utf-8").read()
        cleaned_text = self.clean_txt(raw_text)
        
        with open(output_cleaned, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

        logger.info(f"โœ… Processed: {folder}/{file}")

    def run(self, workers=4):
        try:
            self.logger.info("๐Ÿš€ Starting Cleaning Process")
            for folder in self.folder_list:

                os.makedirs(os.path.join(self.root_folder, folder + "_txt"), exist_ok=True)
                os.makedirs(os.path.join(self.root_folder, folder + "_cleaned_txt"), exist_ok=True)
                
                pdf_files = [
                    f for f in os.listdir(os.path.join(self.root_folder, folder))
                    if f.endswith(".pdf")
                ]

                # Run parallel processing
                with multiprocessing.Pool(processes=workers) as pool:
                    pool.starmap(self.process_file, [(folder, f, self.logger) for f in pdf_files])
                    pool.close()
                    pool.join()
                    
                self.logger.info(f"Data Cleaning completed for folder:{folder}")
        except Exception as e:
            self.logger.error(f"Got Error: {e}")
            


# if __name__ == "__main__":                                                                                                                                                                                                                                                                                                                                                              
#     obj = Cleaning(root_folder="financial_reports", excluding_folder=["ICICI"])
#     obj.run()
    # obj.process_file("ICICI", "icici-bank-23.pdf") # for experiment only