File size: 5,421 Bytes
4c01182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

import sys
import string
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from src.core.logger import logging
from src.core.exception import AppException
from src.core.configuration import AppConfiguration
import gc


class HelperFunctions:
    def __init__(self):
        """

        Initializes an instance of the HelperFunctions class.



        This class contains methods for various text preprocessing tasks such as

        lowercasing, removing punctuations, removing stopwords, and lemmatization.

        """
        pass
    
    def lower_case(self, text) -> str:
        """Converts the given text to lowercase."""
        return text.lower()
    
    def remove_punctuations(self, text) -> str:
        """Removes all punctuation marks from the given text."""
        exclude = string.punctuation
        return text.translate(str.maketrans("", "", exclude))
    
    def remove_stopwords(self, text) -> str:
        """Removes all English stopwords from the given text."""
        stop_words = set(stopwords.words('english'))
        text = [word for word in text.split() if word not in stop_words]
        return " ".join(text)

    def lemmatization(self, text) -> str:
        """

        Does parts of speech tagging and then converts words to their base form 

        using WordNetLemmatizer.

        """
        lemmatizer = WordNetLemmatizer()
        wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}
        # Perform POS tagging and lemmatization
        pos_text = pos_tag(text.split())
        text = [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text]
        return " ".join(text)


class DataPreprocessing:
    def __init__(self, config = AppConfiguration()):
        """

        Initializes the DataPreprocessing object by creating the data prepocessing configuration.

        Args:

            config (AppConfiguration): The configuration object containing the application configuration.

        """
        try:
            self.data_preprocessing_config = config.data_preprocessing_config()

        except Exception as e:
            logging.error(f"Failed to create data preprocessing configuration: {e}", exc_info=True)
            raise AppException(e, sys)


    def preprocess(self, df: pd.DataFrame, filename: str) -> pd.DataFrame:
        """

        Preprocesses the given dataframe by performing lowercasing, removing punctuation, 

        removing stopwords and performing lemmatization with parts of speech tagging.

        Args:

            df (pd.DataFrame): The dataframe to be preprocessed

            filename (str): The filename to save the preprocessed dataframe



        Returns:

            pd.DataFrame: The preprocessed dataframe

        """
        fn = HelperFunctions()
        # Preprocessing steps
        try:
            df.dropna(how='any', inplace=True)
            tqdm.pandas()

            logging.info("Performing lowercasing")
            df['Content'] = df['Content'].progress_apply(fn.lower_case)

            logging.info("Removing punctuations")
            df['Content'] = df['Content'].progress_apply(fn.remove_punctuations)

            logging.info("Performing pos tagging and lemmatization")
            df['Content'] = df['Content'].progress_apply(fn.lemmatization)

            logging.info("Removing stopwords")
            df['Content'] = df['Content'].progress_apply(fn.remove_stopwords)

            logging.info("Finished preprocessing operations successfully")

            preprocessed_data_dir = self.data_preprocessing_config.preprocessed_data_dir
            df.to_feather(Path(preprocessed_data_dir, filename))
            
            logging.info(f"Data successfully saved at {preprocessed_data_dir}")
            return df
        
        except Exception as e:
            logging.error(f"Data preprocessing failed: {e}", exc_info=True)
            raise AppException(e, sys)
    
def initiate_data_preprocessing():
    """

    Main function to initiate the data preprocessing workflow. It reads ingested dataset,

    performs different preprocessing operations and saves the preprocessed data as a CSV file.



    Raises:

        AppException: If an error occurs during data preprocessing.

    """
    obj = DataPreprocessing()
    try:
        logging.info(f"{'='*20}Data Preprocessing{'='*20}")
        data_path = obj.data_preprocessing_config.data_path
        if not data_path:
            raise ValueError("No data path found")
        
        df = pd.read_parquet(data_path)
        preprocessed_dataset_name = obj.data_preprocessing_config.preprocessed_data_filename
        obj.preprocess(df, preprocessed_dataset_name)
        del df
        gc.collect()
        logging.info(f"{'='*20}Data Preprocessing Completed Successfully{'='*20} \n\n")

    except Exception as e:
        logging.error(f"Error in Data Preprocessing process: {e}", exc_info=True)
        raise AppException(e, sys)
    

if __name__ == "__main__":
    initiate_data_preprocessing()