Spaces:

Subi003
/

ToxicTweet-Tagger

Sleeping

File size: 5,421 Bytes

4c01182

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

import sys
import string
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from src.core.logger import logging
from src.core.exception import AppException
from src.core.configuration import AppConfiguration
import gc


class HelperFunctions:
    def __init__(self):
        """

        Initializes an instance of the HelperFunctions class.



        This class contains methods for various text preprocessing tasks such as

        lowercasing, removing punctuations, removing stopwords, and lemmatization.

        """
        pass
    
    def lower_case(self, text) -> str:
        """Converts the given text to lowercase."""
        return text.lower()
    
    def remove_punctuations(self, text) -> str:
        """Removes all punctuation marks from the given text."""
        exclude = string.punctuation
        return text.translate(str.maketrans("", "", exclude))
    
    def remove_stopwords(self, text) -> str:
        """Removes all English stopwords from the given text."""
        stop_words = set(stopwords.words('english'))
        text = [word for word in text.split() if word not in stop_words]
        return " ".join(text)

    def lemmatization(self, text) -> str:
        """

        Does parts of speech tagging and then converts words to their base form 

        using WordNetLemmatizer.

        """
        lemmatizer = WordNetLemmatizer()
        wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}
        # Perform POS tagging and lemmatization
        pos_text = pos_tag(text.split())
        text = [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text]
        return " ".join(text)


class DataPreprocessing:
    def __init__(self, config = AppConfiguration()):
        """

        Initializes the DataPreprocessing object by creating the data prepocessing configuration.

        Args:

            config (AppConfiguration): The configuration object containing the application configuration.

        """
        try:
            self.data_preprocessing_config = config.data_preprocessing_config()

        except Exception as e:
            logging.error(f"Failed to create data preprocessing configuration: {e}", exc_info=True)
            raise AppException(e, sys)


    def preprocess(self, df: pd.DataFrame, filename: str) -> pd.DataFrame:
        """

        Preprocesses the given dataframe by performing lowercasing, removing punctuation, 

        removing stopwords and performing lemmatization with parts of speech tagging.

        Args:

            df (pd.DataFrame): The dataframe to be preprocessed

            filename (str): The filename to save the preprocessed dataframe



        Returns:

            pd.DataFrame: The preprocessed dataframe

        """
        fn = HelperFunctions()
        # Preprocessing steps
        try:
            df.dropna(how='any', inplace=True)
            tqdm.pandas()

            logging.info("Performing lowercasing")
            df['Content'] = df['Content'].progress_apply(fn.lower_case)

            logging.info("Removing punctuations")
            df['Content'] = df['Content'].progress_apply(fn.remove_punctuations)

            logging.info("Performing pos tagging and lemmatization")
            df['Content'] = df['Content'].progress_apply(fn.lemmatization)

            logging.info("Removing stopwords")
            df['Content'] = df['Content'].progress_apply(fn.remove_stopwords)

            logging.info("Finished preprocessing operations successfully")

            preprocessed_data_dir = self.data_preprocessing_config.preprocessed_data_dir
            df.to_feather(Path(preprocessed_data_dir, filename))
            
            logging.info(f"Data successfully saved at {preprocessed_data_dir}")
            return df
        
        except Exception as e:
            logging.error(f"Data preprocessing failed: {e}", exc_info=True)
            raise AppException(e, sys)
    
def initiate_data_preprocessing():
    """

    Main function to initiate the data preprocessing workflow. It reads ingested dataset,

    performs different preprocessing operations and saves the preprocessed data as a CSV file.



    Raises:

        AppException: If an error occurs during data preprocessing.

    """
    obj = DataPreprocessing()
    try:
        logging.info(f"{'='*20}Data Preprocessing{'='*20}")
        data_path = obj.data_preprocessing_config.data_path
        if not data_path:
            raise ValueError("No data path found")
        
        df = pd.read_parquet(data_path)
        preprocessed_dataset_name = obj.data_preprocessing_config.preprocessed_data_filename
        obj.preprocess(df, preprocessed_dataset_name)
        del df
        gc.collect()
        logging.info(f"{'='*20}Data Preprocessing Completed Successfully{'='*20} \n\n")

    except Exception as e:
        logging.error(f"Error in Data Preprocessing process: {e}", exc_info=True)
        raise AppException(e, sys)
    

if __name__ == "__main__":
    initiate_data_preprocessing()