Spaces:
Sleeping
Sleeping
File size: 5,421 Bytes
4c01182 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
import sys
import string
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from src.core.logger import logging
from src.core.exception import AppException
from src.core.configuration import AppConfiguration
import gc
class HelperFunctions:
def __init__(self):
"""
Initializes an instance of the HelperFunctions class.
This class contains methods for various text preprocessing tasks such as
lowercasing, removing punctuations, removing stopwords, and lemmatization.
"""
pass
def lower_case(self, text) -> str:
"""Converts the given text to lowercase."""
return text.lower()
def remove_punctuations(self, text) -> str:
"""Removes all punctuation marks from the given text."""
exclude = string.punctuation
return text.translate(str.maketrans("", "", exclude))
def remove_stopwords(self, text) -> str:
"""Removes all English stopwords from the given text."""
stop_words = set(stopwords.words('english'))
text = [word for word in text.split() if word not in stop_words]
return " ".join(text)
def lemmatization(self, text) -> str:
"""
Does parts of speech tagging and then converts words to their base form
using WordNetLemmatizer.
"""
lemmatizer = WordNetLemmatizer()
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}
# Perform POS tagging and lemmatization
pos_text = pos_tag(text.split())
text = [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text]
return " ".join(text)
class DataPreprocessing:
def __init__(self, config = AppConfiguration()):
"""
Initializes the DataPreprocessing object by creating the data prepocessing configuration.
Args:
config (AppConfiguration): The configuration object containing the application configuration.
"""
try:
self.data_preprocessing_config = config.data_preprocessing_config()
except Exception as e:
logging.error(f"Failed to create data preprocessing configuration: {e}", exc_info=True)
raise AppException(e, sys)
def preprocess(self, df: pd.DataFrame, filename: str) -> pd.DataFrame:
"""
Preprocesses the given dataframe by performing lowercasing, removing punctuation,
removing stopwords and performing lemmatization with parts of speech tagging.
Args:
df (pd.DataFrame): The dataframe to be preprocessed
filename (str): The filename to save the preprocessed dataframe
Returns:
pd.DataFrame: The preprocessed dataframe
"""
fn = HelperFunctions()
# Preprocessing steps
try:
df.dropna(how='any', inplace=True)
tqdm.pandas()
logging.info("Performing lowercasing")
df['Content'] = df['Content'].progress_apply(fn.lower_case)
logging.info("Removing punctuations")
df['Content'] = df['Content'].progress_apply(fn.remove_punctuations)
logging.info("Performing pos tagging and lemmatization")
df['Content'] = df['Content'].progress_apply(fn.lemmatization)
logging.info("Removing stopwords")
df['Content'] = df['Content'].progress_apply(fn.remove_stopwords)
logging.info("Finished preprocessing operations successfully")
preprocessed_data_dir = self.data_preprocessing_config.preprocessed_data_dir
df.to_feather(Path(preprocessed_data_dir, filename))
logging.info(f"Data successfully saved at {preprocessed_data_dir}")
return df
except Exception as e:
logging.error(f"Data preprocessing failed: {e}", exc_info=True)
raise AppException(e, sys)
def initiate_data_preprocessing():
"""
Main function to initiate the data preprocessing workflow. It reads ingested dataset,
performs different preprocessing operations and saves the preprocessed data as a CSV file.
Raises:
AppException: If an error occurs during data preprocessing.
"""
obj = DataPreprocessing()
try:
logging.info(f"{'='*20}Data Preprocessing{'='*20}")
data_path = obj.data_preprocessing_config.data_path
if not data_path:
raise ValueError("No data path found")
df = pd.read_parquet(data_path)
preprocessed_dataset_name = obj.data_preprocessing_config.preprocessed_data_filename
obj.preprocess(df, preprocessed_dataset_name)
del df
gc.collect()
logging.info(f"{'='*20}Data Preprocessing Completed Successfully{'='*20} \n\n")
except Exception as e:
logging.error(f"Error in Data Preprocessing process: {e}", exc_info=True)
raise AppException(e, sys)
if __name__ == "__main__":
initiate_data_preprocessing() |