import sys from src.core.constants import PARAMS_FILE from src.core.configuration import AppConfiguration from src.core.logger import logging from src.core.exception import AppException from src.utils import read_yaml, save_obj import gc import pandas as pd from scipy.sparse import csr_matrix from pathlib import Path from sklearn.feature_extraction.text import TfidfVectorizer class FeatureEngineering: def __init__(self, config = AppConfiguration()): """ Initializes the FeatureEngineering object by creating a feature engineering configuration. Args: config (AppConfiguration): The configuration object containing the application configuration. """ try: self.eng_config = config.feature_engineering_config() except Exception as e: logging.error(f"Failed to create feature engineering Configuration: {e}", exc_info=True) raise AppException(e, sys) def perform_feature_engineering(self, df: pd.DataFrame): """ Performs feature engineering on the given dataframe by extracting features with TF-IDF vectorization and also splits data into training and testing sets. Saves the vectorizer object and training dataset. """ try: config_params = read_yaml(PARAMS_FILE) params = config_params.feature_engineering vectorizer_name = params.vectorizer vectorizer = TfidfVectorizer(max_features=params.max_features, min_df=params.min_df, ngram_range=(params.ngrams.min, params.ngrams.max) ) logging.info("Performing TF-IDF vectorization") X_tfidf = vectorizer.fit_transform(df['Content']) X_tfidf = csr_matrix(X_tfidf) training_data = pd.DataFrame(X_tfidf.toarray()) training_data['Label'] = df['Label'].values save_model_path = self.eng_config.models_dir save_obj(location_path=save_model_path, obj_name=f"vectorizer.joblib", obj=vectorizer) with open(Path(save_model_path, "vectorizer_meta.txt"), 'w') as f: f.write(f"{vectorizer_name} has been created and fitted on the training data\n\n {params}") logging.info("Saving training dataset") training_data.to_feather(self.eng_config.training_data_path) logging.info("Feature engineering operation done") except Exception as e: logging.error(f"Error - feature engineering operation terminated: {e}", exc_info=True) raise AppException(e, sys) def initiate_feature_engineering(): """ Main function to initiate the feature engineering workflow. It reads preprocessed data, performs feature engineering on the data, and splits data into training and testing sets. Raises: AppException: If an error occurs during feature engineering. """ obj = FeatureEngineering() try: logging.info(f"{'='*20}Feature Engineering{'='*20}") data_path = obj.eng_config.preprocessed_data_path if not data_path: logging.error("Dataset path after preprocessing stage not found") df = pd.read_feather(data_path) df.dropna(how='any', inplace=True) obj.perform_feature_engineering(df) del df, obj gc.collect() logging.info(f"{'='*20}Feature Engineering Completed Successfully{'='*20} \n\n") except Exception as e: logging.error(f"Error during Feature Engineering: {e}", exc_info=True) raise AppException(e, sys) if __name__ == "__main__": initiate_feature_engineering()