Spaces:
Sleeping
Sleeping
File size: 3,820 Bytes
4c01182 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | import sys
from src.core.constants import PARAMS_FILE
from src.core.configuration import AppConfiguration
from src.core.logger import logging
from src.core.exception import AppException
from src.utils import read_yaml, save_obj
import gc
import pandas as pd
from scipy.sparse import csr_matrix
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
class FeatureEngineering:
def __init__(self, config = AppConfiguration()):
"""
Initializes the FeatureEngineering object by creating a feature engineering configuration.
Args:
config (AppConfiguration): The configuration object containing the application configuration.
"""
try:
self.eng_config = config.feature_engineering_config()
except Exception as e:
logging.error(f"Failed to create feature engineering Configuration: {e}", exc_info=True)
raise AppException(e, sys)
def perform_feature_engineering(self, df: pd.DataFrame):
"""
Performs feature engineering on the given dataframe by extracting features with TF-IDF vectorization
and also splits data into training and testing sets.
Saves the vectorizer object and training dataset.
"""
try:
config_params = read_yaml(PARAMS_FILE)
params = config_params.feature_engineering
vectorizer_name = params.vectorizer
vectorizer = TfidfVectorizer(max_features=params.max_features,
min_df=params.min_df, ngram_range=(params.ngrams.min, params.ngrams.max)
)
logging.info("Performing TF-IDF vectorization")
X_tfidf = vectorizer.fit_transform(df['Content'])
X_tfidf = csr_matrix(X_tfidf)
training_data = pd.DataFrame(X_tfidf.toarray())
training_data['Label'] = df['Label'].values
save_model_path = self.eng_config.models_dir
save_obj(location_path=save_model_path, obj_name=f"vectorizer.joblib", obj=vectorizer)
with open(Path(save_model_path, "vectorizer_meta.txt"), 'w') as f:
f.write(f"{vectorizer_name} has been created and fitted on the training data\n\n {params}")
logging.info("Saving training dataset")
training_data.to_feather(self.eng_config.training_data_path)
logging.info("Feature engineering operation done")
except Exception as e:
logging.error(f"Error - feature engineering operation terminated: {e}", exc_info=True)
raise AppException(e, sys)
def initiate_feature_engineering():
"""
Main function to initiate the feature engineering workflow. It reads preprocessed data,
performs feature engineering on the data, and splits data into training and testing sets.
Raises:
AppException: If an error occurs during feature engineering.
"""
obj = FeatureEngineering()
try:
logging.info(f"{'='*20}Feature Engineering{'='*20}")
data_path = obj.eng_config.preprocessed_data_path
if not data_path:
logging.error("Dataset path after preprocessing stage not found")
df = pd.read_feather(data_path)
df.dropna(how='any', inplace=True)
obj.perform_feature_engineering(df)
del df, obj
gc.collect()
logging.info(f"{'='*20}Feature Engineering Completed Successfully{'='*20} \n\n")
except Exception as e:
logging.error(f"Error during Feature Engineering: {e}", exc_info=True)
raise AppException(e, sys)
if __name__ == "__main__":
initiate_feature_engineering() |