import pandas as pd import pickle from sklearn.preprocessing import LabelEncoder from config import TEXT_COLUMN, LABEL_COLUMNS, METADATA_COLUMNS, LABEL_ENCODERS_PATH def load_and_preprocess_data(data_path): """ Loads the dataset, fills missing values, converts metadata to numeric if specified, and encodes each target label using LabelEncoder. Args: data_path (str): Path to the CSV file Returns: data (pd.DataFrame): Preprocessed data label_encoders (dict): Fitted LabelEncoders for each label """ data = pd.read_csv(data_path) data.fillna("Unknown", inplace=True) # Optional: convert numeric metadata for col in METADATA_COLUMNS: if col in data.columns: data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0) # Encode target labels label_encoders = {} for col in LABEL_COLUMNS: le = LabelEncoder() data[col] = le.fit_transform(data[col]) label_encoders[col] = le return data, label_encoders def save_label_encoders(label_encoders): """ Save fitted label encoders as a pickle file. """ with open(LABEL_ENCODERS_PATH, "wb") as f: pickle.dump(label_encoders, f) def load_label_encoders(): """ Load previously saved label encoders. """ with open(LABEL_ENCODERS_PATH, "rb") as f: return pickle.load(f)