Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import pickle | |
| from sklearn.preprocessing import LabelEncoder | |
| from config import TEXT_COLUMN, LABEL_COLUMNS, METADATA_COLUMNS, LABEL_ENCODERS_PATH | |
| def load_and_preprocess_data(data_path): | |
| """ | |
| Loads the dataset, fills missing values, converts metadata to numeric if specified, | |
| and encodes each target label using LabelEncoder. | |
| Args: | |
| data_path (str): Path to the CSV file | |
| Returns: | |
| data (pd.DataFrame): Preprocessed data | |
| label_encoders (dict): Fitted LabelEncoders for each label | |
| """ | |
| data = pd.read_csv(data_path) | |
| data.fillna("Unknown", inplace=True) | |
| # Optional: convert numeric metadata | |
| for col in METADATA_COLUMNS: | |
| if col in data.columns: | |
| data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0) | |
| # Encode target labels | |
| label_encoders = {} | |
| for col in LABEL_COLUMNS: | |
| le = LabelEncoder() | |
| data[col] = le.fit_transform(data[col]) | |
| label_encoders[col] = le | |
| return data, label_encoders | |
| def save_label_encoders(label_encoders): | |
| """ | |
| Save fitted label encoders as a pickle file. | |
| """ | |
| with open(LABEL_ENCODERS_PATH, "wb") as f: | |
| pickle.dump(label_encoders, f) | |
| def load_label_encoders(): | |
| """ | |
| Load previously saved label encoders. | |
| """ | |
| with open(LABEL_ENCODERS_PATH, "rb") as f: | |
| return pickle.load(f) | |