Xgboost_hm / dataset_utils.py
ganeshkonapalli's picture
Update dataset_utils.py
cc93d14 verified
raw
history blame contribute delete
956 Bytes
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from config import TEXT_COLUMN, LABEL_COLUMNS, METADATA_COLUMNS, LABEL_ENCODERS_PATH
def load_and_preprocess_data(data_path):
"""
Loads the data from a CSV file, fills missing values, and encodes label columns.
"""
data = pd.read_csv(data_path)
data.fillna("Unknown", inplace=True)
for col in METADATA_COLUMNS:
if col in data.columns:
data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0)
label_encoders = {}
for col in LABEL_COLUMNS:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le
return data, label_encoders
def save_label_encoders(label_encoders):
with open(LABEL_ENCODERS_PATH, "wb") as f:
pickle.dump(label_encoders, f)
def load_label_encoders():
with open(LABEL_ENCODERS_PATH, "rb") as f:
return pickle.load(f)