logreg__ / dataset_utils.py
ganeshkonapalli's picture
Create dataset_utils.py
c61689a verified
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder
from config import TEXT_COLUMN, LABEL_COLUMNS, METADATA_COLUMNS, LABEL_ENCODERS_PATH
def load_and_preprocess_data(data_path):
"""
Loads the dataset, fills missing values, converts metadata to numeric if specified,
and encodes each target label using LabelEncoder.
Args:
data_path (str): Path to the CSV file
Returns:
data (pd.DataFrame): Preprocessed data
label_encoders (dict): Fitted LabelEncoders for each label
"""
data = pd.read_csv(data_path)
data.fillna("Unknown", inplace=True)
# Optional: convert numeric metadata
for col in METADATA_COLUMNS:
if col in data.columns:
data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0)
# Encode target labels
label_encoders = {}
for col in LABEL_COLUMNS:
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
label_encoders[col] = le
return data, label_encoders
def save_label_encoders(label_encoders):
"""
Save fitted label encoders as a pickle file.
"""
with open(LABEL_ENCODERS_PATH, "wb") as f:
pickle.dump(label_encoders, f)
def load_label_encoders():
"""
Load previously saved label encoders.
"""
with open(LABEL_ENCODERS_PATH, "rb") as f:
return pickle.load(f)