ntl_clustering / utils /preprocess.py
kawaiipeace's picture
update model
53e2114
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
def load_csv(file_path_or_obj):
return pd.read_csv(file_path_or_obj)
def get_numeric(df: pd.DataFrame, strategy: str = "Fill with Mean") -> pd.DataFrame:
numeric_df = df.select_dtypes(include=['number'])
if strategy == "Fill with Mean":
return numeric_df.fillna(numeric_df.mean(numeric_only=True))
elif strategy == "Fill with Zero":
return numeric_df.fillna(0)
elif strategy == "Drop Rows":
return numeric_df.dropna()
else:
return numeric_df
def get_text_column(df: pd.DataFrame) -> list:
text_columns = df.select_dtypes(include=['object']).columns
if not text_columns.empty:
return df[text_columns[0]].dropna().astype(str).tolist()
return []
def normalize_data(data: pd.DataFrame, method: str):
if method == "z-score":
scaler = StandardScaler()
elif method == "mapminmax":
scaler = MinMaxScaler()
else: # "none"
return data.copy(), None
scaled = scaler.fit_transform(data)
return pd.DataFrame(scaled, columns=data.columns), scaler
def denormalize_data(scaled_data: pd.DataFrame, scaler):
if scaler is None:
return scaled_data
return pd.DataFrame(scaler.inverse_transform(scaled_data), columns=scaled_data.columns)