Spaces:

zaid002
/

multiple-disease-prediction

Sleeping

App Files Files Community

multiple-disease-prediction / data_preprocessing.py

zaid002

Upload 6 files

20fdb7e verified 3 months ago

raw

history blame contribute delete

1.95 kB

	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline

	def load_csv(path):
	df = pd.read_csv("/content/merged.csv")
	return df

	def build_preprocessing_pipeline(df, categorical_cols=None, numeric_cols=None, scale=True):
	# auto-detect if not given
	if categorical_cols is None:
	categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
	if numeric_cols is None:
	numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

	# remove numeric cols that are actually target or indices must be handled by caller
	# numeric preprocessing
	numeric_transformers = []
	if numeric_cols:
	numeric_transformers = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='median')),
	('scaler', StandardScaler() if scale else 'passthrough')
	])

	# categorical preprocessing
	categorical_transformers = Pipeline(steps=[
	('imputer', SimpleImputer(strategy='most_frequent')),
	('onehot', OneHotEncoder(handle_unknown='ignore'))
	]) if categorical_cols else 'passthrough'

	preprocessor = ColumnTransformer(transformers=[
	('num', numeric_transformers, numeric_cols),
	('cat', categorical_transformers, categorical_cols)
	], remainder='drop', sparse_threshold=0)

	return preprocessor, numeric_cols, categorical_cols

	def split_features_target(df, target_col, test_size=0.2, random_state=42):
	X = df.drop(columns=[target_col])
	y = df[target_col]
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, stratify=y if len(np.unique(y))>1 else None,
	test_size=test_size, random_state=random_state
	)
	return X_train, X_test, y_train, y_test