Spaces:

OxonTechnologies
/

CSS_EDA_Dashboard

Sleeping

App Files Files Community

CSS_EDA_Dashboard / src /utils /feature_class.py

arash7920

Upload 38 files

e869d90 verified 3 months ago

raw

history blame contribute delete

4.9 kB

	"""
	Feature classification module for detecting data types in DataFrames.

	This module provides the DetectFeatureClasses class which automatically
	classifies features as Binary, Categorical, or Continuous based on their
	statistical properties.
	"""

	import numpy as np


	class DetectFeatureClasses:
	"""
	A class to detect feature classes in a pandas DataFrame.
	Parameters:
	----------
	dataframe : pd.DataFrame
	The input DataFrame containing features to be classified.
	categorical_threshold : float, optional
	The relative threshold to determine if a feature is categorical based on the ratio of unique values to total rows. Default is 0.5.
	string_data_policy : str, optional
	Policy for handling string data that cannot be converted to float. Options are 'drop' to drop such features or 'ignore' to leave them as is. Default is 'drop'.
	Methods:
	-------
	feature_classes() -> dict
	Classifies features into 'Binary', 'Categorical', or 'Continuous' and returns a dictionary with feature names as keys and their classes as values.
	"""

	def __init__(self, dataframe, categorical_threshold=0.5, string_data_policy='drop'):

	"""
	Initializes the DetectFeatureClasses with the provided DataFrame and parameters.
	Parameters:
	----------
	dataframe : pd.DataFrame
	The input DataFrame containing features to be classified.
	categorical_threshold : float, optional
	The relative threshold to determine if a feature is categorical based on the ratio of unique values to total rows. Default is 0.5.
	string_data_policy : str, optional
	Policy for handling string data that cannot be converted to float. Options are 'drop' to drop such features or 'ignore' to leave them as is. Default is 'drop'.
	"""

	self.dataframe = dataframe
	self.categorical_threshold = categorical_threshold
	self.string_data_policy = string_data_policy

	def _binaries(self):
	"""
	Identifies binary features in the DataFrame.

	A feature is considered binary if it has at most 2 unique values.

	Returns
	-------
	list
	A list of column names that are classified as binary features.
	"""
	binary_columns = [column for column in self.dataframe.columns if len(self.dataframe[column].unique()) <= 2]
	return binary_columns

	def _categorical(self):
	"""
	Identifies categorical features in the DataFrame based on the categorical threshold.

	A feature is considered categorical if the number of unique values
	is significantly less than the total number of rows (using the
	categorical_threshold as a relative tolerance).

	Returns
	-------
	list
	A list of column names that are classified as categorical features.
	"""
	categorical_columns = []
	for column in self.dataframe.columns:
	# Check if unique count is not close to total rows (within threshold)
	if np.isclose(len(
	self.dataframe[column].unique()), len(self.dataframe),
	rtol=self.categorical_threshold
	) is False:
	categorical_columns.append(column)
	return categorical_columns

	def feature_classes(self):
	"""
	Classifies features in the DataFrame into 'Binary', 'Categorical', or 'Continuous'.
	Returns:
	-------
	dict
	A dictionary with feature names as keys and their classes ('Binary', 'Categorical', 'Continuous') as values.
	list
	A list of features that were dropped due to string data policy.
	"""
	binary_columns = self._binaries()
	categorical_columns = self._categorical()
	features_class_types = {}
	excess_columns = []

	# Classify each feature
	for feature in self.dataframe.columns:
	if feature in binary_columns:
	features_class_types[feature] = 'Binary'
	elif feature in categorical_columns:
	features_class_types[feature] = 'Categorical'
	else:
	# Try to convert to float to determine if continuous
	try:
	self.dataframe[feature] = self.dataframe[feature].astype(float)
	features_class_types[feature] = 'Continuous'
	except ValueError:
	# Cannot convert to float - handle based on policy
	if self.string_data_policy == 'drop':
	excess_columns.append(feature)
	else:
	# 'ignore' policy: leave as-is (not recommended)
	pass

	return features_class_types, excess_columns