Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| from sklearn.decomposition import PCA | |
| from sklearn.preprocessing import StandardScaler | |
| from src.preprocess import convert_to_integer | |
| def decide_pca(df, cumulative_variance_threshold=0.95, min_dim_reduction_ratio=0.1): | |
| """ | |
| Determines whether PCA should be performed based on cumulative variance threshold and dimension reduction ratio. | |
| Parameters: | |
| - df (DataFrame): The input DataFrame. | |
| - cumulative_variance_threshold (float): The threshold of explained variance to retain. Default is 0.95. | |
| - min_dim_reduction_ratio (float): The minimum ratio of dimension reduction required to perform PCA. Default is 0.1. | |
| Returns: | |
| - perform_pca (bool): Whether PCA should be performed. | |
| - n_components (int): The number of principal components to retain. | |
| """ | |
| # Remove non-numeric columns | |
| numeric_df = df.select_dtypes(include=[np.number]) | |
| # Standardizing the Data | |
| scaler = StandardScaler() | |
| scaled_data = scaler.fit_transform(numeric_df) | |
| # PCA for Explained Variance | |
| pca = PCA() | |
| pca.fit(scaled_data) | |
| # Calculate cumulative variance | |
| cumulative_variance = np.cumsum(pca.explained_variance_ratio_) | |
| # Find the number of components for the desired threshold | |
| n_components = np.where(cumulative_variance >= cumulative_variance_threshold)[0][0] + 1 | |
| # Calculate the dimension reduction ratio | |
| dim_reduction_ratio = 1 - (n_components / df.shape[1]) | |
| # Check if PCA should be performed based on the dimension reduction ratio | |
| perform_pca = dim_reduction_ratio >= min_dim_reduction_ratio | |
| return perform_pca, n_components | |
| def perform_pca(df, n_components, Y_name): | |
| """ | |
| Performs PCA on the dataset, optionally excluding a target column, and standardizes the data. | |
| Parameters: | |
| - df (DataFrame): The input DataFrame. | |
| - n_components (int): The number of principal components to retain. | |
| - Y_name (str, optional): The name of the target column to exclude from PCA. Default is None. | |
| Returns: | |
| - pca_df (DataFrame): DataFrame with principal components and optionally the target column. | |
| """ | |
| # Save the target column data | |
| drop_columns = [] | |
| if Y_name: | |
| target_data = df[Y_name] | |
| drop_columns.append(Y_name) | |
| # Remove non-numeric columns and the target column | |
| numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore') | |
| # Standardizing the Data | |
| scaler = StandardScaler() | |
| scaled_data = scaler.fit_transform(numeric_df) | |
| # Applying PCA | |
| pca = PCA(n_components=n_components) | |
| principal_components = pca.fit_transform(scaled_data) | |
| # Create a new DataFrame with principal components | |
| columns = [f'PC{i+1}' for i in range(n_components)] | |
| pca_df = pd.DataFrame(data=principal_components, columns=columns) | |
| # Reattach the target column | |
| if Y_name: | |
| pca_df[Y_name] = target_data.reset_index(drop=True) | |
| pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name]) | |
| return pca_df | |
| def perform_PCA_for_clustering(df, n_components): | |
| """ | |
| Applies PCA transformation for clustering tasks on the given DataFrame. | |
| Parameters: | |
| - df (DataFrame): The input DataFrame to apply PCA. | |
| - n_components (int): The number of principal components to retain. | |
| Returns: | |
| - pca_df (DataFrame): DataFrame of the principal components. | |
| """ | |
| # Applying PCA | |
| pca = PCA(n_components=n_components) | |
| principal_components = pca.fit_transform(df) | |
| # Create a new DataFrame with principal components | |
| columns = [f'PC{i+1}' for i in range(n_components)] | |
| pca_df = pd.DataFrame(data=principal_components, columns=columns) | |
| return pca_df | |
| def perform_PCA_for_regression(df, n_components, Y_name): | |
| """ | |
| Applies PCA for regression tasks, excluding a specified target column from the transformation. | |
| Parameters: | |
| - df (DataFrame): The input DataFrame. | |
| - n_components (int): The number of principal components to retain. | |
| - Y_name (str, optional): The name of the target column to exclude from PCA and append back after transformation. Default is None. | |
| Returns: | |
| - pca_df (DataFrame): A new DataFrame with principal components and the target column. | |
| """ | |
| # Save the target column data | |
| drop_columns = [] | |
| if Y_name: | |
| target_data = df[Y_name] | |
| drop_columns.append(Y_name) | |
| # Remove non-numeric columns and the target column | |
| numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore') | |
| # Applying PCA | |
| pca = PCA(n_components=n_components) | |
| principal_components = pca.fit_transform(numeric_df) | |
| # Create a new DataFrame with principal components | |
| columns = [f'PC{i+1}' for i in range(n_components)] | |
| pca_df = pd.DataFrame(data=principal_components, columns=columns) | |
| # Reattach the target column | |
| if Y_name: | |
| pca_df[Y_name] = target_data.reset_index(drop=True) | |
| pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name]) | |
| return pca_df |