Spaces:
Runtime error
Runtime error
| import os | |
| import io | |
| import pandas as pd | |
| def read_file(file_path): | |
| """ | |
| Read a file from a given path. | |
| """ | |
| # Check the size of the file | |
| if os.path.getsize(file_path) > 200 * 1024 * 1024: # 200MB in bytes | |
| raise ValueError("Too large file") | |
| # Extract the file extension | |
| file_extension = file_path.split('.')[-1] | |
| if file_extension == 'csv': | |
| # Read CSV file | |
| return pd.read_csv(file_path) | |
| elif file_extension == 'json': | |
| # Read JSON file | |
| return pd.read_json(file_path) | |
| elif file_extension in ['xls', 'xlsx']: | |
| # Read Excel file | |
| return pd.read_excel(file_path, engine='openpyxl') | |
| else: | |
| raise ValueError("Unsupported file format: " + file_extension) | |
| def read_file_from_streamlit(uploaded_file): | |
| """ | |
| Read a file from a given streamlit file. | |
| """ | |
| # Check the size of the file | |
| if uploaded_file.size > 200 * 1024 * 1024: # 200MB in bytes | |
| raise ValueError("Too large file") | |
| # Extract the file extension | |
| file_extension = uploaded_file.name.split('.')[-1] | |
| if file_extension == 'csv': | |
| # Read CSV file | |
| return pd.read_csv(uploaded_file) | |
| elif file_extension == 'json': | |
| # Read JSON file | |
| return pd.read_json(uploaded_file) | |
| elif file_extension in ['xls', 'xlsx']: | |
| # Read Excel file | |
| # Use io.BytesIO to handle the binary stream | |
| return pd.read_excel(io.BytesIO(uploaded_file.read()), engine='openpyxl') | |
| else: | |
| raise ValueError("Unsupported file format: " + file_extension) | |
| def select_Y(df, Y_name): | |
| """ | |
| Select the target variable from the DataFrame. | |
| """ | |
| if Y_name in df.columns: | |
| X = df.drop(Y_name, axis=1) | |
| Y = df[Y_name] | |
| return X, Y | |
| else: | |
| return -1 | |
| def check_all_columns_numeric(df): | |
| """ | |
| Check if all columns in a DataFrame are numeric. Return True if so, False otherwise. | |
| """ | |
| return df.select_dtypes(include=[int, float]).shape[1] == df.shape[1] | |
| def non_numeric_columns_and_head(df, num_rows=20): | |
| """ | |
| Identify non-numeric columns in a DataFrame and return their names and head. | |
| :param df: Pandas DataFrame to be examined. | |
| :param num_rows: Number of rows to include in the head (default is 20). | |
| :return: A tuple with two elements: | |
| 1. List of column names that are not numeric (integer or float). | |
| 2. DataFrame containing the head of the non-numeric columns. | |
| """ | |
| # Identify columns that are not of numeric data type | |
| non_numeric_cols = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])] | |
| # Get the head of the non-numeric columns | |
| non_numeric_head = df[non_numeric_cols].head(num_rows).to_csv() | |
| return non_numeric_cols, non_numeric_head | |
| def contain_null_attributes_info(df): | |
| """ | |
| Identifies columns with missing values, summarizes their statistics, and reports their data types. | |
| This function checks for attributes within a DataFrame that contain null values, | |
| generates descriptive statistics for these attributes, and compiles information about their data types. | |
| :param df: A pandas DataFrame to be analyzed. | |
| :return: A tuple containing: | |
| - A list of columns that contain null values. | |
| - A string representation of data types for these columns. | |
| - A CSV-formatted string containing descriptive statistics (count, mean, median, and standard deviation) for these columns. | |
| Returns an empty list, -1, and -1 if no columns with null values are found. | |
| """ | |
| attributes = df.columns[df.isnull().any()].tolist() | |
| if not attributes: return [], -1, -1 | |
| description_info = df[attributes].describe(percentiles=[.5]) | |
| description_info = description_info.loc[['count', 'mean', '50%', 'std']].round(2).to_csv() | |
| dtypes_df = df[attributes].dtypes | |
| types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()]) | |
| return attributes, types_info, description_info | |
| def attribute_info(df): | |
| """ | |
| Obtain the attributes, types, and head information of the DataFrame. | |
| """ | |
| attributes = df.columns.tolist() | |
| dtypes_df = df.dtypes | |
| types_info = "\n".join([f"{index}:{dtype}" for index, dtype in dtypes_df.items()]) | |
| head_info = df.head(10).to_csv() | |
| return attributes, types_info, head_info | |
| def get_data_overview(df): | |
| """ | |
| Obtain the shape, head, nunique, and description information of the DataFrame. | |
| """ | |
| shape_info = str(df.shape) | |
| head_info = df.head().to_csv() | |
| nunique_info = df.nunique().to_csv() | |
| description_info = df.describe(include='all').to_csv() | |
| return shape_info, head_info, nunique_info, description_info | |
| def get_balance_info(df, Y_name): | |
| """ | |
| Obtain the shape, description, and balance information of the DataFrame. | |
| """ | |
| shape_info = df.shape | |
| description_info = df.describe().to_csv() | |
| balance_info = df[Y_name].value_counts().to_dict() | |
| return shape_info, description_info, balance_info | |
| def separate_decode_list(decided_dict, Y_name): | |
| """ | |
| Process the LLM response and return the lists of columns to be converted to integer, one-hot encoding, and drop | |
| """ | |
| convert_int_cols = [key for key, value in decided_dict.items() if value == 1] | |
| one_hot_cols = [key for key, value in decided_dict.items() if value == 2] | |
| drop_cols = [key for key, value in decided_dict.items() if value == 3] | |
| if Y_name and Y_name in one_hot_cols: | |
| one_hot_cols.remove(Y_name) | |
| convert_int_cols.append(Y_name) | |
| if Y_name and Y_name in drop_cols: | |
| drop_cols.remove(Y_name) | |
| convert_int_cols.append(Y_name) | |
| return convert_int_cols, one_hot_cols, drop_cols | |
| def separate_fill_null_list(fill_null_dict): | |
| """ | |
| Process the LLM response and return the lists of columns to be filled with mean, median, mode, new category, interpolation | |
| """ | |
| mean_list = [key for key, value in fill_null_dict.items() if value == 1] | |
| median_list = [key for key, value in fill_null_dict.items() if value == 2] | |
| mode_list = [key for key, value in fill_null_dict.items() if value == 3] | |
| new_category_list = [key for key, value in fill_null_dict.items() if value == 4] | |
| interpolation_list = [key for key, value in fill_null_dict.items() if value == 5] | |
| return mean_list, median_list, mode_list, new_category_list, interpolation_list | |
| def get_selected_models(model_dict): | |
| """ | |
| Convert the dictionary of models to a list. | |
| """ | |
| return list(model_dict.values()) | |
| def get_model_name(model_no): | |
| """ | |
| Returns the name of the classification model based on the model number. | |
| """ | |
| if model_no == 1: | |
| return "Logistic Regression" | |
| elif model_no == 2: | |
| return "SVM" | |
| elif model_no == 3: | |
| return "Naive Bayes" | |
| elif model_no == 4: | |
| return "Random Forest" | |
| elif model_no == 5: | |
| return "ADA Boost" | |
| elif model_no == 6: | |
| return "XGBoost" | |
| elif model_no == 7: | |
| return "Grandient Boost" | |
| def get_cluster_method_name(method): | |
| """ | |
| Returns the name of the clustering method based on the method number. | |
| """ | |
| if method == 1: | |
| return "K-Means" | |
| elif method == 2: | |
| return "DBSCAN" | |
| elif method == 3: | |
| return "Gaussian Mixture" | |
| elif method == 4: | |
| return "Agglomerative Clustering" | |
| elif method == 5: | |
| return "Spectral Clustering" | |
| def get_balance_method_name(method): | |
| """ | |
| Returns the name of the balance method based on the method number. | |
| """ | |
| if method == 1: | |
| return "ROS" | |
| elif method == 2: | |
| return "SMOTE" | |
| elif method == 3: | |
| return "ADASYN" | |
| elif method == 4: | |
| return "None" | |
| def get_regression_method_name(method): | |
| """ | |
| Returns the name of the regression method based on the method number. | |
| """ | |
| if method == 1: | |
| return "Linear Regression" | |
| elif method == 2: | |
| return "Ridge Regression" | |
| elif method == 3: | |
| return "Lasso Regression" | |
| elif method == 4: | |
| return "Random Forest" | |
| elif method == 5: | |
| return "Gradient Boosting" | |
| elif method == 6: | |
| return "Elastic Net" | |
| def count_unique(df, Y): | |
| """ | |
| Counts the number of unique values in a specified column of a DataFrame. | |
| """ | |
| return df[Y].nunique() | |