import pandas as pd import numpy as np from backend.utils.logger import logger def convert_numpy_types(obj): """Recursively convert numpy types to Python types for JSON serialization.""" if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, (np.integer, np.int64, np.int32)): return int(obj) elif isinstance(obj, (np.floating, np.float64, np.float32)): return float(obj) elif isinstance(obj, np.bool_): return bool(obj) elif isinstance(obj, dict): return {key: convert_numpy_types(value) for key, value in obj.items()} elif isinstance(obj, list): return [convert_numpy_types(item) for item in obj] else: return obj class DatasetAnalyzer: def analyze(self, df: pd.DataFrame, target_column: str = None): logger.info("Starting dataset analysis...") # Remove all-null columns null_columns = df.columns[df.isnull().all()] if len(null_columns) > 0: logger.warning(f"Removing all-null columns: {list(null_columns)}") df = df.drop(columns=null_columns) # Remove duplicate rows duplicate_rows = df.duplicated().sum() if duplicate_rows > 0: logger.warning(f"Removing {duplicate_rows} duplicate rows") df = df.drop_duplicates() # Remove constant columns constant_columns = [col for col in df.columns if df[col].nunique() == 1] if len(constant_columns) > 0: logger.warning(f"Removing constant columns: {constant_columns}") df = df.drop(columns=constant_columns) # Ensure at least 2 usable features after preprocessing usable_features = [col for col in df.columns if col != target_column] if len(usable_features) < 2: raise ValueError(f"Insufficient features: only {len(usable_features)} usable features after preprocessing, need at least 2") info = {} info["num_rows"] = df.shape[0] info["num_columns"] = df.shape[1] info["missing_ratio"] = df.isnull().mean().mean() info["row_count"] = df.shape[0] info["high_dimensional"] = bool(df.shape[1] > 50) info["small_data"] = bool(df.shape[0] < 1200) info["sparse_data"] = bool(df.isnull().mean().mean() > 0.4) all_numeric_cols = df.select_dtypes(include="number").columns.tolist() all_categorical_cols = df.select_dtypes(exclude="number").columns.tolist() info["numeric_cols"] = [col for col in all_numeric_cols if col != target_column] info["categorical_cols"] = [col for col in all_categorical_cols if col != target_column] if len(info["numeric_cols"]) + len(info["categorical_cols"]) < 2: raise ValueError("Dataset must have at least 2 usable features after preprocessing") # Cardinality cardinality = {col: df[col].nunique() for col in df.columns} info["cardinality"] = cardinality # Target-specific checks if target_column and target_column in df.columns: target = df[target_column] unique_vals = target.nunique() if target.dtype in ['int64', 'float64'] and unique_vals > 10: info["target_type"] = "regression" info["class_distribution"] = None info["imbalance"] = None else: info["target_type"] = "classification" value_counts = target.value_counts(normalize=True) info["class_distribution"] = value_counts.to_dict() info["imbalance"] = bool(value_counts.max() > 0.8) else: info["target_type"] = None info["class_distribution"] = None info["imbalance"] = None # NLP detection heuristic avg_text_len = None text_columns = [] for col in info["categorical_cols"]: if df[col].astype(str).str.len().mean() > 30: text_columns.append(col) info["text_columns"] = text_columns info["possible_nlp"] = len(text_columns) > 0 return convert_numpy_types(info)