ModelSmith-AI / backend /core /dataset_analyzer.py
ACA050's picture
Upload 79 files
a309487 verified
import pandas as pd
import numpy as np
from backend.utils.logger import logger
def convert_numpy_types(obj):
"""Recursively convert numpy types to Python types for JSON serialization."""
if isinstance(obj, np.ndarray):
return obj.tolist()
elif isinstance(obj, (np.integer, np.int64, np.int32)):
return int(obj)
elif isinstance(obj, (np.floating, np.float64, np.float32)):
return float(obj)
elif isinstance(obj, np.bool_):
return bool(obj)
elif isinstance(obj, dict):
return {key: convert_numpy_types(value) for key, value in obj.items()}
elif isinstance(obj, list):
return [convert_numpy_types(item) for item in obj]
else:
return obj
class DatasetAnalyzer:
def analyze(self, df: pd.DataFrame, target_column: str = None):
logger.info("Starting dataset analysis...")
# Remove all-null columns
null_columns = df.columns[df.isnull().all()]
if len(null_columns) > 0:
logger.warning(f"Removing all-null columns: {list(null_columns)}")
df = df.drop(columns=null_columns)
# Remove duplicate rows
duplicate_rows = df.duplicated().sum()
if duplicate_rows > 0:
logger.warning(f"Removing {duplicate_rows} duplicate rows")
df = df.drop_duplicates()
# Remove constant columns
constant_columns = [col for col in df.columns if df[col].nunique() == 1]
if len(constant_columns) > 0:
logger.warning(f"Removing constant columns: {constant_columns}")
df = df.drop(columns=constant_columns)
# Ensure at least 2 usable features after preprocessing
usable_features = [col for col in df.columns if col != target_column]
if len(usable_features) < 2:
raise ValueError(f"Insufficient features: only {len(usable_features)} usable features after preprocessing, need at least 2")
info = {}
info["num_rows"] = df.shape[0]
info["num_columns"] = df.shape[1]
info["missing_ratio"] = df.isnull().mean().mean()
info["row_count"] = df.shape[0]
info["high_dimensional"] = bool(df.shape[1] > 50)
info["small_data"] = bool(df.shape[0] < 1200)
info["sparse_data"] = bool(df.isnull().mean().mean() > 0.4)
all_numeric_cols = df.select_dtypes(include="number").columns.tolist()
all_categorical_cols = df.select_dtypes(exclude="number").columns.tolist()
info["numeric_cols"] = [col for col in all_numeric_cols if col != target_column]
info["categorical_cols"] = [col for col in all_categorical_cols if col != target_column]
if len(info["numeric_cols"]) + len(info["categorical_cols"]) < 2:
raise ValueError("Dataset must have at least 2 usable features after preprocessing")
# Cardinality
cardinality = {col: df[col].nunique() for col in df.columns}
info["cardinality"] = cardinality
# Target-specific checks
if target_column and target_column in df.columns:
target = df[target_column]
unique_vals = target.nunique()
if target.dtype in ['int64', 'float64'] and unique_vals > 10:
info["target_type"] = "regression"
info["class_distribution"] = None
info["imbalance"] = None
else:
info["target_type"] = "classification"
value_counts = target.value_counts(normalize=True)
info["class_distribution"] = value_counts.to_dict()
info["imbalance"] = bool(value_counts.max() > 0.8)
else:
info["target_type"] = None
info["class_distribution"] = None
info["imbalance"] = None
# NLP detection heuristic
avg_text_len = None
text_columns = []
for col in info["categorical_cols"]:
if df[col].astype(str).str.len().mean() > 30:
text_columns.append(col)
info["text_columns"] = text_columns
info["possible_nlp"] = len(text_columns) > 0
return convert_numpy_types(info)