import pandas as pd import numpy as np from typing import Dict, Any from sklearn.ensemble import IsolationForest class DataValidators: def missing_values_ratio(self, df: pd.DataFrame) -> float: """Calculate ratio of missing values in dataset""" return float(df.isna().sum().sum() / (df.shape[0] * df.shape[1])) def duplicate_rows_ratio(self, df: pd.DataFrame) -> float: """Calculate ratio of duplicate rows""" return float(df.duplicated().sum() / len(df)) def data_type_consistency(self, df: pd.DataFrame) -> float: """Score how consistent data types are per column""" consistency_score = 0.0 for col in df.columns: non_null = df[col].dropna() if len(non_null) == 0: continue types = non_null.apply(type).value_counts(normalize=True) consistency_score += types.iloc[0] return consistency_score / len(df.columns) def outlier_ratio(self, df: pd.DataFrame) -> float: """Calculate ratio of outliers across all numeric columns""" numeric_cols = df.select_dtypes(include=[np.number]).columns if len(numeric_cols) == 0: return 0.0 total_outliers = 0 total_values = 0 for col in numeric_cols: data = df[col].dropna().values.reshape(-1, 1) if len(data) < 10: continue try: iso = IsolationForest(contamination=0.1, random_state=42) outliers = iso.fit_predict(data) total_outliers += sum(outliers == -1) total_values += len(data) except: continue return total_outliers / total_values if total_values > 0 else 0.0 def schema_validity(self, df: pd.DataFrame, schema: Dict[str, str]) -> float: """Validate dataset against expected schema""" valid = 0 for col, expected_type in schema.items(): if col not in df.columns: continue current_type = self._get_column_type(df[col]) if current_type == expected_type: valid += 1 return valid / len(schema) if schema else 1.0 def _get_column_type(self, series: pd.Series) -> str: dtype = str(series.dtype) if 'int' in dtype or 'float' in dtype: return 'numeric' elif 'datetime' in dtype: return 'datetime' elif dtype == 'object': unique_ratio = series.nunique() / len(series.dropna()) if unique_ratio < 0.1: return 'categorical' else: return 'text' return 'unknown'