Spaces:

entropy25
/

data-analysis-platform

Build error

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

9cba783

verified ·

1 Parent(s): 7583e80

Update data_handler.py

Browse files

Files changed (1) hide show

data_handler.py +673 -202

data_handler.py CHANGED Viewed

@@ -4,38 +4,53 @@ import numpy as np
 import warnings
 from typing import Dict, List, Any, Tuple
 from scipy import stats
 warnings.filterwarnings('ignore')
-# All cached data processing functions
-@st.cache_data
 def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
-    """Load CSV with automatic encoding detection - cached"""
-    import chardet
-    detected = chardet.detect(file_content)
-    encoding = detected['encoding']
     try:
-        from io import BytesIO
-        return pd.read_csv(BytesIO(file_content), encoding=encoding)
     except:
-        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
-        for enc in encodings:
-            try:
-                return pd.read_csv(BytesIO(file_content), encoding=enc)
-            except:
-                continue
-        raise Exception("Cannot read file with any encoding")
-@st.cache_data
 def load_excel_file(file_content: bytes) -> pd.DataFrame:
-    """Load Excel file - cached"""
-    from io import BytesIO
-    return pd.read_excel(BytesIO(file_content))
-@st.cache_data
 def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate basic statistics - cached"""
     dtype_counts = df.dtypes.value_counts()
     dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
@@ -44,254 +59,710 @@ def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
         'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
         'missing_values': int(df.isnull().sum().sum()),
         'dtypes': dtype_dict,
-        'duplicates': int(df.duplicated().sum())
     }
-@st.cache_data
 def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate column cardinality analysis - cached"""
     cardinality_data = []
     for col in df.columns:
         unique_count = df[col].nunique()
-        unique_ratio = unique_count / len(df)
-        # Determine column type based on cardinality
         if unique_count == 1:
             col_type = "Constant"
-        elif unique_count == len(df):
             col_type = "Unique Identifier"
         elif unique_ratio < 0.05:
             col_type = "Low Cardinality"
         elif unique_ratio < 0.5:
             col_type = "Medium Cardinality"
         else:
             col_type = "High Cardinality"
         cardinality_data.append({
             'Column': col,
             'Unique Count': unique_count,
             'Unique Ratio': unique_ratio,
             'Type': col_type,
-            'Data Type': str(df[col].dtype)
         })
     return pd.DataFrame(cardinality_data)
-@st.cache_data
-def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate memory optimization suggestions - cached"""
-    suggestions = []
-    current_memory = df.memory_usage(deep=True).sum() / 1024**2
-    potential_savings = 0
-    for col in df.columns:
-        if df[col].dtype == 'object':
-            unique_ratio = df[col].nunique() / len(df)
-            if unique_ratio < 0.5:  # Less than 50% unique values
-                # Estimate category memory usage
-                category_memory = df[col].astype('category').memory_usage(deep=True)
-                object_memory = df[col].memory_usage(deep=True)
-                savings = (object_memory - category_memory) / 1024**2
-                if savings > 0.1:  # More than 0.1MB savings
-                    suggestions.append({
-                        'column': col,
-                        'current_type': 'object',
-                        'suggested_type': 'category',
-                        'savings_mb': savings
-                    })
-                    potential_savings += savings
-    return {
-        'suggestions': suggestions,
-        'current_memory_mb': current_memory,
-        'potential_savings_mb': potential_savings,
-        'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
-    }
-@st.cache_data
 def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate missing data analysis - cached"""
     missing_data = df.isnull().sum()
     if missing_data.sum() > 0:
         missing_df = pd.DataFrame({
             'Column': missing_data.index,
             'Missing Count': missing_data.values,
-            'Missing %': (missing_data.values / len(df)) * 100
         })
         return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
     return pd.DataFrame()
-@st.cache_data
 def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate correlation matrix - cached"""
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
-@st.cache_data
 def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
-    """Get column types - cached"""
     return {
         'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
         'categorical': df.select_dtypes(include=['object']).columns.tolist(),
-        'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
-    }
-@st.cache_data
-def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
-    """Calculate enhanced numeric statistics - cached"""
-    series = df[column].dropna()
-    return {
-        'mean': series.mean(),
-        'median': series.median(),
-        'std': series.std(),
-        'skewness': series.skew(),
-        'kurtosis': series.kurtosis(),
-        'min': series.min(),
-        'max': series.max(),
-        'q25': series.quantile(0.25),
-        'q75': series.quantile(0.75)
     }
-@st.cache_data
 def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
-    """Calculate outliers using IQR method - cached"""
-    Q1 = df[column].quantile(0.25)
-    Q3 = df[column].quantile(0.75)
-    IQR = Q3 - Q1
-    lower_bound = Q1 - 1.5 * IQR
-    upper_bound = Q3 + 1.5 * IQR
-    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
-@st.cache_data
 def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
-    """Detect columns with mixed data types - cached"""
     mixed_type_issues = []
     for col in df.select_dtypes(include=['object']).columns:
-        # Try to convert to numeric
-        numeric_conversion = pd.to_numeric(df[col], errors='coerce')
-        new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
-        if new_nulls > 0:
-            mixed_type_issues.append({
-                'column': col,
-                'problematic_values': new_nulls,
-                'total_values': len(df[col]),
-                'percentage': (new_nulls / len(df[col])) * 100
-            })
     return mixed_type_issues
-@st.cache_data
-def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
-    """Get value counts for categorical column - cached"""
-    return df[column].value_counts().head(top_n)
-@st.cache_data
-def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
-    """Calculate crosstab between two categorical columns - cached"""
-    return pd.crosstab(df[col1], df[col2])
-@st.cache_data
 def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
-    """Calculate group statistics - cached"""
-    return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
-@st.cache_data
 def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate overall data quality score - cached"""
-    score = 100
-    issues = []
-    # Missing values penalty
-    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
-    if missing_pct > 0:
-        penalty = min(30, missing_pct * 2)  # Max 30 points penalty
-        score -= penalty
-        issues.append(f"Missing values: {missing_pct:.1f}%")
-    # Duplicates penalty
-    duplicate_pct = (df.duplicated().sum() / len(df)) * 100
-    if duplicate_pct > 0:
-        penalty = min(20, duplicate_pct * 4)  # Max 20 points penalty
-        score -= penalty
-        issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
-    # Constant columns penalty
-    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
-    if constant_cols:
-        penalty = min(10, len(constant_cols) * 2)
-        score -= penalty
-        issues.append(f"Constant columns: {len(constant_cols)}")
-    # Mixed types penalty
-    mixed_types = detect_mixed_types(df)
-    if mixed_types:
-        penalty = min(10, len(mixed_types) * 3)
-        score -= penalty
-        issues.append(f"Mixed type columns: {len(mixed_types)}")
-    return {
-        'score': max(0, score),
-        'issues': issues,
-        'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
     }
-def load_data(uploaded_file):
-    """Unified data loading function"""
-    file_content = uploaded_file.read()
-    uploaded_file.seek(0)
-    if uploaded_file.name.endswith('.csv'):
-        return load_csv_with_encoding(file_content, uploaded_file.name)
-    else:
-        return load_excel_file(file_content)
-def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
-    """Apply data cleaning operations"""
-    cleaned_df = df.copy()
-    for operation in operations:
-        if operation['type'] == 'fill_missing':
-            if operation['method'] == 'mean':
-                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
-                    cleaned_df[operation['column']].mean())
-            elif operation['method'] == 'median':
-                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
-                    cleaned_df[operation['column']].median())
-            elif operation['method'] == 'mode':
-                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
-                    cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
-            elif operation['method'] == 'drop':
-                cleaned_df = cleaned_df.dropna(subset=[operation['column']])
-        elif operation['type'] == 'remove_duplicates':
-            cleaned_df = cleaned_df.drop_duplicates()
-        elif operation['type'] == 'remove_outliers':
-            Q1 = cleaned_df[operation['column']].quantile(0.25)
-            Q3 = cleaned_df[operation['column']].quantile(0.75)
-            IQR = Q3 - Q1
-            lower_bound = Q1 - 1.5 * IQR
-            upper_bound = Q3 + 1.5 * IQR
-            cleaned_df = cleaned_df[
-                (cleaned_df[operation['column']] >= lower_bound) &
-                (cleaned_df[operation['column']] <= upper_bound)
-            ]
-        elif operation['type'] == 'cap_outliers':
-            Q1 = cleaned_df[operation['column']].quantile(0.25)
-            Q3 = cleaned_df[operation['column']].quantile(0.75)
-            IQR = Q3 - Q1
-            lower_bound = Q1 - 1.5 * IQR
-            upper_bound = Q3 + 1.5 * IQR
-            cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
-        elif operation['type'] == 'convert_type':
-            if operation['target_type'] == 'category':
-                cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
-    return cleaned_df

 import warnings
 from typing import Dict, List, Any, Tuple
 from scipy import stats
+import chardet
+from io import BytesIO
 warnings.filterwarnings('ignore')
+# HuggingFace optimized data processing functions with enhanced caching
+@st.cache_data(show_spinner=False)
 def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
+    """Load CSV with automatic encoding detection - optimized for HF"""
+    # Try to detect encoding
     try:
+        detected = chardet.detect(file_content[:10000])  # Sample first 10KB for speed
+        encoding = detected['encoding'] if detected['confidence'] > 0.7 else 'utf-8'
     except:
+        encoding = 'utf-8'
+    # Try detected encoding first, then fallbacks
+    encodings_to_try = [encoding, 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
+    for enc in encodings_to_try:
+        try:
+            return pd.read_csv(BytesIO(file_content), encoding=enc)
+        except:
+            continue
+    raise Exception(f"Cannot read CSV file '{filename}' with any supported encoding")
+@st.cache_data(show_spinner=False)
 def load_excel_file(file_content: bytes) -> pd.DataFrame:
+    """Load Excel file - optimized for HF"""
+    try:
+        return pd.read_excel(BytesIO(file_content))
+    except Exception as e:
+        raise Exception(f"Cannot read Excel file: {str(e)}")
+@st.cache_data(show_spinner=False)
 def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate basic statistics with performance optimization"""
+    # Optimize for large datasets
+    if len(df) > 100000:
+        sample_df = df.sample(n=50000, random_state=42)
+        st.info("📊 Using statistical sample for large dataset analysis")
+    else:
+        sample_df = df
     dtype_counts = df.dtypes.value_counts()
     dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
         'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
         'missing_values': int(df.isnull().sum().sum()),
         'dtypes': dtype_dict,
+        'duplicates': int(df.duplicated().sum()),
+        'sample_used': len(sample_df) != len(df)
+    }
+@st.cache_data(show_spinner=False)
+def calculate_enhanced_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate comprehensive quality score with business intelligence"""
+    score = 100
+    issues = []
+    recommendations = []
+    critical_issues = []
+    # Missing values analysis (max -30 points)
+    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
+    if missing_pct > 0:
+        penalty = min(30, missing_pct * 1.5)
+        score -= penalty
+        issues.append(f"Missing values: {missing_pct:.1f}%")
+        if missing_pct > 20:
+            critical_issues.append("High missing value rate")
+            recommendations.append("🚨 Critical: Review data collection processes")
+        elif missing_pct > 5:
+            recommendations.append("🔧 Apply intelligent filling strategies")
+        else:
+            recommendations.append("✅ Missing values within acceptable limits")
+    # Duplicates analysis (max -25 points)
+    duplicate_pct = (df.duplicated().sum() / len(df)) * 100
+    if duplicate_pct > 0:
+        penalty = min(25, duplicate_pct * 3)
+        score -= penalty
+        issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
+        if duplicate_pct > 5:
+            critical_issues.append("High duplication rate")
+            recommendations.append("🚨 Investigate data collection pipeline")
+        else:
+            recommendations.append("🗑️ Remove duplicates before analysis")
+    # Outliers analysis (max -20 points)
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    total_outliers = 0
+    problematic_cols = []
+    for col in numeric_cols:
+        try:
+            Q1 = df[col].quantile(0.25)
+            Q3 = df[col].quantile(0.75)
+            IQR = Q3 - Q1
+            if IQR > 0:  # Avoid division by zero
+                outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
+                outlier_pct = (len(outliers) / len(df)) * 100
+                total_outliers += len(outliers)
+                if outlier_pct > 5:
+                    problematic_cols.append(col)
+        except:
+            continue
+    if total_outliers > 0:
+        outlier_overall_pct = (total_outliers / len(df)) * 100
+        penalty = min(20, outlier_overall_pct * 2)
+        score -= penalty
+        issues.append(f"Statistical outliers: {outlier_overall_pct:.1f}%")
+        if problematic_cols:
+            recommendations.append(f"📊 Investigate outliers in: {', '.join(problematic_cols[:3])}")
+    # Type consistency analysis (max -15 points)
+    mixed_type_issues = detect_mixed_types(df)
+    if mixed_type_issues:
+        penalty = min(15, len(mixed_type_issues) * 5)
+        score -= penalty
+        issues.append(f"Type inconsistencies: {len(mixed_type_issues)} columns")
+        recommendations.append("🔧 Standardize data types")
+    # Constant columns analysis (max -10 points)
+    constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
+    if constant_cols:
+        penalty = min(10, len(constant_cols) * 3)
+        score -= penalty
+        issues.append(f"Constant columns: {len(constant_cols)}")
+        recommendations.append("🗑️ Remove uninformative columns")
+    # Grade assignment
+    if score >= 90:
+        grade, color = "A", "#22c55e"
+    elif score >= 80:
+        grade, color = "B", "#3b82f6"
+    elif score >= 70:
+        grade, color = "C", "#f59e0b"
+    elif score >= 60:
+        grade, color = "D", "#f97316"
+    else:
+        grade, color = "F", "#ef4444"
+    return {
+        'score': max(0, score),
+        'grade': grade,
+        'color': color,
+        'issues': issues,
+        'recommendations': recommendations,
+        'critical_issues': critical_issues,
+        'missing_pct': missing_pct,
+        'duplicate_pct': duplicate_pct,
+        'outlier_pct': (total_outliers / len(df)) * 100 if len(df) > 0 else 0,
+        'constant_cols': constant_cols,
+        'mixed_type_cols': len(mixed_type_issues)
     }
+@st.cache_data(show_spinner=False)
 def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
+    """Enhanced column cardinality analysis with business intelligence"""
     cardinality_data = []
     for col in df.columns:
         unique_count = df[col].nunique()
+        unique_ratio = unique_count / len(df) if len(df) > 0 else 0
+        missing_count = df[col].isnull().sum()
+        missing_pct = (missing_count / len(df)) * 100 if len(df) > 0 else 0
+        # Enhanced type classification
         if unique_count == 1:
             col_type = "Constant"
+            business_value = "None - Consider removal"
+        elif unique_count == len(df) - missing_count:
             col_type = "Unique Identifier"
+            business_value = "High - Key for joins"
+        elif unique_ratio < 0.01:
+            col_type = "Very Low Cardinality"
+            business_value = "Medium - Good for flags"
         elif unique_ratio < 0.05:
             col_type = "Low Cardinality"
+            business_value = "High - Perfect for grouping"
         elif unique_ratio < 0.5:
             col_type = "Medium Cardinality"
+            business_value = "Medium - Use for segmentation"
         else:
             col_type = "High Cardinality"
+            business_value = "Low - Avoid in group analysis"
+        # Memory impact estimation
+        if df[col].dtype == 'object' and unique_ratio < 0.5:
+            category_memory = df[col].astype('category').memory_usage(deep=True)
+            object_memory = df[col].memory_usage(deep=True)
+            memory_savings = (object_memory - category_memory) / 1024**2
+            memory_note = f"Save {memory_savings:.1f}MB with category type" if memory_savings > 0.1 else "Optimized"
+        else:
+            memory_note = "Optimized"
         cardinality_data.append({
             'Column': col,
             'Unique Count': unique_count,
             'Unique Ratio': unique_ratio,
+            'Missing %': missing_pct,
             'Type': col_type,
+            'Business Value': business_value,
+            'Data Type': str(df[col].dtype),
+            'Memory Note': memory_note
         })
     return pd.DataFrame(cardinality_data)
+@st.cache_data(show_spinner=False)
 def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
+    """Enhanced missing data analysis with pattern detection"""
     missing_data = df.isnull().sum()
     if missing_data.sum() > 0:
         missing_df = pd.DataFrame({
             'Column': missing_data.index,
             'Missing Count': missing_data.values,
+            'Missing %': (missing_data.values / len(df)) * 100,
+            'Data Type': [str(df[col].dtype) for col in missing_data.index]
         })
+        # Add severity classification
+        def classify_severity(pct):
+            if pct > 50:
+                return "🚨 Critical"
+            elif pct > 20:
+                return "⚠️ High"
+            elif pct > 5:
+                return "🔸 Medium"
+            else:
+                return "🔹 Low"
+        missing_df['Severity'] = missing_df['Missing %'].apply(classify_severity)
+        # Add AI suggestions
+        def get_ai_suggestion(row):
+            col_name = row['Column']
+            missing_pct = row['Missing %']
+            data_type = row['Data Type']
+            if missing_pct > 50:
+                return "Drop column - too many missing values"
+            elif 'int' in data_type or 'float' in data_type:
+                return "Fill with median (robust to outliers)"
+            elif 'object' in data_type:
+                return "Fill with mode (most frequent value)"
+            else:
+                return "Manual review recommended"
+        missing_df['AI Suggestion'] = missing_df.apply(get_ai_suggestion, axis=1)
         return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
     return pd.DataFrame()
+@st.cache_data(show_spinner=False)
 def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate correlation matrix with performance optimization"""
     numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    if len(numeric_cols) > 1:
+        # Use sample for very large datasets
+        if len(df) > 50000:
+            sample_df = df[numeric_cols].sample(n=25000, random_state=42)
+        else:
+            sample_df = df[numeric_cols]
+        return sample_df.corr()
+    return pd.DataFrame()
+@st.cache_data(show_spinner=False)
 def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
+    """Enhanced column type detection with business context"""
     return {
         'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
         'categorical': df.select_dtypes(include=['object']).columns.tolist(),
+        'datetime': df.select_dtypes(include=['datetime64']).columns.tolist(),
+        'boolean': df.select_dtypes(include=['bool']).columns.tolist()
     }
+@st.cache_data(show_spinner=False)
 def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
+    """Enhanced outlier detection with business context"""
+    try:
+        Q1 = df[column].quantile(0.25)
+        Q3 = df[column].quantile(0.75)
+        IQR = Q3 - Q1
+        if IQR == 0:  # No variation in data
+            return pd.DataFrame()
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
+        # Add outlier context
+        if not outliers.empty:
+            outliers = outliers.copy()
+            outliers['outlier_type'] = outliers[column].apply(
+                lambda x: 'extreme_high' if x > upper_bound else 'extreme_low'
+            )
+            outliers['severity'] = outliers[column].apply(
+                lambda x: abs(x - df[column].median()) / df[column].std() if df[column].std() > 0 else 0
+            )
+        return outliers
+    except Exception as e:
+        st.warning(f"Could not calculate outliers for '{column}': {str(e)}")
+        return pd.DataFrame()
+@st.cache_data(show_spinner=False)
 def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
+    """Enhanced mixed type detection with AI insights"""
     mixed_type_issues = []
     for col in df.select_dtypes(include=['object']).columns:
+        try:
+            # Try numeric conversion
+            numeric_conversion = pd.to_numeric(df[col], errors='coerce')
+            new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
+            if new_nulls > 0:
+                # Analyze the problematic values
+                problematic_mask = pd.to_numeric(df[col], errors='coerce').isnull() & df[col].notnull()
+                problematic_values = df.loc[problematic_mask, col].unique()[:5]  # Top 5 examples
+                mixed_type_issues.append({
+                    'column': col,
+                    'problematic_values': new_nulls,
+                    'total_values': len(df[col]),
+                    'percentage': (new_nulls / len(df[col])) * 100,
+                    'examples': problematic_values.tolist(),
+                    'suggestion': 'Convert to numeric with error handling' if new_nulls < len(df[col]) * 0.1 else 'Keep as text'
+                })
+        except:
+            continue
     return mixed_type_issues
+@st.cache_data(show_spinner=False)
+def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
+    """Enhanced memory optimization with detailed suggestions"""
+    suggestions = []
+    current_memory = df.memory_usage(deep=True).sum() / 1024**2
+    potential_savings = 0
+    for col in df.columns:
+        col_memory = df[col].memory_usage(deep=True) / 1024**2
+        if df[col].dtype == 'object':
+            unique_ratio = df[col].nunique() / len(df)
+            # Category optimization
+            if unique_ratio < 0.5:
+                try:
+                    category_memory = df[col].astype('category').memory_usage(deep=True) / 1024**2
+                    savings = col_memory - category_memory
+                    if savings > 0.1:  # Significant savings
+                        suggestions.append({
+                            'column': col,
+                            'current_type': 'object',
+                            'suggested_type': 'category',
+                            'current_memory_mb': col_memory,
+                            'optimized_memory_mb': category_memory,
+                            'savings_mb': savings,
+                            'savings_pct': (savings / col_memory) * 100
+                        })
+                        potential_savings += savings
+                except:
+                    continue
+        elif df[col].dtype == 'int64':
+            # Integer downcast optimization
+            col_min = df[col].min()
+            col_max = df[col].max()
+            if col_min >= 0:  # Unsigned integers
+                if col_max < 255:
+                    new_type = 'uint8'
+                elif col_max < 65535:
+                    new_type = 'uint16'
+                elif col_max < 4294967295:
+                    new_type = 'uint32'
+                else:
+                    new_type = 'int64'
+            else:  # Signed integers
+                if col_min >= -128 and col_max <= 127:
+                    new_type = 'int8'
+                elif col_min >= -32768 and col_max <= 32767:
+                    new_type = 'int16'
+                elif col_min >= -2147483648 and col_max <= 2147483647:
+                    new_type = 'int32'
+                else:
+                    new_type = 'int64'
+            if new_type != 'int64':
+                try:
+                    optimized_memory = df[col].astype(new_type).memory_usage(deep=True) / 1024**2
+                    savings = col_memory - optimized_memory
+                    if savings > 0.1:
+                        suggestions.append({
+                            'column': col,
+                            'current_type': 'int64',
+                            'suggested_type': new_type,
+                            'current_memory_mb': col_memory,
+                            'optimized_memory_mb': optimized_memory,
+                            'savings_mb': savings,
+                            'savings_pct': (savings / col_memory) * 100
+                        })
+                        potential_savings += savings
+                except:
+                    continue
+    return {
+        'suggestions': suggestions,
+        'current_memory_mb': current_memory,
+        'potential_savings_mb': potential_savings,
+        'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0,
+        'optimization_available': len(suggestions) > 0
+    }
+@st.cache_data(show_spinner=False)
+def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
+    """Get value counts with performance optimization"""
+    try:
+        value_counts = df[column].value_counts()
+        # Add percentage information
+        value_counts_pct = (value_counts / len(df)) * 100
+        return value_counts.head(top_n)
+    except:
+        return pd.Series()
+@st.cache_data(show_spinner=False)
 def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
+    """Enhanced group statistics with business insights"""
+    try:
+        # Basic group statistics
+        group_stats = df.groupby(group_col)[metric_col].agg([
+            'count', 'mean', 'median', 'std', 'min', 'max'
+        ]).round(3)
+        # Add business insights
+        group_stats['cv'] = (group_stats['std'] / group_stats['mean']).round(3)  # Coefficient of variation
+        group_stats['range'] = group_stats['max'] - group_stats['min']
+        # Sort by mean for better insights
+        group_stats = group_stats.sort_values('mean', ascending=False)
+        return group_stats
+    except Exception as e:
+        st.error(f"Error calculating group statistics: {str(e)}")
+        return pd.DataFrame()
+@st.cache_data(show_spinner=False)
 def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
+    """Backward compatibility wrapper"""
+    return calculate_enhanced_quality_score(df)
+def load_data(uploaded_file) -> pd.DataFrame:
+    """Enhanced data loading with better error handling for HuggingFace"""
+    if uploaded_file is None:
+        return None
+    try:
+        # Check file size (HuggingFace has limits)
+        file_size_mb = len(uploaded_file.getvalue()) / 1024**2
+        if file_size_mb > 200:  # 200MB limit for HF
+            st.error(f"File too large ({file_size_mb:.1f}MB). Please upload files under 200MB.")
+            return None
+        # Get file content
+        file_content = uploaded_file.read()
+        uploaded_file.seek(0)  # Reset file pointer
+        # Load based on file extension
+        if uploaded_file.name.endswith('.csv'):
+            df = load_csv_with_encoding(file_content, uploaded_file.name)
+        elif uploaded_file.name.endswith(('.xlsx', '.xls')):
+            df = load_excel_file(file_content)
+        else:
+            st.error("Unsupported file format. Please upload CSV or Excel files.")
+            return None
+        # Basic validation
+        if df.empty:
+            st.error("The uploaded file appears to be empty.")
+            return None
+        if len(df.columns) == 0:
+            st.error("No columns detected in the file.")
+            return None
+        # Performance warning for large datasets
+        if len(df) > 100000:
+            st.warning(f"⚡ Large dataset detected ({len(df):,} rows). Some operations will use sampling for performance.")
+        return df
+    except Exception as e:
+        st.error(f"Error loading file: {str(e)}")
+        st.info("💡 **Troubleshooting Tips:**\n- Ensure CSV files are properly formatted\n- Check for special characters in Excel files\n- Try saving Excel as CSV first")
+        return None
+def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> Tuple[pd.DataFrame, List[str]]:
+    """Apply comprehensive data cleaning operations with logging"""
+    cleaned_df = df.copy()
+    operation_log = []
+    for operation in operations:
+        try:
+            if operation['type'] == 'fill_missing':
+                col = operation['column']
+                method = operation['method']
+                if method == 'mean' and cleaned_df[col].dtype in ['int64', 'float64']:
+                    fill_value = cleaned_df[col].mean()
+                    cleaned_df[col] = cleaned_df[col].fillna(fill_value)
+                    operation_log.append(f"Filled missing values in '{col}' with mean ({fill_value:.2f})")
+                elif method == 'median' and cleaned_df[col].dtype in ['int64', 'float64']:
+                    fill_value = cleaned_df[col].median()
+                    cleaned_df[col] = cleaned_df[col].fillna(fill_value)
+                    operation_log.append(f"Filled missing values in '{col}' with median ({fill_value:.2f})")
+                elif method == 'mode':
+                    mode_values = cleaned_df[col].mode()
+                    if not mode_values.empty:
+                        fill_value = mode_values.iloc[0]
+                        cleaned_df[col] = cleaned_df[col].fillna(fill_value)
+                        operation_log.append(f"Filled missing values in '{col}' with mode ('{fill_value}')")
+                elif method == 'drop':
+                    original_len = len(cleaned_df)
+                    cleaned_df = cleaned_df.dropna(subset=[col])
+                    removed = original_len - len(cleaned_df)
+                    operation_log.append(f"Dropped {removed} rows with missing values in '{col}'")
+            elif operation['type'] == 'remove_duplicates':
+                original_len = len(cleaned_df)
+                cleaned_df = cleaned_df.drop_duplicates()
+                removed = original_len - len(cleaned_df)
+                if removed > 0:
+                    operation_log.append(f"Removed {removed} duplicate rows")
+            elif operation['type'] == 'remove_outliers':
+                col = operation['column']
+                Q1 = cleaned_df[col].quantile(0.25)
+                Q3 = cleaned_df[col].quantile(0.75)
+                IQR = Q3 - Q1
+                lower_bound = Q1 - 1.5 * IQR
+                upper_bound = Q3 + 1.5 * IQR
+                outliers = cleaned_df[(cleaned_df[col] < lower_bound) | (cleaned_df[col] > upper_bound)]
+                cleaned_df = cleaned_df[~cleaned_df.index.isin(outliers.index)]
+                operation_log.append(f"Removed {len(outliers)} outliers from '{col}'")
+            elif operation['type'] == 'cap_outliers':
+                col = operation['column']
+                Q1 = cleaned_df[col].quantile(0.25)
+                Q3 = cleaned_df[col].quantile(0.75)
+                IQR = Q3 - Q1
+                lower_bound = Q1 - 1.5 * IQR
+                upper_bound = Q3 + 1.5 * IQR
+                original_outliers = len(cleaned_df[(cleaned_df[col] < lower_bound) | (cleaned_df[col] > upper_bound)])
+                cleaned_df[col] = cleaned_df[col].clip(lower_bound, upper_bound)
+                operation_log.append(f"Capped {original_outliers} outliers in '{col}' to statistical bounds")
+            elif operation['type'] == 'convert_type':
+                col = operation['column']
+                target_type = operation['target_type']
+                if target_type == 'category':
+                    cleaned_df[col] = cleaned_df[col].astype('category')
+                    operation_log.append(f"Converted '{col}' to category type")
+                elif target_type == 'numeric':
+                    cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='coerce')
+                    operation_log.append(f"Converted '{col}' to numeric type")
+            elif operation['type'] == 'drop_column':
+                col = operation['column']
+                cleaned_df = cleaned_df.drop(columns=[col])
+                operation_log.append(f"Dropped column '{col}'")
+        except Exception as e:
+            operation_log.append(f"Failed to apply {operation['type']}: {str(e)}")
+    return cleaned_df, operation_log
+# HuggingFace specific optimizations
+def optimize_dataframe_for_hf(df: pd.DataFrame) -> pd.DataFrame:
+    """Apply HuggingFace specific optimizations"""
+    optimized_df = df.copy()
+    # Convert high-cardinality object columns to category
+    for col in optimized_df.select_dtypes(include=['object']).columns:
+        if optimized_df[col].nunique() / len(optimized_df) < 0.5:
+            try:
+                optimized_df[col] = optimized_df[col].astype('category')
+            except:
+                continue
+    # Downcast numeric types for memory efficiency
+    for col in optimized_df.select_dtypes(include=['int64']).columns:
+        try:
+            optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='integer')
+        except:
+            continue
+    for col in optimized_df.select_dtypes(include=['float64']).columns:
+        try:
+            optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='float')
+        except:
+            continue
+    return optimized_df
+@st.cache_data(show_spinner=False)
+def generate_sample_data() -> pd.DataFrame:
+    """Generate sample dataset for demonstration"""
+    np.random.seed(42)
+    n_samples = 1000
+    # Create realistic business dataset
+    data = {
+        'customer_id': [f"CUST_{i:06d}" for i in range(1, n_samples + 1)],
+        'age': np.random.normal(35, 12, n_samples),
+        'annual_income': np.random.lognormal(10.5, 0.5, n_samples),
+        'credit_score': np.random.normal(650, 100, n_samples),
+        'account_balance': np.random.normal(5000, 3000, n_samples),
+        'region': np.random.choice(['North', 'South', 'East', 'West', 'Central'], n_samples),
+        'customer_segment': np.random.choice(['Premium', 'Standard', 'Basic'], n_samples, p=[0.2, 0.5, 0.3]),
+        'is_active': np.random.choice([True, False], n_samples, p=[0.8, 0.2]),
+        'signup_date': pd.date_range('2020-01-01', periods=n_samples, freq='D')[:n_samples]
     }
+    df = pd.DataFrame(data)
+    # Inject realistic quality issues for demonstration
+    # 1. Missing values in income (realistic - some customers don't disclose)
+    missing_income_idx = np.random.choice(df.index, size=int(n_samples * 0.15), replace=False)
+    df.loc[missing_income_idx, 'annual_income'] = np.nan
+    # 2. Missing values in credit score (realistic - new customers)
+    missing_credit_idx = np.random.choice(df.index, size=int(n_samples * 0.08), replace=False)
+    df.loc[missing_credit_idx, 'credit_score'] = np.nan
+    # 3. Outliers in age (data entry errors)
+    outlier_age_idx = np.random.choice(df.index, size=25, replace=False)
+    df.loc[outlier_age_idx, 'age'] = np.random.uniform(150, 999, 25)  # Obvious errors
+    # 4. Outliers in income (legitimate high earners + errors)
+    outlier_income_idx = np.random.choice(df.index, size=30, replace=False)
+    df.loc[outlier_income_idx, 'annual_income'] = np.random.uniform(500000, 2000000, 30)
+    # 5. Negative account balances (overdrafts - realistic)
+    negative_balance_idx = np.random.choice(df.index, size=50, replace=False)
+    df.loc[negative_balance_idx, 'account_balance'] = np.random.uniform(-5000, -100, 50)
+    # 6. Duplicate records (system errors)
+    duplicate_records = df.sample(n=35).copy()
+    df = pd.concat([df, duplicate_records], ignore_index=True)
+    # 7. Mixed types in a column (add some text to numeric column)
+    mixed_type_idx = np.random.choice(df.index, size=15, replace=False)
+    df.loc[mixed_type_idx, 'credit_score'] = 'PENDING'
+    return df
+# Additional utility functions for HuggingFace deployment
+def check_dataset_compatibility(df: pd.DataFrame) -> Dict[str, Any]:
+    """Check if dataset is compatible with HuggingFace processing limits"""
+    compatibility = {
+        'size_ok': True,
+        'memory_ok': True,
+        'columns_ok': True,
+        'warnings': [],
+        'recommendations': []
+    }
+    # Size checks
+    if len(df) > 1000000:  # 1M rows
+        compatibility['size_ok'] = False
+        compatibility['warnings'].append(f"Large dataset: {len(df):,} rows")
+        compatibility['recommendations'].append("Consider sampling for interactive analysis")
+    # Memory checks
+    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
+    if memory_mb > 500:  # 500MB
+        compatibility['memory_ok'] = False
+        compatibility['warnings'].append(f"High memory usage: {memory_mb:.1f}MB")
+        compatibility['recommendations'].append("Apply memory optimization techniques")
+    # Column count checks
+    if len(df.columns) > 100:
+        compatibility['columns_ok'] = False
+        compatibility['warnings'].append(f"Many columns: {len(df.columns)}")
+        compatibility['recommendations'].append("Focus analysis on key business columns")
+    return compatibility
+def get_smart_sample(df: pd.DataFrame, target_size: int = 10000) -> pd.DataFrame:
+    """Get intelligent sample that preserves data characteristics"""
+    if len(df) <= target_size:
+        return df
+    # Stratified sampling if categorical columns exist
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+    if len(categorical_cols) > 0:
+        # Use the first categorical column for stratification
+        strat_col = categorical_cols[0]
+        try:
+            sample_df = df.groupby(strat_col, group_keys=False).apply(
+                lambda x: x.sample(min(len(x), max(1, int(target_size * len(x) / len(df)))))
+            )
+            return sample_df.reset_index(drop=True)
+        except:
+            # Fall back to random sampling
+            pass
+    # Random sampling
+    return df.sample(n=target_size, random_state=42).reset_index(drop=True)