Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

59db6f8

verified ·

1 Parent(s): e879f17

Update data_handler.py

Browse files

Files changed (1) hide show

data_handler.py +259 -322

data_handler.py CHANGED Viewed

@@ -2,359 +2,296 @@ import streamlit as st
 import pandas as pd
 import numpy as np
 import warnings
-from typing import Dict, List, Any
 warnings.filterwarnings('ignore')
 @st.cache_data
 def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
-    """Load CSV with automatic encoding detection and enhanced error handling"""
     import chardet
-    from io import BytesIO
     try:
-        # Detect encoding
-        detected = chardet.detect(file_content)
-        encoding = detected['encoding'] if detected['confidence'] > 0.7 else 'utf-8'
-        # Try detected encoding first
         return pd.read_csv(BytesIO(file_content), encoding=encoding)
-    except Exception:
-        # Fallback encodings
-        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'utf-16']
         for enc in encodings:
             try:
-                file_content_copy = BytesIO(file_content)
-                return pd.read_csv(file_content_copy, encoding=enc)
-            except Exception:
                 continue
-        # Last resort - try with error handling
-        try:
-            return pd.read_csv(BytesIO(file_content), encoding='utf-8', errors='ignore')
-        except Exception as e:
-            raise Exception(f"Cannot read CSV file: {str(e)}")
 @st.cache_data
-def load_excel_file(file_content: bytes, filename: str) -> pd.DataFrame:
-    """Load Excel file with enhanced error handling"""
     from io import BytesIO
-    try:
-        # Try loading Excel file
-        return pd.read_excel(BytesIO(file_content))
-    except Exception as e:
-        # Try different engines
-        engines = ['openpyxl', 'xlrd']
-        for engine in engines:
-            try:
-                return pd.read_excel(BytesIO(file_content), engine=engine)
-            except Exception:
-                continue
-        raise Exception(f"Cannot read Excel file: {str(e)}")
 @st.cache_data
 def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate comprehensive basic statistics"""
-    try:
-        # Convert dtypes to string for JSON serialization
-        dtype_counts = df.dtypes.value_counts()
-        dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
-        # Calculate memory usage more accurately
-        memory_usage = df.memory_usage(deep=True).sum() / (1024**2)  # MB
-        # Count missing values
-        missing_values = int(df.isnull().sum().sum())
-        # Count duplicates
-        duplicates = int(df.duplicated().sum())
-        # Additional statistics
-        numeric_cols = df.select_dtypes(include=[np.number]).columns
-        categorical_cols = df.select_dtypes(include=['object']).columns
-        return {
-            'shape': df.shape,
-            'memory_usage': float(memory_usage),
-            'missing_values': missing_values,
-            'dtypes': dtype_dict,
-            'duplicates': duplicates,
-            'numeric_columns': len(numeric_cols),
-            'categorical_columns': len(categorical_cols),
-            'total_cells': df.shape[0] * df.shape[1],
-            'completeness': ((df.shape[0] * df.shape[1] - missing_values) / (df.shape[0] * df.shape[1])) * 100
-        }
-    except Exception as e:
-        st.error(f"Error calculating basic statistics: {str(e)}")
-        return {
-            'shape': (0, 0),
-            'memory_usage': 0.0,
-            'missing_values': 0,
-            'dtypes': {},
-            'duplicates': 0,
-            'numeric_columns': 0,
-            'categorical_columns': 0,
-            'total_cells': 0,
-            'completeness': 0.0
-        }
 @st.cache_data
 def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate detailed missing data analysis"""
-    try:
-        missing_data = df.isnull().sum()
-        if missing_data.sum() > 0:
-            missing_df = pd.DataFrame({
-                'Column': missing_data.index,
-                'Missing Count': missing_data.values,
-                'Missing %': (missing_data.values / len(df)) * 100,
-                'Data Type': [str(df[col].dtype) for col in missing_data.index],
-                'Non-Missing Count': len(df) - missing_data.values
-            })
-            # Sort by missing percentage (descending)
-            missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
-            # Add severity classification
-            missing_df['Severity'] = missing_df['Missing %'].apply(
-                lambda x: 'Critical' if x > 50 else 'High' if x > 20 else 'Medium' if x > 5 else 'Low'
-            )
-            return missing_df
-        return pd.DataFrame()
-    except Exception as e:
-        st.error(f"Error calculating missing data: {str(e)}")
-        return pd.DataFrame()
 @st.cache_data
 def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
-    """Calculate correlation matrix with enhanced handling"""
-    try:
-        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-        if len(numeric_cols) > 1:
-            # Remove columns with no variance (constant values)
-            variance_cols = []
-            for col in numeric_cols:
-                if df[col].var() > 0:  # Only include columns with variance
-                    variance_cols.append(col)
-            if len(variance_cols) > 1:
-                corr_matrix = df[variance_cols].corr()
-                return corr_matrix
-        return pd.DataFrame()
-    except Exception as e:
-        st.error(f"Error calculating correlation matrix: {str(e)}")
-        return pd.DataFrame()
 @st.cache_data
-def detect_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> Dict[str, Any]:
-    """Detect outliers using IQR or Z-score method"""
-    try:
-        if column not in df.columns or not pd.api.types.is_numeric_dtype(df[column]):
-            return {'outliers': [], 'bounds': {}, 'count': 0}
-        data = df[column].dropna()
-        if method == 'iqr':
-            Q1 = data.quantile(0.25)
-            Q3 = data.quantile(0.75)
-            IQR = Q3 - Q1
-            lower_bound = Q1 - 1.5 * IQR
-            upper_bound = Q3 + 1.5 * IQR
-            outliers = data[(data < lower_bound) | (data > upper_bound)]
-            bounds = {'lower': lower_bound, 'upper': upper_bound, 'Q1': Q1, 'Q3': Q3}
-        else:  # z-score method
-            z_scores = np.abs((data - data.mean()) / data.std())
-            outliers = data[z_scores > 3]
-            bounds = {'threshold': 3, 'mean': data.mean(), 'std': data.std()}
-        return {
-            'outliers': outliers.tolist(),
-            'bounds': bounds,
-            'count': len(outliers),
-            'percentage': (len(outliers) / len(data)) * 100
-        }
-    except Exception as e:
-        st.error(f"Error detecting outliers: {str(e)}")
-        return {'outliers': [], 'bounds': {}, 'count': 0, 'percentage': 0}
 @st.cache_data
-def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
-    """Calculate comprehensive data quality score"""
-    try:
-        # Initialize scores
-        scores = {}
-        # 1. Completeness (missing data)
-        total_cells = df.shape[0] * df.shape[1]
-        missing_cells = df.isnull().sum().sum()
-        completeness = ((total_cells - missing_cells) / total_cells) * 100
-        scores['completeness'] = completeness
-        # 2. Uniqueness (duplicates)
-        duplicate_rows = df.duplicated().sum()
-        uniqueness = ((df.shape[0] - duplicate_rows) / df.shape[0]) * 100
-        scores['uniqueness'] = uniqueness
-        # 3. Consistency (data types)
-        numeric_cols = df.select_dtypes(include=[np.number]).columns
-        consistency_score = 100  # Start with perfect score
-        for col in numeric_cols:
-            # Check for mixed types (e.g., numbers stored as strings)
-            non_null_data = df[col].dropna()
-            if len(non_null_data) > 0:
-                try:
-                    pd.to_numeric(non_null_data, errors='raise')
-                except:
-                    consistency_score -= 10  # Penalty for inconsistent types
-        scores['consistency'] = max(consistency_score, 0)
-        # 4. Validity (basic checks)
-        validity_score = 100
-        # Check for extreme outliers in numeric columns
-        for col in numeric_cols:
-            outlier_info = detect_outliers(df, col)
-            if outlier_info['percentage'] > 5:  # More than 5% outliers
-                validity_score -= 5
-        scores['validity'] = max(validity_score, 0)
-        # Overall quality score (weighted average)
-        overall_score = (
-            scores['completeness'] * 0.4 +
-            scores['uniqueness'] * 0.3 +
-            scores['consistency'] * 0.2 +
-            scores['validity'] * 0.1
-        )
-        scores['overall'] = overall_score
-        # Quality grade
-        if overall_score >= 90:
-            grade = 'Excellent'
-        elif overall_score >= 80:
-            grade = 'Good'
-        elif overall_score >= 70:
-            grade = 'Fair'
-        elif overall_score >= 60:
-            grade = 'Poor'
-        else:
-            grade = 'Critical'
-        scores['grade'] = grade
-        return scores
-    except Exception as e:
-        st.error(f"Error calculating data quality score: {str(e)}")
-        return {
-            'completeness': 0,
-            'uniqueness': 0,
-            'consistency': 0,
-            'validity': 0,
-            'overall': 0,
-            'grade': 'Unknown'
-        }
 @st.cache_data
-def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
-    """Get columns categorized by their data types"""
-    try:
-        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-        categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
-        datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
-        boolean_cols = df.select_dtypes(include=['bool']).columns.tolist()
-        return {
-            'numeric': numeric_cols,
-            'categorical': categorical_cols,
-            'datetime': datetime_cols,
-            'boolean': boolean_cols,
-            'all': df.columns.tolist()
-        }
-    except Exception as e:
-        st.error(f"Error getting column types: {str(e)}")
-        return {
-            'numeric': [],
-            'categorical': [],
-            'datetime': [],
-            'boolean': [],
-            'all': []
-        }
 @st.cache_data
-def clean_data(df: pd.DataFrame, options: Dict[str, Any] = None) -> pd.DataFrame:
-    """Clean data based on specified options"""
-    try:
-        cleaned_df = df.copy()
-        if options is None:
-            options = {}
-        # Remove duplicates if specified
-        if options.get('remove_duplicates', False):
-            initial_rows = len(cleaned_df)
-            cleaned_df = cleaned_df.drop_duplicates()
-            removed_rows = initial_rows - len(cleaned_df)
-            if removed_rows > 0:
-                st.info(f"Removed {removed_rows} duplicate rows")
-        # Handle missing values
-        if options.get('handle_missing', False):
-            missing_strategy = options.get('missing_strategy', 'drop')
-            if missing_strategy == 'drop':
-                cleaned_df = cleaned_df.dropna()
-            elif missing_strategy == 'fill_mean':
-                numeric_cols = cleaned_df.select_dtypes(include=[np.number]).columns
-                cleaned_df[numeric_cols] = cleaned_df[numeric_cols].fillna(cleaned_df[numeric_cols].mean())
-            elif missing_strategy == 'fill_median':
-                numeric_cols = cleaned_df.select_dtypes(include=[np.number]).columns
-                cleaned_df[numeric_cols] = cleaned_df[numeric_cols].fillna(cleaned_df[numeric_cols].median())
-            elif missing_strategy == 'fill_mode':
-                for col in cleaned_df.columns:
-                    if cleaned_df[col].isnull().any():
-                        mode_value = cleaned_df[col].mode()
-                        if not mode_value.empty:
-                            cleaned_df[col].fillna(mode_value[0], inplace=True)
-        return cleaned_df
-    except Exception as e:
-        st.error(f"Error cleaning data: {str(e)}")
-        return df
 def load_data(uploaded_file):
-    """Main function to load data - for compatibility"""
-    try:
-        file_extension = uploaded_file.name.split('.')[-1].lower()
-        file_content = uploaded_file.getvalue()
-        if file_extension == 'csv':
-            return load_csv_with_encoding(file_content, uploaded_file.name)
-        elif file_extension in ['xlsx', 'xls']:
-            return load_excel_file(file_content, uploaded_file.name)
-        else:
-            st.error("Unsupported file format")
-            return None
-    except Exception as e:
-        st.error(f"Error loading file: {str(e)}")
-        return None

 import pandas as pd
 import numpy as np
 import warnings
+from typing import Dict, List, Any, Tuple
+from scipy import stats
 warnings.filterwarnings('ignore')
+# All cached data processing functions
 @st.cache_data
 def load_csv_with_encoding(file_content: bytes, filename: str) -> pd.DataFrame:
+    """Load CSV with automatic encoding detection - cached"""
     import chardet
+    detected = chardet.detect(file_content)
+    encoding = detected['encoding']
     try:
+        from io import BytesIO
         return pd.read_csv(BytesIO(file_content), encoding=encoding)
+    except:
+        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
         for enc in encodings:
             try:
+                return pd.read_csv(BytesIO(file_content), encoding=enc)
+            except:
                 continue
+        raise Exception("Cannot read file with any encoding")
 @st.cache_data
+def load_excel_file(file_content: bytes) -> pd.DataFrame:
+    """Load Excel file - cached"""
     from io import BytesIO
+    return pd.read_excel(BytesIO(file_content))
 @st.cache_data
 def calculate_basic_stats(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate basic statistics - cached"""
+    dtype_counts = df.dtypes.value_counts()
+    dtype_dict = {str(k): int(v) for k, v in dtype_counts.items()}
+    return {
+        'shape': df.shape,
+        'memory_usage': float(df.memory_usage(deep=True).sum() / 1024**2),
+        'missing_values': int(df.isnull().sum().sum()),
+        'dtypes': dtype_dict,
+        'duplicates': int(df.duplicated().sum())
+    }
+@st.cache_data
+def calculate_column_cardinality(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate column cardinality analysis - cached"""
+    cardinality_data = []
+    for col in df.columns:
+        unique_count = df[col].nunique()
+        unique_ratio = unique_count / len(df)
+        # Determine column type based on cardinality
+        if unique_count == 1:
+            col_type = "Constant"
+        elif unique_count == len(df):
+            col_type = "Unique Identifier"
+        elif unique_ratio < 0.05:
+            col_type = "Low Cardinality"
+        elif unique_ratio < 0.5:
+            col_type = "Medium Cardinality"
+        else:
+            col_type = "High Cardinality"
+        cardinality_data.append({
+            'Column': col,
+            'Unique Count': unique_count,
+            'Unique Ratio': unique_ratio,
+            'Type': col_type,
+            'Data Type': str(df[col].dtype)
+        })
+    return pd.DataFrame(cardinality_data)
+@st.cache_data
+def calculate_memory_optimization(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate memory optimization suggestions - cached"""
+    suggestions = []
+    current_memory = df.memory_usage(deep=True).sum() / 1024**2
+    potential_savings = 0
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            unique_ratio = df[col].nunique() / len(df)
+            if unique_ratio < 0.5:  # Less than 50% unique values
+                # Estimate category memory usage
+                category_memory = df[col].astype('category').memory_usage(deep=True)
+                object_memory = df[col].memory_usage(deep=True)
+                savings = (object_memory - category_memory) / 1024**2
+                if savings > 0.1:  # More than 0.1MB savings
+                    suggestions.append({
+                        'column': col,
+                        'current_type': 'object',
+                        'suggested_type': 'category',
+                        'savings_mb': savings
+                    })
+                    potential_savings += savings
+    return {
+        'suggestions': suggestions,
+        'current_memory_mb': current_memory,
+        'potential_savings_mb': potential_savings,
+        'potential_savings_pct': (potential_savings / current_memory) * 100 if current_memory > 0 else 0
+    }
 @st.cache_data
 def calculate_missing_data(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate missing data analysis - cached"""
+    missing_data = df.isnull().sum()
+    if missing_data.sum() > 0:
+        missing_df = pd.DataFrame({
+            'Column': missing_data.index,
+            'Missing Count': missing_data.values,
+            'Missing %': (missing_data.values / len(df)) * 100
+        })
+        return missing_df[missing_df['Missing Count'] > 0].sort_values('Missing %', ascending=False)
+    return pd.DataFrame()
 @st.cache_data
 def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
+    """Calculate correlation matrix - cached"""
+    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
+    return df[numeric_cols].corr() if len(numeric_cols) > 1 else pd.DataFrame()
 @st.cache_data
+def get_column_types(df: pd.DataFrame) -> Dict[str, List[str]]:
+    """Get column types - cached"""
+    return {
+        'numeric': df.select_dtypes(include=[np.number]).columns.tolist(),
+        'categorical': df.select_dtypes(include=['object']).columns.tolist(),
+        'datetime': df.select_dtypes(include=['datetime64']).columns.tolist()
+    }
 @st.cache_data
+def calculate_numeric_stats(df: pd.DataFrame, column: str) -> Dict[str, float]:
+    """Calculate enhanced numeric statistics - cached"""
+    series = df[column].dropna()
+    return {
+        'mean': series.mean(),
+        'median': series.median(),
+        'std': series.std(),
+        'skewness': series.skew(),
+        'kurtosis': series.kurtosis(),
+        'min': series.min(),
+        'max': series.max(),
+        'q25': series.quantile(0.25),
+        'q75': series.quantile(0.75)
+    }
 @st.cache_data
+def calculate_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
+    """Calculate outliers using IQR method - cached"""
+    Q1 = df[column].quantile(0.25)
+    Q3 = df[column].quantile(0.75)
+    IQR = Q3 - Q1
+    lower_bound = Q1 - 1.5 * IQR
+    upper_bound = Q3 + 1.5 * IQR
+    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
 @st.cache_data
+def detect_mixed_types(df: pd.DataFrame) -> List[Dict[str, Any]]:
+    """Detect columns with mixed data types - cached"""
+    mixed_type_issues = []
+    for col in df.select_dtypes(include=['object']).columns:
+        # Try to convert to numeric
+        numeric_conversion = pd.to_numeric(df[col], errors='coerce')
+        new_nulls = numeric_conversion.isnull().sum() - df[col].isnull().sum()
+        if new_nulls > 0:
+            mixed_type_issues.append({
+                'column': col,
+                'problematic_values': new_nulls,
+                'total_values': len(df[col]),
+                'percentage': (new_nulls / len(df[col])) * 100
+            })
+    return mixed_type_issues
+@st.cache_data
+def get_value_counts(df: pd.DataFrame, column: str, top_n: int = 10) -> pd.Series:
+    """Get value counts for categorical column - cached"""
+    return df[column].value_counts().head(top_n)
+@st.cache_data
+def calculate_crosstab(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
+    """Calculate crosstab between two categorical columns - cached"""
+    return pd.crosstab(df[col1], df[col2])
+@st.cache_data
+def calculate_group_stats(df: pd.DataFrame, group_col: str, metric_col: str) -> pd.DataFrame:
+    """Calculate group statistics - cached"""
+    return df.groupby(group_col)[metric_col].agg(['mean', 'median', 'std', 'count'])
+@st.cache_data
+def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate overall data quality score - cached"""
+    score = 100
+    issues = []
+    # Missing values penalty
+    missing_pct = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
+    if missing_pct > 0:
+        penalty = min(30, missing_pct * 2)  # Max 30 points penalty
+        score -= penalty
+        issues.append(f"Missing values: {missing_pct:.1f}%")
+    # Duplicates penalty
+    duplicate_pct = (df.duplicated().sum() / len(df)) * 100
+    if duplicate_pct > 0:
+        penalty = min(20, duplicate_pct * 4)  # Max 20 points penalty
+        score -= penalty
+        issues.append(f"Duplicate rows: {duplicate_pct:.1f}%")
+    # Constant columns penalty
+    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
+    if constant_cols:
+        penalty = min(10, len(constant_cols) * 2)
+        score -= penalty
+        issues.append(f"Constant columns: {len(constant_cols)}")
+    # Mixed types penalty
+    mixed_types = detect_mixed_types(df)
+    if mixed_types:
+        penalty = min(10, len(mixed_types) * 3)
+        score -= penalty
+        issues.append(f"Mixed type columns: {len(mixed_types)}")
+    return {
+        'score': max(0, score),
+        'issues': issues,
+        'grade': 'A' if score >= 90 else 'B' if score >= 80 else 'C' if score >= 70 else 'D' if score >= 60 else 'F'
+    }
 def load_data(uploaded_file):
+    """Unified data loading function"""
+    file_content = uploaded_file.read()
+    uploaded_file.seek(0)
+    if uploaded_file.name.endswith('.csv'):
+        return load_csv_with_encoding(file_content, uploaded_file.name)
+    else:
+        return load_excel_file(file_content)
+def apply_data_cleaning(df: pd.DataFrame, operations: List[Dict[str, Any]]) -> pd.DataFrame:
+    """Apply data cleaning operations"""
+    cleaned_df = df.copy()
+    for operation in operations:
+        if operation['type'] == 'fill_missing':
+            if operation['method'] == 'mean':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
+                    cleaned_df[operation['column']].mean())
+            elif operation['method'] == 'median':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
+                    cleaned_df[operation['column']].median())
+            elif operation['method'] == 'mode':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].fillna(
+                    cleaned_df[operation['column']].mode().iloc[0] if not cleaned_df[operation['column']].mode().empty else 0)
+            elif operation['method'] == 'drop':
+                cleaned_df = cleaned_df.dropna(subset=[operation['column']])
+        elif operation['type'] == 'remove_duplicates':
+            cleaned_df = cleaned_df.drop_duplicates()
+        elif operation['type'] == 'remove_outliers':
+            Q1 = cleaned_df[operation['column']].quantile(0.25)
+            Q3 = cleaned_df[operation['column']].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            cleaned_df = cleaned_df[
+                (cleaned_df[operation['column']] >= lower_bound) &
+                (cleaned_df[operation['column']] <= upper_bound)
+            ]
+        elif operation['type'] == 'cap_outliers':
+            Q1 = cleaned_df[operation['column']].quantile(0.25)
+            Q3 = cleaned_df[operation['column']].quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            cleaned_df[operation['column']] = cleaned_df[operation['column']].clip(lower_bound, upper_bound)
+        elif operation['type'] == 'convert_type':
+            if operation['target_type'] == 'category':
+                cleaned_df[operation['column']] = cleaned_df[operation['column']].astype('category')
+    return cleaned_df