Spaces:

entropy25
/

data-analysis-platform

Sleeping

App Files Files Community

entropy25 commited on Aug 9, 2025

Commit

e595151

verified ·

1 Parent(s): 546a4bd

Update data_handler.py

Browse files

Files changed (1) hide show

data_handler.py +124 -1

data_handler.py CHANGED Viewed

@@ -149,4 +149,127 @@ def calculate_correlation_matrix(df: pd.DataFrame) -> pd.DataFrame:
                     variance_cols.append(col)
             if len(variance_cols) > 1:
-                corr_matrix = df[variance_cols].corr()

                     variance_cols.append(col)
             if len(variance_cols) > 1:
+                corr_matrix = df[variance_cols].corr()
+                return corr_matrix
+        return pd.DataFrame()
+    except Exception as e:
+        st.error(f"Error calculating correlation matrix: {str(e)}")
+        return pd.DataFrame()
+@st.cache_data
+def detect_outliers(df: pd.DataFrame, column: str, method: str = 'iqr') -> Dict[str, Any]:
+    """Detect outliers using IQR or Z-score method"""
+    try:
+        if column not in df.columns or not pd.api.types.is_numeric_dtype(df[column]):
+            return {'outliers': [], 'bounds': {}, 'count': 0}
+        data = df[column].dropna()
+        if method == 'iqr':
+            Q1 = data.quantile(0.25)
+            Q3 = data.quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            outliers = data[(data < lower_bound) | (data > upper_bound)]
+            bounds = {'lower': lower_bound, 'upper': upper_bound, 'Q1': Q1, 'Q3': Q3}
+        else:  # z-score method
+            z_scores = np.abs((data - data.mean()) / data.std())
+            outliers = data[z_scores > 3]
+            bounds = {'threshold': 3, 'mean': data.mean(), 'std': data.std()}
+        return {
+            'outliers': outliers.tolist(),
+            'bounds': bounds,
+            'count': len(outliers),
+            'percentage': (len(outliers) / len(data)) * 100
+        }
+    except Exception as e:
+        st.error(f"Error detecting outliers: {str(e)}")
+        return {'outliers': [], 'bounds': {}, 'count': 0, 'percentage': 0}
+@st.cache_data
+def calculate_data_quality_score(df: pd.DataFrame) -> Dict[str, Any]:
+    """Calculate comprehensive data quality score"""
+    try:
+        # Initialize scores
+        scores = {}
+        # 1. Completeness (missing data)
+        total_cells = df.shape[0] * df.shape[1]
+        missing_cells = df.isnull().sum().sum()
+        completeness = ((total_cells - missing_cells) / total_cells) * 100
+        scores['completeness'] = completeness
+        # 2. Uniqueness (duplicates)
+        duplicate_rows = df.duplicated().sum()
+        uniqueness = ((df.shape[0] - duplicate_rows) / df.shape[0]) * 100
+        scores['uniqueness'] = uniqueness
+        # 3. Consistency (data types)
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        consistency_score = 100  # Start with perfect score
+        for col in numeric_cols:
+            # Check for mixed types (e.g., numbers stored as strings)
+            non_null_data = df[col].dropna()
+            if len(non_null_data) > 0:
+                try:
+                    pd.to_numeric(non_null_data, errors='raise')
+                except:
+                    consistency_score -= 10  # Penalty for inconsistent types
+        scores['consistency'] = max(consistency_score, 0)
+        # 4. Validity (basic checks)
+        validity_score = 100
+        # Check for extreme outliers in numeric columns
+        for col in numeric_cols:
+            outlier_info = detect_outliers(df, col)
+            if outlier_info['percentage'] > 5:  # More than 5% outliers
+                validity_score -= 5
+        scores['validity'] = max(validity_score, 0)
+        # Overall quality score (weighted average)
+        overall_score = (
+            scores['completeness'] * 0.4 +
+            scores['uniqueness'] * 0.3 +
+            scores['consistency'] * 0.2 +
+            scores['validity'] * 0.1
+        )
+        scores['overall'] = overall_score
+        # Quality grade
+        if overall_score >= 90:
+            grade = 'Excellent'
+        elif overall_score >= 80:
+            grade = 'Good'
+        elif overall_score >= 70:
+            grade = 'Fair'
+        elif overall_score >= 60:
+            grade = 'Poor'
+        else:
+            grade = 'Critical'
+        scores['grade'] = grade
+        return scores
+    except Exception as e:
+        st.error(f"Error calculating data quality score: {str(e)}")
+        return {
+            'completeness': 0,
+            'uniqueness': 0,
+            'consistency': 0,
+            'validity': 0,
+            'overall': 0,
+            'grade': 'Unknown'
+        }