File size: 1,937 Bytes
04b129a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""
Utility functions for Streamlit components
"""

import streamlit as st
import pandas as pd
from typing import Optional


@st.cache_data
def load_data(file_path: str) -> Optional[pd.DataFrame]:
    """Load and cache data"""
    try:
        if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
            return pd.read_excel(file_path)
        elif file_path.endswith('.csv'):
            return pd.read_csv(file_path)
    except Exception as e:
        st.error(f"Error loading file: {e}")
        return None


def display_dataframe_stats(df: pd.DataFrame):
    """Display basic dataframe statistics"""
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        st.metric("Rows", df.shape[0])
    with col2:
        st.metric("Columns", df.shape[1])
    with col3:
        st.metric("Missing Values", df.isnull().sum().sum())
    with col4:
        st.metric("Memory Usage", f"{df.memory_usage().sum() / 1024:.2f} KB")


def display_column_info(df: pd.DataFrame):
    """Display information about dataframe columns"""
    st.subheader("Column Information")
    
    col_info = pd.DataFrame({
        'Column': df.columns,
        'Type': df.dtypes.values,
        'Non-Null Count': df.count().values,
        'Null Count': df.isnull().sum().values,
    })
    
    st.dataframe(col_info, use_container_width=True)


def display_data_quality(df: pd.DataFrame):
    """Display data quality metrics"""
    st.subheader("Data Quality Assessment")
    
    col1, col2, col3 = st.columns(3)
    
    total_cells = df.shape[0] * df.shape[1]
    null_cells = df.isnull().sum().sum()
    completeness = ((total_cells - null_cells) / total_cells) * 100
    
    with col1:
        st.metric("Data Completeness", f"{completeness:.2f}%")
    
    with col2:
        st.metric("Duplicate Rows", df.duplicated().sum())
    
    with col3:
        st.metric("Numeric Columns", df.select_dtypes(include=['number']).shape[1])