File size: 7,793 Bytes
f2e849e
cbbc735
f2e849e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbbc735
f2e849e
 
cbbc735
f2e849e
 
 
cbbc735
6762acb
f2e849e
 
cbbc735
 
 
 
f2e849e
 
 
 
 
 
cbbc735
 
 
 
f2e849e
 
 
 
 
 
 
 
6762acb
 
 
 
 
 
 
 
 
 
 
 
cbbc735
 
 
6762acb
cbbc735
f2e849e
 
 
 
 
 
 
 
 
 
cbbc735
 
f2e849e
 
 
 
 
 
 
6762acb
cbbc735
 
 
6762acb
cbbc735
f2e849e
 
 
 
 
 
 
 
 
 
 
 
cbbc735
f2e849e
cbbc735
f2e849e
cbbc735
 
f2e849e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cbbc735
 
 
 
f2e849e
cbbc735
f2e849e
 
 
cbbc735
f2e849e
 
 
cbbc735
 
f2e849e
 
 
 
 
 
 
6762acb
 
 
 
 
 
 
 
 
 
 
 
cbbc735
 
 
6762acb
cbbc735
f2e849e
 
 
 
 
 
 
 
 
 
 
cbbc735
f2e849e
 
 
cbbc735
6762acb
b3ce1b2
cbbc735
 
f2e849e
e2408de
f2e849e
 
e2408de
 
 
 
 
b3ce1b2
e2408de
f2e849e
 
 
cbbc735
f2e849e
 
 
cbbc735
f2e849e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import streamlit as st
import polars as pl

# Perform a statistical analysis
st.title("Statistical Analysis")

# Loading data
if st.session_state.parsed_df is None:
    st.info("Please upload a log file on the 'Upload' page.")
    st.stop()

# Create tabs for different statistical views
stat_tab1, stat_tab2, stat_tab3 = st.tabs(
    ["General Information", "Numerical Statistics", "Categorical Variables"]
)

with stat_tab1:
    st.write("### Dataset Overview")

    # Show basic dataframe information
    df = st.session_state.parsed_df
    col1, col2 = st.columns(2)

    with col1:
        st.metric("Number of Rows", df.height)
        st.metric(
            "Memory Usage",
            f"{df.estimated_size() / (1024 * 1024):.2f} MB",
        )

    with col2:
        st.metric("Number of Columns", df.width)
        st.metric("Missing Values", sum(df.null_count().row(0)))

    # Display data types distribution
    dtypes_dict = {
        str(dtype): sum(1 for dt in df.schema.values() if str(dt) == str(dtype))
        for dtype in set(str(dt) for dt in df.schema.values())
    }
    st.write("### Data Types")
    for dtype, count in dtypes_dict.items():
        st.write(f"- {dtype}: {count} columns")

    # Show columns by type
    st.write("### Columns by Type")
    for dtype in set(str(dt) for dt in df.schema.values()):
        cols = [
            name for name, dt in zip(df.columns, df.schema.values()) if str(dt) == dtype
        ]
        with st.expander(f"{dtype} columns ({len(cols)})", expanded=True):
            st.write(", ".join(cols))

with stat_tab2:
    # Display numerical statistics with better formatting
    st.write("### Numerical Summary Statistics")

    # Get numeric columns
    numeric_dtypes = {
        pl.Int8,
        pl.Int16,
        pl.Int32,
        pl.Int64,
        pl.UInt8,
        pl.UInt16,
        pl.UInt32,
        pl.UInt64,
        pl.Float32,
        pl.Float64,
    }
    numeric_cols = [
        name
        for name, dtype in zip(df.columns, df.schema.values())
        if dtype in numeric_dtypes
    ]

    if numeric_cols:
        # Allow user to select which columns to analyze
        selected_cols = st.multiselect(
            "Select columns for analysis (default shows all):",
            numeric_cols,
            default=numeric_cols[: min(5, len(numeric_cols))],
        )

        if selected_cols:
            # Show detailed stats
            detailed_stats = df.select(selected_cols).describe()
            st.dataframe(detailed_stats, use_container_width=True)
    else:
        st.info("No numerical columns available for analysis.")

    # Add datetime variables analysis section
    st.write("### Datetime Variables Analysis")

    datetime_dtypes = {pl.Date, pl.Datetime, pl.Time, pl.Duration}
    datetime_cols = [
        name
        for name, dtype in zip(df.columns, df.schema.values())
        if dtype in datetime_dtypes
    ]

    if datetime_cols:
        # Allow user to select which datetime columns to analyze
        selected_dt_cols = st.multiselect(
            "Select datetime columns for analysis:",
            datetime_cols,
            default=datetime_cols,
        )

        if selected_dt_cols:
            for col in selected_dt_cols:
                with st.expander(f"Datetime analysis: {col}", expanded=True):
                    series = df.filter(pl.col(col).is_not_null()).select(pl.col(col))

                    if series.height > 0:
                        # Calculate basic datetime statistics
                        min_date = series.select(pl.col(col).min()).item()
                        max_date = series.select(pl.col(col).max()).item()
                        time_span = max_date - min_date

                        # Display key metrics
                        col1, col2, col3 = st.columns(3)
                        with col1:
                            st.metric(
                                "Minimum Date", min_date.strftime("%Y-%m-%d %H:%M:%S")
                            )
                        with col2:
                            st.metric(
                                "Maximum Date", max_date.strftime("%Y-%m-%d %H:%M:%S")
                            )
                        with col3:
                            days = time_span.days
                            hours = time_span.seconds // 3600
                            st.metric("Time Span", f"{days} days, {hours} hours")

                        # Additional datetime metrics
                        col1, col2, col3 = st.columns(3)
                        with col1:
                            st.metric(
                                "Unique Dates",
                                df.select(pl.col(col).dt.date()).n_unique(),
                            )
                        with col2:
                            missing = df.select(pl.col(col).is_null().sum()).item()
                            st.metric(
                                "Missing Values",
                                missing,
                                f"{missing / df.height * 100:.2f}%",
                            )
                        with col3:
                            st.metric(
                                "Unique Months",
                                df.select(pl.col(col).dt.month()).n_unique(),
                            )
                    else:
                        st.warning(f"No valid datetime values in column '{col}'")
    else:
        st.info("No datetime columns available for analysis.")

with stat_tab3:
    numeric_dtypes = {
        pl.Int8,
        pl.Int16,
        pl.Int32,
        pl.Int64,
        pl.UInt8,
        pl.UInt16,
        pl.UInt32,
        pl.UInt64,
        pl.Float32,
        pl.Float64,
    }
    non_numeric_cols = [
        name
        for name, dtype in zip(df.columns, df.schema.values())
        if dtype not in numeric_dtypes
    ]

    if non_numeric_cols:
        st.write("### Categorical Variables Analysis")
        selected_cat_cols = st.multiselect(
            "Select categorical columns to analyze:",
            non_numeric_cols,
            default=non_numeric_cols[: min(3, len(non_numeric_cols))],
        )

        if selected_cat_cols:
            for col in selected_cat_cols:
                unique_count = df.select(pl.col(col)).n_unique()
                with st.expander(f"{col} - {unique_count} unique values"):
                    # Show value counts if not too many unique values
                    if unique_count <= 20:
                        st.write(
                            df.select(pl.col(col).value_counts().struct.unnest()).sort(
                                "count", descending=True
                            )
                        )
                    else:
                        # Avec votre variable 'col' (remplacez 'col' par le nom réel de votre colonne)
                        st.write(f"Top 10 most common values (out of {unique_count})")
                        st.write(
                            df.select(
                                pl.col(col)
                                .value_counts()
                                .struct.unnest()  # Déstructure la struct ici
                            )
                            .sort("count", descending=True)
                            .head(10)
                        )

                    # Show missing values for this column
                    missing = df.select(pl.col(col).is_null().sum()).item()
                    st.metric(
                        "Missing values",
                        missing,
                        f"{missing / df.height * 100:.2f}%",
                    )
    else:
        st.info("No categorical or text columns available for analysis.")