File size: 4,056 Bytes
798602c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pandas as pd
import numpy as np
import gradio as gr
from pathlib import Path

ROUND = 4

def load_dataset(file):
    """

    Load CSV or Excel file.

    Returns:

        df, status_message

    """
    if file is None:
        return None, "No file uploaded."

    try:
        path = Path(file.name)

        if path.suffix == ".csv":
            df = pd.read_csv(path)
        elif path.suffix in [".xlsx", ".xls"]:
            df = pd.read_excel(path)
        else:
            return None, "Unsupported file format."

        return df, f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns."

    except Exception as e:
        return None, f"Error loading file: {e}"


def dataset_summary(df: pd.DataFrame):
    if df is None:
        return None

    summary = (
        df.describe(include="all")
        .transpose()
        .reset_index()
        .rename(columns={"index": "variable"})
    )

    # Add unique counts explicitly
    summary["unique"] = df.nunique(dropna=True).values

    # Desired column order
    desired_order = [
        "variable",
        "count",
        "unique",
        "mean",
        "std",
        "min",
        "25%",
        "50%",
        "75%",
        "max",
    ]
    summary = summary[[c for c in desired_order if c in summary.columns]]

    # ---- IMPORTANT PART ----
    # Format numeric columns as strings
    for col in summary.columns:
        if col not in ["variable", "count", "unique"]:
            summary[col] = summary[col].apply(
                lambda x: f"{x:.{ROUND}f}" if isinstance(x, (int, float)) else x
            )

    return summary


def variable_types(df):
    if df is None:
        return None

    return (
        df.dtypes
        .reset_index()
        .rename(columns={"index": "Variable", 0: "Type"})
    )


def column_choices_single(cols: list[str]):
    return gr.update(choices=cols, value=None)


def column_choices_multi(cols: list[str]):
    return gr.update(choices=cols, value=[])


def category_value_choices(df, col):
    if df is None or col is None or col not in df.columns:
        return gr.update(visible=False, choices=[], value=[])

    values = sorted(df[col].dropna().unique().tolist())

    return gr.update(
        visible=True,
        choices=values,
        value=[],   # MUST be a list for multiselect
    )


def infer_column_types(df: pd.DataFrame):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

    return sorted(numeric_cols), sorted(categorical_cols)


def apply_category_filters(

    df,

    cat_cols,

    val1,

    val2,

    val3,

):
    if df is None:
        return None, "❌ No data loaded."

    if not cat_cols or all(not v for v in [val1, val2, val3]):
        return df.copy(), "⚠️ No filters selected. Using full dataset."

    filtered_df = df.copy()

    values = [val1, val2, val3]

    for col, selected_vals in zip(cat_cols[:3], values):
        if selected_vals:
            filtered_df = filtered_df[filtered_df[col].isin(selected_vals)]

    return filtered_df, f"✅ Filter applied. Rows remaining: {len(filtered_df)}"

def reclassify_as_categorical(state, column):
    if column and column in state.numeric_cols:
        state.numeric_cols.remove(column)
        state.categorical_cols.append(column)
        state.active_filters = {}   # reset filters
        return True, f"Column '{column}' reclassified as categorical."
    return False, f"Column '{column}' is not numeric."


def reclassify_as_numeric(state, column):
    if column and column in state.categorical_cols:
        state.categorical_cols.remove(column)
        state.numeric_cols.append(column)
        state.active_filters = {}   # reset filters
        return True, f"Column '{column}' reclassified as numeric."
    return False, f"Column '{column}' is not categorical."