File size: 4,592 Bytes
45f4df4
 
 
43f66fe
45f4df4
 
f73fa1b
b719f5b
 
45f4df4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e828170
 
45f4df4
e828170
 
45f4df4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae62cb9
754848a
42f0b99
 
 
754848a
45f4df4
 
 
 
 
e828170
 
 
 
 
 
45f4df4
 
 
 
 
 
 
 
 
 
9d9048a
8910acd
9d9048a
b2f5507
9d9048a
 
e828170
 
 
9d9048a
 
 
 
 
 
e828170
bbb414f
 
c04ed11
5e63ff3
 
 
ae72d84
45f4df4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import streamlit as st
import pandas as pd
import io
import numpy as np
import base64
from sklearn.impute import SimpleImputer
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt


st.set_page_config(page_title="CSV Data Cleaning Tool")

hide_streamlit_style = """
            <style>
            #MainMenu {visibility: hidden;}
            footer {visibility: hidden;}
            </style>
            """
st.markdown(hide_streamlit_style, unsafe_allow_html=True)

st.title("CSV Data Tool")

st.markdown("กดเลือกหัว Tool ข้อที่ต้องการจะใช้ได้เลยนะจ๊ะ")

uploaded_files = st.file_uploader("Choose CSV files", type="csv", accept_multiple_files=True)

dataframes = []

if uploaded_files:
    for file in uploaded_files:
        file.seek(0)
        df = pd.read_csv(file)
        dataframes.append(df)
        
    st.markdown("---")
    st.markdown("Data Cleansing")
    st.markdown("---")
    
    duplicate_columns = st.checkbox("Remove duplicate columns", value=False)
    if duplicate_columns :
        for i, df in enumerate(dataframes):
            dataframes[i] = df.drop_duplicates(inplace=False)
    
    remove_empty_rows = st.checkbox("Remove empty rows", value=False)
    if remove_empty_rows:
        for i, df in enumerate(dataframes):
            dataframes[i] = df.dropna(how="all", inplace=False)
    
    impute_mean = st.checkbox("Impute missing values with mean (for int and float columns)",value=False)
    if impute_mean:
        for i, df in enumerate(dataframes):
            numeric_cols = df.select_dtypes(include=['int', 'float']).columns
            imputer = SimpleImputer(strategy='mean')
            df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
            dataframes[i] = df                       

    impute_most_frequent = st.checkbox("Impute missing values with most frequent category (for categorical columns)",value=False)
    if impute_most_frequent:
        for i, df in enumerate(dataframes):
            categorical_cols = df.select_dtypes(include=['object']).columns
            imputer = SimpleImputer(strategy='most_frequent')
            df[categorical_cols] = imputer.fit_transform(df[categorical_cols])
            dataframes[i] = df

    
    selected_out = st.selectbox("เลือก columns ที่จะดู Outlier", df.columns)
    if selected_out:
        z_scores = np.abs((df[selected_out] - df[selected_out].mean()) / df[selected_out].std())
        threshold = 3
        outliers = df[z_scores > threshold]
        st.write("Outliers:")
        st.write(outliers[selected_out])

    
    for i, df in enumerate(dataframes):
        st.dataframe(df)
    

    st.markdown("---")
    st.markdown("Data transform")
    st.markdown("---")
    

    selected_values = st.multiselect("เลือกค่าจากคอลัมน์", df.columns)

    
    convert_to_String = st.checkbox("convert columns to String", value=False)
    convert_to_float = st.checkbox("convert columns to Float", value=False)
    if convert_to_String:
        df[selected_values] = df[selected_values].astype(str)
    if convert_to_float:
        df[selected_values] = df[selected_values].astype(float)

    
    onehot = st.selectbox("เลือก columns ที่จะ Encoder", df.columns)
    if onehot:
        df = pd.get_dummies(df, columns=[onehot])
    
    
    st.markdown("---")
    st.markdown("Distribution")
    st.markdown("---")


    norm = st.multiselect("เลือก columns ที่จะ Scale Data โดยการใช้ Mapping", df.columns)
    if norm:
        df[norm] = df[norm].apply(lambda x: np.log(x))

    
    qq = st.selectbox("QQplot", df.columns)
    if qq:
        grouped = df.groupby('team')[[qq]].mean()
        fig, ax = plt.subplots()
        stats.probplot(grouped[qq], dist="norm", plot=ax)
        st.pyplot(fig)
        

    
    show_dataframes = st.checkbox("Show DataFrames", value=True)
    if show_dataframes:
        for i, df in enumerate(dataframes):
            st.write(f"DataFrame {i + 1}")
            st.dataframe(df)
            

    
if st.button("Download cleaned data"):
    for i, df in enumerate(dataframes):
        csv = df.to_csv(index=False)
        b64 = base64.b64encode(csv.encode()).decode()
        href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data_{i + 1}.csv">Download cleaned_data_{i + 1}.csv</a>'
        st.markdown(href, unsafe_allow_html=True)


st.markdown("")
st.markdown("---")
st.markdown("")