File size: 4,592 Bytes
45f4df4 43f66fe 45f4df4 f73fa1b b719f5b 45f4df4 e828170 45f4df4 e828170 45f4df4 ae62cb9 754848a 42f0b99 754848a 45f4df4 e828170 45f4df4 9d9048a 8910acd 9d9048a b2f5507 9d9048a e828170 9d9048a e828170 bbb414f c04ed11 5e63ff3 ae72d84 45f4df4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | import streamlit as st
import pandas as pd
import io
import numpy as np
import base64
from sklearn.impute import SimpleImputer
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
st.set_page_config(page_title="CSV Data Cleaning Tool")
hide_streamlit_style = """
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
"""
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
st.title("CSV Data Tool")
st.markdown("กดเลือกหัว Tool ข้อที่ต้องการจะใช้ได้เลยนะจ๊ะ")
uploaded_files = st.file_uploader("Choose CSV files", type="csv", accept_multiple_files=True)
dataframes = []
if uploaded_files:
for file in uploaded_files:
file.seek(0)
df = pd.read_csv(file)
dataframes.append(df)
st.markdown("---")
st.markdown("Data Cleansing")
st.markdown("---")
duplicate_columns = st.checkbox("Remove duplicate columns", value=False)
if duplicate_columns :
for i, df in enumerate(dataframes):
dataframes[i] = df.drop_duplicates(inplace=False)
remove_empty_rows = st.checkbox("Remove empty rows", value=False)
if remove_empty_rows:
for i, df in enumerate(dataframes):
dataframes[i] = df.dropna(how="all", inplace=False)
impute_mean = st.checkbox("Impute missing values with mean (for int and float columns)",value=False)
if impute_mean:
for i, df in enumerate(dataframes):
numeric_cols = df.select_dtypes(include=['int', 'float']).columns
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
dataframes[i] = df
impute_most_frequent = st.checkbox("Impute missing values with most frequent category (for categorical columns)",value=False)
if impute_most_frequent:
for i, df in enumerate(dataframes):
categorical_cols = df.select_dtypes(include=['object']).columns
imputer = SimpleImputer(strategy='most_frequent')
df[categorical_cols] = imputer.fit_transform(df[categorical_cols])
dataframes[i] = df
selected_out = st.selectbox("เลือก columns ที่จะดู Outlier", df.columns)
if selected_out:
z_scores = np.abs((df[selected_out] - df[selected_out].mean()) / df[selected_out].std())
threshold = 3
outliers = df[z_scores > threshold]
st.write("Outliers:")
st.write(outliers[selected_out])
for i, df in enumerate(dataframes):
st.dataframe(df)
st.markdown("---")
st.markdown("Data transform")
st.markdown("---")
selected_values = st.multiselect("เลือกค่าจากคอลัมน์", df.columns)
convert_to_String = st.checkbox("convert columns to String", value=False)
convert_to_float = st.checkbox("convert columns to Float", value=False)
if convert_to_String:
df[selected_values] = df[selected_values].astype(str)
if convert_to_float:
df[selected_values] = df[selected_values].astype(float)
onehot = st.selectbox("เลือก columns ที่จะ Encoder", df.columns)
if onehot:
df = pd.get_dummies(df, columns=[onehot])
st.markdown("---")
st.markdown("Distribution")
st.markdown("---")
norm = st.multiselect("เลือก columns ที่จะ Scale Data โดยการใช้ Mapping", df.columns)
if norm:
df[norm] = df[norm].apply(lambda x: np.log(x))
qq = st.selectbox("QQplot", df.columns)
if qq:
grouped = df.groupby('team')[[qq]].mean()
fig, ax = plt.subplots()
stats.probplot(grouped[qq], dist="norm", plot=ax)
st.pyplot(fig)
show_dataframes = st.checkbox("Show DataFrames", value=True)
if show_dataframes:
for i, df in enumerate(dataframes):
st.write(f"DataFrame {i + 1}")
st.dataframe(df)
if st.button("Download cleaned data"):
for i, df in enumerate(dataframes):
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data_{i + 1}.csv">Download cleaned_data_{i + 1}.csv</a>'
st.markdown(href, unsafe_allow_html=True)
st.markdown("")
st.markdown("---")
st.markdown("")
|