Spaces:
Runtime error
Runtime error
File size: 5,920 Bytes
5464c58 ac98ac9 5464c58 ac98ac9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import pandas as pd
import numpy as np
import streamlit as st
import os
import matplotlib.pyplot as plt
import seaborn as sns
try:
import tabula
from tabula import read_pdf
except:
read_pdf = None
# ----------- File Upload Handler ----------- #
def file_upload(file):
file_ext = os.path.splitext(file.name)[1].lower()
try:
if file_ext == '.csv':
df = pd.read_csv(file)
elif file_ext in ['.xls', '.xlsx']:
df = pd.read_excel(file)
elif file_ext == '.json':
df = pd.read_json(file)
elif file_ext == '.pdf' and read_pdf:
df = read_pdf(file, pages='all', multiple_tables=False)[0]
else:
st.error("β Unsupported file type or missing dependencies for PDF.")
return None
return df
except Exception as e:
st.error(f"β οΈ Error loading file: {e}")
return None
# ----------- Cleaning Functions ----------- #
def remove_empty_rows(df):
st.info("π Null values before cleaning:")
st.write(df.isnull().sum())
df_cleaned = df.dropna()
st.success("β
Null values removed.")
return df_cleaned
def replace_nulls(df, value):
st.info("π Null values before replacement:")
st.write(df.isnull().sum())
df_filled = df.fillna(value)
st.success("β
Null values replaced.")
return df_filled
def remove_noise(df):
noise_words = {'the', 'is', 'an', 'a', 'in', 'of', 'to'}
def clean_text(val):
if isinstance(val, str):
return ' '.join(word for word in val.split() if word.lower() not in noise_words)
return val
df_cleaned = df.applymap(clean_text)
st.success("β
Noise words removed.")
return df_cleaned
def remove_duplicates(df):
df_deduped = df.drop_duplicates()
st.success("β
Duplicate rows removed.")
return df_deduped
def convert_column_dtype(df, column, dtype):
try:
df[column] = df[column].astype(dtype)
st.success(f"β
Converted '{column}' to {dtype}")
except Exception as e:
st.error(f"β οΈ Conversion error: {e}")
return df
def detect_outliers(df, column):
if column in df.select_dtypes(include=['float', 'int']).columns:
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df[column] < lower) | (df[column] > upper)]
st.write(f"π Found {len(outliers)} outliers in column '{column}'")
return outliers
else:
st.warning("β οΈ Column must be numeric to detect outliers.")
return pd.DataFrame()
def plot_distributions(df):
st.subheader("π Data Distributions")
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
for col in numeric_cols:
fig, ax = plt.subplots()
sns.histplot(df[col].dropna(), kde=True, ax=ax)
ax.set_title(f"Distribution of {col}")
st.pyplot(fig)
def plot_missing_data(df):
st.subheader("π Missing Data Heatmap")
fig, ax = plt.subplots()
sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
st.pyplot(fig)
def main():
st.set_page_config(page_title="π§Ή Smart Dataset Cleaner", layout="wide")
st.title("π§Ή Smart Dataset Cleaner")
st.caption("β¨ Clean, analyze, and preprocess your dataset with ease")
uploaded_file = st.file_uploader("π Upload your dataset", type=["csv", "xlsx", "xls", "json", "pdf"])
if uploaded_file:
df = file_upload(uploaded_file)
if df is not None:
st.subheader("π Original Dataset Preview")
st.dataframe(df.head())
st.markdown("## π§° Data Cleaning Tools")
with st.expander("β Replace Null Values"):
fill_value = st.text_input("Enter value to replace nulls with:")
if st.button("Replace Nulls"):
df = replace_nulls(df, fill_value)
st.dataframe(df)
if st.button("π§Ό Remove Empty Rows"):
df = remove_empty_rows(df)
st.dataframe(df)
if st.button("π§Ή Remove Duplicate Rows"):
df = remove_duplicates(df)
st.dataframe(df)
if st.button("π Remove Noise Words from Text"):
df = remove_noise(df)
st.dataframe(df)
with st.expander("π Convert Column DataType"):
selected_col = st.selectbox("Select column", df.columns)
dtype = st.selectbox("Select target type", ["int", "float", "str", "bool"])
if st.button("Convert"):
df = convert_column_dtype(df, selected_col, dtype)
st.dataframe(df)
st.markdown("## π Data Visualizations")
if st.checkbox("π Show Summary Stats"):
st.write(df.describe(include='all'))
if st.checkbox("π Plot Column Distributions"):
plot_distributions(df)
if st.checkbox("π Show Missing Data Heatmap"):
plot_missing_data(df)
st.markdown("## π¨ Outlier Detection")
outlier_col = st.selectbox("Select numeric column", df.select_dtypes(include=['float', 'int']).columns)
if st.button("Detect Outliers"):
outliers = detect_outliers(df, outlier_col)
if not outliers.empty:
st.write(outliers)
st.markdown("## πΎ Download Cleaned Dataset")
file_name = st.text_input("Filename:", "cleaned_dataset.csv")
if st.button("Download CSV"):
st.download_button("π Download", df.to_csv(index=False), file_name, mime="text/csv")
else:
st.warning("β οΈ Please upload a supported file to begin.")
if __name__ == "__main__":
main()
|