Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| import streamlit as st | |
| import os | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| try: | |
| import tabula | |
| from tabula import read_pdf | |
| except: | |
| read_pdf = None | |
| # ----------- File Upload Handler ----------- # | |
| def file_upload(file): | |
| file_ext = os.path.splitext(file.name)[1].lower() | |
| try: | |
| if file_ext == '.csv': | |
| df = pd.read_csv(file) | |
| elif file_ext in ['.xls', '.xlsx']: | |
| df = pd.read_excel(file) | |
| elif file_ext == '.json': | |
| df = pd.read_json(file) | |
| elif file_ext == '.pdf' and read_pdf: | |
| df = read_pdf(file, pages='all', multiple_tables=False)[0] | |
| else: | |
| st.error("β Unsupported file type or missing dependencies for PDF.") | |
| return None | |
| return df | |
| except Exception as e: | |
| st.error(f"β οΈ Error loading file: {e}") | |
| return None | |
| # ----------- Cleaning Functions ----------- # | |
| def remove_empty_rows(df): | |
| st.info("π Null values before cleaning:") | |
| st.write(df.isnull().sum()) | |
| df_cleaned = df.dropna() | |
| st.success("β Null values removed.") | |
| return df_cleaned | |
| def replace_nulls(df, value): | |
| st.info("π Null values before replacement:") | |
| st.write(df.isnull().sum()) | |
| df_filled = df.fillna(value) | |
| st.success("β Null values replaced.") | |
| return df_filled | |
| def remove_noise(df): | |
| noise_words = {'the', 'is', 'an', 'a', 'in', 'of', 'to'} | |
| def clean_text(val): | |
| if isinstance(val, str): | |
| return ' '.join(word for word in val.split() if word.lower() not in noise_words) | |
| return val | |
| df_cleaned = df.applymap(clean_text) | |
| st.success("β Noise words removed.") | |
| return df_cleaned | |
| def remove_duplicates(df): | |
| df_deduped = df.drop_duplicates() | |
| st.success("β Duplicate rows removed.") | |
| return df_deduped | |
| def convert_column_dtype(df, column, dtype): | |
| try: | |
| df[column] = df[column].astype(dtype) | |
| st.success(f"β Converted '{column}' to {dtype}") | |
| except Exception as e: | |
| st.error(f"β οΈ Conversion error: {e}") | |
| return df | |
| def detect_outliers(df, column): | |
| if column in df.select_dtypes(include=['float', 'int']).columns: | |
| Q1 = df[column].quantile(0.25) | |
| Q3 = df[column].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| lower = Q1 - 1.5 * IQR | |
| upper = Q3 + 1.5 * IQR | |
| outliers = df[(df[column] < lower) | (df[column] > upper)] | |
| st.write(f"π Found {len(outliers)} outliers in column '{column}'") | |
| return outliers | |
| else: | |
| st.warning("β οΈ Column must be numeric to detect outliers.") | |
| return pd.DataFrame() | |
| def plot_distributions(df): | |
| st.subheader("π Data Distributions") | |
| numeric_cols = df.select_dtypes(include=['float', 'int']).columns | |
| for col in numeric_cols: | |
| fig, ax = plt.subplots() | |
| sns.histplot(df[col].dropna(), kde=True, ax=ax) | |
| ax.set_title(f"Distribution of {col}") | |
| st.pyplot(fig) | |
| def plot_missing_data(df): | |
| st.subheader("π Missing Data Heatmap") | |
| fig, ax = plt.subplots() | |
| sns.heatmap(df.isnull(), cbar=False, cmap='viridis') | |
| st.pyplot(fig) | |
| def main(): | |
| st.set_page_config(page_title="π§Ή Smart Dataset Cleaner", layout="wide") | |
| st.title("π§Ή Smart Dataset Cleaner") | |
| st.caption("β¨ Clean, analyze, and preprocess your dataset with ease") | |
| uploaded_file = st.file_uploader("π Upload your dataset", type=["csv", "xlsx", "xls", "json", "pdf"]) | |
| if uploaded_file: | |
| df = file_upload(uploaded_file) | |
| if df is not None: | |
| st.subheader("π Original Dataset Preview") | |
| st.dataframe(df.head()) | |
| st.markdown("## π§° Data Cleaning Tools") | |
| with st.expander("β Replace Null Values"): | |
| fill_value = st.text_input("Enter value to replace nulls with:") | |
| if st.button("Replace Nulls"): | |
| df = replace_nulls(df, fill_value) | |
| st.dataframe(df) | |
| if st.button("π§Ό Remove Empty Rows"): | |
| df = remove_empty_rows(df) | |
| st.dataframe(df) | |
| if st.button("π§Ή Remove Duplicate Rows"): | |
| df = remove_duplicates(df) | |
| st.dataframe(df) | |
| if st.button("π Remove Noise Words from Text"): | |
| df = remove_noise(df) | |
| st.dataframe(df) | |
| with st.expander("π Convert Column DataType"): | |
| selected_col = st.selectbox("Select column", df.columns) | |
| dtype = st.selectbox("Select target type", ["int", "float", "str", "bool"]) | |
| if st.button("Convert"): | |
| df = convert_column_dtype(df, selected_col, dtype) | |
| st.dataframe(df) | |
| st.markdown("## π Data Visualizations") | |
| if st.checkbox("π Show Summary Stats"): | |
| st.write(df.describe(include='all')) | |
| if st.checkbox("π Plot Column Distributions"): | |
| plot_distributions(df) | |
| if st.checkbox("π Show Missing Data Heatmap"): | |
| plot_missing_data(df) | |
| st.markdown("## π¨ Outlier Detection") | |
| outlier_col = st.selectbox("Select numeric column", df.select_dtypes(include=['float', 'int']).columns) | |
| if st.button("Detect Outliers"): | |
| outliers = detect_outliers(df, outlier_col) | |
| if not outliers.empty: | |
| st.write(outliers) | |
| st.markdown("## πΎ Download Cleaned Dataset") | |
| file_name = st.text_input("Filename:", "cleaned_dataset.csv") | |
| if st.button("Download CSV"): | |
| st.download_button("π Download", df.to_csv(index=False), file_name, mime="text/csv") | |
| else: | |
| st.warning("β οΈ Please upload a supported file to begin.") | |
| if __name__ == "__main__": | |
| main() | |