Spaces:

Haseeb-001
/

Smart_Data_Cleaner

Runtime error

App Files Files Community

Smart_Data_Cleaner / app.py

Haseeb-001

Update app.py

ac98ac9 verified 8 months ago

raw

history blame contribute delete

5.92 kB

	import pandas as pd
	import numpy as np
	import streamlit as st
	import os
	import matplotlib.pyplot as plt
	import seaborn as sns

	try:
	import tabula
	from tabula import read_pdf
	except:
	read_pdf = None

	# ----------- File Upload Handler ----------- #
	def file_upload(file):
	file_ext = os.path.splitext(file.name)[1].lower()
	try:
	if file_ext == '.csv':
	df = pd.read_csv(file)
	elif file_ext in ['.xls', '.xlsx']:
	df = pd.read_excel(file)
	elif file_ext == '.json':
	df = pd.read_json(file)
	elif file_ext == '.pdf' and read_pdf:
	df = read_pdf(file, pages='all', multiple_tables=False)[0]
	else:
	st.error("❌ Unsupported file type or missing dependencies for PDF.")
	return None
	return df
	except Exception as e:
	st.error(f"⚠️ Error loading file: {e}")
	return None

	# ----------- Cleaning Functions ----------- #
	def remove_empty_rows(df):
	st.info("🔍 Null values before cleaning:")
	st.write(df.isnull().sum())
	df_cleaned = df.dropna()
	st.success("✅ Null values removed.")
	return df_cleaned

	def replace_nulls(df, value):
	st.info("🔍 Null values before replacement:")
	st.write(df.isnull().sum())
	df_filled = df.fillna(value)
	st.success("✅ Null values replaced.")
	return df_filled

	def remove_noise(df):
	noise_words = {'the', 'is', 'an', 'a', 'in', 'of', 'to'}
	def clean_text(val):
	if isinstance(val, str):
	return ' '.join(word for word in val.split() if word.lower() not in noise_words)
	return val
	df_cleaned = df.applymap(clean_text)
	st.success("✅ Noise words removed.")
	return df_cleaned

	def remove_duplicates(df):
	df_deduped = df.drop_duplicates()
	st.success("✅ Duplicate rows removed.")
	return df_deduped

	def convert_column_dtype(df, column, dtype):
	try:
	df[column] = df[column].astype(dtype)
	st.success(f"✅ Converted '{column}' to {dtype}")
	except Exception as e:
	st.error(f"⚠️ Conversion error: {e}")
	return df

	def detect_outliers(df, column):
	if column in df.select_dtypes(include=['float', 'int']).columns:
	Q1 = df[column].quantile(0.25)
	Q3 = df[column].quantile(0.75)
	IQR = Q3 - Q1
	lower = Q1 - 1.5 * IQR
	upper = Q3 + 1.5 * IQR
	outliers = df[(df[column] < lower) \| (df[column] > upper)]
	st.write(f"🔍 Found {len(outliers)} outliers in column '{column}'")
	return outliers
	else:
	st.warning("⚠️ Column must be numeric to detect outliers.")
	return pd.DataFrame()

	def plot_distributions(df):
	st.subheader("📊 Data Distributions")
	numeric_cols = df.select_dtypes(include=['float', 'int']).columns
	for col in numeric_cols:
	fig, ax = plt.subplots()
	sns.histplot(df[col].dropna(), kde=True, ax=ax)
	ax.set_title(f"Distribution of {col}")
	st.pyplot(fig)

	def plot_missing_data(df):
	st.subheader("📉 Missing Data Heatmap")
	fig, ax = plt.subplots()
	sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
	st.pyplot(fig)

	def main():
	st.set_page_config(page_title="🧹 Smart Dataset Cleaner", layout="wide")
	st.title("🧹 Smart Dataset Cleaner")
	st.caption("✨ Clean, analyze, and preprocess your dataset with ease")

	uploaded_file = st.file_uploader("📂 Upload your dataset", type=["csv", "xlsx", "xls", "json", "pdf"])
	if uploaded_file:
	df = file_upload(uploaded_file)
	if df is not None:
	st.subheader("📋 Original Dataset Preview")
	st.dataframe(df.head())

	st.markdown("## 🧰 Data Cleaning Tools")
	with st.expander("➕ Replace Null Values"):
	fill_value = st.text_input("Enter value to replace nulls with:")
	if st.button("Replace Nulls"):
	df = replace_nulls(df, fill_value)
	st.dataframe(df)

	if st.button("🧼 Remove Empty Rows"):
	df = remove_empty_rows(df)
	st.dataframe(df)

	if st.button("🧹 Remove Duplicate Rows"):
	df = remove_duplicates(df)
	st.dataframe(df)

	if st.button("📉 Remove Noise Words from Text"):
	df = remove_noise(df)
	st.dataframe(df)

	with st.expander("🔁 Convert Column DataType"):
	selected_col = st.selectbox("Select column", df.columns)
	dtype = st.selectbox("Select target type", ["int", "float", "str", "bool"])
	if st.button("Convert"):
	df = convert_column_dtype(df, selected_col, dtype)
	st.dataframe(df)

	st.markdown("## 📊 Data Visualizations")
	if st.checkbox("📈 Show Summary Stats"):
	st.write(df.describe(include='all'))

	if st.checkbox("📌 Plot Column Distributions"):
	plot_distributions(df)

	if st.checkbox("📍 Show Missing Data Heatmap"):
	plot_missing_data(df)

	st.markdown("## 🚨 Outlier Detection")
	outlier_col = st.selectbox("Select numeric column", df.select_dtypes(include=['float', 'int']).columns)
	if st.button("Detect Outliers"):
	outliers = detect_outliers(df, outlier_col)
	if not outliers.empty:
	st.write(outliers)

	st.markdown("## 💾 Download Cleaned Dataset")
	file_name = st.text_input("Filename:", "cleaned_dataset.csv")
	if st.button("Download CSV"):
	st.download_button("📄 Download", df.to_csv(index=False), file_name, mime="text/csv")
	else:
	st.warning("⚠️ Please upload a supported file to begin.")

	if __name__ == "__main__":
	main()