Spaces:

Osnly
/

smart-data-cleaning-agent

Sleeping

App Files Files Community

smart-data-cleaning-agent / src /streamlit_app.py

Osnly

Update src/streamlit_app.py

4fe9e39 verified 6 months ago

raw

history blame contribute delete

3.47 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import os

	from analyze import analyze_csv
	from plan import generate_cleaning_plan
	from execute import execute_plan
	from insight import generate_insights
	from visual_insight import generate_visual_plan
	from report import ReportBuilder
	from transformers import AutoProcessor, AutoModelForImageTextToText

	from transformers import AutoTokenizer

	# Temp-safe paths
	input_path = "/tmp/input.csv"
	output_path = "/tmp/output.csv"
	report_path = "/tmp/final_report.pdf"
	charts_dir = "/tmp/charts"
	os.makedirs(charts_dir, exist_ok=True)

	# Authenticate and load tokenizer to check access
	hf_token = os.environ.get("HUGGINGFACE_TOKEN")
	cache_dir = "/tmp/hf_cache"


	st.set_page_config(page_title="Smart Data Cleaning Agent", layout="wide")
	st.title("🧠 Smart Data Cleaning Agent")

	uploaded_file = st.file_uploader("📂 Upload a CSV file", type=["csv"])

	if uploaded_file:
	# Save file to /tmp/ for processing
	with open(input_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	df = pd.read_csv(input_path)
	st.subheader("🔍 Original Data Preview")
	st.dataframe(df.head())

	with st.spinner("📊 Analyzing CSV..."):
	analysis = analyze_csv(input_path)

	with st.spinner("🧼 Generating Cleaning Plan..."):
	cleaning_plan, cleaning_summary = generate_cleaning_plan(analysis)
	st.subheader("🧹 Cleaning Plan")
	st.json(cleaning_plan)
	st.markdown("### ✅ Cleaning Summary")
	st.markdown(cleaning_summary)

	with st.spinner("🧪 Applying cleaning..."):
	cleaned_df = execute_plan(df.copy(), cleaning_plan)
	cleaned_df.to_csv(output_path, index=False)
	st.subheader("🧼 Cleaned Data Preview")
	st.dataframe(cleaned_df.head())
	st.download_button("⬇️ Download Cleaned CSV", cleaned_df.to_csv(index=False), file_name="cleaned.csv")

	with st.spinner("🧠 Deriving insights..."):
	insights = generate_insights(analysis["columns"])
	st.subheader("📄 EDA Insights")
	st.text(insights)

	with st.spinner("📈 Generating recommended plots..."):
	visuals = generate_visual_plan(analysis["columns"])
	for vis in visuals:
	st.markdown(f"#### {vis['title']}")
	st.markdown(vis['description'])
	try:
	safe_code = vis["code"].replace("charts/", f"{charts_dir}/")
	exec(safe_code, {"df": cleaned_df, "plt": plt, "sns": sns, "os": os})
	st.pyplot(plt.gcf())
	plt.clf()
	except Exception as e:
	st.error(f"❌ Failed to render: {e}")

	if st.button("📝 Generate PDF Report"):
	report = ReportBuilder(output_path=report_path)
	report.add_title("📊 Smart Data Cleaning Report")
	report.add_section("Cleaning Summary", cleaning_summary)
	report.add_section("EDA Insights", insights)

	for vis in visuals:
	if "savefig('" in vis["code"]:
	path = vis["code"].split("savefig('")[-1].split("')")[0]
	if not path.startswith("/"):
	path = os.path.join(charts_dir, os.path.basename(path))
	report.add_plot(path, vis["description"])

	report.save()
	with open(report_path, "rb") as f:
	st.download_button("⬇️ Download PDF Report", f, file_name="smart_data_report.pdf")