Spaces:
Sleeping
Sleeping
File size: 3,707 Bytes
d2e9523 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
# app.py
import streamlit as st
import pandas as pd
from io import StringIO, BytesIO
import os
from data_clean_final import clean_data, display_llm_report
import tempfile
# Set page config
st.set_page_config(page_title="LLM Data Cleaner", page_icon=":bar_chart:", layout="wide")
st.title("LLM-Powered Data Cleaner")
st.markdown(
"""
Upload a CSV, TSV or Excel file, and we'll clean it up for you using AI-augmented techniques.
The system will automatically:
- Fix formatting issues
- Handle missing values
- Standardize data entries
- Provide AI-powered suggestions for data improvements
Then, you can download the cleaned data for your analysis.
"""
)
# File uploader
uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"])
if uploaded_file:
file_bytes = uploaded_file.read() # Convert to BytesIO for pandas
file_name = uploaded_file.name
file_type = os.path.splitext(file_name)[1].lower()
# Clean data and get LLM suggestions
with st.spinner("Cleaning your data..."):
with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file:
temp_file.write(file_bytes)
temp_file_path = temp_file.name
try:
# Temporary file to clean_data function
cleaned_df, suggestions = clean_data(temp_file_path)
st.success("Data cleaned successfully!")
finally:
# Clean up the temporary file
if os.path.exists(temp_file_path):
os.unlink(temp_file_path)
# Show original data
st.subheader("Original Data")
try:
if file_type == ".tsv":
original_df = pd.read_csv(BytesIO(file_bytes), sep='\t')
elif file_type == ".xlsx":
original_df = pd.read_excel(BytesIO(file_bytes))
else:
original_df = pd.read_csv(BytesIO(file_bytes))
st.dataframe(original_df.head(10), use_container_width=True)
except Exception as e:
st.error(f"Error loading original data: {str(e)}")
# Show cleaned data
st.subheader("Cleaned Data Preview")
st.dataframe(cleaned_df.head(10), use_container_width=True)
# Data statistics
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Rows", len(cleaned_df))
with col2:
st.metric("Total Columns", len(cleaned_df.columns))
with col3:
null_percentage = round((cleaned_df.isnull().sum().sum() / (cleaned_df.shape[0] * cleaned_df.shape[1])) * 100, 2)
st.metric("Null Values (%)", f"{null_percentage}%")
# Show suggestions from LLM
display_llm_report(suggestions)
# Convert to downloadable format
if file_type == ".csv":
cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8")
download_name = "cleaned_data.csv"
mime_type = "text/csv"
elif file_type == ".tsv":
cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8")
download_name = "cleaned_data.tsv"
mime_type = "text/tsv"
elif file_type == ".xlsx":
output = BytesIO()
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
cleaned_df.to_excel(writer, index=False)
cleaned_file = output.getvalue()
download_name = "cleaned_data.xlsx"
mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
# Download button
st.download_button(
label="📁 Download Cleaned Data",
data=cleaned_file,
file_name=download_name,
mime=mime_type
) |