Spaces:

translators-will
/

LLM-Data-Cleaner

Sleeping

File size: 3,707 Bytes

d2e9523

# app.py

import streamlit as st
import pandas as pd
from io import StringIO, BytesIO
import os
from data_clean_final import clean_data, display_llm_report
import tempfile

# Set page config
st.set_page_config(page_title="LLM Data Cleaner", page_icon=":bar_chart:", layout="wide")

st.title("LLM-Powered Data Cleaner")
st.markdown(
    """

    Upload a CSV, TSV or Excel file, and we'll clean it up for you using AI-augmented techniques.

    The system will automatically:

    - Fix formatting issues

    - Handle missing values

    - Standardize data entries

    - Provide AI-powered suggestions for data improvements

    

    Then, you can download the cleaned data for your analysis.

    """
)

# File uploader
uploaded_file = st.file_uploader("Choose a file", type=["csv", "tsv", "xlsx"])

if uploaded_file:
    file_bytes = uploaded_file.read() # Convert to BytesIO for pandas
    file_name = uploaded_file.name
    file_type = os.path.splitext(file_name)[1].lower()

    # Clean data and get LLM suggestions
    with st.spinner("Cleaning your data..."):

        with tempfile.NamedTemporaryFile(delete=False, suffix=file_type) as temp_file:
            temp_file.write(file_bytes)
            temp_file_path = temp_file.name

        try:
            # Temporary file to clean_data function
            cleaned_df, suggestions = clean_data(temp_file_path)

            st.success("Data cleaned successfully!")

        finally:
            # Clean up the temporary file
            if os.path.exists(temp_file_path):
                os.unlink(temp_file_path)

    # Show original data
    st.subheader("Original Data")
    try:
        if file_type == ".tsv":
            original_df = pd.read_csv(BytesIO(file_bytes), sep='\t')
        elif file_type == ".xlsx":
            original_df = pd.read_excel(BytesIO(file_bytes))
        else:
            original_df = pd.read_csv(BytesIO(file_bytes))

        st.dataframe(original_df.head(10), use_container_width=True)
    
    except Exception as e:
        st.error(f"Error loading original data: {str(e)}")

    # Show cleaned data
    st.subheader("Cleaned Data Preview")
    st.dataframe(cleaned_df.head(10), use_container_width=True)

    # Data statistics
    col1, col2, col3 = st.columns(3)
    with col1:
        st.metric("Total Rows", len(cleaned_df))
    with col2:
        st.metric("Total Columns", len(cleaned_df.columns))
    with col3:
        null_percentage = round((cleaned_df.isnull().sum().sum() / (cleaned_df.shape[0] * cleaned_df.shape[1])) * 100, 2)
        st.metric("Null Values (%)", f"{null_percentage}%")
        
    # Show suggestions from LLM
    display_llm_report(suggestions)
    

    # Convert to downloadable format
    if file_type == ".csv":
        cleaned_file = cleaned_df.to_csv(index=False).encode("utf-8")
        download_name = "cleaned_data.csv"
        mime_type = "text/csv"
    elif file_type == ".tsv":
        cleaned_file = cleaned_df.to_csv(index=False, sep="\t").encode("utf-8")
        download_name = "cleaned_data.tsv"
        mime_type = "text/tsv"
    elif file_type == ".xlsx":
        output = BytesIO()
        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
            cleaned_df.to_excel(writer, index=False)
        cleaned_file = output.getvalue()
        download_name = "cleaned_data.xlsx"
        mime_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"

    # Download button
    st.download_button(
        label="📁 Download Cleaned Data",
        data=cleaned_file,
        file_name=download_name,
        mime=mime_type
    )