Spaces:

hackt4d
/

DDD

Sleeping

File size: 6,545 Bytes

import streamlit as st
import pandas as pd
import plotly.express as px
from langchain_community.llms import OpenAI
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
import textwrap
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
import time

# Initialize session state
if 'step' not in st.session_state:
    st.session_state.step = 1
if 'dataframes' not in st.session_state:
    st.session_state.dataframes = {}
if 'chat_history' not in st.session_state:
    st.session_state.chat_history = []
if 'cleaning_operations' not in st.session_state:
    st.session_state.cleaning_operations = {}

def main():
    st.title("D^3- Data Dialogue Decisions")

    if st.session_state.step == 1:
        step_1_upload_and_analyze()
    # elif st.session_state.step == 2:
    #     step_2_clean_data()
    elif st.session_state.step == 3:
        step_3_chat_with_data()

def step_1_upload_and_analyze():
    st.subheader("Step 1: Upload and Analyze Data")

    uploaded_files = st.file_uploader("Upload CSV files", type="csv", accept_multiple_files=True)
    if uploaded_files:
        for file in uploaded_files:
            df = pd.read_csv(file)
            st.session_state.dataframes[file.name] = df
            st.success(f"Uploaded: {file.name}")

        if st.button("Analyze Data"):
            for name, df in st.session_state.dataframes.items():
                st.write(f"Analysis for {name}:")
                st.write(f"Shape: {df.shape}")
                st.write("Columns:")
                st.write(df.columns.tolist())
                st.write("Preview:")
                st.write(df.head())
                st.write("---")

        if st.button("Explore Data"):
            st.session_state.step = 3

# def step_2_clean_data():
#     st.subheader("Step 2: Clean Data")

#     llm = OpenAI(temperature=0)
    
#     for name, df in st.session_state.dataframes.items():
#         st.write(f"Cleaning recommendations for {name}:")
        
#         # Create a summary of the dataframe
#         summary = f"Dataframe '{name}' summary:\n"
#         summary += f"- Shape: {df.shape}\n"
#         summary += f"- Columns: {', '.join(df.columns)}\n"
#         summary += "- Data types:\n"
#         for col, dtype in df.dtypes.items():
#             summary += f"  - {col}: {dtype}\n"
#         summary += "- Sample data (first 5 rows):\n"
#         summary += df.head().to_string()

#         # Split the summary into smaller chunks
#         chunk_size = 1500  # Reduced chunk size
#         chunks = textwrap.wrap(summary, chunk_size)

#         cleaning_recommendations = []
#         with st.spinner("Analyzing data and generating recommendations..."):
#             for i, chunk in enumerate(chunks):
#                 chunk_result = analyze_chunk(llm, df, chunk)
#                 cleaning_recommendations.append(chunk_result)

#         # Combine all recommendations
#         full_recommendations = "\n".join(cleaning_recommendations)
#         st.write(full_recommendations)
        
#         # Create checkboxes for cleaning operations
#         cleaning_ops = [op.strip() for op in full_recommendations.split('\n') if op.strip()]
#         st.session_state.cleaning_operations[name] = []
#         for op in cleaning_ops:
#             if st.checkbox(op, key=f"{name}_{op}"):
#                 st.session_state.cleaning_operations[name].append(op)

#     if st.button("Apply Cleaning and Proceed to Chat"):
#         for name, ops in st.session_state.cleaning_operations.items():
#             df = st.session_state.dataframes[name]
#             for op in ops:
#                 # Here you would implement the actual cleaning operations
#                 # For now, we'll just print what would be done
#                 st.write(f"Applying to {name}: {op}")
        
#         st.session_state.step = 3
#         st.success("Cleaning operations applied. Proceeding to chat interface.")
#         st.button("Go to Chat Interface")

#     if st.button("Back to Data Upload"):
#         st.session_state.step = 1
#         st.experimental_rerun()

def step_3_chat_with_data():
    st.subheader("Step 3: Chat with your data")

    user_input = st.text_input("Ask a question about your data:")
    if user_input:
        response = process_user_input(user_input)
        st.session_state.chat_history.append(("User", user_input))
        st.session_state.chat_history.append(("AI", response))

    for role, message in reversed(st.session_state.chat_history):
        if role == "User":
            st.text_area("You:", value=message, height=50, disabled=True)
        else:
            st.write(message)

def process_user_input(user_input):
    llm = OpenAI(temperature=0)
    combined_df = pd.concat([df.assign(source=name) for name, df in st.session_state.dataframes.items()], ignore_index=True)

    df_summary = "Available data:\n"
    for name, df in st.session_state.dataframes.items():
        df_summary += f"- {name}: {len(df)} rows, {len(df.columns)} columns\n"
        df_summary += f"  Columns: {', '.join(df.columns)}\n\n"

    agent = create_pandas_dataframe_agent(
        llm,
        combined_df,
        verbose=True,
        agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
        allow_dangerous_code=True,
        handle_parsing_errors=True
    )

   # full_input = f"{df_summary}\nThe data from all files has been combined into a single DataFrame with an additional 'source' column indicating the original file.\n\nUser question: {user_input}"
    full_input = f"{combined_df}\nAs a data analyst, process the data to answer the user question.\n\nUser question: {user_input}"

    response = agent.run(full_input)
    return response

def analyze_chunk(llm, df, chunk, timeout=30):
    agent = create_pandas_dataframe_agent(
        llm,
        df,
        verbose=True,
        agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
        allow_dangerous_code=True
    )
    
    prompt = f"Analyze this part of the dataframe summary and suggest up to 3 specific cleaning operations. Focus on identifying missing values, outliers, and inconsistent data formats.\n\n{chunk}"
    
    try:
        with ThreadPoolExecutor() as executor:
            future = executor.submit(agent.run, prompt)
            return future.result(timeout=timeout)
    except Exception as e:
        return f"Analysis timed out or encountered an error: {str(e)}"

if __name__ == "__main__":
    main()