EDA / app.py
DataWizard9742's picture
Update app.py
6c832f4
import streamlit as st
import pandas as pd
import numpy as np
from scipy import stats
import openai
def get_column_descriptions(df, openai_key):
openai.api_key = openai_key
descriptions = {}
for column in df.columns:
try:
# Generate a prompt for each column
prompt = f"Explain the meaning of a dataset column named '{column}' in the context of {df[column].dtype} data type."
response = openai.Completion.create(engine="text-davinci-003", prompt=prompt, max_tokens=60)
descriptions[column] = response.choices[0].text.strip()
except Exception as e:
descriptions[column] = f"Error in generating description: {e}"
return descriptions
def analyze_dataframe(df):
info = df.info()
memory = df.memory_usage(deep=True)
# Missing values
missing_values = df.isnull().sum()
# Outliers (using Z-score for numerical columns)
outliers = df.select_dtypes(include=[np.number]).apply(lambda x: np.abs(stats.zscore(x)) > 3)
# Duplicates
duplicates = df[df.duplicated()]
# Statistics for each column
stats_df = df.describe(include='all')
return info, memory, missing_values, outliers, duplicates, stats_df
def main():
st.title('Data Analysis App with GPT-3.5 Column Descriptions')
# API key for OpenAI
openai_key = st.sidebar.text_input("Enter your OpenAI API key", type="password")
uploaded_file = st.file_uploader("Upload your CSV or Excel file.", type=["csv", "xlsx"])
if uploaded_file is not None:
# Read data
if uploaded_file.name.endswith('.csv'):
df = pd.read_csv(uploaded_file)
else:
df = pd.read_excel(uploaded_file)
# Analyze data
info, memory, missing_values, outliers, duplicates, stats_df = analyze_dataframe(df)
# Get column descriptions
if openai_key:
column_descriptions = get_column_descriptions(df, openai_key)
st.write("Column Descriptions:")
for col, desc in column_descriptions.items():
st.markdown(f"**{col}:** {desc}")
st.write("Basic Information:")
st.text(info)
st.write("Memory Usage:")
st.write(memory)
st.write("Missing Values:")
st.write(missing_values)
st.write("Outliers:")
st.write(outliers)
st.write("Duplicates:")
st.write(duplicates)
st.write("Statistics:")
st.write(stats_df)
if __name__ == "__main__":
main()