File size: 2,551 Bytes
6c832f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import streamlit as st
import pandas as pd
import numpy as np
from scipy import stats
import openai

def get_column_descriptions(df, openai_key):
    openai.api_key = openai_key

    descriptions = {}
    for column in df.columns:
        try:
            # Generate a prompt for each column
            prompt = f"Explain the meaning of a dataset column named '{column}' in the context of {df[column].dtype} data type."
            response = openai.Completion.create(engine="text-davinci-003", prompt=prompt, max_tokens=60)
            descriptions[column] = response.choices[0].text.strip()
        except Exception as e:
            descriptions[column] = f"Error in generating description: {e}"

    return descriptions

def analyze_dataframe(df):
    info = df.info()
    memory = df.memory_usage(deep=True)
    # Missing values
    missing_values = df.isnull().sum()
    # Outliers (using Z-score for numerical columns)
    outliers = df.select_dtypes(include=[np.number]).apply(lambda x: np.abs(stats.zscore(x)) > 3)
    # Duplicates
    duplicates = df[df.duplicated()]
    # Statistics for each column
    stats_df = df.describe(include='all')
    return info, memory, missing_values, outliers, duplicates, stats_df

def main():
    st.title('Data Analysis App with GPT-3.5 Column Descriptions')

    # API key for OpenAI
    openai_key = st.sidebar.text_input("Enter your OpenAI API key", type="password")

    uploaded_file = st.file_uploader("Upload your CSV or Excel file.", type=["csv", "xlsx"])
    
    if uploaded_file is not None:
        # Read data
        if uploaded_file.name.endswith('.csv'):
            df = pd.read_csv(uploaded_file)
        else:
            df = pd.read_excel(uploaded_file)

        # Analyze data
        info, memory, missing_values, outliers, duplicates, stats_df = analyze_dataframe(df)

        # Get column descriptions
        if openai_key:
            column_descriptions = get_column_descriptions(df, openai_key)
            st.write("Column Descriptions:")
            for col, desc in column_descriptions.items():
                st.markdown(f"**{col}:** {desc}")
                
        st.write("Basic Information:")
        st.text(info)

        st.write("Memory Usage:")
        st.write(memory)

        st.write("Missing Values:")
        st.write(missing_values)

        st.write("Outliers:")
        st.write(outliers)

        st.write("Duplicates:")
        st.write(duplicates)

        st.write("Statistics:")
        st.write(stats_df)

if __name__ == "__main__":
    main()