DataWizard9742 commited on
Commit
6c832f4
·
1 Parent(s): c3a4867

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -0
app.py CHANGED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from scipy import stats
5
+ import openai
6
+
7
+ def get_column_descriptions(df, openai_key):
8
+ openai.api_key = openai_key
9
+
10
+ descriptions = {}
11
+ for column in df.columns:
12
+ try:
13
+ # Generate a prompt for each column
14
+ prompt = f"Explain the meaning of a dataset column named '{column}' in the context of {df[column].dtype} data type."
15
+ response = openai.Completion.create(engine="text-davinci-003", prompt=prompt, max_tokens=60)
16
+ descriptions[column] = response.choices[0].text.strip()
17
+ except Exception as e:
18
+ descriptions[column] = f"Error in generating description: {e}"
19
+
20
+ return descriptions
21
+
22
+ def analyze_dataframe(df):
23
+ info = df.info()
24
+ memory = df.memory_usage(deep=True)
25
+ # Missing values
26
+ missing_values = df.isnull().sum()
27
+ # Outliers (using Z-score for numerical columns)
28
+ outliers = df.select_dtypes(include=[np.number]).apply(lambda x: np.abs(stats.zscore(x)) > 3)
29
+ # Duplicates
30
+ duplicates = df[df.duplicated()]
31
+ # Statistics for each column
32
+ stats_df = df.describe(include='all')
33
+ return info, memory, missing_values, outliers, duplicates, stats_df
34
+
35
+ def main():
36
+ st.title('Data Analysis App with GPT-3.5 Column Descriptions')
37
+
38
+ # API key for OpenAI
39
+ openai_key = st.sidebar.text_input("Enter your OpenAI API key", type="password")
40
+
41
+ uploaded_file = st.file_uploader("Upload your CSV or Excel file.", type=["csv", "xlsx"])
42
+
43
+ if uploaded_file is not None:
44
+ # Read data
45
+ if uploaded_file.name.endswith('.csv'):
46
+ df = pd.read_csv(uploaded_file)
47
+ else:
48
+ df = pd.read_excel(uploaded_file)
49
+
50
+ # Analyze data
51
+ info, memory, missing_values, outliers, duplicates, stats_df = analyze_dataframe(df)
52
+
53
+ # Get column descriptions
54
+ if openai_key:
55
+ column_descriptions = get_column_descriptions(df, openai_key)
56
+ st.write("Column Descriptions:")
57
+ for col, desc in column_descriptions.items():
58
+ st.markdown(f"**{col}:** {desc}")
59
+
60
+ st.write("Basic Information:")
61
+ st.text(info)
62
+
63
+ st.write("Memory Usage:")
64
+ st.write(memory)
65
+
66
+ st.write("Missing Values:")
67
+ st.write(missing_values)
68
+
69
+ st.write("Outliers:")
70
+ st.write(outliers)
71
+
72
+ st.write("Duplicates:")
73
+ st.write(duplicates)
74
+
75
+ st.write("Statistics:")
76
+ st.write(stats_df)
77
+
78
+ if __name__ == "__main__":
79
+ main()