import streamlit as st import pandas as pd import numpy as np from scipy import stats import openai def get_column_descriptions(df, openai_key): openai.api_key = openai_key descriptions = {} for column in df.columns: try: # Generate a prompt for each column prompt = f"Explain the meaning of a dataset column named '{column}' in the context of {df[column].dtype} data type." response = openai.Completion.create(engine="text-davinci-003", prompt=prompt, max_tokens=60) descriptions[column] = response.choices[0].text.strip() except Exception as e: descriptions[column] = f"Error in generating description: {e}" return descriptions def analyze_dataframe(df): info = df.info() memory = df.memory_usage(deep=True) # Missing values missing_values = df.isnull().sum() # Outliers (using Z-score for numerical columns) outliers = df.select_dtypes(include=[np.number]).apply(lambda x: np.abs(stats.zscore(x)) > 3) # Duplicates duplicates = df[df.duplicated()] # Statistics for each column stats_df = df.describe(include='all') return info, memory, missing_values, outliers, duplicates, stats_df def main(): st.title('Data Analysis App with GPT-3.5 Column Descriptions') # API key for OpenAI openai_key = st.sidebar.text_input("Enter your OpenAI API key", type="password") uploaded_file = st.file_uploader("Upload your CSV or Excel file.", type=["csv", "xlsx"]) if uploaded_file is not None: # Read data if uploaded_file.name.endswith('.csv'): df = pd.read_csv(uploaded_file) else: df = pd.read_excel(uploaded_file) # Analyze data info, memory, missing_values, outliers, duplicates, stats_df = analyze_dataframe(df) # Get column descriptions if openai_key: column_descriptions = get_column_descriptions(df, openai_key) st.write("Column Descriptions:") for col, desc in column_descriptions.items(): st.markdown(f"**{col}:** {desc}") st.write("Basic Information:") st.text(info) st.write("Memory Usage:") st.write(memory) st.write("Missing Values:") st.write(missing_values) st.write("Outliers:") st.write(outliers) st.write("Duplicates:") st.write(duplicates) st.write("Statistics:") st.write(stats_df) if __name__ == "__main__": main()