Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from scipy import stats | |
| import openai | |
| def get_column_descriptions(df, openai_key): | |
| openai.api_key = openai_key | |
| descriptions = {} | |
| for column in df.columns: | |
| try: | |
| # Generate a prompt for each column | |
| prompt = f"Explain the meaning of a dataset column named '{column}' in the context of {df[column].dtype} data type." | |
| response = openai.Completion.create(engine="text-davinci-003", prompt=prompt, max_tokens=60) | |
| descriptions[column] = response.choices[0].text.strip() | |
| except Exception as e: | |
| descriptions[column] = f"Error in generating description: {e}" | |
| return descriptions | |
| def analyze_dataframe(df): | |
| info = df.info() | |
| memory = df.memory_usage(deep=True) | |
| # Missing values | |
| missing_values = df.isnull().sum() | |
| # Outliers (using Z-score for numerical columns) | |
| outliers = df.select_dtypes(include=[np.number]).apply(lambda x: np.abs(stats.zscore(x)) > 3) | |
| # Duplicates | |
| duplicates = df[df.duplicated()] | |
| # Statistics for each column | |
| stats_df = df.describe(include='all') | |
| return info, memory, missing_values, outliers, duplicates, stats_df | |
| def main(): | |
| st.title('Data Analysis App with GPT-3.5 Column Descriptions') | |
| # API key for OpenAI | |
| openai_key = st.sidebar.text_input("Enter your OpenAI API key", type="password") | |
| uploaded_file = st.file_uploader("Upload your CSV or Excel file.", type=["csv", "xlsx"]) | |
| if uploaded_file is not None: | |
| # Read data | |
| if uploaded_file.name.endswith('.csv'): | |
| df = pd.read_csv(uploaded_file) | |
| else: | |
| df = pd.read_excel(uploaded_file) | |
| # Analyze data | |
| info, memory, missing_values, outliers, duplicates, stats_df = analyze_dataframe(df) | |
| # Get column descriptions | |
| if openai_key: | |
| column_descriptions = get_column_descriptions(df, openai_key) | |
| st.write("Column Descriptions:") | |
| for col, desc in column_descriptions.items(): | |
| st.markdown(f"**{col}:** {desc}") | |
| st.write("Basic Information:") | |
| st.text(info) | |
| st.write("Memory Usage:") | |
| st.write(memory) | |
| st.write("Missing Values:") | |
| st.write(missing_values) | |
| st.write("Outliers:") | |
| st.write(outliers) | |
| st.write("Duplicates:") | |
| st.write(duplicates) | |
| st.write("Statistics:") | |
| st.write(stats_df) | |
| if __name__ == "__main__": | |
| main() | |