', unsafe_allow_html=True) st.markdown('

💬 Chat with Your Data

', unsafe_allow_html=True) # Initialize chat history if not present if "chat_history" not in st.session_state: st.session_state.chat_history = [] # Make sure we have data to chat about if 'df' not in st.session_state or st.session_state.df is None: st.error("No dataset loaded. Please upload a CSV file to chat with your data.") # Show a preview of chat capabilities st.markdown("""

What can I help you with?

Once you upload a dataset, you can ask questions like:

What patterns do you see in my data?
How many missing values are there?
What feature engineering would you recommend?
Show me the distribution of a specific column
What are the correlations between features?

""", unsafe_allow_html=True) st.markdown('

', unsafe_allow_html=True) st.subheader("Dataset Overview") # Display dataset information in a cleaner format total_rows = df.shape[0] total_cols = df.shape[1] numeric_cols = len(df.select_dtypes(include=['number']).columns) cat_cols = len(df.select_dtypes(include=['object', 'category']).columns) date_cols = len(df.select_dtypes(include=['datetime']).columns) st.markdown(f"""

{total_rows:,}

Rows

{total_cols}

Columns

{numeric_cols}

Numerical

{cat_cols}

Categorical

{date_cols}

Date/Time

""", unsafe_allow_html=True) st.markdown('

', unsafe_allow_html=True) st.subheader("Missing Values") col1, col2 = st.columns([2, 3]) with col1: # Calculate missing values missing_data = df.isnull().sum() missing_percent = (missing_data / len(df)) * 100 missing_data = pd.DataFrame({ 'Missing Values': missing_data, 'Percentage (%)': missing_percent.round(2) }) missing_data = missing_data[missing_data['Missing Values'] > 0].sort_values('Missing Values', ascending=False) if not missing_data.empty: st.dataframe(missing_data.style.background_gradient(cmap='Reds', subset=['Percentage (%)']) .format({'Percentage (%)': '{:.2f}%'}), use_container_width=True) else: st.success("No missing values found in the dataset! 🎉") with col2: if not missing_data.empty: # Create a horizontal bar chart for missing values fig = px.bar(missing_data, x='Percentage (%)', y=missing_data.index, orientation='h', color='Percentage (%)', color_continuous_scale='Reds', title='Missing Values by Column') fig.update_layout( height=max(350, len(missing_data) * 30), xaxis_title='Missing (%)', yaxis_title='', coloraxis_showscale=False, margin=dict(l=0, r=10, t=30, b=0) ) st.plotly_chart(fig, use_container_width=True) st.markdown('

', unsafe_allow_html=True) if chart_type == "Histogram": col1, col2 = st.columns([3, 1]) with col2: bins = st.slider("Number of bins", min_value=5, max_value=100, value=30, key="hist_bins") kde = st.checkbox("Show KDE", value=True, key="show_kde") with col1: pass # Display histograms with better styling for column in selected_columns: st.markdown(f'

{column}

', unsafe_allow_html=True) fig = px.histogram(df, x=column, nbins=bins, title=f"Histogram of {column}", marginal="box" if kde else None, color_discrete_sequence=['rgba(99, 102, 241, 0.7)']) fig.update_layout( template="plotly_white", height=400, margin=dict(l=10, r=10, t=40, b=10), xaxis_title=column, yaxis_title="Frequency", bargap=0.1 ) st.plotly_chart(fig, use_container_width=True) # Show basic statistics stats = df[column].describe().to_dict() st.markdown(f"""

Mean: {stats['mean']:.2f}

Median: {stats['50%']:.2f}

Std Dev: {stats['std']:.2f}

Min: {stats['min']:.2f}

Max: {stats['max']:.2f}

""", unsafe_allow_html=True) st.markdown('

', unsafe_allow_html=True) elif chart_type == "Box Plot": for column in selected_columns: st.markdown(f'

{column}

', unsafe_allow_html=True) fig = px.box(df, y=column, title=f"Box Plot of {column}", color_discrete_sequence=['rgba(99, 102, 241, 0.7)']) fig.update_layout( template="plotly_white", height=400, margin=dict(l=10, r=10, t=40, b=10), yaxis_title=column ) st.plotly_chart(fig, use_container_width=True) # Show outlier information q1 = df[column].quantile(0.25) q3 = df[column].quantile(0.75) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column] st.markdown(f"""

Q1 (25%): {q1:.2f}

Median: {df[column].median():.2f}

Q3 (75%): {q3:.2f}

IQR: {iqr:.2f}

Outliers: {len(outliers)} ({(len(outliers)/len(df)*100):.2f}%)

""", unsafe_allow_html=True) st.markdown('

', unsafe_allow_html=True) elif chart_type == "Violin Plot": for column in selected_columns: st.markdown(f'

{column}

', unsafe_allow_html=True) fig = px.violin(df, y=column, box=True, points="all", title=f"Violin Plot of {column}", color_discrete_sequence=['rgba(99, 102, 241, 0.7)']) fig.update_layout( template="plotly_white", height=400, margin=dict(l=10, r=10, t=40, b=10), yaxis_title=column ) fig.update_traces(marker=dict(size=3, opacity=0.5)) st.plotly_chart(fig, use_container_width=True) st.markdown('

', unsafe_allow_html=True) elif chart_type == "Distribution Plot": if len(selected_columns) >= 2: st.markdown('

', unsafe_allow_html=True) chart_options = st.radio( "Select Distribution Plot Type", ["Scatter Plot", "Correlation Heatmap"], horizontal=True ) if chart_options == "Scatter Plot": col1, col2 = st.columns([3, 1]) with col2: x_axis = st.selectbox("X-axis", options=selected_columns, index=0) y_axis = st.selectbox("Y-axis", options=selected_columns, index=min(1, len(selected_columns)-1)) color_option = st.selectbox("Color by", options=["None"] + df.columns.tolist()) with col1: if color_option != "None": fig = px.scatter(df, x=x_axis, y=y_axis, color=color_option, title=f"{y_axis} vs {x_axis} (colored by {color_option})", opacity=0.7, marginal_x="histogram", marginal_y="histogram") else: fig = px.scatter(df, x=x_axis, y=y_axis, title=f"{y_axis} vs {x_axis}", opacity=0.7, marginal_x="histogram", marginal_y="histogram") fig.update_layout( template="plotly_white", height=600, margin=dict(l=10, r=10, t=40, b=10), ) st.plotly_chart(fig, use_container_width=True) elif chart_options == "Correlation Heatmap": # Calculate correlation matrix corr_matrix = df[selected_columns].corr() # Create heatmap fig = px.imshow(corr_matrix, text_auto=".2f", color_continuous_scale="RdBu_r", zmin=-1, zmax=1, title="Correlation Heatmap") fig.update_layout( template="plotly_white", height=600, margin=dict(l=10, r=10, t=40, b=10), ) st.plotly_chart(fig, use_container_width=True) # Show highest correlations corr_df = corr_matrix.stack().reset_index() corr_df.columns = ['Variable 1', 'Variable 2', 'Correlation'] corr_df = corr_df[corr_df['Variable 1'] != corr_df['Variable 2']] corr_df = corr_df.sort_values('Correlation', ascending=False).head(5) st.markdown("##### Top 5 Highest Correlations") st.dataframe(corr_df.style.background_gradient(cmap='Blues') .format({'Correlation': '{:.2f}'}), use_container_width=True) st.markdown('

', unsafe_allow_html=True) else: st.warning("Please select at least 2 numerical columns to see distribution plots") st.markdown('

💬 Chat with Your Data

What can I help you with?

📊 Descriptive Statistics

📈 Data Distribution

{column}

{column}

{column}

🧠 AI-Generated Insights

🔄 Relationships & Correlations