import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import re
from datetime import datetime, timedelta
def data_chatbot(df):
"""
Advanced chatbot that provides data access and visualizations based on user questions
"""
st.markdown("""
""", unsafe_allow_html=True)
# Initialize session state
if "chat_messages" not in st.session_state:
st.session_state.chat_messages = []
if "last_viz" not in st.session_state:
st.session_state.last_viz = None
if "last_data" not in st.session_state:
st.session_state.last_data = None
# Main layout
main_col, viz_col = st.columns([1, 1])
with main_col:
# Chat history
chat_container = st.container()
with chat_container:
if not st.session_state.chat_messages:
st.info("""
👋 **Hi! I can show you data and create visualizations. Try asking:**
**📊 Show Data:**
• "Show me the first 10 rows"
• "Show me data where age > 30"
• "Display top 5 by sales"
**📈 Create Visualizations:**
• "Show me a bar chart of category"
• "Plot histogram of age"
• "Create scatter plot of price vs quantity"
• "Show trend of sales over time"
**🔍 Analyze:**
• "What's the average of salary?"
• "Show statistics for all columns"
• "Find outliers in price"
""")
for msg in st.session_state.chat_messages:
if msg["role"] == "user":
st.markdown(f'👤 You: {msg["content"]}
', unsafe_allow_html=True)
else:
st.markdown(f'{msg["content"]}
', unsafe_allow_html=True)
# Input area
st.markdown("
", unsafe_allow_html=True)
input_col1, input_col2 = st.columns([5, 1])
with input_col1:
user_query = st.text_input("", placeholder="💬 Ask a question or request a visualization...",
key="chat_input", label_visibility="collapsed")
with input_col2:
send_button = st.button("📤 Ask", use_container_width=True)
if send_button and user_query:
# Add user message
st.session_state.chat_messages.append({"role": "user", "content": user_query})
# Process query and get response with data/viz
with st.spinner("🔍 Processing your request..."):
response, viz_data, table_data = process_query_with_viz(user_query, df)
# Add bot response
st.session_state.chat_messages.append({"role": "bot", "content": response})
# Store visualization and data for display
if viz_data:
st.session_state.last_viz = viz_data
if table_data is not None:
st.session_state.last_data = table_data
st.rerun()
with viz_col:
# Display visualizations and data
if st.session_state.last_viz:
st.markdown('', unsafe_allow_html=True)
st.markdown("### 📊 Generated Visualization")
display_visualization(st.session_state.last_viz)
st.markdown('
', unsafe_allow_html=True)
if st.session_state.last_data is not None:
st.markdown('', unsafe_allow_html=True)
st.markdown("### 📋 Data Result")
st.dataframe(st.session_state.last_data, use_container_width=True, height=300)
st.markdown('
', unsafe_allow_html=True)
# Quick action buttons
st.markdown("---")
st.markdown("### 🔍 Quick Actions")
col1, col2, col3, col4, col5 = st.columns(5)
actions = [
("📊 First 10 Rows", "Show me first 10 rows", col1),
("📈 Bar Chart", "Show bar chart of first categorical column", col2),
("📉 Histogram", "Plot histogram of first numeric column", col3),
("🔎 Filter", "Show rows where value > average", col4),
("📋 Statistics", "Show me statistics", col5)
]
for label, query, col in actions:
if col.button(label, use_container_width=True):
st.session_state.chat_messages.append({"role": "user", "content": query})
response, viz_data, table_data = process_query_with_viz(query, df)
st.session_state.chat_messages.append({"role": "bot", "content": response})
if viz_data:
st.session_state.last_viz = viz_data
if table_data is not None:
st.session_state.last_data = table_data
st.rerun()
# Clear button
col1, col2, col3 = st.columns([1, 1, 1])
with col2:
if st.button("🗑️ Clear Chat & Visualizations", use_container_width=True):
st.session_state.chat_messages = []
st.session_state.last_viz = None
st.session_state.last_data = None
st.rerun()
def process_query_with_viz(query, df):
"""Process query and return response with visualization and data"""
query_lower = query.lower().strip()
# Get column information
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
datetime_cols = df.select_dtypes(include=['datetime64']).columns.tolist()
all_cols = df.columns.tolist()
# Extract numbers from query
numbers = re.findall(r'\d+', query_lower)
n = int(numbers[0]) if numbers else 10
# 1. SHOW DATA - First/Last/Random rows
if any(word in query_lower for word in ['first', 'head', 'top']):
return show_first_rows(df, n)
elif any(word in query_lower for word in ['last', 'tail', 'bottom']):
return show_last_rows(df, n)
elif 'random' in query_lower or 'sample' in query_lower:
return show_random_rows(df, n)
# 2. FILTER DATA
elif any(word in query_lower for word in ['find', 'where', 'filter', 'search', 'with']):
return filter_data(query_lower, df)
# 3. SORT DATA
elif 'sort' in query_lower or 'order by' in query_lower:
return sort_data(query_lower, df)
# 4. BAR CHART
elif any(word in query_lower for word in ['bar chart', 'bar plot', 'bar graph', 'count plot']):
return create_bar_chart(query_lower, df, categorical_cols)
# 5. HISTOGRAM
elif any(word in query_lower for word in ['histogram', 'distribution', 'hist', 'frequency']):
return create_histogram(query_lower, df, numeric_cols)
# 6. SCATTER PLOT
elif any(word in query_lower for word in ['scatter', 'scatter plot', 'scatterplot', 'relationship']):
return create_scatter_plot(query_lower, df, numeric_cols)
# 7. LINE CHART / TREND
elif any(word in query_lower for word in ['line chart', 'line plot', 'trend', 'over time']):
return create_line_chart(query_lower, df, numeric_cols, datetime_cols)
# 8. BOX PLOT
elif any(word in query_lower for word in ['box plot', 'boxplot', 'box', 'outliers']):
return create_box_plot(query_lower, df, numeric_cols, categorical_cols)
# 9. PIE CHART
elif any(word in query_lower for word in ['pie chart', 'pie', 'proportion', 'percentage']):
return create_pie_chart(query_lower, df, categorical_cols)
# 10. HEATMAP / CORRELATION
elif any(word in query_lower for word in ['heatmap', 'correlation', 'corr', 'heat map']):
return create_heatmap(df, numeric_cols)
# 11. VIOLIN PLOT
elif 'violin' in query_lower:
return create_violin_plot(query_lower, df, numeric_cols, categorical_cols)
# 12. STATISTICS
elif any(word in query_lower for word in ['statistics', 'stats', 'describe', 'summary']):
return show_statistics(query_lower, df, numeric_cols, all_cols)
# 13. COLUMN INFORMATION
elif any(word in query_lower for word in ['column info', 'column details', 'info about']):
return show_column_info(query_lower, df, all_cols)
# 14. MISSING VALUES
elif any(word in query_lower for word in ['missing', 'null', 'na', 'empty']):
return show_missing_values(df)
# 15. OUTLIERS
elif 'outlier' in query_lower:
return detect_outliers(query_lower, df, numeric_cols)
# 16. UNIQUE VALUES
elif any(word in query_lower for word in ['unique', 'distinct', 'categories']):
return show_unique_values(query_lower, df, all_cols, categorical_cols)
# 17. COMPARE COLUMNS
elif 'compare' in query_lower:
return compare_columns(query_lower, df, numeric_cols, categorical_cols)
# 18. HELP
elif any(word in query_lower for word in ['help', 'what can you do', 'capabilities']):
return show_help(), None, None
# 19. DEFAULT - Try to understand if asking about a specific column
else:
return handle_general_query(query_lower, df, numeric_cols, categorical_cols, all_cols)
def show_first_rows(df, n=10):
"""Show first n rows"""
data = df.head(n)
response = f"### 👁️ First {n} Rows\n\nHere's the data you requested:"
return response, None, data
def show_last_rows(df, n=10):
"""Show last n rows"""
data = df.tail(n)
response = f"### 👁️ Last {n} Rows\n\nHere's the data you requested:"
return response, None, data
def show_random_rows(df, n=5):
"""Show random n rows"""
data = df.sample(min(n, len(df)))
response = f"### 🎲 Random Sample of {n} Rows\n\nHere's a random sample from your data:"
return response, None, data
def filter_data(query, df):
"""Filter data based on conditions"""
# Common patterns
patterns = [
(r'(\w+)\s*>\s*(\d+\.?\d*)', '>'),
(r'(\w+)\s*<\s*(\d+\.?\d*)', '<'),
(r'(\w+)\s*>=\s*(\d+\.?\d*)', '>='),
(r'(\w+)\s*<=\s*(\d+\.?\d*)', '<='),
(r'(\w+)\s*=\s*(\d+\.?\d*)', '=='),
(r'(\w+)\s*==\s*(\d+\.?\d*)', '=='),
(r'(\w+)\s*contains\s*["\']?([^"\']+)["\']?', 'contains'),
(r'(\w+)\s*is\s*["\']?([^"\']+)["\']?', '=='),
]
for pattern, op in patterns:
match = re.search(pattern, query.lower())
if match:
col = match.group(1)
val = match.group(2)
# Find matching column
for c in df.columns:
if c.lower() == col:
try:
if op in ['>', '<', '>=', '<=']:
val = float(val)
if op == '>':
filtered = df[df[c] > val]
condition = f"{c} > {val}"
elif op == '<':
filtered = df[df[c] < val]
condition = f"{c} < {val}"
elif op == '>=':
filtered = df[df[c] >= val]
condition = f"{c} >= {val}"
elif op == '<=':
filtered = df[df[c] <= val]
condition = f"{c} <= {val}"
elif op == 'contains':
filtered = df[df[c].astype(str).str.contains(val, case=False, na=False)]
condition = f"{c} contains '{val}'"
else:
if df[c].dtype in ['int64', 'float64']:
filtered = df[df[c] == float(val)]
else:
filtered = df[df[c].astype(str).str.lower() == val.lower()]
condition = f"{c} = {val}"
if len(filtered) > 0:
response = f"### 🔍 Found {len(filtered)} rows where {condition}\n\nShowing first 20 results:"
return response, None, filtered.head(20)
else:
return f"❌ No rows found where {condition}", None, None
except:
pass
return "❌ I couldn't understand the filter condition. Try something like: 'show rows where age > 30'", None, None
def sort_data(query, df):
"""Sort data by column"""
# Extract column name
for col in df.columns:
if col.lower() in query:
sort_col = col
break
else:
sort_col = df.columns[0] if len(df.columns) > 0 else None
if not sort_col:
return "❌ Please specify a column to sort by", None, None
# Determine order
if 'desc' in query or 'highest' in query or 'largest' in query:
ascending = False
order = "descending"
else:
ascending = True
order = "ascending"
# Get number
numbers = re.findall(r'\d+', query)
n = int(numbers[0]) if numbers else 20
sorted_df = df.sort_values(sort_col, ascending=ascending).head(n)
response = f"### 📊 Sorted by {sort_col} ({order})\n\nShowing top {n} results:"
return response, None, sorted_df
def create_bar_chart(query, df, categorical_cols):
"""Create bar chart for categorical column"""
# Find requested column
col = None
for c in categorical_cols:
if c.lower() in query:
col = c
break
if not col and categorical_cols:
col = categorical_cols[0]
if col:
value_counts = df[col].value_counts().head(20)
fig = px.bar(
x=value_counts.index,
y=value_counts.values,
title=f"Bar Chart of {col} (Top 20)",
labels={'x': col, 'y': 'Count'},
color_discrete_sequence=['#667eea']
)
fig.update_layout(
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(color='#2c3e50'),
xaxis_tickangle=-45,
height=500
)
response = f"### 📊 Bar Chart of '{col}'\n\nHere's the distribution of values:"
return response, fig, None
return "❌ No categorical column found for bar chart", None, None
def create_histogram(query, df, numeric_cols):
"""Create histogram for numeric column"""
# Find requested column
col = None
for c in numeric_cols:
if c.lower() in query:
col = c
break
if not col and numeric_cols:
col = numeric_cols[0]
if col:
fig = px.histogram(
df,
x=col,
nbins=30,
title=f"Histogram of {col}",
marginal="box",
color_discrete_sequence=['#667eea']
)
fig.update_layout(
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(color='#2c3e50'),
height=500
)
# Add statistics
data = df[col].dropna()
stats = f"Mean: {data.mean():.2f} | Median: {data.median():.2f} | Std: {data.std():.2f}"
response = f"### 📊 Histogram of '{col}'\n\n{stats}"
return response, fig, None
return "❌ No numeric column found for histogram", None, None
def create_scatter_plot(query, df, numeric_cols):
"""Create scatter plot between two numeric columns"""
# Find two numeric columns
cols = []
for col in numeric_cols:
if col.lower() in query:
cols.append(col)
if len(cols) >= 2:
x_col, y_col = cols[0], cols[1]
elif len(numeric_cols) >= 2:
x_col, y_col = numeric_cols[0], numeric_cols[1]
else:
return "❌ Need at least 2 numeric columns for scatter plot", None, None
fig = px.scatter(
df,
x=x_col,
y=y_col,
title=f"Scatter Plot: {y_col} vs {x_col}",
trendline="ols",
opacity=0.6,
color_discrete_sequence=['#667eea']
)
fig.update_layout(
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(color='#2c3e50'),
height=500
)
# Calculate correlation
corr = df[x_col].corr(df[y_col])
response = f"### 📊 Scatter Plot: {y_col} vs {x_col}\n\nCorrelation: {corr:.4f}"
return response, fig, None
def create_line_chart(query, df, numeric_cols, datetime_cols):
"""Create line chart for time series or sequential data"""
# Find date column
date_col = None
for col in datetime_cols:
if col.lower() in query:
date_col = col
break
if not date_col and datetime_cols:
date_col = datetime_cols[0]
# Find value column
val_col = None
for col in numeric_cols:
if col.lower() in query:
val_col = col
break
if not val_col and numeric_cols:
val_col = numeric_cols[0]
if date_col and val_col:
# Sort by date
plot_df = df[[date_col, val_col]].dropna().sort_values(date_col)
fig = px.line(
plot_df,
x=date_col,
y=val_col,
title=f"Trend of {val_col} over Time",
color_discrete_sequence=['#667eea']
)
fig.update_layout(
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(color='#2c3e50'),
height=500
)
response = f"### 📈 Line Chart: {val_col} over Time"
return response, fig, None
return "❌ Need a datetime column and numeric column for line chart", None, None
def create_box_plot(query, df, numeric_cols, categorical_cols):
"""Create box plot"""
# Find numeric column
num_col = None
for col in numeric_cols:
if col.lower() in query:
num_col = col
break
if not num_col and numeric_cols:
num_col = numeric_cols[0]
# Find categorical column for grouping
cat_col = None
for col in categorical_cols:
if col.lower() in query:
cat_col = col
break
if num_col:
if cat_col:
fig = px.box(
df,
x=cat_col,
y=num_col,
title=f"Box Plot of {num_col} by {cat_col}",
color_discrete_sequence=['#667eea']
)
response = f"### 📊 Box Plot: {num_col} grouped by {cat_col}"
else:
fig = px.box(
df,
y=num_col,
title=f"Box Plot of {num_col}",
color_discrete_sequence=['#667eea']
)
response = f"### 📊 Box Plot of {num_col}"
fig.update_layout(
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(color='#2c3e50'),
height=500
)
return response, fig, None
return "❌ No numeric column found for box plot", None, None
def create_pie_chart(query, df, categorical_cols):
"""Create pie chart for categorical column"""
# Find categorical column
col = None
for c in categorical_cols:
if c.lower() in query:
col = c
break
if not col and categorical_cols:
col = categorical_cols[0]
if col:
value_counts = df[col].value_counts().head(10)
fig = px.pie(
values=value_counts.values,
names=value_counts.index,
title=f"Pie Chart of {col} (Top 10)",
hole=0.3,
color_discrete_sequence=px.colors.qualitative.Set3
)
fig.update_layout(
height=500,
showlegend=True
)
response = f"### 🥧 Pie Chart of '{col}'\n\nProportion of values:"
return response, fig, None
return "❌ No categorical column found for pie chart", None, None
def create_heatmap(df, numeric_cols):
"""Create correlation heatmap"""
if len(numeric_cols) < 2:
return "❌ Need at least 2 numeric columns for correlation heatmap", None, None
corr_matrix = df[numeric_cols].corr()
fig = px.imshow(
corr_matrix,
text_auto=True,
aspect="auto",
color_continuous_scale='RdBu_r',
title="Correlation Heatmap",
zmin=-1, zmax=1
)
fig.update_layout(
height=600,
plot_bgcolor='white',
paper_bgcolor='white'
)
response = "### 🔥 Correlation Heatmap\n\nStrong correlations are shown in dark red/blue:"
return response, fig, None
def create_violin_plot(query, df, numeric_cols, categorical_cols):
"""Create violin plot"""
# Find numeric column
num_col = None
for col in numeric_cols:
if col.lower() in query:
num_col = col
break
if not num_col and numeric_cols:
num_col = numeric_cols[0]
# Find categorical column for grouping
cat_col = None
for col in categorical_cols:
if col.lower() in query:
cat_col = col
break
if num_col:
if cat_col:
fig = px.violin(
df,
x=cat_col,
y=num_col,
title=f"Violin Plot of {num_col} by {cat_col}",
box=True,
points="all",
color_discrete_sequence=['#667eea']
)
response = f"### 🎻 Violin Plot: {num_col} grouped by {cat_col}"
else:
fig = px.violin(
df,
y=num_col,
title=f"Violin Plot of {num_col}",
box=True,
points="all",
color_discrete_sequence=['#667eea']
)
response = f"### 🎻 Violin Plot of {num_col}"
fig.update_layout(
plot_bgcolor='white',
paper_bgcolor='white',
font=dict(color='#2c3e50'),
height=500
)
return response, fig, None
return "❌ No numeric column found for violin plot", None, None
def show_statistics(query, df, numeric_cols, all_cols):
"""Show statistics for columns"""
# Check if asking about specific column
for col in all_cols:
if col.lower() in query and col in numeric_cols:
data = df[col].dropna()
stats_data = pd.DataFrame({
'Statistic': ['Count', 'Mean', 'Std Dev', 'Min', '25%', '50%', '75%', 'Max', 'Skewness', 'Kurtosis'],
'Value': [
len(data),
f"{data.mean():.4f}",
f"{data.std():.4f}",
f"{data.min():.4f}",
f"{data.quantile(0.25):.4f}",
f"{data.median():.4f}",
f"{data.quantile(0.75):.4f}",
f"{data.max():.4f}",
f"{data.skew():.4f}",
f"{data.kurtosis():.4f}"
]
})
response = f"### 📊 Statistics for '{col}'"
return response, None, stats_data
# General statistics for all numeric columns
if numeric_cols:
stats_df = df[numeric_cols].describe().T
stats_df['skew'] = df[numeric_cols].skew()
stats_df['kurtosis'] = df[numeric_cols].kurtosis()
response = "### 📈 Summary Statistics for Numeric Columns"
return response, None, stats_df
return "❌ No numeric columns found for statistics", None, None
def show_column_info(query, df, all_cols):
"""Show information about specific column or all columns"""
# Check if asking about specific column
for col in all_cols:
if col.lower() in query:
info_data = pd.DataFrame({
'Property': ['Data Type', 'Unique Values', 'Missing Values', 'Missing %', 'Sample Values'],
'Value': [
str(df[col].dtype),
df[col].nunique(),
df[col].isnull().sum(),
f"{(df[col].isnull().sum()/len(df)*100):.2f}%",
str(df[col].dropna().iloc[:3].tolist())
]
})
response = f"### 📋 Column Information: '{col}'"
return response, None, info_data
# General column information
col_info = pd.DataFrame({
'Column': df.columns,
'Data Type': df.dtypes.astype(str),
'Unique Values': [df[col].nunique() for col in df.columns],
'Missing Values': df.isnull().sum().values,
'Missing %': (df.isnull().sum().values / len(df) * 100).round(2)
})
response = "### 📋 All Columns Information"
return response, None, col_info
def show_missing_values(df):
"""Show missing values analysis"""
missing = df.isnull().sum()
missing = missing[missing > 0]
if len(missing) == 0:
return "✅ **Good news!** No missing values found in the dataset.", None, None
missing_data = pd.DataFrame({
'Column': missing.index,
'Missing Count': missing.values,
'Missing %': (missing.values / len(df) * 100).round(2)
}).sort_values('Missing %', ascending=False)
total_missing = missing.sum()
total_cells = df.shape[0] * df.shape[1]
response = f"### 🔍 Missing Values Analysis\n\n**Total Missing:** {total_missing} out of {total_cells} cells ({total_missing/total_cells*100:.2f}%)"
return response, None, missing_data
def detect_outliers(query, df, numeric_cols):
"""Detect outliers in numeric columns"""
# Check if asking about specific column
target_cols = []
for col in numeric_cols:
if col.lower() in query:
target_cols.append(col)
if not target_cols:
target_cols = numeric_cols[:3] # Check first 3 numeric columns
outlier_data = []
for col in target_cols:
data = df[col].dropna()
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
outliers = data[(data < Q1 - 1.5 * IQR) | (data > Q3 + 1.5 * IQR)]
outlier_data.append({
'Column': col,
'Outliers Count': len(outliers),
'Outliers %': f"{(len(outliers)/len(data)*100):.2f}%",
'Normal Range': f"[{Q1 - 1.5 * IQR:.4f}, {Q3 + 1.5 * IQR:.4f}]",
'Severity': 'High' if len(outliers)/len(data)*100 > 10 else 'Medium' if len(outliers)/len(data)*100 > 5 else 'Low'
})
outlier_df = pd.DataFrame(outlier_data)
response = "### ⚠️ Outlier Detection Results"
return response, None, outlier_df
def show_unique_values(query, df, all_cols, categorical_cols):
"""Show unique values in columns"""
# Check if asking about specific column
for col in all_cols:
if col.lower() in query:
value_counts = df[col].value_counts().reset_index()
value_counts.columns = [col, 'Count']
value_counts['Percentage'] = (value_counts['Count'] / len(df) * 100).round(2)
response = f"### 🎯 Unique Values in '{col}'\n\n**Total Unique:** {df[col].nunique()}"
return response, None, value_counts.head(20)
# Show for categorical columns
if categorical_cols:
unique_data = []
for col in categorical_cols[:10]:
unique_data.append({
'Column': col,
'Unique Values': df[col].nunique(),
'Most Common': df[col].value_counts().index[0] if len(df[col].value_counts()) > 0 else 'N/A',
'Most Common Count': df[col].value_counts().values[0] if len(df[col].value_counts()) > 0 else 0
})
unique_df = pd.DataFrame(unique_data)
response = "### 🎯 Unique Values in Categorical Columns"
return response, None, unique_df
return "❌ No categorical columns found", None, None
def compare_columns(query, df, numeric_cols, categorical_cols):
"""Compare two columns"""
# Find two columns to compare
cols = []
for col in df.columns:
if col.lower() in query:
cols.append(col)
if len(cols) >= 2:
col1, col2 = cols[0], cols[1]
if col1 in numeric_cols and col2 in numeric_cols:
# Numeric comparison
comparison_data = pd.DataFrame({
'Metric': ['Mean', 'Median', 'Std Dev', 'Min', 'Max'],
col1: [
df[col1].mean(),
df[col1].median(),
df[col1].std(),
df[col1].min(),
df[col1].max()
],
col2: [
df[col2].mean(),
df[col2].median(),
df[col2].std(),
df[col2].min(),
df[col2].max()
]
})
response = f"### 🔄 Comparison: {col1} vs {col2}"
return response, None, comparison_data
elif col1 in categorical_cols and col2 in categorical_cols:
# Categorical comparison - crosstab
cross_tab = pd.crosstab(df[col1], df[col2])
response = f"### 🔄 Cross-tabulation: {col1} vs {col2}"
return response, None, cross_tab
return "❌ Please specify two columns to compare", None, None
def show_help():
"""Show help information"""
help_text = """
### 🤖 I Can Help You With:
**📊 Show Data:**
• "Show me first 10 rows"
• "Show me last 5 rows"
• "Show random sample of 10 rows"
• "Find rows where age > 30"
• "Sort by price descending"
• "Top 5 by sales"
**📈 Create Visualizations:**
• "Show bar chart of category"
• "Plot histogram of age"
• "Create scatter plot of price vs quantity"
• "Show line chart of sales over time"
• "Create box plot of salary"
• "Show pie chart of region"
• "Display correlation heatmap"
• "Create violin plot of price"
**🔍 Analyze Data:**
• "Show statistics for all columns"
• "Tell me about [column name]"
• "Any missing values?"
• "Find outliers in price"
• "Show unique values in category"
• "Compare age and income"
**Just ask naturally and I'll show you the data and visualizations!**
"""
return help_text
def handle_general_query(query, df, numeric_cols, categorical_cols, all_cols):
"""Handle general queries that don't match specific patterns"""
# Check if asking about a specific column
for col in all_cols:
if col.lower() in query:
if col in numeric_cols:
data = df[col].dropna()
return f"**{col}** - Mean: {data.mean():.2f}, Min: {data.min():.2f}, Max: {data.max():.2f}", None, None
else:
return f"**{col}** - Unique values: {df[col].nunique()}, Most common: {df[col].value_counts().index[0] if len(df[col].value_counts()) > 0 else 'N/A'}", None, None
# Check for dataset size
if 'size' in query or 'large' in query or 'big' in query:
size_mb = df.memory_usage(deep=True).sum() / 1024**2
return f"Dataset size: {size_mb:.2f} MB ({df.shape[0]:,} rows × {df.shape[1]} columns)", None, None
# Default response
return "❌ I didn't understand. Try asking for data, visualizations, or type 'help'", None, None
def display_visualization(fig):
"""Display the visualization"""
st.plotly_chart(fig, use_container_width=True)
# Simple version for quick integration
def run_simple_chatbot(df):
"""Simplified chatbot version"""
st.markdown("### 💬 Simple Data Chat")
if "simple_msgs" not in st.session_state:
st.session_state.simple_msgs = []
# Chat display
for msg in st.session_state.simple_msgs:
if msg["role"] == "user":
st.info(f"👤 {msg['content']}")
else:
st.success(f"🤖 {msg['content']}")
# Input
user_input = st.text_input("Ask:", key="simple_chat_input")
if st.button("Send") and user_input:
st.session_state.simple_msgs.append({"role": "user", "content": user_input})
# Simple responses
response = "I don't understand. Try: rows, columns, missing, stats, chart"
if "row" in user_input.lower():
response = f"Dataset has {df.shape[0]} rows"
elif "column" in user_input.lower():
response = f"Dataset has {df.shape[1]} columns: {', '.join(df.columns[:5])}"
elif "missing" in user_input.lower():
missing = df.isnull().sum().sum()
response = f"Found {missing} missing values" if missing > 0 else "No missing values"
elif "stat" in user_input.lower():
numeric = df.select_dtypes(include=[np.number]).columns
if len(numeric) > 0:
response = f"Mean of {numeric[0]}: {df[numeric[0]].mean():.2f}"
elif "chart" in user_input.lower() or "plot" in user_input.lower():
response = "📊 Creating visualization... (check the plot above)"
# Simple histogram
numeric = df.select_dtypes(include=[np.number]).columns
if len(numeric) > 0:
fig = px.histogram(df, x=numeric[0], title=f"Distribution of {numeric[0]}")
st.plotly_chart(fig, use_container_width=True)
st.session_state.simple_msgs.append({"role": "bot", "content": response})
st.rerun()
if st.button("Clear Chat"):
st.session_state.simple_msgs = []
st.rerun()