import streamlit as st import pandas as pd import seaborn as sns import duckdb # For in-memory SQL Quering import matplotlib.pyplot as plt st.set_page_config(page_title="Tips EDA & Insights", layout="wide") # --- Using App Header --- st.title("💸 Tipping Behavior Analyzer") st.markdown(""" Welcome to the interactive explorer for the Seaborn Tips dataset! Analyze tipping behavior by gender, day, and party size. """) # --- Data Loading & Caching --- @st.cache_data(show_spinner=True) def load_data(): tips = sns.load_dataset("tips") tips['tip_pct'] = (tips['tip'] / tips['total_bill']) * 100 tips.to_parquet("tips.parquet") return tips tips = load_data() # --- DuckDB Query Function --- @st.cache_data(show_spinner=True) def query_duckdb(gender, day): query = f""" SELECT * FROM 'tips.parquet' WHERE sex = '{gender}' AND day = '{day}' """ return duckdb.query(query).to_df() # --- Sidebar Controls --- st.sidebar.header("🔎 Filter Data") gender = st.sidebar.selectbox("Select Gender", options=tips['sex'].unique()) day = st.sidebar.selectbox("Select Day", options=tips['day'].unique()) party_size = st.sidebar.slider("Party Size", int(tips['size'].min()), int(tips['size'].max()), int(tips['size'].min())) hue_option = st.sidebar.selectbox("Color by (hue)", options=['smoker', 'time', 'day', 'sex']) filtered = query_duckdb(gender, day) filtered = filtered[filtered['size'] == party_size] # --- KPI Section --- st.subheader("📊 Key Performance Indicator") mean_tip = filtered['tip_pct'].mean() st.metric("Average Tip Percentage", f"{mean_tip:.2f}%") # --- Data Table --- st.subheader("🗃️ Filtered Data") st.dataframe(filtered) # --- Visualizations --- st.subheader("📈 Visualizations") col1, col2 = st.columns(2) with col1: st.markdown(f"#### Tip Percentage Distribution by Gender") fig1, ax1 = plt.subplots() sns.boxplot(data=tips, x="sex", y="tip_pct", hue=hue_option, ax=ax1) ax1.set_title(f"Tip Percentage Distribution by Gender (Hue: {hue_option})") st.pyplot(fig1) with col2: st.markdown(f"#### Tip Percentage vs. Party Size") fig2, ax2 = plt.subplots() sns.scatterplot(data=tips, x="size", y="tip_pct", hue=hue_option, ax=ax2) ax2.set_title(f"Tip Percentage vs. Party Size (Hue: {hue_option})") st.pyplot(fig2) st.markdown("#### Average Tip Percentage by Day") mean_tip_by_day = tips.groupby('day')['tip_pct'].mean() st.bar_chart(mean_tip_by_day) # --- Dynamic Insight --- st.subheader("💡 Insight") st.write( f"On **{day}s**, for **{gender}** customers in a party of size **{party_size}**, " f"the average tip percentage is **{mean_tip:.2f}%**." ) # --- Cache Invalidation Patterns --- # ...existing code... # --- Authors & Plot Explanations --- st.markdown("---") st.header("👨‍💻 Project Contributors") st.markdown(""" **Muhammad Ibrahim** **Asalun Hye Arnob** --- ### 📊 Plot Explanations - **Tip Percentage Distribution by Gender (Box Plot):** This plot shows how tip percentages vary between male and female customers. The box represents the middle 50% of values, the line inside is the median, and dots outside the box are outliers. It helps us compare tipping habits by gender. - **Tip Percentage vs. Party Size (Scatter Plot):** This plot displays individual tip percentages for each party size. Each dot is a meal. It helps us see if larger groups tend to tip more or less, and spot any patterns or clusters. - **Average Tip Percentage by Day (Bar Chart):** This chart shows the average tip percentage for each day of the week. It helps us identify which days have higher or lower tipping rates. --- **Project Summary:** We loaded and cleaned the tips dataset, created a tip percentage variable, and built interactive visualizations to explore how tipping behavior varies by gender, day, and party size. Our app uses DuckDB for fast queries and Streamlit for a user-friendly interface. """)