#Import import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np import random from PIL import Image import os from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score ##setup st.sidebar.title("Citi Bike Dashboard 🚲") page = st.sidebar.selectbox("Select Page", ["Introduction", "Visualization", "Model Prediction"]) # Add "Automated Report 📑" if using ydata_profiling image_citibike = Image.open('src/citibike.png') st.image(image_citibike, width=300) ## Upload all data root_folder = "src/CitiBike_Trip_Data.zip" all_data = [] # Walk through all subdirectories for root, dirs, files in os.walk(root_folder): for file in files: if file.endswith(".csv"): file_path = os.path.join(root, file) try: df = pd.read_csv(file_path) all_data.append(df) except Exception as e: print(f"Error reading {file_path}: {e}") # Combine all DataFrames final_df = pd.concat(all_data, ignore_index=True) ## Page 1 if page == "Introduction": st.subheader("01 Introduction") st.markdown(""" Welcome to the Citi Bike Explorer Page 🚲\n We will explore riding trends through thorough visual analysis and attempt to predict usertype based on age, gender, biking hours, and so much more. In other words, we will try to look for correlations between membership with other user demographics. \n This dashboard uses data from 2020. While Citi Bike does present data as recent to 2025 May, they do not provide specific user demographics such as gender and age, which is why we used the dataset from 2020.\n """) # Preview st.markdown("##### 📊 Dataset Preview") rows = st.slider("Select a number of rows to display", 5, 20, 5) st.dataframe(final_df.head(rows)) # Dictionary st.markdown("##### 📖 Dictionary of Columns") feature_info = { "tripduration": "Duration of the trip in seconds.", "starttime": "Start time and date when the trip began.", "stoptime": "End time and date when the trip ended.", "start station id": "Unique ID of the station where the trip started.", "start station name": "Name of the station where the trip started.", "start station latitude": "Latitude coordinate of the start station.", "start station longitude": "Longitude coordinate of the start station.", "end station id": "Unique ID of the station where the trip ended.", "end station name": "Name of the station where the trip ended.", "end station latitude": "Latitude coordinate of the end station.", "end station longitude": "Longitude coordinate of the end station.", "bikeid": "ID of the bicycle used during the trip.", "usertype": "Type of user. ('Customer' refers to short-term pass users, while 'Subscriber' refers to annual members.)", "birth year": "Year of birth of the rider.", "gender": "Gender of the rider. (0 = unknown, 1 = male, 2 = female.)" } desc_df = pd.DataFrame(feature_info.items(), columns=["Feature", "Description"]) #Display dicitonary st.dataframe(desc_df, use_container_width=True) #Summary Statistics st.markdown("##### 📖 Summary Statistics") st.dataframe(final_df.describe()) st.markdown(""" Before we go on to the Visualization page, we want to share the results of two brief graphs that allowed us to decide on the trajectory of our visualization. """) #Display age, gender current_year = 2020 df = final_df.copy() # Calculate age df['age'] = current_year - df['birth year'] df = df[(df['gender'].isin([1, 2])) & (df['age'].between(15, 90))] df['age_group'] = pd.cut( df['age'], bins=[15, 25, 35, 45, 55, 65, 75, 90], labels=['16–25', '26–35', '36–45', '46–55', '56–65', '66–75', '76–90'] ) st.markdown("##### 👫 Demographics by Age, Gender, and Usertype") # Pie chart filtered_df = final_df[final_df['gender'].isin([1, 2])].copy() gender_map = {1: "Male", 2: "Female"} filtered_df['gender_label'] = filtered_df['gender'].map(gender_map) filtered_df['group'] = filtered_df['gender_label'] + " " + filtered_df['usertype'] group_counts = filtered_df['group'].value_counts() if not group_counts.empty: st.markdown("##### 1️⃣ Pie Chart on Gender and Usertype") fig, ax = plt.subplots() ax.pie( group_counts.values, labels=group_counts.index, autopct='%1.1f%%', startangle=140, colors=plt.cm.Pastel1.colors ) ax.axis('equal') st.pyplot(fig) else: st.warning("No data available to generate pie chart. Try adjusting filters.") # bar chart st.markdown("##### 2️⃣ Bar Chart on Age, Gender, and Usertype") st.markdown("###### This bar chart visualizes the distribution of male and female Citi Bike riders across age groups, with an optional filter to view patterns by user type.") # filter usertype_filter = st.multiselect( "Select Usertype(s) to Include", options=df['usertype'].unique(), default=df['usertype'].unique() ) filtered_df = df[df['usertype'].isin(usertype_filter)] age_gender_counts = filtered_df.groupby(['age_group', 'gender']).size().unstack(fill_value=0) fig1, ax1 = plt.subplots(figsize=(10, 6)) age_gender_counts.plot(kind='bar', stacked=True, ax=ax1, color=['blue', 'pink']) ax1.set_title("Age Group by Gender (Stacked)") ax1.set_xlabel("Age Group") ax1.set_ylabel("Number of Riders") ax1.legend(title='Gender', labels=['Male (1)', 'Female (2)']) st.pyplot(fig1) # analysis st.markdown("##### 📝 Analysis") st.markdown(""" 1️⃣ **Subscribers outnumber short-term (guest) customers**, indicating stronger engagement from long-term users.\n 2️⃣ **Male riders are more active than female riders** across both user types.\n 3️⃣ **Gender distribution is more balanced among guest customers**, while subscribers are predominantly male. """) ## Page 2 elif page == "Visualization": st.subheader("02 Data Visualization") filtered_df = final_df[final_df['gender'].isin([1, 2])].copy() df = final_df.copy() df = df[df['gender'].isin([1, 2])] # Filter out unknown gender df['gender_label'] = df['gender'].map({1: 'Male', 2: 'Female'}) df = df[df['tripduration'] <= 3600] # Keep trips under 1 hour # Plot st.subheader("1️⃣ Trip Duration by Gender (Boxplot)") fig, ax = plt.subplots(figsize=(8, 5)) sns.boxplot(data=df, x='gender_label', y='tripduration', ax=ax) ax.set_title("Distribution of Trip Durations by Gender") ax.set_ylabel("Trip Duration (seconds)") ax.set_xlabel("Gender") st.pyplot(fig) st.subheader("2️⃣ Average Trip Duration by Gender and Usertype") # Group by gender and usertype avg_duration = filtered_df.groupby(['gender', 'usertype'])['tripduration'].mean().reset_index() # Plot fig, ax = plt.subplots() sns.barplot(data=avg_duration, x='gender', y='tripduration', hue='usertype', ax=ax) ax.set_xticklabels(['Male (1)', 'Female (2)']) ax.set_ylabel("Average Trip Duration (seconds)") ax.set_title("Trip Duration Trends by Gender and Usertype") st.pyplot(fig) st.subheader("3️⃣ Trip Frequency by Gender and Usertype") trip_counts = filtered_df.groupby(['gender', 'usertype']).size().reset_index(name='trip_count') # Plot fig, ax = plt.subplots() sns.barplot(data=trip_counts, x='gender', y='trip_count', hue='usertype', ax=ax) ax.set_xticklabels(['Male (1)', 'Female (2)']) ax.set_ylabel("Number of Trips") ax.set_title("How Often Do Different Groups Ride?") st.pyplot(fig) st.subheader("4️⃣ Birth Year Density by Usertype (Violin Plot)") # Remove outliers violin_df = final_df[(final_df['birth year'] > 1920) & (final_df['birth year'] < 2010)] # Remove unknowns violin_df = violin_df[violin_df['usertype'].notna() & violin_df['birth year'].notna()] fig, ax = plt.subplots(figsize=(8, 5)) sns.violinplot(data=violin_df, x='usertype', y='birth year', ax=ax) ax.set_title("Density of Birth Years by Usertype") st.pyplot(fig) st.subheader("5️⃣ Time of Day Usage by Age Group") time_df = final_df.copy() time_df = time_df.dropna(subset=['birth year', 'starttime']) time_df['age'] = 2020 - time_df['birth year'] time_df['age_group'] = pd.cut(time_df['age'], bins=[15, 25, 35, 50, 65, 100], labels=['16–25', '26–35', '36–50', '51–65', '66+']) time_df['hour'] = pd.to_datetime(time_df['starttime']).dt.hour fig, ax = plt.subplots(figsize=(10, 5)) sns.histplot(data=time_df, x='hour', hue='age_group', multiple='stack', bins=24) ax.set_title("Ride Start Time by Age Group") ax.set_xlabel("Hour of Day") st.pyplot(fig) ## Page 3 elif page == "Model Prediction": from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder st.subheader("03 Model Prediction") #Linear Regression Model df = final_df.copy() df = df[df['gender'].isin([1, 2])] df = df.dropna(subset=['birth year', 'tripduration', 'usertype']) df['age'] = 2020 - df['birth year'] df = df[(df['age'] >= 15) & (df['age'] <= 90)] df['usertype_encoded'] = df['usertype'].map({'Customer': 0, 'Subscriber': 1}) X = df[['age', 'gender', 'tripduration']] y = df['usertype_encoded'] model = LinearRegression() model.fit(X, y) y_pred = model.predict(X) mae = mean_absolute_error(y, y_pred) y_pred_class = (y_pred >= 0.5).astype(int) accuracy = accuracy_score(y, y_pred_class) st.subheader("1️⃣ Linear Regression Model Evaluation") st.markdown(f"- **Mean Absolute Error (MAE)**: `{mae:.4f}`") st.markdown(f"- **Classification Accuracy (threshold @ 0.5)**: `{accuracy * 100:.2f}%`") #Logistic Regression df = final_df.copy() df = df[df['gender'].isin([1, 2])] df['age'] = 2020 - df['birth year'] df = df[(df['age'] > 10) & (df['age'] < 90)] df = df.dropna(subset=['tripduration']) df['usertype_encoded'] = LabelEncoder().fit_transform(df['usertype']) # Subscriber=1, Customer=0 X = df[['age', 'gender', 'tripduration']] y = df['usertype_encoded'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = LogisticRegression() model.fit(X_train, y_train) st.markdown("### 2️⃣ Predicted Usertype Based on User Info") #User Interaction age_input = st.slider("Select rider's age:", min_value=15, max_value=90, value=30) gender_input = st.selectbox("Select rider's gender:", options=["Male", "Female"]) trip_duration_input = st.slider("Estimated trip duration (in seconds):", min_value=60, max_value=7200, value=900) gender_code = 1 if gender_input == "Male" else 2 input_df = pd.DataFrame([[age_input, gender_code, trip_duration_input]], columns=['age', 'gender', 'tripduration']) prediction = model.predict(input_df)[0] proba = model.predict_proba(input_df)[0] usertype_label = "Subscriber" if prediction == 1 else "Customer" st.markdown(f"### 🚲 Predicted Usertype: **{usertype_label}**") st.markdown(f"- Probability of being a Subscriber: **{proba[1]*100:.2f}%**") st.markdown(f"- Probability of being a Customer: **{proba[0]*100:.2f}%**")