Citi_Bike_Aanalysis / src /streamlit_app.py
Daniella
Update src/streamlit_app.py
9e01ca7 verified
#Import
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
from PIL import Image
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
##setup
st.sidebar.title("Citi Bike Dashboard 🚲")
page = st.sidebar.selectbox("Select Page", ["Introduction", "Visualization", "Model Prediction"]) # Add "Automated Report πŸ“‘" if using ydata_profiling
image_citibike = Image.open('src/citibike.png')
st.image(image_citibike, width=300)
## Upload all data
root_folder = "src/CitiBike_Trip_Data.zip"
all_data = []
# Walk through all subdirectories
for root, dirs, files in os.walk(root_folder):
for file in files:
if file.endswith(".csv"):
file_path = os.path.join(root, file)
try:
df = pd.read_csv(file_path)
all_data.append(df)
except Exception as e:
print(f"Error reading {file_path}: {e}")
# Combine all DataFrames
final_df = pd.concat(all_data, ignore_index=True)
## Page 1
if page == "Introduction":
st.subheader("01 Introduction")
st.markdown("""
Welcome to the Citi Bike Explorer Page 🚲\n
We will explore riding trends through thorough visual analysis and attempt to predict usertype based on age, gender, biking hours, and so much more. In other words, we will try to look for correlations between membership with other user demographics. \n
This dashboard uses data from 2020. While Citi Bike does present data as recent to 2025 May, they do not provide specific user demographics such as gender and age, which is why we used the dataset from 2020.\n
""")
# Preview
st.markdown("##### πŸ“Š Dataset Preview")
rows = st.slider("Select a number of rows to display", 5, 20, 5)
st.dataframe(final_df.head(rows))
# Dictionary
st.markdown("##### πŸ“– Dictionary of Columns")
feature_info = {
"tripduration": "Duration of the trip in seconds.",
"starttime": "Start time and date when the trip began.",
"stoptime": "End time and date when the trip ended.",
"start station id": "Unique ID of the station where the trip started.",
"start station name": "Name of the station where the trip started.",
"start station latitude": "Latitude coordinate of the start station.",
"start station longitude": "Longitude coordinate of the start station.",
"end station id": "Unique ID of the station where the trip ended.",
"end station name": "Name of the station where the trip ended.",
"end station latitude": "Latitude coordinate of the end station.",
"end station longitude": "Longitude coordinate of the end station.",
"bikeid": "ID of the bicycle used during the trip.",
"usertype": "Type of user. ('Customer' refers to short-term pass users, while 'Subscriber' refers to annual members.)",
"birth year": "Year of birth of the rider.",
"gender": "Gender of the rider. (0 = unknown, 1 = male, 2 = female.)"
}
desc_df = pd.DataFrame(feature_info.items(), columns=["Feature", "Description"])
#Display dicitonary
st.dataframe(desc_df, use_container_width=True)
#Summary Statistics
st.markdown("##### πŸ“– Summary Statistics")
st.dataframe(final_df.describe())
st.markdown("""
Before we go on to the Visualization page, we want to share the results of two brief graphs that allowed us to decide on the trajectory of our visualization.
""")
#Display age, gender
current_year = 2020
df = final_df.copy()
# Calculate age
df['age'] = current_year - df['birth year']
df = df[(df['gender'].isin([1, 2])) & (df['age'].between(15, 90))]
df['age_group'] = pd.cut(
df['age'],
bins=[15, 25, 35, 45, 55, 65, 75, 90],
labels=['16–25', '26–35', '36–45', '46–55', '56–65', '66–75', '76–90']
)
st.markdown("##### πŸ‘« Demographics by Age, Gender, and Usertype")
# Pie chart
filtered_df = final_df[final_df['gender'].isin([1, 2])].copy()
gender_map = {1: "Male", 2: "Female"}
filtered_df['gender_label'] = filtered_df['gender'].map(gender_map)
filtered_df['group'] = filtered_df['gender_label'] + " " + filtered_df['usertype']
group_counts = filtered_df['group'].value_counts()
if not group_counts.empty:
st.markdown("##### 1️⃣ Pie Chart on Gender and Usertype")
fig, ax = plt.subplots()
ax.pie(
group_counts.values,
labels=group_counts.index,
autopct='%1.1f%%',
startangle=140,
colors=plt.cm.Pastel1.colors
)
ax.axis('equal')
st.pyplot(fig)
else:
st.warning("No data available to generate pie chart. Try adjusting filters.")
# bar chart
st.markdown("##### 2️⃣ Bar Chart on Age, Gender, and Usertype")
st.markdown("###### This bar chart visualizes the distribution of male and female Citi Bike riders across age groups, with an optional filter to view patterns by user type.")
# filter
usertype_filter = st.multiselect(
"Select Usertype(s) to Include",
options=df['usertype'].unique(),
default=df['usertype'].unique()
)
filtered_df = df[df['usertype'].isin(usertype_filter)]
age_gender_counts = filtered_df.groupby(['age_group', 'gender']).size().unstack(fill_value=0)
fig1, ax1 = plt.subplots(figsize=(10, 6))
age_gender_counts.plot(kind='bar', stacked=True, ax=ax1, color=['blue', 'pink'])
ax1.set_title("Age Group by Gender (Stacked)")
ax1.set_xlabel("Age Group")
ax1.set_ylabel("Number of Riders")
ax1.legend(title='Gender', labels=['Male (1)', 'Female (2)'])
st.pyplot(fig1)
# analysis
st.markdown("##### πŸ“ Analysis")
st.markdown("""
1️⃣ **Subscribers outnumber short-term (guest) customers**, indicating stronger engagement from long-term users.\n
2️⃣ **Male riders are more active than female riders** across both user types.\n
3️⃣ **Gender distribution is more balanced among guest customers**, while subscribers are predominantly male.
""")
## Page 2
elif page == "Visualization":
st.subheader("02 Data Visualization")
filtered_df = final_df[final_df['gender'].isin([1, 2])].copy()
df = final_df.copy()
df = df[df['gender'].isin([1, 2])] # Filter out unknown gender
df['gender_label'] = df['gender'].map({1: 'Male', 2: 'Female'})
df = df[df['tripduration'] <= 3600] # Keep trips under 1 hour
# Plot
st.subheader("1️⃣ Trip Duration by Gender (Boxplot)")
fig, ax = plt.subplots(figsize=(8, 5))
sns.boxplot(data=df, x='gender_label', y='tripduration', ax=ax)
ax.set_title("Distribution of Trip Durations by Gender")
ax.set_ylabel("Trip Duration (seconds)")
ax.set_xlabel("Gender")
st.pyplot(fig)
st.subheader("2️⃣ Average Trip Duration by Gender and Usertype")
# Group by gender and usertype
avg_duration = filtered_df.groupby(['gender', 'usertype'])['tripduration'].mean().reset_index()
# Plot
fig, ax = plt.subplots()
sns.barplot(data=avg_duration, x='gender', y='tripduration', hue='usertype', ax=ax)
ax.set_xticklabels(['Male (1)', 'Female (2)'])
ax.set_ylabel("Average Trip Duration (seconds)")
ax.set_title("Trip Duration Trends by Gender and Usertype")
st.pyplot(fig)
st.subheader("3️⃣ Trip Frequency by Gender and Usertype")
trip_counts = filtered_df.groupby(['gender', 'usertype']).size().reset_index(name='trip_count')
# Plot
fig, ax = plt.subplots()
sns.barplot(data=trip_counts, x='gender', y='trip_count', hue='usertype', ax=ax)
ax.set_xticklabels(['Male (1)', 'Female (2)'])
ax.set_ylabel("Number of Trips")
ax.set_title("How Often Do Different Groups Ride?")
st.pyplot(fig)
st.subheader("4️⃣ Birth Year Density by Usertype (Violin Plot)")
# Remove outliers
violin_df = final_df[(final_df['birth year'] > 1920) & (final_df['birth year'] < 2010)]
# Remove unknowns
violin_df = violin_df[violin_df['usertype'].notna() & violin_df['birth year'].notna()]
fig, ax = plt.subplots(figsize=(8, 5))
sns.violinplot(data=violin_df, x='usertype', y='birth year', ax=ax)
ax.set_title("Density of Birth Years by Usertype")
st.pyplot(fig)
st.subheader("5️⃣ Time of Day Usage by Age Group")
time_df = final_df.copy()
time_df = time_df.dropna(subset=['birth year', 'starttime'])
time_df['age'] = 2020 - time_df['birth year']
time_df['age_group'] = pd.cut(time_df['age'], bins=[15, 25, 35, 50, 65, 100], labels=['16–25', '26–35', '36–50', '51–65', '66+'])
time_df['hour'] = pd.to_datetime(time_df['starttime']).dt.hour
fig, ax = plt.subplots(figsize=(10, 5))
sns.histplot(data=time_df, x='hour', hue='age_group', multiple='stack', bins=24)
ax.set_title("Ride Start Time by Age Group")
ax.set_xlabel("Hour of Day")
st.pyplot(fig)
## Page 3
elif page == "Model Prediction":
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
st.subheader("03 Model Prediction")
#Linear Regression Model
df = final_df.copy()
df = df[df['gender'].isin([1, 2])]
df = df.dropna(subset=['birth year', 'tripduration', 'usertype'])
df['age'] = 2020 - df['birth year']
df = df[(df['age'] >= 15) & (df['age'] <= 90)]
df['usertype_encoded'] = df['usertype'].map({'Customer': 0, 'Subscriber': 1})
X = df[['age', 'gender', 'tripduration']]
y = df['usertype_encoded']
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)
mae = mean_absolute_error(y, y_pred)
y_pred_class = (y_pred >= 0.5).astype(int)
accuracy = accuracy_score(y, y_pred_class)
st.subheader("1️⃣ Linear Regression Model Evaluation")
st.markdown(f"- **Mean Absolute Error (MAE)**: `{mae:.4f}`")
st.markdown(f"- **Classification Accuracy (threshold @ 0.5)**: `{accuracy * 100:.2f}%`")
#Logistic Regression
df = final_df.copy()
df = df[df['gender'].isin([1, 2])]
df['age'] = 2020 - df['birth year']
df = df[(df['age'] > 10) & (df['age'] < 90)]
df = df.dropna(subset=['tripduration'])
df['usertype_encoded'] = LabelEncoder().fit_transform(df['usertype']) # Subscriber=1, Customer=0
X = df[['age', 'gender', 'tripduration']]
y = df['usertype_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)
st.markdown("### 2️⃣ Predicted Usertype Based on User Info")
#User Interaction
age_input = st.slider("Select rider's age:", min_value=15, max_value=90, value=30)
gender_input = st.selectbox("Select rider's gender:", options=["Male", "Female"])
trip_duration_input = st.slider("Estimated trip duration (in seconds):", min_value=60, max_value=7200, value=900)
gender_code = 1 if gender_input == "Male" else 2
input_df = pd.DataFrame([[age_input, gender_code, trip_duration_input]], columns=['age', 'gender', 'tripduration'])
prediction = model.predict(input_df)[0]
proba = model.predict_proba(input_df)[0]
usertype_label = "Subscriber" if prediction == 1 else "Customer"
st.markdown(f"### 🚲 Predicted Usertype: **{usertype_label}**")
st.markdown(f"- Probability of being a Subscriber: **{proba[1]*100:.2f}%**")
st.markdown(f"- Probability of being a Customer: **{proba[0]*100:.2f}%**")