Spaces:
Sleeping
Sleeping
| #Import | |
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import numpy as np | |
| import random | |
| from PIL import Image | |
| import os | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score | |
| ##setup | |
| st.sidebar.title("Citi Bike Dashboard π²") | |
| page = st.sidebar.selectbox("Select Page", ["Introduction", "Visualization", "Model Prediction"]) # Add "Automated Report π" if using ydata_profiling | |
| image_citibike = Image.open('src/citibike.png') | |
| st.image(image_citibike, width=300) | |
| ## Upload all data | |
| root_folder = "src/CitiBike_Trip_Data.zip" | |
| all_data = [] | |
| # Walk through all subdirectories | |
| for root, dirs, files in os.walk(root_folder): | |
| for file in files: | |
| if file.endswith(".csv"): | |
| file_path = os.path.join(root, file) | |
| try: | |
| df = pd.read_csv(file_path) | |
| all_data.append(df) | |
| except Exception as e: | |
| print(f"Error reading {file_path}: {e}") | |
| # Combine all DataFrames | |
| final_df = pd.concat(all_data, ignore_index=True) | |
| ## Page 1 | |
| if page == "Introduction": | |
| st.subheader("01 Introduction") | |
| st.markdown(""" | |
| Welcome to the Citi Bike Explorer Page π²\n | |
| We will explore riding trends through thorough visual analysis and attempt to predict usertype based on age, gender, biking hours, and so much more. In other words, we will try to look for correlations between membership with other user demographics. \n | |
| This dashboard uses data from 2020. While Citi Bike does present data as recent to 2025 May, they do not provide specific user demographics such as gender and age, which is why we used the dataset from 2020.\n | |
| """) | |
| # Preview | |
| st.markdown("##### π Dataset Preview") | |
| rows = st.slider("Select a number of rows to display", 5, 20, 5) | |
| st.dataframe(final_df.head(rows)) | |
| # Dictionary | |
| st.markdown("##### π Dictionary of Columns") | |
| feature_info = { | |
| "tripduration": "Duration of the trip in seconds.", | |
| "starttime": "Start time and date when the trip began.", | |
| "stoptime": "End time and date when the trip ended.", | |
| "start station id": "Unique ID of the station where the trip started.", | |
| "start station name": "Name of the station where the trip started.", | |
| "start station latitude": "Latitude coordinate of the start station.", | |
| "start station longitude": "Longitude coordinate of the start station.", | |
| "end station id": "Unique ID of the station where the trip ended.", | |
| "end station name": "Name of the station where the trip ended.", | |
| "end station latitude": "Latitude coordinate of the end station.", | |
| "end station longitude": "Longitude coordinate of the end station.", | |
| "bikeid": "ID of the bicycle used during the trip.", | |
| "usertype": "Type of user. ('Customer' refers to short-term pass users, while 'Subscriber' refers to annual members.)", | |
| "birth year": "Year of birth of the rider.", | |
| "gender": "Gender of the rider. (0 = unknown, 1 = male, 2 = female.)" | |
| } | |
| desc_df = pd.DataFrame(feature_info.items(), columns=["Feature", "Description"]) | |
| #Display dicitonary | |
| st.dataframe(desc_df, use_container_width=True) | |
| #Summary Statistics | |
| st.markdown("##### π Summary Statistics") | |
| st.dataframe(final_df.describe()) | |
| st.markdown(""" | |
| Before we go on to the Visualization page, we want to share the results of two brief graphs that allowed us to decide on the trajectory of our visualization. | |
| """) | |
| #Display age, gender | |
| current_year = 2020 | |
| df = final_df.copy() | |
| # Calculate age | |
| df['age'] = current_year - df['birth year'] | |
| df = df[(df['gender'].isin([1, 2])) & (df['age'].between(15, 90))] | |
| df['age_group'] = pd.cut( | |
| df['age'], | |
| bins=[15, 25, 35, 45, 55, 65, 75, 90], | |
| labels=['16β25', '26β35', '36β45', '46β55', '56β65', '66β75', '76β90'] | |
| ) | |
| st.markdown("##### π« Demographics by Age, Gender, and Usertype") | |
| # Pie chart | |
| filtered_df = final_df[final_df['gender'].isin([1, 2])].copy() | |
| gender_map = {1: "Male", 2: "Female"} | |
| filtered_df['gender_label'] = filtered_df['gender'].map(gender_map) | |
| filtered_df['group'] = filtered_df['gender_label'] + " " + filtered_df['usertype'] | |
| group_counts = filtered_df['group'].value_counts() | |
| if not group_counts.empty: | |
| st.markdown("##### 1οΈβ£ Pie Chart on Gender and Usertype") | |
| fig, ax = plt.subplots() | |
| ax.pie( | |
| group_counts.values, | |
| labels=group_counts.index, | |
| autopct='%1.1f%%', | |
| startangle=140, | |
| colors=plt.cm.Pastel1.colors | |
| ) | |
| ax.axis('equal') | |
| st.pyplot(fig) | |
| else: | |
| st.warning("No data available to generate pie chart. Try adjusting filters.") | |
| # bar chart | |
| st.markdown("##### 2οΈβ£ Bar Chart on Age, Gender, and Usertype") | |
| st.markdown("###### This bar chart visualizes the distribution of male and female Citi Bike riders across age groups, with an optional filter to view patterns by user type.") | |
| # filter | |
| usertype_filter = st.multiselect( | |
| "Select Usertype(s) to Include", | |
| options=df['usertype'].unique(), | |
| default=df['usertype'].unique() | |
| ) | |
| filtered_df = df[df['usertype'].isin(usertype_filter)] | |
| age_gender_counts = filtered_df.groupby(['age_group', 'gender']).size().unstack(fill_value=0) | |
| fig1, ax1 = plt.subplots(figsize=(10, 6)) | |
| age_gender_counts.plot(kind='bar', stacked=True, ax=ax1, color=['blue', 'pink']) | |
| ax1.set_title("Age Group by Gender (Stacked)") | |
| ax1.set_xlabel("Age Group") | |
| ax1.set_ylabel("Number of Riders") | |
| ax1.legend(title='Gender', labels=['Male (1)', 'Female (2)']) | |
| st.pyplot(fig1) | |
| # analysis | |
| st.markdown("##### π Analysis") | |
| st.markdown(""" | |
| 1οΈβ£ **Subscribers outnumber short-term (guest) customers**, indicating stronger engagement from long-term users.\n | |
| 2οΈβ£ **Male riders are more active than female riders** across both user types.\n | |
| 3οΈβ£ **Gender distribution is more balanced among guest customers**, while subscribers are predominantly male. | |
| """) | |
| ## Page 2 | |
| elif page == "Visualization": | |
| st.subheader("02 Data Visualization") | |
| filtered_df = final_df[final_df['gender'].isin([1, 2])].copy() | |
| df = final_df.copy() | |
| df = df[df['gender'].isin([1, 2])] # Filter out unknown gender | |
| df['gender_label'] = df['gender'].map({1: 'Male', 2: 'Female'}) | |
| df = df[df['tripduration'] <= 3600] # Keep trips under 1 hour | |
| # Plot | |
| st.subheader("1οΈβ£ Trip Duration by Gender (Boxplot)") | |
| fig, ax = plt.subplots(figsize=(8, 5)) | |
| sns.boxplot(data=df, x='gender_label', y='tripduration', ax=ax) | |
| ax.set_title("Distribution of Trip Durations by Gender") | |
| ax.set_ylabel("Trip Duration (seconds)") | |
| ax.set_xlabel("Gender") | |
| st.pyplot(fig) | |
| st.subheader("2οΈβ£ Average Trip Duration by Gender and Usertype") | |
| # Group by gender and usertype | |
| avg_duration = filtered_df.groupby(['gender', 'usertype'])['tripduration'].mean().reset_index() | |
| # Plot | |
| fig, ax = plt.subplots() | |
| sns.barplot(data=avg_duration, x='gender', y='tripduration', hue='usertype', ax=ax) | |
| ax.set_xticklabels(['Male (1)', 'Female (2)']) | |
| ax.set_ylabel("Average Trip Duration (seconds)") | |
| ax.set_title("Trip Duration Trends by Gender and Usertype") | |
| st.pyplot(fig) | |
| st.subheader("3οΈβ£ Trip Frequency by Gender and Usertype") | |
| trip_counts = filtered_df.groupby(['gender', 'usertype']).size().reset_index(name='trip_count') | |
| # Plot | |
| fig, ax = plt.subplots() | |
| sns.barplot(data=trip_counts, x='gender', y='trip_count', hue='usertype', ax=ax) | |
| ax.set_xticklabels(['Male (1)', 'Female (2)']) | |
| ax.set_ylabel("Number of Trips") | |
| ax.set_title("How Often Do Different Groups Ride?") | |
| st.pyplot(fig) | |
| st.subheader("4οΈβ£ Birth Year Density by Usertype (Violin Plot)") | |
| # Remove outliers | |
| violin_df = final_df[(final_df['birth year'] > 1920) & (final_df['birth year'] < 2010)] | |
| # Remove unknowns | |
| violin_df = violin_df[violin_df['usertype'].notna() & violin_df['birth year'].notna()] | |
| fig, ax = plt.subplots(figsize=(8, 5)) | |
| sns.violinplot(data=violin_df, x='usertype', y='birth year', ax=ax) | |
| ax.set_title("Density of Birth Years by Usertype") | |
| st.pyplot(fig) | |
| st.subheader("5οΈβ£ Time of Day Usage by Age Group") | |
| time_df = final_df.copy() | |
| time_df = time_df.dropna(subset=['birth year', 'starttime']) | |
| time_df['age'] = 2020 - time_df['birth year'] | |
| time_df['age_group'] = pd.cut(time_df['age'], bins=[15, 25, 35, 50, 65, 100], labels=['16β25', '26β35', '36β50', '51β65', '66+']) | |
| time_df['hour'] = pd.to_datetime(time_df['starttime']).dt.hour | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| sns.histplot(data=time_df, x='hour', hue='age_group', multiple='stack', bins=24) | |
| ax.set_title("Ride Start Time by Age Group") | |
| ax.set_xlabel("Hour of Day") | |
| st.pyplot(fig) | |
| ## Page 3 | |
| elif page == "Model Prediction": | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.preprocessing import LabelEncoder | |
| st.subheader("03 Model Prediction") | |
| #Linear Regression Model | |
| df = final_df.copy() | |
| df = df[df['gender'].isin([1, 2])] | |
| df = df.dropna(subset=['birth year', 'tripduration', 'usertype']) | |
| df['age'] = 2020 - df['birth year'] | |
| df = df[(df['age'] >= 15) & (df['age'] <= 90)] | |
| df['usertype_encoded'] = df['usertype'].map({'Customer': 0, 'Subscriber': 1}) | |
| X = df[['age', 'gender', 'tripduration']] | |
| y = df['usertype_encoded'] | |
| model = LinearRegression() | |
| model.fit(X, y) | |
| y_pred = model.predict(X) | |
| mae = mean_absolute_error(y, y_pred) | |
| y_pred_class = (y_pred >= 0.5).astype(int) | |
| accuracy = accuracy_score(y, y_pred_class) | |
| st.subheader("1οΈβ£ Linear Regression Model Evaluation") | |
| st.markdown(f"- **Mean Absolute Error (MAE)**: `{mae:.4f}`") | |
| st.markdown(f"- **Classification Accuracy (threshold @ 0.5)**: `{accuracy * 100:.2f}%`") | |
| #Logistic Regression | |
| df = final_df.copy() | |
| df = df[df['gender'].isin([1, 2])] | |
| df['age'] = 2020 - df['birth year'] | |
| df = df[(df['age'] > 10) & (df['age'] < 90)] | |
| df = df.dropna(subset=['tripduration']) | |
| df['usertype_encoded'] = LabelEncoder().fit_transform(df['usertype']) # Subscriber=1, Customer=0 | |
| X = df[['age', 'gender', 'tripduration']] | |
| y = df['usertype_encoded'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| model = LogisticRegression() | |
| model.fit(X_train, y_train) | |
| st.markdown("### 2οΈβ£ Predicted Usertype Based on User Info") | |
| #User Interaction | |
| age_input = st.slider("Select rider's age:", min_value=15, max_value=90, value=30) | |
| gender_input = st.selectbox("Select rider's gender:", options=["Male", "Female"]) | |
| trip_duration_input = st.slider("Estimated trip duration (in seconds):", min_value=60, max_value=7200, value=900) | |
| gender_code = 1 if gender_input == "Male" else 2 | |
| input_df = pd.DataFrame([[age_input, gender_code, trip_duration_input]], columns=['age', 'gender', 'tripduration']) | |
| prediction = model.predict(input_df)[0] | |
| proba = model.predict_proba(input_df)[0] | |
| usertype_label = "Subscriber" if prediction == 1 else "Customer" | |
| st.markdown(f"### π² Predicted Usertype: **{usertype_label}**") | |
| st.markdown(f"- Probability of being a Subscriber: **{proba[1]*100:.2f}%**") | |
| st.markdown(f"- Probability of being a Customer: **{proba[0]*100:.2f}%**") | |