Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| from sklearn.cluster import KMeans | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.metrics import silhouette_score | |
| from statsmodels.datasets import get_rdataset | |
| from scipy.cluster.hierarchy import dendrogram, linkage, fcluster | |
| # Set up the style for all plots | |
| plt.style.use('default') | |
| sns.set_theme(style="whitegrid", palette="husl") | |
| def load_arrests_data(): | |
| """Load and return the US Arrests dataset""" | |
| USArrests = get_rdataset('USArrests').data | |
| return USArrests | |
| def create_categorical_plot(df, column, target='Survived'): | |
| """Create an interactive plot for categorical variables""" | |
| fig = px.bar( | |
| df.groupby(column)[target].mean().reset_index(), | |
| x=column, | |
| y=target, | |
| title=f'Survival Rate by {column}', | |
| labels={target: 'Survival Rate', column: column}, | |
| color=target, | |
| color_continuous_scale='RdBu' | |
| ) | |
| fig.update_layout( | |
| plot_bgcolor='rgb(30, 30, 30)', | |
| paper_bgcolor='rgb(30, 30, 30)', | |
| font=dict(color='white') | |
| ) | |
| return fig | |
| def create_numeric_plot(df, column, target='Survived'): | |
| """Create an interactive plot for numeric variables""" | |
| fig = px.box( | |
| df, | |
| x=target, | |
| y=column, | |
| title=f'{column} Distribution by Survival', | |
| labels={target: 'Survived', column: column}, | |
| color=target, | |
| color_discrete_sequence=px.colors.qualitative.Set1 | |
| ) | |
| fig.update_layout( | |
| plot_bgcolor='rgb(30, 30, 30)', | |
| paper_bgcolor='rgb(30, 30, 30)', | |
| font=dict(color='white') | |
| ) | |
| return fig | |
| def show(): | |
| st.title("Week 7: Clustering Lab - State Crime Pattern Analysis") | |
| # Code Example: Loading and Basic Data Exploration | |
| with st.expander("Code Example: Loading and Exploring Data"): | |
| st.code(""" | |
| # Load the data | |
| from statsmodels.datasets import get_rdataset | |
| USArrests = get_rdataset('USArrests').data | |
| # Basic data exploration | |
| print("Dataset shape:", USArrests.shape) | |
| print("\\nVariables:", USArrests.columns.tolist()) | |
| print("\\nFirst 5 states:") | |
| print(USArrests.head()) | |
| # Basic statistics | |
| print("\\nData Summary:") | |
| print(USArrests.describe()) | |
| """, language="python") | |
| # Introduction Section with Learning Objectives | |
| st.header("Learning Objectives") | |
| st.markdown(""" | |
| In this week, you'll master: | |
| 1. **Unsupervised Learning**: Discover hidden patterns in crime data without predefined categories | |
| 2. **K-Means Clustering**: Learn to divide states into distinct safety profiles | |
| 3. **Hierarchical Clustering**: Create a "family tree" of state crime patterns | |
| 4. **Data Preprocessing**: Understand why scaling is crucial for fair comparisons | |
| """) | |
| # Interactive Overview | |
| st.header("Lab Overview") | |
| st.write(""" | |
| Welcome to your hands-on clustering lab! You'll be working as a policy analyst for the Department of Justice, | |
| analyzing crime patterns across US states. Your mission: discover hidden safety profiles that could inform | |
| federal resource allocation and crime prevention strategies. | |
| """) | |
| # Load Data | |
| st.header("Exercise 1: Data Detective Work") | |
| st.write("Let's start by understanding our dataset - the US Arrests data.") | |
| df = load_arrests_data() | |
| # Code Example: Data Visualization | |
| with st.expander("Code Example: Creating Visualizations"): | |
| st.code(""" | |
| # Create correlation heatmap | |
| import plotly.express as px | |
| fig = px.imshow(df.corr(), | |
| labels=dict(color="Correlation"), | |
| color_continuous_scale="RdBu") | |
| fig.show() | |
| # Create box plots | |
| fig = px.box(df, title="Data Distribution") | |
| fig.show() | |
| """, language="python") | |
| # Interactive Data Exploration | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("Dataset Overview") | |
| st.write(f"Number of states: {len(df)}") | |
| st.write(f"Number of variables: {len(df.columns)}") | |
| st.write("\nVariables:", df.columns.tolist()) | |
| # Interactive data summary | |
| st.subheader("Data Summary") | |
| summary = df.describe() | |
| st.dataframe(summary) | |
| with col2: | |
| st.subheader("First 5 States") | |
| st.dataframe(df.head()) | |
| # Interactive correlation heatmap | |
| st.subheader("Correlation Heatmap") | |
| fig = px.imshow(df.corr(), | |
| labels=dict(color="Correlation"), | |
| color_continuous_scale="RdBu") | |
| st.plotly_chart(fig) | |
| # Exercise 2: Scaling Challenge | |
| st.header("Exercise 2: The Scaling Challenge") | |
| # Code Example: Data Scaling | |
| with st.expander("Code Example: Scaling Data"): | |
| st.code(""" | |
| # Import StandardScaler | |
| from sklearn.preprocessing import StandardScaler | |
| # Create and fit the scaler | |
| scaler = StandardScaler() | |
| df_scaled = scaler.fit_transform(df) | |
| # Convert back to DataFrame | |
| df_scaled = pd.DataFrame(df_scaled, | |
| columns=df.columns, | |
| index=df.index) | |
| # Compare original vs scaled data | |
| print("Original data ranges:") | |
| print(df.describe()) | |
| print("\\nScaled data ranges:") | |
| print(df_scaled.describe()) | |
| """, language="python") | |
| # Explanation of scaling | |
| st.markdown(""" | |
| ### Why Do We Need Scaling? | |
| In our crime data, we have variables measured in very different scales: | |
| - Murder rates: typically 0-20 per 100,000 | |
| - Assault rates: typically 50-350 per 100,000 | |
| - Urban population: 0-100 percentage | |
| - Rape rates: typically 0-50 per 100,000 | |
| Without scaling, variables with larger numbers (like Assault) would dominate our analysis, | |
| making smaller-scale variables (like Murder) less influential. This would be like comparing | |
| dollars to cents - the cents would seem insignificant even if they were important! | |
| """) | |
| # Show original data ranges | |
| st.subheader("Original Data Ranges") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Create a bar chart of variances | |
| fig_var = px.bar( | |
| x=df.columns, | |
| y=df.var(), | |
| title="Variance of Each Variable (Before Scaling)", | |
| labels={'x': 'Crime Variables', 'y': 'Variance'}, | |
| color=df.var(), | |
| color_continuous_scale='Viridis' | |
| ) | |
| st.plotly_chart(fig_var) | |
| st.write(""" | |
| Notice how Assault has a much larger variance (6,945) compared to Murder (19). | |
| This means Assault would dominate our clustering if we didn't scale the data! | |
| """) | |
| with col2: | |
| # Create box plots of original data | |
| fig_box = px.box(df, title="Original Data Distribution") | |
| fig_box.update_layout( | |
| xaxis_title="Crime Variables", | |
| yaxis_title="Rate per 100,000" | |
| ) | |
| st.plotly_chart(fig_box) | |
| # Explain standardization | |
| st.markdown(""" | |
| ### What is Standardization? | |
| Standardization (also called Z-score normalization) transforms our data so that: | |
| 1. Each variable has a mean of 0 | |
| 2. Each variable has a standard deviation of 1 | |
| The formula is: z = (x - μ) / σ | |
| - x is the original value | |
| - μ is the mean of the variable | |
| - σ is the standard deviation of the variable | |
| """) | |
| # Scale the data | |
| scaler = StandardScaler() | |
| df_scaled = scaler.fit_transform(df) | |
| df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index) | |
| # Show scaled data | |
| st.subheader("After Scaling") | |
| # Create box plots of scaled data | |
| fig_scaled = px.box(df_scaled, title="Scaled Data Distribution") | |
| fig_scaled.update_layout( | |
| xaxis_title="Crime Variables", | |
| yaxis_title="Standardized Values" | |
| ) | |
| st.plotly_chart(fig_scaled) | |
| st.write(""" | |
| After scaling, all variables are on the same scale: | |
| - Mean = 0 | |
| - Standard Deviation = 1 | |
| - Values typically range from -3 to +3 | |
| """) | |
| # Show before/after comparison for a few states | |
| st.write("### Before vs After Scaling (Sample States)") | |
| comparison_df = pd.DataFrame({ | |
| 'State': df.index[:5], | |
| 'Original Murder': df['Murder'][:5], | |
| 'Scaled Murder': df_scaled['Murder'][:5], | |
| 'Original Assault': df['Assault'][:5], | |
| 'Scaled Assault': df_scaled['Assault'][:5] | |
| }) | |
| st.dataframe(comparison_df) | |
| st.write(""" | |
| Notice how the relative differences between states are preserved, | |
| but now all variables contribute equally to our analysis! | |
| """) | |
| # Why scaling matters for clustering | |
| st.markdown(""" | |
| ### Why Scaling Matters for Clustering | |
| In clustering, we measure distances between data points. Without scaling: | |
| - States might be grouped together just because they have similar assault rates | |
| - Important differences in murder rates might be ignored | |
| With scaling: | |
| - All variables contribute equally to the distance calculations | |
| - We can find true patterns in the data, not just patterns in the largest numbers | |
| """) | |
| # Exercise 3: Finding Optimal Clusters | |
| st.header("Exercise 3: Finding the Right Number of Groups") | |
| # Code Example: Elbow Method | |
| with st.expander("Code Example: Finding Optimal K"): | |
| st.code(""" | |
| # Calculate inertias for different K values | |
| inertias = [] | |
| K_values = range(1, 11) | |
| for k in K_values: | |
| kmeans = KMeans(n_clusters=k, random_state=42, n_init=20) | |
| kmeans.fit(df_scaled) | |
| inertias.append(kmeans.inertia_) | |
| # Create elbow plot | |
| import plotly.graph_objects as go | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter( | |
| x=list(K_values), | |
| y=inertias, | |
| mode='lines+markers', | |
| name='Inertia' | |
| )) | |
| fig.update_layout( | |
| title='Finding the Optimal Number of Clusters', | |
| xaxis_title='Number of Clusters (K)', | |
| yaxis_title='Within-Cluster Sum of Squares' | |
| ) | |
| fig.show() | |
| """, language="python") | |
| st.markdown(""" | |
| ### The Elbow Method Explained | |
| The elbow method helps us find the optimal number of clusters (K) by looking at how the "within-cluster sum of squares" | |
| (WCSS) changes as we increase the number of clusters. Think of it like this: | |
| - **What is WCSS?** It's a measure of how spread out the points are within each cluster | |
| - **Lower WCSS** means points are closer to their cluster center (better clustering) | |
| - **Higher WCSS** means points are more spread out from their cluster center | |
| As we increase K: | |
| 1. WCSS always decreases (more clusters = tighter groups) | |
| 2. The rate of decrease slows down | |
| 3. We look for the "elbow" - where adding more clusters doesn't help much anymore | |
| """) | |
| # Calculate inertias for different K values | |
| inertias = [] | |
| K_values = range(1, 11) | |
| for k in K_values: | |
| kmeans = KMeans(n_clusters=k, random_state=42, n_init=20) | |
| kmeans.fit(df_scaled) | |
| inertias.append(kmeans.inertia_) | |
| # Create interactive elbow plot | |
| fig_elbow = go.Figure() | |
| fig_elbow.add_trace(go.Scatter( | |
| x=list(K_values), | |
| y=inertias, | |
| mode='lines+markers', | |
| name='Inertia' | |
| )) | |
| fig_elbow.update_layout( | |
| title='Finding the Optimal Number of State Crime Profiles', | |
| xaxis_title='Number of Clusters (K)', | |
| yaxis_title='Within-Cluster Sum of Squares', | |
| plot_bgcolor='rgb(30, 30, 30)', | |
| paper_bgcolor='rgb(30, 30, 30)', | |
| font=dict(color='white') | |
| ) | |
| st.plotly_chart(fig_elbow) | |
| # Interpretation guide | |
| st.markdown(""" | |
| ### How to Interpret the Elbow Plot | |
| Look at the plot above and ask yourself: | |
| 1. **Where is the "elbow"?** | |
| - The point where the line starts to level off | |
| - Adding more clusters doesn't give much improvement | |
| - In our case, it's around K=4 | |
| 2. **What do the numbers mean?** | |
| - K=1: All states in one group (not useful) | |
| - K=2: Basic high/low crime split | |
| - K=3: More nuanced grouping | |
| - K=4: Our "elbow" - good balance of detail and simplicity | |
| - K>4: Diminishing returns - more complexity without much benefit | |
| 3. **Why not just use more clusters?** | |
| - More clusters = more complex to interpret | |
| - Small clusters might not be meaningful | |
| - Goal is to find the simplest model that captures the main patterns | |
| """) | |
| # Show the actual values | |
| st.write("### WCSS Values for Each K") | |
| wcss_df = pd.DataFrame({ | |
| 'Number of Clusters (K)': K_values, | |
| 'Within-Cluster Sum of Squares': inertias, | |
| 'Improvement from Previous K': [0] + [inertias[i-1] - inertias[i] for i in range(1, len(inertias))] | |
| }) | |
| st.dataframe(wcss_df) | |
| st.markdown(""" | |
| ### Making the Decision | |
| Based on our elbow plot and the numbers above: | |
| 1. The biggest improvements happen from K=1 to K=4 | |
| 2. After K=4, the improvements get much smaller | |
| 3. K=4 gives us a good balance of: | |
| - Capturing meaningful patterns | |
| - Keeping the model simple enough to interpret | |
| - Having enough states in each cluster to be meaningful | |
| This is why we'll use K=4 for our clustering analysis! | |
| """) | |
| # Exercise 4: K-Means Clustering | |
| st.header("Exercise 4: K-Means State Profiling") | |
| # Code Example: K-Means Clustering | |
| with st.expander("Code Example: K-Means Implementation"): | |
| st.code(""" | |
| # Perform K-means clustering | |
| from sklearn.cluster import KMeans | |
| # Create and fit the model | |
| kmeans = KMeans( | |
| n_clusters=4, # Number of clusters | |
| random_state=42, # For reproducibility | |
| n_init=20 # Number of times to run with different centroids | |
| ) | |
| cluster_labels = kmeans.fit_predict(df_scaled) | |
| # Add cluster labels to original data | |
| df_clustered = df.copy() | |
| df_clustered['Cluster'] = cluster_labels | |
| # Visualize the clusters | |
| import plotly.express as px | |
| fig = px.scatter(df_clustered, | |
| x='Murder', | |
| y='Assault', | |
| color='Cluster', | |
| hover_data=['UrbanPop', 'Rape'], | |
| title='State Crime Profiles') | |
| fig.show() | |
| # Show cluster centers | |
| centers_df = pd.DataFrame( | |
| kmeans.cluster_centers_, | |
| columns=df.columns | |
| ) | |
| print("Cluster Centers:") | |
| print(centers_df) | |
| """, language="python") | |
| st.markdown(""" | |
| ### What is K-Means Clustering? | |
| K-means is an unsupervised learning algorithm that groups similar data points together. Think of it like organizing | |
| students into study groups based on their interests: | |
| 1. **Initialization**: | |
| - We randomly place K "centers" (centroids) in our data space | |
| - Each center represents the "average" of its cluster | |
| - In our case, each center represents a typical crime profile | |
| 2. **Assignment**: | |
| - Each state is assigned to its nearest center | |
| - "Nearest" is measured by Euclidean distance | |
| - States with similar crime patterns end up in the same cluster | |
| 3. **Update**: | |
| - Centers move to the average position of their assigned states | |
| - This process repeats until centers stop moving | |
| - The algorithm converges when states are optimally grouped | |
| """) | |
| # Visualize the process | |
| st.subheader("K-Means in Action") | |
| st.write(""" | |
| Let's see how K-means works with our state crime data. We'll use K=4 clusters to find distinct crime profiles. | |
| """) | |
| # Let user choose number of clusters | |
| k = st.slider("Choose number of clusters (K)", 2, 6, 4) | |
| # Perform K-means clustering | |
| kmeans = KMeans(n_clusters=k, random_state=42, n_init=20) | |
| cluster_labels = kmeans.fit_predict(df_scaled) | |
| # Add cluster labels to original data | |
| df_clustered = df.copy() | |
| df_clustered['Cluster'] = cluster_labels | |
| # Create interactive scatter plot | |
| fig = px.scatter(df_clustered, | |
| x='Murder', | |
| y='Assault', | |
| color='Cluster', | |
| hover_data=['UrbanPop', 'Rape'], | |
| title='State Crime Profiles') | |
| st.plotly_chart(fig) | |
| # Explain hyperparameters | |
| st.markdown(""" | |
| ### K-Means Hyperparameters Explained | |
| 1. **n_clusters (K)** | |
| - The number of groups we want to create | |
| - We chose K=4 based on the elbow method | |
| - Each cluster represents a distinct crime profile | |
| 2. **random_state** | |
| - Controls the random initialization of centroids | |
| - Setting it to 42 ensures reproducible results | |
| - Different values might give slightly different clusters | |
| 3. **n_init** | |
| - Number of times to run the algorithm with different initial centroids | |
| - We use 20 to find the best possible clustering | |
| - Higher values give more reliable results but take longer | |
| 4. **max_iter** | |
| - Maximum number of iterations for each run | |
| - Default is 300, which is usually enough | |
| - Algorithm stops earlier if it converges | |
| 5. **algorithm** | |
| - 'auto': Automatically chooses the best algorithm | |
| - 'full': Traditional K-means | |
| - 'elkan': More efficient for well-separated clusters | |
| """) | |
| # Show cluster centers | |
| st.subheader("Cluster Centers (Typical Crime Profiles)") | |
| centers_df = pd.DataFrame( | |
| kmeans.cluster_centers_, | |
| columns=df.columns | |
| ) | |
| st.dataframe(centers_df) | |
| st.write(""" | |
| Each row represents the "average" crime profile for that cluster. For example: | |
| - High values in Murder and Assault indicate a high-crime cluster | |
| - High UrbanPop with low crime rates might indicate urban safety | |
| - Low values across all metrics might indicate rural safety | |
| """) | |
| # Display cluster analysis | |
| st.subheader("State Crime Profiles Analysis") | |
| for cluster_num in range(k): | |
| cluster_states = df_clustered[df_clustered['Cluster'] == cluster_num] | |
| st.write(f"\n**CLUSTER {cluster_num}: {len(cluster_states)} states**") | |
| st.write("States:", ", ".join(cluster_states.index.tolist())) | |
| st.write("Average characteristics:") | |
| avg_profile = cluster_states[['Murder', 'Assault', 'UrbanPop', 'Rape']].mean() | |
| st.write(avg_profile) | |
| # Explain the results | |
| st.markdown(""" | |
| ### Interpreting the Results | |
| Each cluster represents a distinct crime profile: | |
| 1. **Cluster Characteristics** | |
| - Look at the average values for each crime type | |
| - Compare urban population percentages | |
| - Identify the defining features of each cluster | |
| 2. **State Groupings** | |
| - States in the same cluster have similar crime patterns | |
| - Geographic proximity doesn't always mean similar profiles | |
| - Some states might surprise you with their cluster membership | |
| 3. **Policy Implications** | |
| - Clusters help identify states with similar challenges | |
| - Can guide resource allocation and policy development | |
| - Enables targeted interventions based on crime profiles | |
| """) | |
| # Exercise 5: Hierarchical Clustering | |
| st.header("Exercise 5: Hierarchical Clustering Exploration") | |
| # Code Example: Hierarchical Clustering | |
| with st.expander("Code Example: Hierarchical Clustering"): | |
| st.code(""" | |
| # Create hierarchical clustering | |
| from scipy.cluster.hierarchy import linkage, dendrogram | |
| # Create linkage matrix | |
| linkage_matrix = linkage(df_scaled, method='complete') | |
| # Plot dendrogram | |
| import plotly.graph_objects as go | |
| dendro = dendrogram(linkage_matrix, labels=df.index.tolist(), no_plot=True) | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter( | |
| x=dendro['icoord'], | |
| y=dendro['dcoord'], | |
| mode='lines', | |
| line=dict(color='white') | |
| )) | |
| fig.update_layout( | |
| title='State Crime Pattern Family Tree', | |
| xaxis_title='States', | |
| yaxis_title='Distance Between Groups' | |
| ) | |
| fig.show() | |
| # Cut the tree to get clusters | |
| from scipy.cluster.hierarchy import fcluster | |
| hierarchical_labels = fcluster(linkage_matrix, k, criterion='maxclust') - 1 | |
| """, language="python") | |
| st.markdown(""" | |
| ### What is Hierarchical Clustering? | |
| Hierarchical clustering creates a tree-like structure (dendrogram) that shows how data points are related at different levels. | |
| Think of it like building a family tree for states based on their crime patterns: | |
| 1. **Bottom-Up Approach (Agglomerative)**: | |
| - Start with each state as its own cluster | |
| - Find the two closest states and merge them | |
| - Continue merging until all states are in one cluster | |
| - Creates a complete hierarchy of relationships | |
| 2. **Distance Measurement**: | |
| - Complete Linkage: Uses the maximum distance between states | |
| - Average Linkage: Uses the average distance between states | |
| - Single Linkage: Uses the minimum distance between states | |
| - We use complete linkage for more distinct clusters | |
| """) | |
| # Create hierarchical clustering | |
| linkage_matrix = linkage(df_scaled, method='complete') | |
| # Create interactive dendrogram | |
| fig_dendro = go.Figure() | |
| dendro = dendrogram(linkage_matrix, labels=df.index.tolist(), no_plot=True) | |
| fig_dendro.add_trace(go.Scatter( | |
| x=dendro['icoord'], | |
| y=dendro['dcoord'], | |
| mode='lines', | |
| line=dict(color='white') | |
| )) | |
| fig_dendro.update_layout( | |
| title='State Crime Pattern Family Tree', | |
| xaxis_title='States', | |
| yaxis_title='Distance Between Groups', | |
| plot_bgcolor='rgb(30, 30, 30)', | |
| paper_bgcolor='rgb(30, 30, 30)', | |
| font=dict(color='white') | |
| ) | |
| st.plotly_chart(fig_dendro) | |
| # Explain how to read the dendrogram | |
| st.markdown(""" | |
| ### How to Read the Dendrogram | |
| 1. **Height of Connections**: | |
| - Higher connections = more different groups | |
| - Lower connections = more similar groups | |
| - The height shows how different two groups are | |
| 2. **Cutting the Tree**: | |
| - Draw a horizontal line to create clusters | |
| - Where you cut determines the number of clusters | |
| - We'll cut at a height that gives us 4 clusters (like K-means) | |
| """) | |
| # Cut the tree to get clusters | |
| hierarchical_labels = fcluster(linkage_matrix, k, criterion='maxclust') - 1 | |
| # Compare K-means and Hierarchical Clustering | |
| st.header("Comparing K-Means and Hierarchical Clustering") | |
| # Create side-by-side comparison | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.subheader("K-Means Clustering") | |
| fig_kmeans = px.scatter(df_clustered, | |
| x='Murder', | |
| y='Assault', | |
| color='Cluster', | |
| title='K-Means Clustering (K=4)', | |
| hover_data=['UrbanPop', 'Rape']) | |
| st.plotly_chart(fig_kmeans) | |
| st.markdown(""" | |
| **K-Means Characteristics**: | |
| - Requires specifying number of clusters upfront | |
| - Creates clusters of similar size | |
| - Works well with spherical clusters | |
| - Faster for large datasets | |
| - Can be sensitive to outliers | |
| """) | |
| with col2: | |
| st.subheader("Hierarchical Clustering") | |
| df_hierarchical = df.copy() | |
| df_hierarchical['Cluster'] = hierarchical_labels | |
| fig_hierarchical = px.scatter(df_hierarchical, | |
| x='Murder', | |
| y='Assault', | |
| color='Cluster', | |
| title='Hierarchical Clustering (4 clusters)', | |
| hover_data=['UrbanPop', 'Rape']) | |
| st.plotly_chart(fig_hierarchical) | |
| st.markdown(""" | |
| **Hierarchical Clustering Characteristics**: | |
| - Creates a complete hierarchy of clusters | |
| - Can handle non-spherical clusters | |
| - More flexible in cluster shapes | |
| - Slower for large datasets | |
| - Less sensitive to outliers | |
| """) | |
| # Show agreement between methods | |
| st.subheader("Comparing the Results") | |
| # Create comparison dataframe | |
| comparison_df = pd.DataFrame({ | |
| 'State': df.index, | |
| 'K-Means Cluster': cluster_labels, | |
| 'Hierarchical Cluster': hierarchical_labels | |
| }) | |
| # Count agreements | |
| agreements = sum(comparison_df['K-Means Cluster'] == comparison_df['Hierarchical Cluster']) | |
| agreement_percentage = (agreements / len(comparison_df)) * 100 | |
| st.write(f"Methods agreed on {agreements} out of {len(comparison_df)} states ({agreement_percentage:.1f}%)") | |
| # Show states where methods disagree | |
| disagreements = comparison_df[comparison_df['K-Means Cluster'] != comparison_df['Hierarchical Cluster']] | |
| if not disagreements.empty: | |
| st.write("States where the methods disagreed:") | |
| st.dataframe(disagreements) | |
| st.markdown(""" | |
| ### When to Use Each Method | |
| 1. **Use K-Means when**: | |
| - You know the number of clusters | |
| - Your data has spherical clusters | |
| - You need fast computation | |
| - You want clusters of similar size | |
| 2. **Use Hierarchical Clustering when**: | |
| - You don't know the number of clusters | |
| - You want to explore the hierarchy | |
| - Your clusters might be non-spherical | |
| - You need to handle outliers carefully | |
| In our case, both methods found similar patterns, suggesting our clusters are robust! | |
| """) | |
| # Exercise 6: Policy Brief | |
| st.header("Exercise 6: Policy Brief Creation") | |
| # Code Example: Creating Final Visualizations | |
| with st.expander("Code Example: Creating Policy Brief Visualizations"): | |
| st.code(""" | |
| # Create a comprehensive visualization | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| # Create subplots | |
| fig = make_subplots(rows=2, cols=2) | |
| # Plot 1: Murder vs Assault by cluster | |
| for i in range(k): | |
| cluster_data = df_clustered[df_clustered['Cluster'] == i] | |
| fig.add_trace( | |
| go.Scatter( | |
| x=cluster_data['Murder'], | |
| y=cluster_data['Assault'], | |
| mode='markers', | |
| name=f'Cluster {i}' | |
| ), | |
| row=1, col=1 | |
| ) | |
| # Plot 2: Urban Population vs Rape by cluster | |
| for i in range(k): | |
| cluster_data = df_clustered[df_clustered['Cluster'] == i] | |
| fig.add_trace( | |
| go.Scatter( | |
| x=cluster_data['UrbanPop'], | |
| y=cluster_data['Rape'], | |
| mode='markers', | |
| name=f'Cluster {i}' | |
| ), | |
| row=1, col=2 | |
| ) | |
| # Update layout | |
| fig.update_layout( | |
| title_text="State Crime Profile Analysis", | |
| showlegend=True | |
| ) | |
| fig.show() | |
| """, language="python") | |
| st.write(""" | |
| Based on our analysis, here's a summary of findings and recommendations: | |
| **Key Findings:** | |
| - We identified distinct crime profiles among US states | |
| - Each cluster represents a unique pattern of crime rates and urban population | |
| - Some states show surprising similarities despite geographic distance | |
| **Policy Recommendations:** | |
| 1. High-Priority States: Focus on states in high-crime clusters | |
| 2. Resource Allocation: Distribute federal crime prevention funds based on cluster profiles | |
| 3. Best Practice Sharing: Encourage states within the same cluster to share successful strategies | |
| """) | |
| # Additional Resources | |
| st.header("Additional Resources") | |
| st.write(""" | |
| - [Scikit-learn Clustering Documentation](https://scikit-learn.org/stable/modules/clustering.html) | |
| - [KNN Documentation](https://scikit-learn.org/stable/modules/neighbors.html) | |
| """) |