Spaces:
Sleeping
Sleeping
| import altair as alt | |
| # import numpy as np # Removed as it is not accessed | |
| import pandas as pd | |
| import streamlit as st | |
| import geopandas as gpd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import ipywidgets | |
| st.title("Crime Data Analysis") | |
| # Load the dataset. | |
| df = pd.read_csv("crime_data.csv") | |
| # Check NaN values and types. | |
| # df.isna().sum() # No NaN value in our dataframe. | |
| # df.dtypes # Only "crm_cd_desc" is categorical variable(object). | |
| # Test code. | |
| df.head(5) | |
| # Plot 1: Pie chart. | |
| # Data filteration. | |
| crm_tot = df["crm_cd_desc"].value_counts() | |
| # Calculate the mean of crime cases. | |
| mean_crm = crm_tot.mean() | |
| # Filter out the crime cases that are below the mean of the crime cases. | |
| crm_tot_filtered = crm_tot[crm_tot > mean_crm] | |
| # Method comes from: https://matplotlib.org/stable/gallery/pie_and_polar_charts/pie_features.html. | |
| plt.figure(figsize=(12, 12)) | |
| fig, ax = plt.subplots() | |
| ax.pie(crm_tot_filtered, labels=crm_tot_filtered.index, autopct='%1.1f%%', labeldistance=1.5, pctdistance=1.2) | |
| #----- | |
| ### Use this one!!! | |
| # A more detailed version pie chart based on the previous one. | |
| # Filter the top 10 crime type. | |
| top_crimes = ( | |
| df["crm_cd_desc"] | |
| .value_counts() | |
| .nlargest(10) | |
| .reset_index() | |
| .rename(columns={"index": "Crime Type", "crm_cd_desc": "Count"}) | |
| ) | |
| # Calculate the percentage of ecah kind of crime. | |
| top_crimes["Percentage"] = top_crimes["Count"] / top_crimes["Count"].sum() | |
| # Create the pie chart. | |
| chart = alt.Chart(top_crimes).mark_arc(innerRadius=50).encode( | |
| theta=alt.Theta(field="Count", type="quantitative"), | |
| color=alt.Color(field="Crime Type", type="nominal", legend=alt.Legend(title="Crime Type")), | |
| tooltip=["Crime Type", "Count", alt.Tooltip("Percentage:Q", format=".1%")] | |
| ).properties( | |
| title="Top 10 Crime Types Distribution" | |
| ) | |
| # Display the plot. | |
| st.altair_chart(chart, theme="streamlit", use_container_width=True) | |
| #------ | |
| ### Use this one!!! | |
| # Count the crime type and list out the top 10 crime type that have the most cases. | |
| top_crimes = df['crm_cd_desc'].value_counts().nlargest(10).index | |
| df_top = df[df['crm_cd_desc'].isin(top_crimes)] | |
| # Group by crime type and year. | |
| heatmap1_data = df_top.groupby(['crm_cd_desc', 'year']).size().unstack(fill_value=0) | |
| # Create the heat map. | |
| plt.figure(figsize=(10, 6)) | |
| sns.heatmap(heatmap1_data, annot=True, fmt="d", cmap="YlOrRd") | |
| plt.title("Top 10 Crime Types by Year") | |
| plt.xlabel("Year") | |
| plt.ylabel("Crime Type") | |
| plt.tight_layout() | |
| plt.show() | |
| st.altair_chart(heatmap1_data, theme="streamlit", use_container_width=True) | |
| #------ | |
| ### Use this one!!! | |
| # Count the crime type and list out the top 10 crime type that have the most cases. | |
| top_crimes = df['crm_cd_desc'].value_counts().nlargest(10).index | |
| df = df[df['year'] != 2025] | |
| df_top = df[df['crm_cd_desc'].isin(top_crimes)] | |
| # Group by crime type and year. | |
| stacked_year_df = df_top.groupby(['year', 'crm_cd_desc']).size().reset_index(name='count') | |
| # Create the stacked bar chart. | |
| bar_chart = alt.Chart(stacked_year_df).mark_bar().encode( | |
| x=alt.X('year:O', title='Year'), | |
| y=alt.Y('count:Q', stack='zero', title='Number of Incidents'), | |
| color=alt.Color('crm_cd_desc:N', title='Crime Type'), | |
| tooltip=['year', 'crm_cd_desc', 'count'] | |
| ).properties( | |
| width=600, | |
| height=400, | |
| title='Stacked Crime Composition by Year (Top 10 Crime Types)' | |
| ) | |
| st.altair_chart(bar_chart, theme="streamlit", use_container_width=True) | |
| #---- | |
| ### Use this one!!! | |
| # Plot 3: Line chart. | |
| df = df[df['year'] != 2025] # 2025 is not end, so the trend can't be see | |
| # Group the each crime type by year. | |
| yearly_crime_counts = ( | |
| df.groupby(["year", "crm_cd_desc"]) | |
| .size() | |
| .reset_index(name="Count") | |
| ) | |
| # Filter the crime types that have the most top 5 cases. | |
| top5_crimes = df["crm_cd_desc"].value_counts().nlargest(5).index | |
| filtered_crimes = yearly_crime_counts[yearly_crime_counts["crm_cd_desc"].isin(top5_crimes)] | |
| # Plot the line plot. | |
| line_chart = alt.Chart(filtered_crimes).mark_line(point=True).encode( | |
| x=alt.X("year:O", title="Year"), | |
| y=alt.Y("Count:Q", title="Number of Incidents"), | |
| color=alt.Color("crm_cd_desc:N", title="Crime Type"), | |
| tooltip=["year", "crm_cd_desc", "Count"] | |
| ).properties( | |
| title="Yearly Trends of Top 5 Crime Types", | |
| width=700, | |
| height=400 | |
| ) | |
| # Display the plot. | |
| st.altair_chart(line_chart, theme="streamlit", use_container_width=True) | |
| #---- | |
| # Plot 4: Map. | |
| # Load geojson file. | |
| gdf_counties = gpd.read_file("County_Boundary.geojson") | |
| # Creat dropdown menu. | |
| year_dropdown = ipywidgets.Dropdown( | |
| options= sorted(df['year'].unique()), | |
| description='Year:' | |
| ) | |
| # Create the map. | |
| def crime_map(year): | |
| # df_filtered = df[df['year'] == year].sample(n=500, random_state=1) | |
| # df_filtered = df[df['year'] == year].sample(n=100, random_state=1) | |
| df_filtered = df[df['year'] == year].sample(n=300, random_state=1) | |
| gdf_points = gpd.GeoDataFrame( | |
| df_filtered, | |
| geometry=gpd.points_from_xy(df_filtered['lon'], df_filtered['lat']), | |
| crs="EPSG:4326" | |
| ) | |
| fig, ax = plt.subplots(figsize=(10, 10)) | |
| gdf_counties.plot(ax=ax, color='lightgray', edgecolor='white') | |
| gdf_points.plot(ax=ax, color='red', markersize=10, alpha=0.6) | |
| ax.set_title(f"Crime Map - {year}") | |
| ax.set_xlabel("Longitude") | |
| ax.set_ylabel("Latitude") | |
| plt.grid(True) | |
| plt.show() | |
| # Displat the plot. | |
| ipywidgets.interact(crime_map, year=year_dropdown) | |
| ### Use this one!!! | |
| # Loading in the map. | |
| gdf_counties = gpd.read_file("County_Boundary.geojson") | |
| # Identify top 10 crime types | |
| top_10_crimes = df['crm_cd_desc'].value_counts().nlargest(10).index.tolist() | |
| # Filter the main DataFrame to include only top 10 crimes | |
| df_top = df[df['crm_cd_desc'].isin(top_10_crimes)] | |
| # Create the dropdown. | |
| crime_dropdown = ipywidgets.Dropdown( | |
| options= sorted(top_10_crimes), | |
| description="Crime Type:") | |
| # Create the map. | |
| def crime_map(year, crime): | |
| df_filtered = df[(df['year'] == year) & (df['crm_cd_desc'] == crime)].sample(n=300, random_state=1) | |
| gdf_points = gpd.GeoDataFrame( | |
| df_filtered, | |
| geometry=gpd.points_from_xy(df_filtered['lon'], df_filtered['lat']), | |
| crs="EPSG:4326" | |
| ) | |
| fig, ax = plt.subplots(figsize=(10, 10)) | |
| gdf_counties.plot(ax=ax, color='lightgray', edgecolor='white') | |
| gdf_points.plot(ax=ax, color='red', markersize=10, alpha=0.6) | |
| ax.set_title(f"{crime} - {year}") | |
| ax.set_xlabel("Longitude") | |
| ax.set_ylabel("Latitude") | |
| plt.grid(True) | |
| plt.show() | |
| # Displat the plot. | |
| ipywidgets.interact(crime_map, year=year_dropdown, crime=crime_dropdown) | |