Spaces:

spatel54
/

Crime_Dataset

Sleeping

File size: 6,805 Bytes

caa42f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13eaffe

import altair as alt
import numpy as np
import pandas as pd
import streamlit as st

"""
# Welcome to Streamlit!

Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
forums](https://discuss.streamlit.io).

In the meantime, below is an example of what you can do with just a few lines of code:
"""

st.title("Crime Data Analysis")


# Load the dataset.
df = pd.read_csv("crime_data.csv")

# Check NaN values and types.
# df.isna().sum() # No NaN value in our dataframe.
# df.dtypes # Only "crm_cd_desc" is categorical variable(object).

# Test code.
df.head(5)

# Plot 1: Pie chart.
# Data filteration.
crm_tot = df["crm_cd_desc"].value_counts()

# Calculate the mean of crime cases.
mean_crm = crm_tot.mean()

# Filter out the crime cases that are below the mean of the crime cases.
crm_tot_filtered = crm_tot[crm_tot > mean_crm]

# Method comes from: https://matplotlib.org/stable/gallery/pie_and_polar_charts/pie_features.html.
plt.figure(figsize=(12, 12))
fig, ax = plt.subplots()
ax.pie(crm_tot_filtered, labels=crm_tot_filtered.index, autopct='%1.1f%%', labeldistance=1.5, pctdistance=1.2)

#-----

### Use this one!!!
# A more detailed version pie chart based on the previous one.
# Filter the top 10 crime type.
top_crimes = (
    df["crm_cd_desc"]
    .value_counts()
    .nlargest(10)
    .reset_index()
    .rename(columns={"index": "Crime Type", "crm_cd_desc": "Count"})
)

# Calculate the percentage of ecah kind of crime.
top_crimes["Percentage"] = top_crimes["Count"] / top_crimes["Count"].sum()


# Create the pie chart.
chart = alt.Chart(top_crimes).mark_arc(innerRadius=50).encode(
    theta=alt.Theta(field="Count", type="quantitative"),
    color=alt.Color(field="Crime Type", type="nominal", legend=alt.Legend(title="Crime Type")),
    tooltip=["Crime Type", "Count", alt.Tooltip("Percentage:Q", format=".1%")]
).properties(
    title="Top 10 Crime Types Distribution"
)

# Display the plot.
st.altair_chart(chart, theme="streamlit", use_container_width=True)


#------

### Use this one!!!
# Count the crime type and list out the top 10 crime type that have the most cases.
top_crimes = df['crm_cd_desc'].value_counts().nlargest(10).index
df_top = df[df['crm_cd_desc'].isin(top_crimes)]

# Group by crime type and year.
heatmap1_data = df_top.groupby(['crm_cd_desc', 'year']).size().unstack(fill_value=0)

# Create the heat map.
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap1_data, annot=True, fmt="d", cmap="YlOrRd")
plt.title("Top 10 Crime Types by Year")
plt.xlabel("Year")
plt.ylabel("Crime Type")
plt.tight_layout()
plt.show()

st.altair_chart(heatmap1_data, theme="streamlit", use_container_width=True)


#------

### Use this one!!!
# Count the crime type and list out the top 10 crime type that have the most cases.
top_crimes = df['crm_cd_desc'].value_counts().nlargest(10).index
df = df[df['year'] != 2025]
df_top = df[df['crm_cd_desc'].isin(top_crimes)]

# Group by crime type and year.
stacked_year_df = df_top.groupby(['year', 'crm_cd_desc']).size().reset_index(name='count')

# Create the stacked bar chart.
bar_chart = alt.Chart(stacked_year_df).mark_bar().encode(
    x=alt.X('year:O', title='Year'),
    y=alt.Y('count:Q', stack='zero', title='Number of Incidents'),
    color=alt.Color('crm_cd_desc:N', title='Crime Type'),
    tooltip=['year', 'crm_cd_desc', 'count']
).properties(
    width=600,
    height=400,
    title='Stacked Crime Composition by Year (Top 10 Crime Types)'
)

st.altair_chart(bar_chart, theme="streamlit", use_container_width=True)


#----

### Use this one!!!
# Plot 3: Line chart.
df = df[df['year'] != 2025] # 2025 is not end, so the trend can't be see

# Group the each crime type by year.
yearly_crime_counts = (
    df.groupby(["year", "crm_cd_desc"])
    .size()
    .reset_index(name="Count")
)

# Filter the crime types that have the most top 5 cases.
top5_crimes = df["crm_cd_desc"].value_counts().nlargest(5).index
filtered_crimes = yearly_crime_counts[yearly_crime_counts["crm_cd_desc"].isin(top5_crimes)]

# Plot the line plot.
line_chart = alt.Chart(filtered_crimes).mark_line(point=True).encode(
    x=alt.X("year:O", title="Year"),
    y=alt.Y("Count:Q", title="Number of Incidents"),
    color=alt.Color("crm_cd_desc:N", title="Crime Type"),
    tooltip=["year", "crm_cd_desc", "Count"]
).properties(
    title="Yearly Trends of Top 5 Crime Types",
    width=700,  
    height=400
)

# Display the plot.
st.altair_chart(line_chart, theme="streamlit", use_container_width=True)

#----
# Plot 4: Map.
# Load geojson file.
gdf_counties = gpd.read_file("County_Boundary.geojson")
    
# Creat dropdown menu.
year_dropdown = ipywidgets.Dropdown(
    options= sorted(df['year'].unique()),
    description='Year:'
)

# Create the map.
def crime_map(year):
    # df_filtered = df[df['year'] == year].sample(n=500, random_state=1)
    # df_filtered = df[df['year'] == year].sample(n=100, random_state=1)
    df_filtered = df[df['year'] == year].sample(n=300, random_state=1)
    gdf_points = gpd.GeoDataFrame(
        df_filtered,
        geometry=gpd.points_from_xy(df_filtered['lon'], df_filtered['lat']),
        crs="EPSG:4326"
    )
    
    fig, ax = plt.subplots(figsize=(10, 10))
    gdf_counties.plot(ax=ax, color='lightgray', edgecolor='white')
    gdf_points.plot(ax=ax, color='red', markersize=10, alpha=0.6)
    ax.set_title(f"Crime Map - {year}")
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    plt.grid(True)
    plt.show()
    
# Displat the plot.
ipywidgets.interact(crime_map, year=year_dropdown)




### Use this one!!!
# Loading in the map.
gdf_counties = gpd.read_file("County_Boundary.geojson")

# Identify top 10 crime types
top_10_crimes = df['crm_cd_desc'].value_counts().nlargest(10).index.tolist()

# Filter the main DataFrame to include only top 10 crimes
df_top = df[df['crm_cd_desc'].isin(top_10_crimes)]

# Create the dropdown.
crime_dropdown = ipywidgets.Dropdown(
    options= sorted(top_10_crimes),
    description="Crime Type:")

# Create the map.
def crime_map(year, crime):
    df_filtered = df[(df['year'] == year) & (df['crm_cd_desc'] == crime)].sample(n=300, random_state=1)
    gdf_points = gpd.GeoDataFrame(
        df_filtered,
        geometry=gpd.points_from_xy(df_filtered['lon'], df_filtered['lat']),
        crs="EPSG:4326"
    )
    
    fig, ax = plt.subplots(figsize=(10, 10))
    gdf_counties.plot(ax=ax, color='lightgray', edgecolor='white')
    gdf_points.plot(ax=ax, color='red', markersize=10, alpha=0.6)
    ax.set_title(f"{crime} - {year}")
    ax.set_xlabel("Longitude")
    ax.set_ylabel("Latitude")
    plt.grid(True)
    plt.show()
    
# Displat the plot.
ipywidgets.interact(crime_map, year=year_dropdown, crime=crime_dropdown)