Spaces:
Sleeping
Sleeping
Upload app.py
Browse filesHomework_5.1 app.py file
app.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Importing necessary libraries
|
| 2 |
+
import streamlit as st
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import altair as alt
|
| 5 |
+
|
| 6 |
+
# Loading the Dataset
|
| 7 |
+
url = 'https://github.com/UIUC-iSchool-DataViz/is445_data/raw/main/licenses_fall2022.csv'
|
| 8 |
+
df = pd.read_csv(url)
|
| 9 |
+
st.write("Dataset Preview", df.head())
|
| 10 |
+
|
| 11 |
+
# Checking the dataset columns and types (Streamlit doesn't support df.info(), so we show the data types and column names separately)
|
| 12 |
+
st.write("Data Types and Columns")
|
| 13 |
+
st.write(df.dtypes)
|
| 14 |
+
|
| 15 |
+
# Checking Data Dimensionality
|
| 16 |
+
st.write("Data Dimensionality:", df.shape)
|
| 17 |
+
|
| 18 |
+
# Checking for missing values in the dataset
|
| 19 |
+
missing_values = df.isnull().sum()
|
| 20 |
+
st.write("Missing Values per Column", missing_values)
|
| 21 |
+
|
| 22 |
+
# Checking for percentage of missing values
|
| 23 |
+
missing_percentage = (df.isnull().sum() / len(df)) * 100
|
| 24 |
+
st.write("Percentage of Missing Values per Column", missing_percentage)
|
| 25 |
+
|
| 26 |
+
# Dropping columns with more than 90% missing values
|
| 27 |
+
df.drop(columns=['Title', 'Prefix', 'Suffix', 'BusinessDBA'], inplace=True)
|
| 28 |
+
|
| 29 |
+
# Filling missing values in 'Middle' column
|
| 30 |
+
df['Middle'].fillna('N/A', inplace=True)
|
| 31 |
+
|
| 32 |
+
# Dropping rows with missing values for important columns (where missing data is below 10%)
|
| 33 |
+
df.dropna(subset=['License Number', 'First Name', 'Last Name', 'Effective Date',
|
| 34 |
+
'Expiration Date', 'City', 'Zip', 'County', 'Specialty/Qualifier',
|
| 35 |
+
'Controlled Substance Schedule', 'Action'], inplace=True)
|
| 36 |
+
|
| 37 |
+
# Verifying that there are no missing values left in the dataset
|
| 38 |
+
missing_values = df.isnull().sum()
|
| 39 |
+
st.write("Missing Values After Cleaning", missing_values)
|
| 40 |
+
|
| 41 |
+
# Checking data dimensionality after handling missing values
|
| 42 |
+
st.write("Data Dimensionality After Cleaning:", df.shape)
|
| 43 |
+
|
| 44 |
+
# Converting date columns to datetime
|
| 45 |
+
date_columns = ['Original Issue Date', 'Effective Date', 'Expiration Date', 'LastModifiedDate']
|
| 46 |
+
for col in date_columns:
|
| 47 |
+
df[col] = pd.to_datetime(df[col], errors='coerce', format='%m/%d/%Y')
|
| 48 |
+
|
| 49 |
+
# Creating new columns with the year of each date for easier plotting
|
| 50 |
+
df['Original Issue Year'] = df['Original Issue Date'].dt.year
|
| 51 |
+
df['Effective Date Year'] = df['Effective Date'].dt.year
|
| 52 |
+
df['Expiration Date Year'] = df['Expiration Date'].dt.year
|
| 53 |
+
df['Last Modified Date Year'] = df['LastModifiedDate'].dt.year
|
| 54 |
+
|
| 55 |
+
# Converting object columns to category for better memory management
|
| 56 |
+
for col in df.select_dtypes(include='object').columns:
|
| 57 |
+
df[col] = df[col].astype('category')
|
| 58 |
+
|
| 59 |
+
# Visualization 1: Bar chart of licenses by type
|
| 60 |
+
st.subheader("Licenses by Type")
|
| 61 |
+
|
| 62 |
+
category_counts = df['License Type'].value_counts().reset_index()
|
| 63 |
+
category_counts.columns = ['License Type', 'Count']
|
| 64 |
+
|
| 65 |
+
chart1 = alt.Chart(category_counts).mark_bar().encode(
|
| 66 |
+
x=alt.X('License Type', sort='-y', title='License Type'),
|
| 67 |
+
y=alt.Y('Count', title='Number of Licenses'),
|
| 68 |
+
color=alt.Color('License Type', legend=None)
|
| 69 |
+
).properties(
|
| 70 |
+
width=600,
|
| 71 |
+
height=400,
|
| 72 |
+
title="Number of Licenses by Type"
|
| 73 |
+
)
|
| 74 |
+
st.altair_chart(chart1)
|
| 75 |
+
|
| 76 |
+
# Write-up for Visualization 1
|
| 77 |
+
st.write("""
|
| 78 |
+
**Licenses by Type**: This visualization highlights the distribution of licenses across different types.
|
| 79 |
+
The bar chart was chosen for its simplicity in comparing counts across license types. I chose to sort types in descending order for clarity.
|
| 80 |
+
Colors were selected to make each bar distinct, and axis labels were added for easy interpretation.
|
| 81 |
+
If I had more time, I might add interactivity to allow users to filter by other variables.
|
| 82 |
+
""")
|
| 83 |
+
|
| 84 |
+
# Visualization 2: Line chart showing licenses issued over time
|
| 85 |
+
st.subheader("Licenses Over Time")
|
| 86 |
+
|
| 87 |
+
# Ensure the '_id' column exists; otherwise, use another unique identifier if available
|
| 88 |
+
if '_id' in df.columns:
|
| 89 |
+
time_data = df.groupby('Original Issue Year')['_id'].count().reset_index()
|
| 90 |
+
else:
|
| 91 |
+
time_data = df.groupby('Original Issue Year').size().reset_index(name='License Count')
|
| 92 |
+
|
| 93 |
+
time_data.columns = ['Year', 'License Count']
|
| 94 |
+
|
| 95 |
+
chart2 = alt.Chart(time_data).mark_line().encode(
|
| 96 |
+
x=alt.X('Year', title='Year'),
|
| 97 |
+
y=alt.Y('License Count', title='Number of Licenses'),
|
| 98 |
+
color=alt.value('blue')
|
| 99 |
+
).properties(
|
| 100 |
+
width=600,
|
| 101 |
+
height=400,
|
| 102 |
+
title="Number of Licenses Issued Over Time"
|
| 103 |
+
)
|
| 104 |
+
st.altair_chart(chart2)
|
| 105 |
+
|
| 106 |
+
# Write-up for Visualization 2
|
| 107 |
+
st.write("""
|
| 108 |
+
**Licenses Over Time**: This line chart visualizes the trend in the number of licenses issued over time.
|
| 109 |
+
I chose a line chart to emphasize changes over years. Blue was used to create a consistent look. Axis labels and title are provided for clarity.
|
| 110 |
+
If I had more time, I would explore monthly or seasonal patterns by adjusting the time grouping.
|
| 111 |
+
""")
|