Spaces:

ralate2
/

Homework_5.1

Sleeping

App Files Files Community

ralate2 commited on Nov 15, 2024

Commit

3c2cf47

verified ·

1 Parent(s): 3dbec17

Upload app.py

Browse files

Homework_5.1 app.py file

Files changed (1) hide show

app.py +111 -0

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Importing necessary libraries
+import streamlit as st
+import pandas as pd
+import altair as alt
+# Loading the Dataset
+url = 'https://github.com/UIUC-iSchool-DataViz/is445_data/raw/main/licenses_fall2022.csv'
+df = pd.read_csv(url)
+st.write("Dataset Preview", df.head())
+# Checking the dataset columns and types (Streamlit doesn't support df.info(), so we show the data types and column names separately)
+st.write("Data Types and Columns")
+st.write(df.dtypes)
+# Checking Data Dimensionality
+st.write("Data Dimensionality:", df.shape)
+# Checking for missing values in the dataset
+missing_values = df.isnull().sum()
+st.write("Missing Values per Column", missing_values)
+# Checking for percentage of missing values
+missing_percentage = (df.isnull().sum() / len(df)) * 100
+st.write("Percentage of Missing Values per Column", missing_percentage)
+# Dropping columns with more than 90% missing values
+df.drop(columns=['Title', 'Prefix', 'Suffix', 'BusinessDBA'], inplace=True)
+# Filling missing values in 'Middle' column
+df['Middle'].fillna('N/A', inplace=True)
+# Dropping rows with missing values for important columns (where missing data is below 10%)
+df.dropna(subset=['License Number', 'First Name', 'Last Name', 'Effective Date',
+                  'Expiration Date', 'City', 'Zip', 'County', 'Specialty/Qualifier',
+                  'Controlled Substance Schedule', 'Action'], inplace=True)
+# Verifying that there are no missing values left in the dataset
+missing_values = df.isnull().sum()
+st.write("Missing Values After Cleaning", missing_values)
+# Checking data dimensionality after handling missing values
+st.write("Data Dimensionality After Cleaning:", df.shape)
+# Converting date columns to datetime
+date_columns = ['Original Issue Date', 'Effective Date', 'Expiration Date', 'LastModifiedDate']
+for col in date_columns:
+    df[col] = pd.to_datetime(df[col], errors='coerce', format='%m/%d/%Y')
+# Creating new columns with the year of each date for easier plotting
+df['Original Issue Year'] = df['Original Issue Date'].dt.year
+df['Effective Date Year'] = df['Effective Date'].dt.year
+df['Expiration Date Year'] = df['Expiration Date'].dt.year
+df['Last Modified Date Year'] = df['LastModifiedDate'].dt.year
+# Converting object columns to category for better memory management
+for col in df.select_dtypes(include='object').columns:
+    df[col] = df[col].astype('category')
+# Visualization 1: Bar chart of licenses by type
+st.subheader("Licenses by Type")
+category_counts = df['License Type'].value_counts().reset_index()
+category_counts.columns = ['License Type', 'Count']
+chart1 = alt.Chart(category_counts).mark_bar().encode(
+    x=alt.X('License Type', sort='-y', title='License Type'),
+    y=alt.Y('Count', title='Number of Licenses'),
+    color=alt.Color('License Type', legend=None)
+).properties(
+    width=600,
+    height=400,
+    title="Number of Licenses by Type"
+)
+st.altair_chart(chart1)
+# Write-up for Visualization 1
+st.write("""
+**Licenses by Type**: This visualization highlights the distribution of licenses across different types.
+The bar chart was chosen for its simplicity in comparing counts across license types. I chose to sort types in descending order for clarity.
+Colors were selected to make each bar distinct, and axis labels were added for easy interpretation.
+If I had more time, I might add interactivity to allow users to filter by other variables.
+""")
+# Visualization 2: Line chart showing licenses issued over time
+st.subheader("Licenses Over Time")
+# Ensure the '_id' column exists; otherwise, use another unique identifier if available
+if '_id' in df.columns:
+    time_data = df.groupby('Original Issue Year')['_id'].count().reset_index()
+else:
+    time_data = df.groupby('Original Issue Year').size().reset_index(name='License Count')
+time_data.columns = ['Year', 'License Count']
+chart2 = alt.Chart(time_data).mark_line().encode(
+    x=alt.X('Year', title='Year'),
+    y=alt.Y('License Count', title='Number of Licenses'),
+    color=alt.value('blue')
+).properties(
+    width=600,
+    height=400,
+    title="Number of Licenses Issued Over Time"
+)
+st.altair_chart(chart2)
+# Write-up for Visualization 2
+st.write("""
+**Licenses Over Time**: This line chart visualizes the trend in the number of licenses issued over time.
+I chose a line chart to emphasize changes over years. Blue was used to create a consistent look. Axis labels and title are provided for clarity.
+If I had more time, I would explore monthly or seasonal patterns by adjusting the time grouping.
+""")