ralate2 commited on
Commit
3c2cf47
·
verified ·
1 Parent(s): 3dbec17

Upload app.py

Browse files

Homework_5.1 app.py file

Files changed (1) hide show
  1. app.py +111 -0
app.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing necessary libraries
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import altair as alt
5
+
6
+ # Loading the Dataset
7
+ url = 'https://github.com/UIUC-iSchool-DataViz/is445_data/raw/main/licenses_fall2022.csv'
8
+ df = pd.read_csv(url)
9
+ st.write("Dataset Preview", df.head())
10
+
11
+ # Checking the dataset columns and types (Streamlit doesn't support df.info(), so we show the data types and column names separately)
12
+ st.write("Data Types and Columns")
13
+ st.write(df.dtypes)
14
+
15
+ # Checking Data Dimensionality
16
+ st.write("Data Dimensionality:", df.shape)
17
+
18
+ # Checking for missing values in the dataset
19
+ missing_values = df.isnull().sum()
20
+ st.write("Missing Values per Column", missing_values)
21
+
22
+ # Checking for percentage of missing values
23
+ missing_percentage = (df.isnull().sum() / len(df)) * 100
24
+ st.write("Percentage of Missing Values per Column", missing_percentage)
25
+
26
+ # Dropping columns with more than 90% missing values
27
+ df.drop(columns=['Title', 'Prefix', 'Suffix', 'BusinessDBA'], inplace=True)
28
+
29
+ # Filling missing values in 'Middle' column
30
+ df['Middle'].fillna('N/A', inplace=True)
31
+
32
+ # Dropping rows with missing values for important columns (where missing data is below 10%)
33
+ df.dropna(subset=['License Number', 'First Name', 'Last Name', 'Effective Date',
34
+ 'Expiration Date', 'City', 'Zip', 'County', 'Specialty/Qualifier',
35
+ 'Controlled Substance Schedule', 'Action'], inplace=True)
36
+
37
+ # Verifying that there are no missing values left in the dataset
38
+ missing_values = df.isnull().sum()
39
+ st.write("Missing Values After Cleaning", missing_values)
40
+
41
+ # Checking data dimensionality after handling missing values
42
+ st.write("Data Dimensionality After Cleaning:", df.shape)
43
+
44
+ # Converting date columns to datetime
45
+ date_columns = ['Original Issue Date', 'Effective Date', 'Expiration Date', 'LastModifiedDate']
46
+ for col in date_columns:
47
+ df[col] = pd.to_datetime(df[col], errors='coerce', format='%m/%d/%Y')
48
+
49
+ # Creating new columns with the year of each date for easier plotting
50
+ df['Original Issue Year'] = df['Original Issue Date'].dt.year
51
+ df['Effective Date Year'] = df['Effective Date'].dt.year
52
+ df['Expiration Date Year'] = df['Expiration Date'].dt.year
53
+ df['Last Modified Date Year'] = df['LastModifiedDate'].dt.year
54
+
55
+ # Converting object columns to category for better memory management
56
+ for col in df.select_dtypes(include='object').columns:
57
+ df[col] = df[col].astype('category')
58
+
59
+ # Visualization 1: Bar chart of licenses by type
60
+ st.subheader("Licenses by Type")
61
+
62
+ category_counts = df['License Type'].value_counts().reset_index()
63
+ category_counts.columns = ['License Type', 'Count']
64
+
65
+ chart1 = alt.Chart(category_counts).mark_bar().encode(
66
+ x=alt.X('License Type', sort='-y', title='License Type'),
67
+ y=alt.Y('Count', title='Number of Licenses'),
68
+ color=alt.Color('License Type', legend=None)
69
+ ).properties(
70
+ width=600,
71
+ height=400,
72
+ title="Number of Licenses by Type"
73
+ )
74
+ st.altair_chart(chart1)
75
+
76
+ # Write-up for Visualization 1
77
+ st.write("""
78
+ **Licenses by Type**: This visualization highlights the distribution of licenses across different types.
79
+ The bar chart was chosen for its simplicity in comparing counts across license types. I chose to sort types in descending order for clarity.
80
+ Colors were selected to make each bar distinct, and axis labels were added for easy interpretation.
81
+ If I had more time, I might add interactivity to allow users to filter by other variables.
82
+ """)
83
+
84
+ # Visualization 2: Line chart showing licenses issued over time
85
+ st.subheader("Licenses Over Time")
86
+
87
+ # Ensure the '_id' column exists; otherwise, use another unique identifier if available
88
+ if '_id' in df.columns:
89
+ time_data = df.groupby('Original Issue Year')['_id'].count().reset_index()
90
+ else:
91
+ time_data = df.groupby('Original Issue Year').size().reset_index(name='License Count')
92
+
93
+ time_data.columns = ['Year', 'License Count']
94
+
95
+ chart2 = alt.Chart(time_data).mark_line().encode(
96
+ x=alt.X('Year', title='Year'),
97
+ y=alt.Y('License Count', title='Number of Licenses'),
98
+ color=alt.value('blue')
99
+ ).properties(
100
+ width=600,
101
+ height=400,
102
+ title="Number of Licenses Issued Over Time"
103
+ )
104
+ st.altair_chart(chart2)
105
+
106
+ # Write-up for Visualization 2
107
+ st.write("""
108
+ **Licenses Over Time**: This line chart visualizes the trend in the number of licenses issued over time.
109
+ I chose a line chart to emphasize changes over years. Blue was used to create a consistent look. Axis labels and title are provided for clarity.
110
+ If I had more time, I would explore monthly or seasonal patterns by adjusting the time grouping.
111
+ """)