Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
KIVA___BDS24_Assignment_Karmiris_Tryfonas.ipynb
CHANGED
|
@@ -4894,7 +4894,7 @@
|
|
| 4894 |
"plt.xlabel('Sector')\n",
|
| 4895 |
"plt.xticks(rotation=90)\n",
|
| 4896 |
"plt.ylabel('Count')\n",
|
| 4897 |
-
"plt.legend(title='
|
| 4898 |
"plt.show()"
|
| 4899 |
]
|
| 4900 |
},
|
|
|
|
| 4894 |
"plt.xlabel('Sector')\n",
|
| 4895 |
"plt.xticks(rotation=90)\n",
|
| 4896 |
"plt.ylabel('Count')\n",
|
| 4897 |
+
"plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
|
| 4898 |
"plt.show()"
|
| 4899 |
]
|
| 4900 |
},
|
__pycache__/dual_axis_and_boxplot.cpython-312.pyc
ADDED
|
Binary file (4.82 kB). View file
|
|
|
__pycache__/repayment_interval_chart.cpython-312.pyc
ADDED
|
Binary file (2.97 kB). View file
|
|
|
app.py
CHANGED
|
@@ -3,35 +3,50 @@ import streamlit as st
|
|
| 3 |
import pandas as pd
|
| 4 |
import altair as alt
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
-
from scipy.stats import zscore
|
| 7 |
import seaborn as sns
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
file_path= 'kiva_loans.csv'
|
| 11 |
-
|
| 12 |
df_kiva_loans = pd.read_csv(file_path)
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
df_kiva_loans.dropna(subset=['partner_id','borrower_genders'], inplace=True)
|
| 18 |
|
| 19 |
# Calculate Z-scores
|
| 20 |
z_scores = zscore(df_kiva_loans['funded_amount'])
|
| 21 |
-
|
| 22 |
-
# Get boolean array indicating the presence of outliers
|
| 23 |
df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3)
|
| 24 |
df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']]
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# Slider to select the number of top values to display
|
| 31 |
-
num_columns = st.
|
| 32 |
"Select Number of Columns to Display",
|
| 33 |
min_value=5,
|
| 34 |
-
max_value=
|
| 35 |
value=10, # default value
|
| 36 |
step=1
|
| 37 |
)
|
|
@@ -50,7 +65,7 @@ else: # sector
|
|
| 50 |
x_column = 'sector'
|
| 51 |
count_column = 'count'
|
| 52 |
|
| 53 |
-
# Create a bar plot
|
| 54 |
fig, ax1 = plt.subplots(figsize=(12, 9))
|
| 55 |
plt.xticks(rotation=90)
|
| 56 |
|
|
@@ -71,7 +86,66 @@ ax2.tick_params(axis='y', labelcolor=color)
|
|
| 71 |
# Add titles and labels
|
| 72 |
plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}')
|
| 73 |
fig.tight_layout()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
import altair as alt
|
| 5 |
import matplotlib.pyplot as plt
|
|
|
|
| 6 |
import seaborn as sns
|
| 7 |
+
from scipy.stats import zscore
|
| 8 |
|
| 9 |
+
# Load data
|
| 10 |
+
file_path = 'kiva_loans.csv'
|
|
|
|
| 11 |
df_kiva_loans = pd.read_csv(file_path)
|
| 12 |
|
| 13 |
+
# Clean data
|
| 14 |
+
df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time', 'funded_time', 'posted_time', 'tags'], axis=1)
|
| 15 |
+
df_kiva_loans.dropna(subset=['partner_id', 'borrower_genders'], inplace=True)
|
|
|
|
| 16 |
|
| 17 |
# Calculate Z-scores
|
| 18 |
z_scores = zscore(df_kiva_loans['funded_amount'])
|
|
|
|
|
|
|
| 19 |
df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3)
|
| 20 |
df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']]
|
| 21 |
|
| 22 |
+
# Streamlit App Title
|
| 23 |
+
st.title('BDS24_Weekly_Assignment_Week 2 | Tryfonas Karmiris')
|
| 24 |
+
|
| 25 |
+
# Display the cleaned data table
|
| 26 |
+
st.table(df_kiva_loans_cleaned.head())
|
| 27 |
+
|
| 28 |
+
# Dropdown and slider for Altair chart
|
| 29 |
+
st.subheader('Distribution of Funded Amounts')
|
| 30 |
+
# Altair chart: simple distribution of funded amounts
|
| 31 |
+
chart = alt.Chart(df_kiva_loans_cleaned).mark_bar().encode(
|
| 32 |
+
alt.X('funded_amount', bin=alt.Bin(maxbins=50)), # Use funded_amount for distribution
|
| 33 |
+
y='count()',
|
| 34 |
+
).properties(
|
| 35 |
+
title='Distribution of Funded Amounts'
|
| 36 |
+
)
|
| 37 |
+
st.altair_chart(chart, use_container_width=True)
|
| 38 |
+
|
| 39 |
+
# Dropdown and slider for Matplotlib dual-axis plot
|
| 40 |
+
st.subheader('Top Values by Selected Variable')
|
| 41 |
+
|
| 42 |
+
# Dropdown for plot type
|
| 43 |
+
plot_type = st.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector'])
|
| 44 |
|
| 45 |
# Slider to select the number of top values to display
|
| 46 |
+
num_columns = st.slider(
|
| 47 |
"Select Number of Columns to Display",
|
| 48 |
min_value=5,
|
| 49 |
+
max_value=50,
|
| 50 |
value=10, # default value
|
| 51 |
step=1
|
| 52 |
)
|
|
|
|
| 65 |
x_column = 'sector'
|
| 66 |
count_column = 'count'
|
| 67 |
|
| 68 |
+
# Create a dual-axis bar plot using Matplotlib
|
| 69 |
fig, ax1 = plt.subplots(figsize=(12, 9))
|
| 70 |
plt.xticks(rotation=90)
|
| 71 |
|
|
|
|
| 86 |
# Add titles and labels
|
| 87 |
plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}')
|
| 88 |
fig.tight_layout()
|
| 89 |
+
st.pyplot(fig)
|
| 90 |
+
|
| 91 |
+
# Boxplot (or Violin Plot) after the dual-axis plot
|
| 92 |
+
st.subheader('Funded Amount vs. Selected Variable')
|
| 93 |
+
|
| 94 |
+
# Filter the data based on the selected variable and number of top values
|
| 95 |
+
if plot_type == 'sector':
|
| 96 |
+
top_values_boxplot = df_kiva_loans.groupby('sector')['funded_amount'].agg('sum').nlargest(num_columns).index
|
| 97 |
+
filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_boxplot)]
|
| 98 |
+
elif plot_type == 'country':
|
| 99 |
+
top_values_boxplot = df_kiva_loans.groupby('country')['funded_amount'].agg('sum').nlargest(num_columns).index
|
| 100 |
+
filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_boxplot)]
|
| 101 |
+
else: # repayment_interval
|
| 102 |
+
filtered_df_boxplot = df_kiva_loans_cleaned
|
| 103 |
+
|
| 104 |
+
# Create a boxplot
|
| 105 |
+
fig, ax = plt.subplots(figsize=(12, 6))
|
| 106 |
+
if plot_type != 'repayment_interval':
|
| 107 |
+
# Use sorted values for 'sector' and 'country'
|
| 108 |
+
top_values_sorted = df_kiva_loans.groupby(plot_type)['funded_amount'].agg('sum').nlargest(num_columns).index
|
| 109 |
+
sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, order=top_values_sorted, ax=ax)
|
| 110 |
+
else:
|
| 111 |
+
# No specific sorting needed for 'repayment_interval'
|
| 112 |
+
sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, ax=ax)
|
| 113 |
+
|
| 114 |
+
plt.title('Funded Amount by Selected Variable')
|
| 115 |
+
plt.xlabel(plot_type)
|
| 116 |
+
plt.ylabel('Funded Amount')
|
| 117 |
+
plt.xticks(rotation=45)
|
| 118 |
+
st.pyplot(fig)
|
| 119 |
+
|
| 120 |
+
# Dropdown for Seaborn countplot
|
| 121 |
+
st.subheader('Repayment Interval by Selected Variable')
|
| 122 |
+
|
| 123 |
+
# Dropdown for selecting variable for Seaborn countplot
|
| 124 |
+
plot_var = st.selectbox("Select Variable for Countplot", ['sector', 'country'])
|
| 125 |
+
|
| 126 |
+
# Slider to select the number of top values to display for Seaborn countplot
|
| 127 |
+
num_top_values = st.slider(
|
| 128 |
+
"Select Number of Top Values to Display",
|
| 129 |
+
min_value=5,
|
| 130 |
+
max_value=50,
|
| 131 |
+
value=10, # default value
|
| 132 |
+
step=1
|
| 133 |
+
)
|
| 134 |
|
| 135 |
+
# Filter the data based on the selected variable and number of top values
|
| 136 |
+
if plot_var == 'sector':
|
| 137 |
+
top_values_plot = df_kiva_loans.groupby('sector')['funded_amount'].agg('count').nlargest(num_top_values).index
|
| 138 |
+
filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_plot)]
|
| 139 |
+
elif plot_var == 'country':
|
| 140 |
+
top_values_plot = df_kiva_loans.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
|
| 141 |
+
filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]
|
| 142 |
+
|
| 143 |
+
# Create Seaborn countplot
|
| 144 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
| 145 |
+
sns.countplot(x='repayment_interval', hue=plot_var, data=filtered_df_plot, ax=ax)
|
| 146 |
+
plt.title(f'Repayment Interval by {plot_var.replace("_", " ").title()}')
|
| 147 |
+
plt.xlabel('Repayment Interval')
|
| 148 |
+
plt.xticks(rotation=90)
|
| 149 |
+
plt.ylabel('Count')
|
| 150 |
+
plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
|
| 151 |
+
st.pyplot(fig)
|
data/kiva_mpi_region_locations.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/loan_theme_ids.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48f3d922eef1d329ba913d0ec3c1b88714014f45ec5940a4084c311f4a455baa
|
| 3 |
+
size 31641314
|
data/loan_themes_by_region.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|