Tryfonas commited on
Commit
fb89c2f
·
verified ·
1 Parent(s): 00549a0

Upload folder using huggingface_hub

Browse files
KIVA___BDS24_Assignment_Karmiris_Tryfonas.ipynb CHANGED
@@ -4894,7 +4894,7 @@
4894
  "plt.xlabel('Sector')\n",
4895
  "plt.xticks(rotation=90)\n",
4896
  "plt.ylabel('Count')\n",
4897
- "plt.legend(title='Repayment Interval', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
4898
  "plt.show()"
4899
  ]
4900
  },
 
4894
  "plt.xlabel('Sector')\n",
4895
  "plt.xticks(rotation=90)\n",
4896
  "plt.ylabel('Count')\n",
4897
+ "plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
4898
  "plt.show()"
4899
  ]
4900
  },
__pycache__/dual_axis_and_boxplot.cpython-312.pyc ADDED
Binary file (4.82 kB). View file
 
__pycache__/repayment_interval_chart.cpython-312.pyc ADDED
Binary file (2.97 kB). View file
 
app.py CHANGED
@@ -3,35 +3,50 @@ import streamlit as st
3
  import pandas as pd
4
  import altair as alt
5
  import matplotlib.pyplot as plt
6
- from scipy.stats import zscore
7
  import seaborn as sns
 
8
 
9
-
10
- file_path= 'kiva_loans.csv'
11
-
12
  df_kiva_loans = pd.read_csv(file_path)
13
 
14
- df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time','funded_time','posted_time','tags'], axis=1)
15
-
16
- #drop nas on specific columns not all of them, it doesnt affect the task we actually want to do now, but might need for later use
17
- df_kiva_loans.dropna(subset=['partner_id','borrower_genders'], inplace=True)
18
 
19
  # Calculate Z-scores
20
  z_scores = zscore(df_kiva_loans['funded_amount'])
21
-
22
- # Get boolean array indicating the presence of outliers
23
  df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3)
24
  df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']]
25
 
26
- st.title('BDS24_Weekly_Assignment_Week 2| Tryfonas Karmiris')
27
- # Sidebar selection for the type of plot
28
- plot_type = st.sidebar.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Slider to select the number of top values to display
31
- num_columns = st.sidebar.slider(
32
  "Select Number of Columns to Display",
33
  min_value=5,
34
- max_value=20,
35
  value=10, # default value
36
  step=1
37
  )
@@ -50,7 +65,7 @@ else: # sector
50
  x_column = 'sector'
51
  count_column = 'count'
52
 
53
- # Create a bar plot with dual axes
54
  fig, ax1 = plt.subplots(figsize=(12, 9))
55
  plt.xticks(rotation=90)
56
 
@@ -71,7 +86,66 @@ ax2.tick_params(axis='y', labelcolor=color)
71
  # Add titles and labels
72
  plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}')
73
  fig.tight_layout()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
-
76
- # Display the plot in Streamlit
77
- st.pyplot(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import pandas as pd
4
  import altair as alt
5
  import matplotlib.pyplot as plt
 
6
  import seaborn as sns
7
+ from scipy.stats import zscore
8
 
9
+ # Load data
10
+ file_path = 'kiva_loans.csv'
 
11
  df_kiva_loans = pd.read_csv(file_path)
12
 
13
+ # Clean data
14
+ df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time', 'funded_time', 'posted_time', 'tags'], axis=1)
15
+ df_kiva_loans.dropna(subset=['partner_id', 'borrower_genders'], inplace=True)
 
16
 
17
  # Calculate Z-scores
18
  z_scores = zscore(df_kiva_loans['funded_amount'])
 
 
19
  df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3)
20
  df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']]
21
 
22
+ # Streamlit App Title
23
+ st.title('BDS24_Weekly_Assignment_Week 2 | Tryfonas Karmiris')
24
+
25
+ # Display the cleaned data table
26
+ st.table(df_kiva_loans_cleaned.head())
27
+
28
+ # Dropdown and slider for Altair chart
29
+ st.subheader('Distribution of Funded Amounts')
30
+ # Altair chart: simple distribution of funded amounts
31
+ chart = alt.Chart(df_kiva_loans_cleaned).mark_bar().encode(
32
+ alt.X('funded_amount', bin=alt.Bin(maxbins=50)), # Use funded_amount for distribution
33
+ y='count()',
34
+ ).properties(
35
+ title='Distribution of Funded Amounts'
36
+ )
37
+ st.altair_chart(chart, use_container_width=True)
38
+
39
+ # Dropdown and slider for Matplotlib dual-axis plot
40
+ st.subheader('Top Values by Selected Variable')
41
+
42
+ # Dropdown for plot type
43
+ plot_type = st.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector'])
44
 
45
  # Slider to select the number of top values to display
46
+ num_columns = st.slider(
47
  "Select Number of Columns to Display",
48
  min_value=5,
49
+ max_value=50,
50
  value=10, # default value
51
  step=1
52
  )
 
65
  x_column = 'sector'
66
  count_column = 'count'
67
 
68
+ # Create a dual-axis bar plot using Matplotlib
69
  fig, ax1 = plt.subplots(figsize=(12, 9))
70
  plt.xticks(rotation=90)
71
 
 
86
  # Add titles and labels
87
  plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}')
88
  fig.tight_layout()
89
+ st.pyplot(fig)
90
+
91
+ # Boxplot (or Violin Plot) after the dual-axis plot
92
+ st.subheader('Funded Amount vs. Selected Variable')
93
+
94
+ # Filter the data based on the selected variable and number of top values
95
+ if plot_type == 'sector':
96
+ top_values_boxplot = df_kiva_loans.groupby('sector')['funded_amount'].agg('sum').nlargest(num_columns).index
97
+ filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_boxplot)]
98
+ elif plot_type == 'country':
99
+ top_values_boxplot = df_kiva_loans.groupby('country')['funded_amount'].agg('sum').nlargest(num_columns).index
100
+ filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_boxplot)]
101
+ else: # repayment_interval
102
+ filtered_df_boxplot = df_kiva_loans_cleaned
103
+
104
+ # Create a boxplot
105
+ fig, ax = plt.subplots(figsize=(12, 6))
106
+ if plot_type != 'repayment_interval':
107
+ # Use sorted values for 'sector' and 'country'
108
+ top_values_sorted = df_kiva_loans.groupby(plot_type)['funded_amount'].agg('sum').nlargest(num_columns).index
109
+ sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, order=top_values_sorted, ax=ax)
110
+ else:
111
+ # No specific sorting needed for 'repayment_interval'
112
+ sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, ax=ax)
113
+
114
+ plt.title('Funded Amount by Selected Variable')
115
+ plt.xlabel(plot_type)
116
+ plt.ylabel('Funded Amount')
117
+ plt.xticks(rotation=45)
118
+ st.pyplot(fig)
119
+
120
+ # Dropdown for Seaborn countplot
121
+ st.subheader('Repayment Interval by Selected Variable')
122
+
123
+ # Dropdown for selecting variable for Seaborn countplot
124
+ plot_var = st.selectbox("Select Variable for Countplot", ['sector', 'country'])
125
+
126
+ # Slider to select the number of top values to display for Seaborn countplot
127
+ num_top_values = st.slider(
128
+ "Select Number of Top Values to Display",
129
+ min_value=5,
130
+ max_value=50,
131
+ value=10, # default value
132
+ step=1
133
+ )
134
 
135
+ # Filter the data based on the selected variable and number of top values
136
+ if plot_var == 'sector':
137
+ top_values_plot = df_kiva_loans.groupby('sector')['funded_amount'].agg('count').nlargest(num_top_values).index
138
+ filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_plot)]
139
+ elif plot_var == 'country':
140
+ top_values_plot = df_kiva_loans.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
141
+ filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]
142
+
143
+ # Create Seaborn countplot
144
+ fig, ax = plt.subplots(figsize=(10, 6))
145
+ sns.countplot(x='repayment_interval', hue=plot_var, data=filtered_df_plot, ax=ax)
146
+ plt.title(f'Repayment Interval by {plot_var.replace("_", " ").title()}')
147
+ plt.xlabel('Repayment Interval')
148
+ plt.xticks(rotation=90)
149
+ plt.ylabel('Count')
150
+ plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
151
+ st.pyplot(fig)
data/kiva_mpi_region_locations.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/loan_theme_ids.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48f3d922eef1d329ba913d0ec3c1b88714014f45ec5940a4084c311f4a455baa
3
+ size 31641314
data/loan_themes_by_region.csv ADDED
The diff for this file is too large to render. See raw diff