# Import necessary libraries import streamlit as st import pandas as pd import altair as alt import matplotlib.pyplot as plt import seaborn as sns from scipy.stats import zscore # Load data file_path = 'kiva_loans.csv' df_kiva_loans = pd.read_csv(file_path) # Clean data df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time', 'funded_time', 'posted_time', 'tags'], axis=1) df_kiva_loans.dropna(subset=['partner_id', 'borrower_genders'], inplace=True) # Calculate Z-scores z_scores = zscore(df_kiva_loans['funded_amount']) df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3) df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']] # Streamlit App Title st.title('BDS24_Weekly_Assignment_Week 2 | Tryfonas Karmiris') # Display the cleaned data table st.table(df_kiva_loans_cleaned.head()) # Dropdown and slider for Altair chart st.subheader('Distribution of Funded Amounts') # Altair chart: simple distribution of funded amounts chart = alt.Chart(df_kiva_loans_cleaned).mark_bar().encode( alt.X('funded_amount', bin=alt.Bin(maxbins=50)), # Use funded_amount for distribution y='count()', ).properties( title='Distribution of Funded Amounts' ) st.altair_chart(chart, use_container_width=True) # Dropdown and slider for Matplotlib dual-axis plot st.subheader('Top Values by Selected Variable') # Dropdown for plot type plot_type = st.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector']) # Slider to select the number of top values to display num_columns = st.slider( "Select Number of Columns to Display", min_value=5, max_value=50, value=10, # default value step=1 ) # Select the top values based on the selected variable and number of columns if plot_type == 'country': top_values = df_kiva_loans.groupby('country')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index() x_column = 'country' count_column = 'count' elif plot_type == 'repayment_interval': top_values = df_kiva_loans.groupby('repayment_interval')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index() x_column = 'repayment_interval' count_column = 'count' else: # sector top_values = df_kiva_loans.groupby('sector')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index() x_column = 'sector' count_column = 'count' # Create a dual-axis bar plot using Matplotlib fig, ax1 = plt.subplots(figsize=(12, 9)) plt.xticks(rotation=90) # Bar plot for funded_amount color = 'tab:blue' ax1.set_xlabel(x_column.replace("_", " ").title()) ax1.set_ylabel('Funded Amount', color=color) ax1.bar(top_values[x_column], top_values['sum'], color=color, alpha=0.6, label='Funded Amount') ax1.tick_params(axis='y', labelcolor=color) # Create a second y-axis for count ax2 = ax1.twinx() color = 'tab:red' ax2.set_ylabel('Count', color=color) ax2.plot(top_values[x_column], top_values[count_column], color=color, marker='o', linestyle='-', linewidth=2, label='Count') ax2.tick_params(axis='y', labelcolor=color) # Add titles and labels plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}') fig.tight_layout() st.pyplot(fig) # Boxplot (or Violin Plot) after the dual-axis plot st.subheader('Funded Amount vs. Selected Variable') # Filter the data based on the selected variable and number of top values if plot_type == 'sector': top_values_boxplot = df_kiva_loans.groupby('sector')['funded_amount'].agg('sum').nlargest(num_columns).index filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_boxplot)] elif plot_type == 'country': top_values_boxplot = df_kiva_loans.groupby('country')['funded_amount'].agg('sum').nlargest(num_columns).index filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_boxplot)] else: # repayment_interval filtered_df_boxplot = df_kiva_loans_cleaned # Create a boxplot fig, ax = plt.subplots(figsize=(12, 6)) if plot_type != 'repayment_interval': # Use sorted values for 'sector' and 'country' top_values_sorted = df_kiva_loans.groupby(plot_type)['funded_amount'].agg('sum').nlargest(num_columns).index sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, order=top_values_sorted, ax=ax) else: # No specific sorting needed for 'repayment_interval' sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, ax=ax) plt.title('Funded Amount by Selected Variable') plt.xlabel(plot_type) plt.ylabel('Funded Amount') plt.xticks(rotation=45) st.pyplot(fig) # Dropdown for Seaborn countplot st.subheader('Repayment Interval by Selected Variable') # Dropdown for selecting variable for Seaborn countplot plot_var = st.selectbox("Select Variable for Countplot", ['sector', 'country']) # Slider to select the number of top values to display for Seaborn countplot num_top_values = st.slider( "Select Number of Top Values to Display", min_value=5, max_value=50, value=10, # default value step=1 ) # Filter the data based on the selected variable and number of top values if plot_var == 'sector': top_values_plot = df_kiva_loans.groupby('sector')['funded_amount'].agg('count').nlargest(num_top_values).index filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_plot)] elif plot_var == 'country': top_values_plot = df_kiva_loans.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)] # Create Seaborn countplot fig, ax = plt.subplots(figsize=(10, 6)) sns.countplot(x='repayment_interval', hue=plot_var, data=filtered_df_plot, ax=ax) plt.title(f'Repayment Interval by {plot_var.replace("_", " ").title()}') plt.xlabel('Repayment Interval') plt.xticks(rotation=90) plt.ylabel('Count') plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left') st.pyplot(fig)