Spaces:
Sleeping
Sleeping
| # Import necessary libraries | |
| import streamlit as st | |
| import pandas as pd | |
| import altair as alt | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from scipy.stats import zscore | |
| # Function to load and clean data | |
| def load_and_clean_data(file_path): | |
| # Load data | |
| df_kiva_loans = pd.read_csv(file_path) | |
| # Clean data | |
| df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time', 'funded_time', 'posted_time', 'tags'], axis=1) | |
| df_kiva_loans.dropna(subset=['partner_id', 'borrower_genders'], inplace=True) | |
| # Calculate Z-scores | |
| z_scores = zscore(df_kiva_loans['funded_amount']) | |
| df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3) | |
| df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']] | |
| return df_kiva_loans_cleaned | |
| # Load the cleaned data | |
| file_path = 'kiva_loans.csv' | |
| df_kiva_loans_cleaned = load_and_clean_data(file_path) | |
| # Streamlit App Title | |
| st.title('BDS24_Weekly_Assignment_Week 2 | Tryfonas Karmiris') | |
| # Sidebar for navigation | |
| st.sidebar.title("Navigation") | |
| page = st.sidebar.radio("Select a page:", ["Introduction", "Data Overview", "Top Values by Selected Variable", "Repayment Interval by Selected Variable", "Country Comparison Deepdive", "Sector Comparison Deepdive"]) | |
| # Introduction Page | |
| if page == "Introduction": | |
| st.subheader("Introduction") | |
| st.write(""" | |
| This application provides insights into Kiva loans data. | |
| You can explore the distribution of funded amounts, | |
| analyze top values by selected variables, and visualize | |
| relationships between funded amounts and various factors. | |
| """) | |
| # Data Overview Page | |
| elif page == "Data Overview": | |
| st.subheader("Data Overview") | |
| st.write("Here is a preview of the cleaned Kiva loans data:") | |
| # Display the cleaned data table | |
| st.table(df_kiva_loans_cleaned.head()) | |
| # Distribution of Funded Amounts | |
| st.subheader('Distribution of Funded Amounts') | |
| chart = alt.Chart(df_kiva_loans_cleaned).mark_bar().encode( | |
| alt.X('funded_amount', bin=alt.Bin(maxbins=50)), # Use funded_amount for distribution | |
| y='count()', | |
| ).properties( | |
| title='Distribution of Funded Amounts' | |
| ) | |
| st.altair_chart(chart, use_container_width=True) | |
| st.write("This chart shows the distribution of funded amounts for Kiva loans. The x-axis represents the funded amount, while the y-axis shows the count of loans that fall within each bin.") | |
| # Page 3: Top Values by Selected Variable | |
| elif page == "Top Values by Selected Variable": | |
| st.subheader('Top Values by Selected Variable') | |
| # Dropdown for plot type | |
| plot_type = st.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector']) | |
| # Slider to select the number of top values to display | |
| num_columns = st.slider( | |
| "Select Number of Columns to Display", | |
| min_value=5, | |
| max_value=50, | |
| value=10, # default value | |
| step=1 | |
| ) | |
| # Select the top values based on the selected variable and number of columns | |
| if plot_type == 'country': | |
| top_values = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index() | |
| x_column = 'country' | |
| count_column = 'count' | |
| description = f"This chart displays the top {num_columns} countries by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans." | |
| elif plot_type == 'repayment_interval': | |
| top_values = df_kiva_loans_cleaned.groupby('repayment_interval')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index() | |
| x_column = 'repayment_interval' | |
| count_column = 'count' | |
| description = f"This chart shows the top {num_columns} repayment intervals by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans." | |
| else: # sector | |
| top_values = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index() | |
| x_column = 'sector' | |
| count_column = 'count' | |
| description = f"This chart illustrates the top {num_columns} sectors by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans." | |
| # Display description | |
| st.write(description) | |
| # Create a dual-axis bar plot using Matplotlib | |
| fig, ax1 = plt.subplots(figsize=(12, 9)) | |
| plt.xticks(rotation=90) | |
| # Bar plot for funded_amount | |
| color = 'tab:blue' | |
| ax1.set_xlabel(x_column.replace("_", " ").title()) | |
| ax1.set_ylabel('Funded Amount', color=color) | |
| ax1.bar(top_values[x_column], top_values['sum'], color=color, alpha=0.6, label='Funded Amount') | |
| ax1.tick_params(axis='y', labelcolor=color) | |
| # Create a second y-axis for count | |
| ax2 = ax1.twinx() | |
| color = 'tab:red' | |
| ax2.set_ylabel('Count', color=color) | |
| ax2.plot(top_values[x_column], top_values[count_column], color=color, marker='o', linestyle='-', linewidth=2, label='Count') | |
| ax2.tick_params(axis='y', labelcolor=color) | |
| # Add titles and labels | |
| plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}') | |
| fig.tight_layout() | |
| st.pyplot(fig) | |
| # Boxplot after the dual-axis plot | |
| st.subheader('Funded Amount vs. Selected Variable') | |
| # Filter the data based on the selected variable and number of top values | |
| if plot_type == 'sector': | |
| top_values_boxplot = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg('sum').nlargest(num_columns).index | |
| filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_boxplot)] | |
| elif plot_type == 'country': | |
| top_values_boxplot = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg('sum').nlargest(num_columns).index | |
| filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_boxplot)] | |
| else: # repayment_interval | |
| filtered_df_boxplot = df_kiva_loans_cleaned | |
| # Create a boxplot | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| if plot_type != 'repayment_interval': | |
| top_values_sorted = df_kiva_loans_cleaned.groupby(plot_type)['funded_amount'].agg('sum').nlargest(num_columns).index | |
| sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, order=top_values_sorted, ax=ax) | |
| else: | |
| sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, ax=ax) | |
| plt.title('Funded Amount by Selected Variable') | |
| plt.xlabel(plot_type) | |
| plt.ylabel('Funded Amount') | |
| plt.xticks(rotation=45) | |
| st.pyplot(fig) | |
| # Display description for boxplot | |
| st.write(f"This boxplot shows the distribution of funded amounts for the top {num_columns} {plot_type.replace('_', ' ')}. It provides insights into the spread and outliers of funded amounts.") | |
| # Page 4: Other Plots | |
| elif page == "Repayment Interval by Selected Variable": | |
| st.subheader('Repayment Interval by Selected Variable') | |
| # Dropdown for selecting variable for Seaborn countplot | |
| plot_var = st.selectbox("Select Variable for Countplot", ['sector', 'country']) | |
| # Slider to select the number of top values to display for Seaborn countplot | |
| num_top_values = st.slider( | |
| "Select Number of Top Values to Display", | |
| min_value=5, | |
| max_value=50, | |
| value=10, # default value | |
| step=1 | |
| ) | |
| # Filter the data based on the selected variable and number of top values | |
| if plot_var == 'sector': | |
| top_values_plot = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg('count').nlargest(num_top_values).index | |
| filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_plot)] | |
| description = f"This countplot shows the distribution of repayment intervals for the top {num_top_values} sectors based on the number of loans." | |
| elif plot_var == 'country': | |
| top_values_plot = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index | |
| filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)] | |
| description = f"This countplot illustrates the distribution of repayment intervals for the top {num_top_values} countries based on the number of loans." | |
| # Display description | |
| st.write(description) | |
| # Create a count plot | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| # Count the occurrences of repayment intervals for the filtered data | |
| count_data = filtered_df_plot.groupby('repayment_interval')[plot_var].value_counts().unstack(fill_value=0) | |
| # Calculate total counts for sorting | |
| total_counts = count_data.sum(axis=1) | |
| # Sort the repayment intervals based on the total count of loans in descending order | |
| sorted_index = total_counts.sort_values(ascending=False).index | |
| count_data = count_data.loc[sorted_index] | |
| # Create a grouped bar plot | |
| count_data.plot(kind='bar', ax=ax, position=0, width=0.8) | |
| plt.title(f'Repayment Interval by {plot_var.replace("_", " ").title()}') | |
| plt.xlabel('Repayment Interval') | |
| plt.ylabel('Count of Loans') | |
| plt.xticks(rotation=45) | |
| plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left') | |
| st.pyplot(fig) | |
| # Page 5: Country Comparison | |
| elif page == "Country Comparison Deepdive": | |
| st.subheader("Country Comparison Deepdive") | |
| # Multi-select for countries | |
| selected_countries = st.multiselect("Select Countries to Compare", options=df_kiva_loans_cleaned['country'].unique()) | |
| # Option to choose between count or sum of funded amounts | |
| aggregation_option = st.radio("Select Aggregation Type:", ("Count", "Sum")) | |
| if selected_countries: | |
| # Filter the data based on selected countries | |
| filtered_data = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(selected_countries)] | |
| # Create a combined bar plot for sector summary | |
| st.subheader("Total Funded Amounts by Sector for Selected Countries") | |
| if aggregation_option == "Sum": | |
| sector_summary = filtered_data.groupby(['country', 'sector']).agg( | |
| total_funded_amount=('funded_amount', 'sum') | |
| ).reset_index() | |
| else: # Count | |
| sector_summary = filtered_data.groupby(['country', 'sector']).agg( | |
| total_funded_amount=('funded_amount', 'count') | |
| ).reset_index() | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| sns.barplot(x='sector', y='total_funded_amount', hue='country', data=sector_summary, ax=ax) | |
| plt.title(f'Total Funded Amount by Sector for Selected Countries ({aggregation_option})') | |
| plt.xlabel('Sector') | |
| plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans') | |
| plt.xticks(rotation=45) | |
| st.pyplot(fig) | |
| # Create a combined bar plot for repayment summary | |
| st.subheader("Total Funded Amounts by Repayment Interval for Selected Countries") | |
| if aggregation_option == "Sum": | |
| repayment_summary = filtered_data.groupby(['country', 'repayment_interval']).agg( | |
| total_funded_amount=('funded_amount', 'sum') | |
| ).reset_index() | |
| else: # Count | |
| repayment_summary = filtered_data.groupby(['country', 'repayment_interval']).agg( | |
| total_funded_amount=('funded_amount', 'count') | |
| ).reset_index() | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| sns.barplot(x='repayment_interval', y='total_funded_amount', hue='country', data=repayment_summary, ax=ax) | |
| plt.title(f'Total Funded Amount by Repayment Interval for Selected Countries ({aggregation_option})') | |
| plt.xlabel('Repayment Interval') | |
| plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans') | |
| plt.xticks(rotation=45) | |
| st.pyplot(fig) | |
| else: | |
| st.write("Please select one or more countries to compare.") | |
| # Page 6: Sector Comparison | |
| elif page == "Sector Comparison Deepdive": | |
| st.subheader("Sector Comparison Deepdive") | |
| # Multi-select for sectors | |
| selected_sectors = st.multiselect("Select Sectors to Compare", options=df_kiva_loans_cleaned['sector'].unique()) | |
| # Option to choose between count or sum of funded amounts | |
| aggregation_option = st.radio("Select Aggregation Type:", ("Count", "Sum")) | |
| if selected_sectors: | |
| # Filter the data based on selected sectors | |
| filtered_data = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(selected_sectors)] | |
| # Create a combined bar plot for sector summary by country | |
| st.subheader("Total Funded Amounts by Country for Selected Sectors") | |
| if aggregation_option == "Sum": | |
| country_summary = filtered_data.groupby(['country', 'sector']).agg( | |
| total_funded_amount=('funded_amount', 'sum') | |
| ).reset_index() | |
| else: # Count | |
| country_summary = filtered_data.groupby(['country', 'sector']).agg( | |
| total_funded_amount=('funded_amount', 'count') | |
| ).reset_index() | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| sns.barplot(x='country', y='total_funded_amount', hue='sector', data=country_summary, ax=ax) | |
| plt.title(f'Total Funded Amount by Country for Selected Sectors ({aggregation_option})') | |
| plt.xlabel('Country') | |
| plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans') | |
| plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left') | |
| plt.xticks(rotation=90) | |
| st.pyplot(fig) | |
| # Create a combined bar plot for repayment summary | |
| st.subheader("Total Funded Amounts by Repayment Interval for Selected Sectors") | |
| if aggregation_option == "Sum": | |
| repayment_summary = filtered_data.groupby(['repayment_interval', 'sector']).agg( | |
| total_funded_amount=('funded_amount', 'sum') | |
| ).reset_index() | |
| else: # Count | |
| repayment_summary = filtered_data.groupby(['repayment_interval', 'sector']).agg( | |
| total_funded_amount=('funded_amount', 'count') | |
| ).reset_index() | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| sns.barplot(x='repayment_interval', y='total_funded_amount', hue='sector', data=repayment_summary, ax=ax) | |
| plt.title(f'Total Funded Amount by Repayment Interval for Selected Sectors ({aggregation_option})') | |
| plt.xlabel('Repayment Interval') | |
| plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans') | |
| plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left') | |
| plt.xticks(rotation=90) | |
| st.pyplot(fig) | |
| else: | |
| st.write("Please select one or more sectors to compare.") | |