Spaces:

Tryfonas
/

WeeklyAssignment_Part2

Sleeping

App Files Files Community

Tryfonas commited on Sep 12, 2024

Commit

be26401

verified ·

1 Parent(s): fb89c2f

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +303 -137

app.py CHANGED Viewed

@@ -6,146 +6,312 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 from scipy.stats import zscore
-# Load data
-file_path = 'kiva_loans.csv'
-df_kiva_loans = pd.read_csv(file_path)
-# Clean data
-df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time', 'funded_time', 'posted_time', 'tags'], axis=1)
-df_kiva_loans.dropna(subset=['partner_id', 'borrower_genders'], inplace=True)
-# Calculate Z-scores
-z_scores = zscore(df_kiva_loans['funded_amount'])
-df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3)
-df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']]
 # Streamlit App Title
 st.title('BDS24_Weekly_Assignment_Week 2 | Tryfonas Karmiris')
-# Display the cleaned data table
-st.table(df_kiva_loans_cleaned.head())
-# Dropdown and slider for Altair chart
-st.subheader('Distribution of Funded Amounts')
-# Altair chart: simple distribution of funded amounts
-chart = alt.Chart(df_kiva_loans_cleaned).mark_bar().encode(
-    alt.X('funded_amount', bin=alt.Bin(maxbins=50)),  # Use funded_amount for distribution
-    y='count()',
-).properties(
-    title='Distribution of Funded Amounts'
-)
-st.altair_chart(chart, use_container_width=True)
-# Dropdown and slider for Matplotlib dual-axis plot
-st.subheader('Top Values by Selected Variable')
-# Dropdown for plot type
-plot_type = st.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector'])
-# Slider to select the number of top values to display
-num_columns = st.slider(
-    "Select Number of Columns to Display",
-    min_value=5,
-    max_value=50,
-    value=10,  # default value
-    step=1
-)
-# Select the top values based on the selected variable and number of columns
-if plot_type == 'country':
-    top_values = df_kiva_loans.groupby('country')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
-    x_column = 'country'
-    count_column = 'count'
-elif plot_type == 'repayment_interval':
-    top_values = df_kiva_loans.groupby('repayment_interval')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
-    x_column = 'repayment_interval'
-    count_column = 'count'
-else:  # sector
-    top_values = df_kiva_loans.groupby('sector')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
-    x_column = 'sector'
-    count_column = 'count'
-# Create a dual-axis bar plot using Matplotlib
-fig, ax1 = plt.subplots(figsize=(12, 9))
-plt.xticks(rotation=90)
-# Bar plot for funded_amount
-color = 'tab:blue'
-ax1.set_xlabel(x_column.replace("_", " ").title())
-ax1.set_ylabel('Funded Amount', color=color)
-ax1.bar(top_values[x_column], top_values['sum'], color=color, alpha=0.6, label='Funded Amount')
-ax1.tick_params(axis='y', labelcolor=color)
-# Create a second y-axis for count
-ax2 = ax1.twinx()
-color = 'tab:red'
-ax2.set_ylabel('Count', color=color)
-ax2.plot(top_values[x_column], top_values[count_column], color=color, marker='o', linestyle='-', linewidth=2, label='Count')
-ax2.tick_params(axis='y', labelcolor=color)
-# Add titles and labels
-plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}')
-fig.tight_layout()
-st.pyplot(fig)
-# Boxplot (or Violin Plot) after the dual-axis plot
-st.subheader('Funded Amount vs. Selected Variable')
-# Filter the data based on the selected variable and number of top values
-if plot_type == 'sector':
-    top_values_boxplot = df_kiva_loans.groupby('sector')['funded_amount'].agg('sum').nlargest(num_columns).index
-    filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_boxplot)]
-elif plot_type == 'country':
-    top_values_boxplot = df_kiva_loans.groupby('country')['funded_amount'].agg('sum').nlargest(num_columns).index
-    filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_boxplot)]
-else:  # repayment_interval
-    filtered_df_boxplot = df_kiva_loans_cleaned
-# Create a boxplot
-fig, ax = plt.subplots(figsize=(12, 6))
-if plot_type != 'repayment_interval':
-    # Use sorted values for 'sector' and 'country'
-    top_values_sorted = df_kiva_loans.groupby(plot_type)['funded_amount'].agg('sum').nlargest(num_columns).index
-    sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, order=top_values_sorted, ax=ax)
-else:
-    # No specific sorting needed for 'repayment_interval'
-    sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, ax=ax)
-plt.title('Funded Amount by Selected Variable')
-plt.xlabel(plot_type)
-plt.ylabel('Funded Amount')
-plt.xticks(rotation=45)
-st.pyplot(fig)
-# Dropdown for Seaborn countplot
-st.subheader('Repayment Interval by Selected Variable')
-# Dropdown for selecting variable for Seaborn countplot
-plot_var = st.selectbox("Select Variable for Countplot", ['sector', 'country'])
-# Slider to select the number of top values to display for Seaborn countplot
-num_top_values = st.slider(
-    "Select Number of Top Values to Display",
-    min_value=5,
-    max_value=50,
-    value=10,  # default value
-    step=1
-)
-# Filter the data based on the selected variable and number of top values
-if plot_var == 'sector':
-    top_values_plot = df_kiva_loans.groupby('sector')['funded_amount'].agg('count').nlargest(num_top_values).index
-    filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_plot)]
-elif plot_var == 'country':
-    top_values_plot = df_kiva_loans.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
-    filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]
-# Create Seaborn countplot
-fig, ax = plt.subplots(figsize=(10, 6))
-sns.countplot(x='repayment_interval', hue=plot_var, data=filtered_df_plot, ax=ax)
-plt.title(f'Repayment Interval by {plot_var.replace("_", " ").title()}')
-plt.xlabel('Repayment Interval')
-plt.xticks(rotation=90)
-plt.ylabel('Count')
-plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
-st.pyplot(fig)

 import seaborn as sns
 from scipy.stats import zscore
+# Function to load and clean data
+@st.cache_data
+def load_and_clean_data(file_path):
+    # Load data
+    df_kiva_loans = pd.read_csv(file_path)
+    # Clean data
+    df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time', 'funded_time', 'posted_time', 'tags'], axis=1)
+    df_kiva_loans.dropna(subset=['partner_id', 'borrower_genders'], inplace=True)
+    # Calculate Z-scores
+    z_scores = zscore(df_kiva_loans['funded_amount'])
+    df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3)
+    df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']]
+    return df_kiva_loans_cleaned
+# Load the cleaned data
+file_path = 'kiva_loans.csv'
+df_kiva_loans_cleaned = load_and_clean_data(file_path)
 # Streamlit App Title
 st.title('BDS24_Weekly_Assignment_Week 2 | Tryfonas Karmiris')
+# Sidebar for navigation
+st.sidebar.title("Navigation")
+page = st.sidebar.radio("Select a page:", ["Introduction", "Data Overview", "Top Values by Selected Variable", "Repayment Interval by Selected Variable", "Country Comparison Deepdive", "Sector Comparison Deepdive"])
+# Introduction Page
+if page == "Introduction":
+    st.subheader("Introduction")
+    st.write("""
+        This application provides insights into Kiva loans data.
+        You can explore the distribution of funded amounts,
+        analyze top values by selected variables, and visualize
+        relationships between funded amounts and various factors.
+    """)
+# Data Overview Page
+elif page == "Data Overview":
+    st.subheader("Data Overview")
+    st.write("Here is a preview of the cleaned Kiva loans data:")
+    # Display the cleaned data table
+    st.table(df_kiva_loans_cleaned.head())
+    # Distribution of Funded Amounts
+    st.subheader('Distribution of Funded Amounts')
+    chart = alt.Chart(df_kiva_loans_cleaned).mark_bar().encode(
+        alt.X('funded_amount', bin=alt.Bin(maxbins=50)),  # Use funded_amount for distribution
+        y='count()',
+    ).properties(
+        title='Distribution of Funded Amounts'
+    )
+    st.altair_chart(chart, use_container_width=True)
+    st.write("This chart shows the distribution of funded amounts for Kiva loans. The x-axis represents the funded amount, while the y-axis shows the count of loans that fall within each bin.")
+# Page 3: Top Values by Selected Variable
+elif page == "Top Values by Selected Variable":
+    st.subheader('Top Values by Selected Variable')
+    # Dropdown for plot type
+    plot_type = st.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector'])
+    # Slider to select the number of top values to display
+    num_columns = st.slider(
+        "Select Number of Columns to Display",
+        min_value=5,
+        max_value=50,
+        value=10,  # default value
+        step=1
+    )
+    # Select the top values based on the selected variable and number of columns
+    if plot_type == 'country':
+        top_values = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
+        x_column = 'country'
+        count_column = 'count'
+        description = f"This chart displays the top {num_columns} countries by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans."
+    elif plot_type == 'repayment_interval':
+        top_values = df_kiva_loans_cleaned.groupby('repayment_interval')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
+        x_column = 'repayment_interval'
+        count_column = 'count'
+        description = f"This chart shows the top {num_columns} repayment intervals by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans."
+    else:  # sector
+        top_values = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
+        x_column = 'sector'
+        count_column = 'count'
+        description = f"This chart illustrates the top {num_columns} sectors by total funded amount. The blue bars represent the total funded amount, while the red line indicates the count of loans."
+    # Display description
+    st.write(description)
+    # Create a dual-axis bar plot using Matplotlib
+    fig, ax1 = plt.subplots(figsize=(12, 9))
+    plt.xticks(rotation=90)
+    # Bar plot for funded_amount
+    color = 'tab:blue'
+    ax1.set_xlabel(x_column.replace("_", " ").title())
+    ax1.set_ylabel('Funded Amount', color=color)
+    ax1.bar(top_values[x_column], top_values['sum'], color=color, alpha=0.6, label='Funded Amount')
+    ax1.tick_params(axis='y', labelcolor=color)
+    # Create a second y-axis for count
+    ax2 = ax1.twinx()
+    color = 'tab:red'
+    ax2.set_ylabel('Count', color=color)
+    ax2.plot(top_values[x_column], top_values[count_column], color=color, marker='o', linestyle='-', linewidth=2, label='Count')
+    ax2.tick_params(axis='y', labelcolor=color)
+    # Add titles and labels
+    plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}')
+    fig.tight_layout()
+    st.pyplot(fig)
+    # Boxplot after the dual-axis plot
+    st.subheader('Funded Amount vs. Selected Variable')
+    # Filter the data based on the selected variable and number of top values
+    if plot_type == 'sector':
+        top_values_boxplot = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg('sum').nlargest(num_columns).index
+        filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_boxplot)]
+    elif plot_type == 'country':
+        top_values_boxplot = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg('sum').nlargest(num_columns).index
+        filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_boxplot)]
+    else:  # repayment_interval
+        filtered_df_boxplot = df_kiva_loans_cleaned
+    # Create a boxplot
+    fig, ax = plt.subplots(figsize=(12, 6))
+    if plot_type != 'repayment_interval':
+        top_values_sorted = df_kiva_loans_cleaned.groupby(plot_type)['funded_amount'].agg('sum').nlargest(num_columns).index
+        sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, order=top_values_sorted, ax=ax)
+    else:
+        sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, ax=ax)
+    plt.title('Funded Amount by Selected Variable')
+    plt.xlabel(plot_type)
+    plt.ylabel('Funded Amount')
+    plt.xticks(rotation=45)
+    st.pyplot(fig)
+    # Display description for boxplot
+    st.write(f"This boxplot shows the distribution of funded amounts for the top {num_columns} {plot_type.replace('_', ' ')}. It provides insights into the spread and outliers of funded amounts.")
+# Page 4: Other Plots
+elif page == "Repayment Interval by Selected Variable":
+    st.subheader('Repayment Interval by Selected Variable')
+    # Dropdown for selecting variable for Seaborn countplot
+    plot_var = st.selectbox("Select Variable for Countplot", ['sector', 'country'])
+    # Slider to select the number of top values to display for Seaborn countplot
+    num_top_values = st.slider(
+        "Select Number of Top Values to Display",
+        min_value=5,
+        max_value=50,
+        value=10,  # default value
+        step=1
+    )
+    # Filter the data based on the selected variable and number of top values
+    if plot_var == 'sector':
+        top_values_plot = df_kiva_loans_cleaned.groupby('sector')['funded_amount'].agg('count').nlargest(num_top_values).index
+        filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_plot)]
+        description = f"This countplot shows the distribution of repayment intervals for the top {num_top_values} sectors based on the number of loans."
+    elif plot_var == 'country':
+        top_values_plot = df_kiva_loans_cleaned.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
+        filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]
+        description = f"This countplot illustrates the distribution of repayment intervals for the top {num_top_values} countries based on the number of loans."
+    # Display description
+    st.write(description)
+    # Create a count plot
+    fig, ax = plt.subplots(figsize=(10, 6))
+    # Count the occurrences of repayment intervals for the filtered data
+    count_data = filtered_df_plot.groupby('repayment_interval')[plot_var].value_counts().unstack(fill_value=0)
+    # Calculate total counts for sorting
+    total_counts = count_data.sum(axis=1)
+    # Sort the repayment intervals based on the total count of loans in descending order
+    sorted_index = total_counts.sort_values(ascending=False).index
+    count_data = count_data.loc[sorted_index]
+    # Create a grouped bar plot
+    count_data.plot(kind='bar', ax=ax, position=0, width=0.8)
+    plt.title(f'Repayment Interval by {plot_var.replace("_", " ").title()}')
+    plt.xlabel('Repayment Interval')
+    plt.ylabel('Count of Loans')
+    plt.xticks(rotation=45)
+    plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
+    st.pyplot(fig)
+# Page 5: Country Comparison
+elif page == "Country Comparison Deepdive":
+    st.subheader("Country Comparison Deepdive")
+    # Multi-select for countries
+    selected_countries = st.multiselect("Select Countries to Compare", options=df_kiva_loans_cleaned['country'].unique())
+    # Option to choose between count or sum of funded amounts
+    aggregation_option = st.radio("Select Aggregation Type:", ("Count", "Sum"))
+    if selected_countries:
+        # Filter the data based on selected countries
+        filtered_data = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(selected_countries)]
+        # Create a combined bar plot for sector summary
+        st.subheader("Total Funded Amounts by Sector for Selected Countries")
+        if aggregation_option == "Sum":
+            sector_summary = filtered_data.groupby(['country', 'sector']).agg(
+                total_funded_amount=('funded_amount', 'sum')
+            ).reset_index()
+        else:  # Count
+            sector_summary = filtered_data.groupby(['country', 'sector']).agg(
+                total_funded_amount=('funded_amount', 'count')
+            ).reset_index()
+        fig, ax = plt.subplots(figsize=(12, 6))
+        sns.barplot(x='sector', y='total_funded_amount', hue='country', data=sector_summary, ax=ax)
+        plt.title(f'Total Funded Amount by Sector for Selected Countries ({aggregation_option})')
+        plt.xlabel('Sector')
+        plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans')
+        plt.xticks(rotation=45)
+        st.pyplot(fig)
+        # Create a combined bar plot for repayment summary
+        st.subheader("Total Funded Amounts by Repayment Interval for Selected Countries")
+        if aggregation_option == "Sum":
+            repayment_summary = filtered_data.groupby(['country', 'repayment_interval']).agg(
+                total_funded_amount=('funded_amount', 'sum')
+            ).reset_index()
+        else:  # Count
+            repayment_summary = filtered_data.groupby(['country', 'repayment_interval']).agg(
+                total_funded_amount=('funded_amount', 'count')
+            ).reset_index()
+        fig, ax = plt.subplots(figsize=(12, 6))
+        sns.barplot(x='repayment_interval', y='total_funded_amount', hue='country', data=repayment_summary, ax=ax)
+        plt.title(f'Total Funded Amount by Repayment Interval for Selected Countries ({aggregation_option})')
+        plt.xlabel('Repayment Interval')
+        plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans')
+        plt.xticks(rotation=45)
+        st.pyplot(fig)
+    else:
+        st.write("Please select one or more countries to compare.")
+# Page 6: Sector Comparison
+elif page == "Sector Comparison Deepdive":
+    st.subheader("Sector Comparison Deepdive")
+    # Multi-select for sectors
+    selected_sectors = st.multiselect("Select Sectors to Compare", options=df_kiva_loans_cleaned['sector'].unique())
+    # Option to choose between count or sum of funded amounts
+    aggregation_option = st.radio("Select Aggregation Type:", ("Count", "Sum"))
+    if selected_sectors:
+        # Filter the data based on selected sectors
+        filtered_data = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(selected_sectors)]
+        # Create a combined bar plot for sector summary by country
+        st.subheader("Total Funded Amounts by Country for Selected Sectors")
+        if aggregation_option == "Sum":
+            country_summary = filtered_data.groupby(['country', 'sector']).agg(
+                total_funded_amount=('funded_amount', 'sum')
+            ).reset_index()
+        else:  # Count
+            country_summary = filtered_data.groupby(['country', 'sector']).agg(
+                total_funded_amount=('funded_amount', 'count')
+            ).reset_index()
+        fig, ax = plt.subplots(figsize=(12, 6))
+        sns.barplot(x='country', y='total_funded_amount', hue='sector', data=country_summary, ax=ax)
+        plt.title(f'Total Funded Amount by Country for Selected Sectors ({aggregation_option})')
+        plt.xlabel('Country')
+        plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans')
+        plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')
+        plt.xticks(rotation=90)
+        st.pyplot(fig)
+        # Create a combined bar plot for repayment summary
+        st.subheader("Total Funded Amounts by Repayment Interval for Selected Sectors")
+        if aggregation_option == "Sum":
+            repayment_summary = filtered_data.groupby(['repayment_interval', 'sector']).agg(
+                total_funded_amount=('funded_amount', 'sum')
+            ).reset_index()
+        else:  # Count
+            repayment_summary = filtered_data.groupby(['repayment_interval', 'sector']).agg(
+                total_funded_amount=('funded_amount', 'count')
+            ).reset_index()
+        fig, ax = plt.subplots(figsize=(12, 6))
+        sns.barplot(x='repayment_interval', y='total_funded_amount', hue='sector', data=repayment_summary, ax=ax)
+        plt.title(f'Total Funded Amount by Repayment Interval for Selected Sectors ({aggregation_option})')
+        plt.xlabel('Repayment Interval')
+        plt.ylabel('Total Funded Amount' if aggregation_option == "Sum" else 'Count of Loans')
+        plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')
+        plt.xticks(rotation=90)
+        st.pyplot(fig)
+    else:
+        st.write("Please select one or more sectors to compare.")