Spaces:

Tryfonas
/

WeeklyAssignment_Part2

Sleeping

App Files Files Community

Tryfonas commited on Sep 12, 2024

Commit

fb89c2f

verified ·

1 Parent(s): 00549a0

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

KIVA___BDS24_Assignment_Karmiris_Tryfonas.ipynb +1 -1
__pycache__/dual_axis_and_boxplot.cpython-312.pyc +0 -0
__pycache__/repayment_interval_chart.cpython-312.pyc +0 -0
app.py +93 -19
data/kiva_mpi_region_locations.csv +0 -0
data/loan_theme_ids.csv +3 -0
data/loan_themes_by_region.csv +0 -0

KIVA___BDS24_Assignment_Karmiris_Tryfonas.ipynb CHANGED Viewed

@@ -4894,7 +4894,7 @@
         "plt.xlabel('Sector')\n",
         "plt.xticks(rotation=90)\n",
         "plt.ylabel('Count')\n",
-        "plt.legend(title='Repayment Interval', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
         "plt.show()"
       ]
     },

         "plt.xlabel('Sector')\n",
         "plt.xticks(rotation=90)\n",
         "plt.ylabel('Count')\n",
+        "plt.legend(title='Sector', bbox_to_anchor=(1.05, 1), loc='upper left')\n",
         "plt.show()"
       ]
     },

__pycache__/dual_axis_and_boxplot.cpython-312.pyc ADDED Viewed

Binary file (4.82 kB). View file

__pycache__/repayment_interval_chart.cpython-312.pyc ADDED Viewed

Binary file (2.97 kB). View file

app.py CHANGED Viewed

@@ -3,35 +3,50 @@ import streamlit as st
 import pandas as pd
 import altair as alt
 import matplotlib.pyplot as plt
-from scipy.stats import zscore
 import seaborn as sns
-file_path= 'kiva_loans.csv'
 df_kiva_loans = pd.read_csv(file_path)
-df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time','funded_time','posted_time','tags'], axis=1)
-#drop nas on specific columns not all of them, it doesnt affect the task we actually want to do now, but might need for later use
-df_kiva_loans.dropna(subset=['partner_id','borrower_genders'], inplace=True)
 # Calculate Z-scores
 z_scores = zscore(df_kiva_loans['funded_amount'])
-# Get boolean array indicating the presence of outliers
 df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3)
 df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']]
-st.title('BDS24_Weekly_Assignment_Week 2| Tryfonas Karmiris')
-# Sidebar selection for the type of plot
-plot_type = st.sidebar.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector'])
 # Slider to select the number of top values to display
-num_columns = st.sidebar.slider(
     "Select Number of Columns to Display",
     min_value=5,
-    max_value=20,
     value=10,  # default value
     step=1
 )
@@ -50,7 +65,7 @@ else:  # sector
     x_column = 'sector'
     count_column = 'count'
-# Create a bar plot with dual axes
 fig, ax1 = plt.subplots(figsize=(12, 9))
 plt.xticks(rotation=90)
@@ -71,7 +86,66 @@ ax2.tick_params(axis='y', labelcolor=color)
 # Add titles and labels
 plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}')
 fig.tight_layout()
-# Display the plot in Streamlit
-st.pyplot(fig)

 import pandas as pd
 import altair as alt
 import matplotlib.pyplot as plt
 import seaborn as sns
+from scipy.stats import zscore
+# Load data
+file_path = 'kiva_loans.csv'
 df_kiva_loans = pd.read_csv(file_path)
+# Clean data
+df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time', 'funded_time', 'posted_time', 'tags'], axis=1)
+df_kiva_loans.dropna(subset=['partner_id', 'borrower_genders'], inplace=True)
 # Calculate Z-scores
 z_scores = zscore(df_kiva_loans['funded_amount'])
 df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) | (z_scores < -3)
 df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']]
+# Streamlit App Title
+st.title('BDS24_Weekly_Assignment_Week 2 | Tryfonas Karmiris')
+# Display the cleaned data table
+st.table(df_kiva_loans_cleaned.head())
+# Dropdown and slider for Altair chart
+st.subheader('Distribution of Funded Amounts')
+# Altair chart: simple distribution of funded amounts
+chart = alt.Chart(df_kiva_loans_cleaned).mark_bar().encode(
+    alt.X('funded_amount', bin=alt.Bin(maxbins=50)),  # Use funded_amount for distribution
+    y='count()',
+).properties(
+    title='Distribution of Funded Amounts'
+)
+st.altair_chart(chart, use_container_width=True)
+# Dropdown and slider for Matplotlib dual-axis plot
+st.subheader('Top Values by Selected Variable')
+# Dropdown for plot type
+plot_type = st.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector'])
 # Slider to select the number of top values to display
+num_columns = st.slider(
     "Select Number of Columns to Display",
     min_value=5,
+    max_value=50,
     value=10,  # default value
     step=1
 )
     x_column = 'sector'
     count_column = 'count'
+# Create a dual-axis bar plot using Matplotlib
 fig, ax1 = plt.subplots(figsize=(12, 9))
 plt.xticks(rotation=90)
 # Add titles and labels
 plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}')
 fig.tight_layout()
+st.pyplot(fig)
+# Boxplot (or Violin Plot) after the dual-axis plot
+st.subheader('Funded Amount vs. Selected Variable')
+# Filter the data based on the selected variable and number of top values
+if plot_type == 'sector':
+    top_values_boxplot = df_kiva_loans.groupby('sector')['funded_amount'].agg('sum').nlargest(num_columns).index
+    filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_boxplot)]
+elif plot_type == 'country':
+    top_values_boxplot = df_kiva_loans.groupby('country')['funded_amount'].agg('sum').nlargest(num_columns).index
+    filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_boxplot)]
+else:  # repayment_interval
+    filtered_df_boxplot = df_kiva_loans_cleaned
+# Create a boxplot
+fig, ax = plt.subplots(figsize=(12, 6))
+if plot_type != 'repayment_interval':
+    # Use sorted values for 'sector' and 'country'
+    top_values_sorted = df_kiva_loans.groupby(plot_type)['funded_amount'].agg('sum').nlargest(num_columns).index
+    sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, order=top_values_sorted, ax=ax)
+else:
+    # No specific sorting needed for 'repayment_interval'
+    sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, ax=ax)
+plt.title('Funded Amount by Selected Variable')
+plt.xlabel(plot_type)
+plt.ylabel('Funded Amount')
+plt.xticks(rotation=45)
+st.pyplot(fig)
+# Dropdown for Seaborn countplot
+st.subheader('Repayment Interval by Selected Variable')
+# Dropdown for selecting variable for Seaborn countplot
+plot_var = st.selectbox("Select Variable for Countplot", ['sector', 'country'])
+# Slider to select the number of top values to display for Seaborn countplot
+num_top_values = st.slider(
+    "Select Number of Top Values to Display",
+    min_value=5,
+    max_value=50,
+    value=10,  # default value
+    step=1
+)
+# Filter the data based on the selected variable and number of top values
+if plot_var == 'sector':
+    top_values_plot = df_kiva_loans.groupby('sector')['funded_amount'].agg('count').nlargest(num_top_values).index
+    filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_plot)]
+elif plot_var == 'country':
+    top_values_plot = df_kiva_loans.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
+    filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]
+# Create Seaborn countplot
+fig, ax = plt.subplots(figsize=(10, 6))
+sns.countplot(x='repayment_interval', hue=plot_var, data=filtered_df_plot, ax=ax)
+plt.title(f'Repayment Interval by {plot_var.replace("_", " ").title()}')
+plt.xlabel('Repayment Interval')
+plt.xticks(rotation=90)
+plt.ylabel('Count')
+plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
+st.pyplot(fig)

data/kiva_mpi_region_locations.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/loan_theme_ids.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48f3d922eef1d329ba913d0ec3c1b88714014f45ec5940a4084c311f4a455baa
+size 31641314

data/loan_themes_by_region.csv ADDED Viewed

The diff for this file is too large to render. See raw diff