Spaces:

Tryfonas
/

WeeklyAssignment_Part2

Sleeping

App Files Files Community

WeeklyAssignment_Part2 / app.py

Tryfonas

Upload folder using huggingface_hub

fb89c2f verified over 1 year ago

raw

history blame

6.21 kB

	# Import necessary libraries
	import streamlit as st
	import pandas as pd
	import altair as alt
	import matplotlib.pyplot as plt
	import seaborn as sns
	from scipy.stats import zscore

	# Load data
	file_path = 'kiva_loans.csv'
	df_kiva_loans = pd.read_csv(file_path)

	# Clean data
	df_kiva_loans = df_kiva_loans.drop(['use', 'disbursed_time', 'funded_time', 'posted_time', 'tags'], axis=1)
	df_kiva_loans.dropna(subset=['partner_id', 'borrower_genders'], inplace=True)

	# Calculate Z-scores
	z_scores = zscore(df_kiva_loans['funded_amount'])
	df_kiva_loans['outlier_funded_amount'] = (z_scores > 3) \| (z_scores < -3)
	df_kiva_loans_cleaned = df_kiva_loans[~df_kiva_loans['outlier_funded_amount']]

	# Streamlit App Title
	st.title('BDS24_Weekly_Assignment_Week 2 \| Tryfonas Karmiris')

	# Display the cleaned data table
	st.table(df_kiva_loans_cleaned.head())

	# Dropdown and slider for Altair chart
	st.subheader('Distribution of Funded Amounts')
	# Altair chart: simple distribution of funded amounts
	chart = alt.Chart(df_kiva_loans_cleaned).mark_bar().encode(
	alt.X('funded_amount', bin=alt.Bin(maxbins=50)), # Use funded_amount for distribution
	y='count()',
	).properties(
	title='Distribution of Funded Amounts'
	)
	st.altair_chart(chart, use_container_width=True)

	# Dropdown and slider for Matplotlib dual-axis plot
	st.subheader('Top Values by Selected Variable')

	# Dropdown for plot type
	plot_type = st.selectbox("Select Variable to Display", ['country', 'repayment_interval', 'sector'])

	# Slider to select the number of top values to display
	num_columns = st.slider(
	"Select Number of Columns to Display",
	min_value=5,
	max_value=50,
	value=10, # default value
	step=1
	)

	# Select the top values based on the selected variable and number of columns
	if plot_type == 'country':
	top_values = df_kiva_loans.groupby('country')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
	x_column = 'country'
	count_column = 'count'
	elif plot_type == 'repayment_interval':
	top_values = df_kiva_loans.groupby('repayment_interval')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
	x_column = 'repayment_interval'
	count_column = 'count'
	else: # sector
	top_values = df_kiva_loans.groupby('sector')['funded_amount'].agg(['sum', 'count']).nlargest(num_columns, 'sum').reset_index()
	x_column = 'sector'
	count_column = 'count'

	# Create a dual-axis bar plot using Matplotlib
	fig, ax1 = plt.subplots(figsize=(12, 9))
	plt.xticks(rotation=90)

	# Bar plot for funded_amount
	color = 'tab:blue'
	ax1.set_xlabel(x_column.replace("_", " ").title())
	ax1.set_ylabel('Funded Amount', color=color)
	ax1.bar(top_values[x_column], top_values['sum'], color=color, alpha=0.6, label='Funded Amount')
	ax1.tick_params(axis='y', labelcolor=color)

	# Create a second y-axis for count
	ax2 = ax1.twinx()
	color = 'tab:red'
	ax2.set_ylabel('Count', color=color)
	ax2.plot(top_values[x_column], top_values[count_column], color=color, marker='o', linestyle='-', linewidth=2, label='Count')
	ax2.tick_params(axis='y', labelcolor=color)

	# Add titles and labels
	plt.title(f'Top {num_columns} by {plot_type.replace("_", " ").title()}')
	fig.tight_layout()
	st.pyplot(fig)

	# Boxplot (or Violin Plot) after the dual-axis plot
	st.subheader('Funded Amount vs. Selected Variable')

	# Filter the data based on the selected variable and number of top values
	if plot_type == 'sector':
	top_values_boxplot = df_kiva_loans.groupby('sector')['funded_amount'].agg('sum').nlargest(num_columns).index
	filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_boxplot)]
	elif plot_type == 'country':
	top_values_boxplot = df_kiva_loans.groupby('country')['funded_amount'].agg('sum').nlargest(num_columns).index
	filtered_df_boxplot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_boxplot)]
	else: # repayment_interval
	filtered_df_boxplot = df_kiva_loans_cleaned

	# Create a boxplot
	fig, ax = plt.subplots(figsize=(12, 6))
	if plot_type != 'repayment_interval':
	# Use sorted values for 'sector' and 'country'
	top_values_sorted = df_kiva_loans.groupby(plot_type)['funded_amount'].agg('sum').nlargest(num_columns).index
	sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, order=top_values_sorted, ax=ax)
	else:
	# No specific sorting needed for 'repayment_interval'
	sns.boxplot(x=plot_type, y='funded_amount', data=filtered_df_boxplot, ax=ax)

	plt.title('Funded Amount by Selected Variable')
	plt.xlabel(plot_type)
	plt.ylabel('Funded Amount')
	plt.xticks(rotation=45)
	st.pyplot(fig)

	# Dropdown for Seaborn countplot
	st.subheader('Repayment Interval by Selected Variable')

	# Dropdown for selecting variable for Seaborn countplot
	plot_var = st.selectbox("Select Variable for Countplot", ['sector', 'country'])

	# Slider to select the number of top values to display for Seaborn countplot
	num_top_values = st.slider(
	"Select Number of Top Values to Display",
	min_value=5,
	max_value=50,
	value=10, # default value
	step=1
	)

	# Filter the data based on the selected variable and number of top values
	if plot_var == 'sector':
	top_values_plot = df_kiva_loans.groupby('sector')['funded_amount'].agg('count').nlargest(num_top_values).index
	filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['sector'].isin(top_values_plot)]
	elif plot_var == 'country':
	top_values_plot = df_kiva_loans.groupby('country')['funded_amount'].agg('count').nlargest(num_top_values).index
	filtered_df_plot = df_kiva_loans_cleaned[df_kiva_loans_cleaned['country'].isin(top_values_plot)]

	# Create Seaborn countplot
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.countplot(x='repayment_interval', hue=plot_var, data=filtered_df_plot, ax=ax)
	plt.title(f'Repayment Interval by {plot_var.replace("_", " ").title()}')
	plt.xlabel('Repayment Interval')
	plt.xticks(rotation=90)
	plt.ylabel('Count')
	plt.legend(title=plot_var.replace("_", " ").title(), bbox_to_anchor=(1.05, 1), loc='upper left')
	st.pyplot(fig)