Spaces:

reysarms
/

all_about_kpop

Sleeping

App Files Files Community

all_about_kpop / app.py

reysarms

updated files

dfaa369 about 1 year ago

raw

history blame contribute delete

3.1 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
	from sklearn.preprocessing import StandardScaler

	# Load dataset
	@st.cache_data
	def load_data():
	data = pd.read_csv("kpopidolsv3.csv")
	return data

	data = load_data()

	# Preprocess data
	def preprocess_data(data):
	features = ['Height', 'Weight']
	df = data.dropna(subset=features)
	scaler = StandardScaler()
	scaled_features = scaler.fit_transform(df[features])
	return scaled_features, df

	# Perform hierarchical clustering
	def apply_hierarchical_clustering(scaled_features, method='ward'):
	Z = linkage(scaled_features, method=method)
	return Z

	# Sidebar controls
	st.sidebar.header("Clustering Parameters")
	num_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3)

	def main():
	st.title("🎤 K-Pop Idol Clustering using Hierarchical Clustering")

	# Tabs for Navigation
	tab1, tab2, tab3 = st.tabs(["📚 About the App", "📊 Dataset & Results", "🔎 Explore Idols"])

	with tab1:
	st.header("📚 About the App")
	st.markdown(
	"This app groups K-pop idols based on their physical features (height, weight), company, and debut information using Hierarchical Clustering with Ward's Method."
	)
	st.markdown(
	"### How It Works:
	- Dendrogram Visualization: Explore hierarchical clusters.
	- Dynamic Cluster Cutting: Set the number of clusters dynamically.
	- Idol Comparison: Analyze clusters by different features."
	)

	with tab2:
	st.header("📊 Dataset Overview and Results")
	st.write("### Sample Data")
	st.dataframe(data.head())

	# Preprocess and cluster
	scaled_features, df_processed = preprocess_data(data)
	Z = apply_hierarchical_clustering(scaled_features)

	# Dendrogram
	st.write("### Dendrogram")
	plt.figure(figsize=(12, 6))
	dendrogram(Z, labels=df_processed['Stage Name'].values, leaf_rotation=90)
	st.pyplot(plt)

	# Cut the dendrogram
	cluster_labels = fcluster(Z, num_clusters, criterion='maxclust')
	df_processed['Cluster'] = cluster_labels
	st.write("### Clustered Data Sample")
	st.dataframe(df_processed[['Stage Name', 'Company', 'Nationality', 'Cluster']].head(10))

	with tab3:
	st.header("🔎 Explore Idols by Company or Nationality")
	option = st.selectbox("Filter idols by:", ["Company", "Nationality"])
	selected_value = st.text_input(f"Enter {option} name:")

	if selected_value:
	filtered_data = df_processed[df_processed[option].str.contains(selected_value, na=False, case=False)]
	if not filtered_data.empty:
	st.dataframe(filtered_data[['Stage Name', 'Company', 'Nationality', 'Cluster']])
	else:
	st.warning(f"No idols found for {option}: {selected_value}")

	if __name__ == "__main__":
	main()