all_about_kpop / app.py
reysarms's picture
updated files
dfaa369
import streamlit as st
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.preprocessing import StandardScaler
# Load dataset
@st.cache_data
def load_data():
data = pd.read_csv("kpopidolsv3.csv")
return data
data = load_data()
# Preprocess data
def preprocess_data(data):
features = ['Height', 'Weight']
df = data.dropna(subset=features)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features])
return scaled_features, df
# Perform hierarchical clustering
def apply_hierarchical_clustering(scaled_features, method='ward'):
Z = linkage(scaled_features, method=method)
return Z
# Sidebar controls
st.sidebar.header("Clustering Parameters")
num_clusters = st.sidebar.slider("Number of Clusters", 2, 10, 3)
def main():
st.title("🎀 K-Pop Idol Clustering using Hierarchical Clustering")
# Tabs for Navigation
tab1, tab2, tab3 = st.tabs(["πŸ“š About the App", "πŸ“Š Dataset & Results", "πŸ”Ž Explore Idols"])
with tab1:
st.header("πŸ“š About the App")
st.markdown(
"This app groups K-pop idols based on their physical features (height, weight), company, and debut information using **Hierarchical Clustering with Ward's Method**."
)
st.markdown(
"### How It Works:
- **Dendrogram Visualization:** Explore hierarchical clusters.
- **Dynamic Cluster Cutting:** Set the number of clusters dynamically.
- **Idol Comparison:** Analyze clusters by different features."
)
with tab2:
st.header("πŸ“Š Dataset Overview and Results")
st.write("### Sample Data")
st.dataframe(data.head())
# Preprocess and cluster
scaled_features, df_processed = preprocess_data(data)
Z = apply_hierarchical_clustering(scaled_features)
# Dendrogram
st.write("### Dendrogram")
plt.figure(figsize=(12, 6))
dendrogram(Z, labels=df_processed['Stage Name'].values, leaf_rotation=90)
st.pyplot(plt)
# Cut the dendrogram
cluster_labels = fcluster(Z, num_clusters, criterion='maxclust')
df_processed['Cluster'] = cluster_labels
st.write("### Clustered Data Sample")
st.dataframe(df_processed[['Stage Name', 'Company', 'Nationality', 'Cluster']].head(10))
with tab3:
st.header("πŸ”Ž Explore Idols by Company or Nationality")
option = st.selectbox("Filter idols by:", ["Company", "Nationality"])
selected_value = st.text_input(f"Enter {option} name:")
if selected_value:
filtered_data = df_processed[df_processed[option].str.contains(selected_value, na=False, case=False)]
if not filtered_data.empty:
st.dataframe(filtered_data[['Stage Name', 'Company', 'Nationality', 'Cluster']])
else:
st.warning(f"No idols found for {option}: {selected_value}")
if __name__ == "__main__":
main()