Spaces:

aug6th
/

streamlit-sample-space

Sleeping

App Files Files Community

aug6th commited on Feb 25

Commit

a80ba92

verified ·

1 Parent(s): 2ee1c2a

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +97 -40

src/streamlit_app.py CHANGED Viewed

@@ -1,63 +1,120 @@
 import streamlit as st
-from datasets import load_dataset
 import pandas as pd
-st.set_page_config(page_title="HF Dataset Dashboard", layout="wide")
-st.title("📊 Streamlit Sample Dataset Dashboard")
-# Load Hugging Face dataset
-@st.cache_resource
-def load_data():
-    ds = load_dataset("aug6th/streamlit_sample", split="train")
-    return ds
-dataset = load_data()
-df = dataset.to_pandas()
-# --- Basic Stats ---
-st.header("📌 기본 통계")
-col1, col2, col3 = st.columns(3)
-col1.metric("총 샘플 수", len(df))
-col2.metric("칼럼 수", len(df.columns))
-col3.metric("칼럼 이름", ", ".join(df.columns))
 st.markdown("---")
-# --- Numeric Stats ---
-st.subheader("📊 숫자값 통계 요약")
-numeric_df = df.select_dtypes(include=["int64", "float64"])
-if not numeric_df.empty:
-    st.dataframe(numeric_df.describe())
-else:
-    st.write("숫자형 칼럼 없음")
 st.markdown("---")
-# --- Text Length Features ---
-st.subheader("📝 문자열 길이 통계")
-df["question_length"] = df["question"].str.len()
-df["answer_length"] = df["answer"].str.len()
-col1, col2 = st.columns(2)
-col1.metric("질문 평균 길이", round(df["question_length"].mean(), 1))
-col2.metric("답변 평균 길이", round(df["answer_length"].mean(), 1))
-# Length distributions
-st.bar_chart(df[["question_length", "answer_length"]])
 st.markdown("---")
-# --- Search & Filter ---
-st.subheader("🔍 검색 및 필터")
-search = st.text_input("검색어 입력 (질문/답변)")
-if search:
-    filtered_df = df[df["question"].str.contains(search, na=False) | df["answer"].str.contains(search, na=False)]
-else:
-    filtered_df = df
-st.dataframe(filtered_df, use_container_width=True)

 import streamlit as st
 import pandas as pd
+import plotly.express as px
+from datasets import load_dataset
+from collections import Counter
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+# -----------------------------
+# Page Config
+# -----------------------------
+st.set_page_config(
+    page_title="Korean Hate Speech Dashboard",
+    layout="wide",
+)
+st.title("🇰🇷 Korean Hate Speech Analytics Dashboard")
+# -----------------------------
+# Load Dataset (cached)
+# -----------------------------
+@st.cache_data
+def load_data():
+    ds = load_dataset("jeanlee/kmhas_korean_hate_speech")
+    df = pd.DataFrame(ds["train"])
+    return df
+df = load_data()
+# -----------------------------
+# Preprocessing
+# -----------------------------
+df["length"] = df["text"].apply(len)
+# Multi-label explode
+df_exploded = df.explode("label")
+# -----------------------------
+# KPI Section
+# -----------------------------
+col1, col2, col3, col4 = st.columns(4)
+total_samples = len(df)
+avg_length = df["length"].mean()
+label_counts = df_exploded["label"].value_counts()
+top_label = label_counts.idxmax()
+hate_ratio = 1 - (label_counts.get("clean", 0) / total_samples)
+col1.metric("총 샘플 수", f"{total_samples:,}")
+col2.metric("혐오 비율", f"{hate_ratio:.2%}")
+col3.metric("평균 텍스트 길이", f"{avg_length:.1f}")
+col4.metric("최다 라벨", top_label)
 st.markdown("---")
+# -----------------------------
+# Charts Section
+# -----------------------------
+left, right = st.columns(2)
+with left:
+    fig1 = px.bar(
+        label_counts,
+        x=label_counts.index,
+        y=label_counts.values,
+        title="라벨 분포",
+    )
+    st.plotly_chart(fig1, use_container_width=True)
+with right:
+    fig2 = px.histogram(
+        df,
+        x="length",
+        nbins=50,
+        title="텍스트 길이 분포",
+    )
+    st.plotly_chart(fig2, use_container_width=True)
 st.markdown("---")
+# -----------------------------
+# Label Filter Section
+# -----------------------------
+st.subheader("🔎 라벨 필터링")
+selected_label = st.selectbox(
+    "라벨 선택",
+    sorted(df_exploded["label"].unique())
+)
+filtered_df = df[df["label"].apply(lambda x: selected_label in x)]
+st.write(f"선택된 라벨 샘플 수: {len(filtered_df):,}")
+st.dataframe(filtered_df[["text", "label"]].head(100), use_container_width=True)
 st.markdown("---")
+# -----------------------------
+# WordCloud Section
+# -----------------------------
+st.subheader("☁️ Word Cloud")
+text_data = " ".join(filtered_df["text"].tolist())
+if len(text_data) > 0:
+    wordcloud = WordCloud(
+        font_path="/usr/share/fonts/truetype/nanum/NanumGothic.ttf",
+        background_color="white",
+        width=800,
+        height=400
+    ).generate(text_data)
+    fig_wc, ax = plt.subplots()
+    ax.imshow(wordcloud, interpolation="bilinear")
+    ax.axis("off")
+    st.pyplot(fig_wc)
+else:
+    st.info("해당 라벨에 대한 텍스트가 없습니다.")