Spaces:

aug6th
/

streamlit-sample-space

Sleeping

File size: 4,023 Bytes

import streamlit as st
import pathlib
import pandas as pd
import plotly.express as px
from datasets import load_dataset
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

# -----------------------------
# Page Config
# -----------------------------
st.set_page_config(
    page_title="K-MHaS Korean Hate Speech Dashboard",
    layout="wide"
)

st.title("🇰🇷 K-MHaS Korean Hate Speech Analytics Dashboard")

# -----------------------------
# Label Mapping
# -----------------------------
LABEL_MAP = {
    0: "Origin",
    1: "Physical",
    2: "Politics",
    3: "Profanity",
    4: "Age",
    5: "Gender",
    6: "Race",
    7: "Religion",
    8: "Not Hate"
}

# -----------------------------
# Load Dataset (Parquet Revision)
# -----------------------------
@st.cache_data
def load_data():
    ds = load_dataset(
        "jeanlee/kmhas_korean_hate_speech",
        split="train",
        revision="refs/convert/parquet"
    )
    df = pd.DataFrame(ds)
    return df

df = load_data()

# -----------------------------
# Preprocessing
# -----------------------------
df["length"] = df["text"].apply(len)

# label 숫자를 문자열로 변환
df["label_name"] = df["label"].apply(
    lambda labels: [LABEL_MAP[l] for l in labels]
)

df_exploded = df.explode("label_name")

# -----------------------------
# KPI Section
# -----------------------------
col1, col2, col3, col4 = st.columns(4)

total_samples = len(df)
avg_length = df["length"].mean()

label_counts = df_exploded["label_name"].value_counts()
top_label = label_counts.idxmax()

not_hate_count = label_counts.get("Not Hate", 0)
hate_ratio = 1 - (not_hate_count / total_samples)

col1.metric("총 샘플 수", f"{total_samples:,}")
col2.metric("혐오 비율", f"{hate_ratio:.2%}")
col3.metric("평균 텍스트 길이", f"{avg_length:.1f}")
col4.metric("최다 라벨", top_label)

st.markdown("---")

# -----------------------------
# Charts Section
# -----------------------------
left, right = st.columns(2)

with left:
    fig1 = px.bar(
        label_counts,
        x=label_counts.index,
        y=label_counts.values,
        title="라벨 분포"
    )
    st.plotly_chart(fig1, use_container_width=True)

with right:
    fig2 = px.histogram(
        df,
        x="length",
        nbins=50,
        title="텍스트 길이 분포"
    )
    st.plotly_chart(fig2, use_container_width=True)

st.markdown("---")

# -----------------------------
# Label Filter Section
# -----------------------------
st.subheader("🔎 라벨 필터")

selected_label = st.selectbox(
    "라벨 선택",
    sorted(df_exploded["label_name"].unique())
)

filtered_df = df[df["label_name"].apply(lambda x: selected_label in x)]

st.write(f"선택된 라벨 샘플 수: {len(filtered_df):,}")
st.dataframe(
    filtered_df[["text", "label_name"]].head(100),
    use_container_width=True
)

st.markdown("---")

# -----------------------------
# WordCloud Section
# -----------------------------

BASE_DIR = pathlib.Path(__file__).resolve().parent
FONT_PATH = BASE_DIR / "NanumGothic.ttf"

st.subheader("☁️ Word Cloud")

text_data = " ".join(filtered_df["text"].tolist())

if len(text_data) > 0:
    # 너무 많은 텍스트면 샘플링 (성능 안정화)
    if len(filtered_df) > 3000:
        sample_df = filtered_df.sample(3000, random_state=42)
        text_data = " ".join(sample_df["text"].tolist())

    wordcloud = WordCloud(
        font_path=str(FONT_PATH),
        background_color=None,
        mode="RGBA",
        colormap="viridis",
        width=1200,
        height=600,
        max_words=200,
        max_font_size=150,
        min_font_size=10,
        relative_scaling=0.5,
        collocations=False,
    ).generate(text_data)

    fig_wc, ax = plt.subplots(figsize=(14, 7))
    ax.imshow(wordcloud, interpolation="bilinear")
    ax.axis("off")

    fig_wc.patch.set_alpha(0)
    st.pyplot(fig_wc, use_container_width=True)

else:
    st.info("해당 라벨에 대한 텍스트가 없습니다.")