aug6th commited on
Commit
a80ba92
ยท
verified ยท
1 Parent(s): 2ee1c2a

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +97 -40
src/streamlit_app.py CHANGED
@@ -1,63 +1,120 @@
1
  import streamlit as st
2
- from datasets import load_dataset
3
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- st.set_page_config(page_title="HF Dataset Dashboard", layout="wide")
6
 
7
- st.title("๐Ÿ“Š Streamlit Sample Dataset Dashboard")
 
 
 
8
 
9
- # Load Hugging Face dataset
10
- @st.cache_resource
11
- def load_data():
12
- ds = load_dataset("aug6th/streamlit_sample", split="train")
13
- return ds
14
 
15
- dataset = load_data()
16
- df = dataset.to_pandas()
 
 
17
 
18
- # --- Basic Stats ---
19
- st.header("๐Ÿ“Œ ๊ธฐ๋ณธ ํ†ต๊ณ„")
20
 
21
- col1, col2, col3 = st.columns(3)
 
 
22
 
23
- col1.metric("์ด ์ƒ˜ํ”Œ ์ˆ˜", len(df))
24
- col2.metric("์นผ๋Ÿผ ์ˆ˜", len(df.columns))
25
- col3.metric("์นผ๋Ÿผ ์ด๋ฆ„", ", ".join(df.columns))
 
26
 
27
  st.markdown("---")
28
 
29
- # --- Numeric Stats ---
30
- st.subheader("๐Ÿ“Š ์ˆซ์ž๊ฐ’ ํ†ต๊ณ„ ์š”์•ฝ")
31
- numeric_df = df.select_dtypes(include=["int64", "float64"])
32
- if not numeric_df.empty:
33
- st.dataframe(numeric_df.describe())
34
- else:
35
- st.write("์ˆซ์žํ˜• ์นผ๋Ÿผ ์—†์Œ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  st.markdown("---")
38
 
39
- # --- Text Length Features ---
40
- st.subheader("๐Ÿ“ ๋ฌธ์ž์—ด ๊ธธ์ด ํ†ต๊ณ„")
41
- df["question_length"] = df["question"].str.len()
42
- df["answer_length"] = df["answer"].str.len()
43
 
44
- col1, col2 = st.columns(2)
45
- col1.metric("์งˆ๋ฌธ ํ‰๊ท  ๊ธธ์ด", round(df["question_length"].mean(), 1))
46
- col2.metric("๋‹ต๋ณ€ ํ‰๊ท  ๊ธธ์ด", round(df["answer_length"].mean(), 1))
 
47
 
48
- # Length distributions
49
- st.bar_chart(df[["question_length", "answer_length"]])
 
 
50
 
51
  st.markdown("---")
52
 
53
- # --- Search & Filter ---
54
- st.subheader("๐Ÿ” ๊ฒ€์ƒ‰ ๋ฐ ํ•„ํ„ฐ")
 
 
55
 
56
- search = st.text_input("๊ฒ€์ƒ‰์–ด ์ž…๋ ฅ (์งˆ๋ฌธ/๋‹ต๋ณ€)")
57
 
58
- if search:
59
- filtered_df = df[df["question"].str.contains(search, na=False) | df["answer"].str.contains(search, na=False)]
60
- else:
61
- filtered_df = df
 
 
 
62
 
63
- st.dataframe(filtered_df, use_container_width=True)
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
  import pandas as pd
3
+ import plotly.express as px
4
+ from datasets import load_dataset
5
+ from collections import Counter
6
+ from wordcloud import WordCloud
7
+ import matplotlib.pyplot as plt
8
+
9
+ # -----------------------------
10
+ # Page Config
11
+ # -----------------------------
12
+ st.set_page_config(
13
+ page_title="Korean Hate Speech Dashboard",
14
+ layout="wide",
15
+ )
16
+
17
+ st.title("๐Ÿ‡ฐ๐Ÿ‡ท Korean Hate Speech Analytics Dashboard")
18
+
19
+ # -----------------------------
20
+ # Load Dataset (cached)
21
+ # -----------------------------
22
+ @st.cache_data
23
+ def load_data():
24
+ ds = load_dataset("jeanlee/kmhas_korean_hate_speech")
25
+ df = pd.DataFrame(ds["train"])
26
+ return df
27
 
28
+ df = load_data()
29
 
30
+ # -----------------------------
31
+ # Preprocessing
32
+ # -----------------------------
33
+ df["length"] = df["text"].apply(len)
34
 
35
+ # Multi-label explode
36
+ df_exploded = df.explode("label")
 
 
 
37
 
38
+ # -----------------------------
39
+ # KPI Section
40
+ # -----------------------------
41
+ col1, col2, col3, col4 = st.columns(4)
42
 
43
+ total_samples = len(df)
44
+ avg_length = df["length"].mean()
45
 
46
+ label_counts = df_exploded["label"].value_counts()
47
+ top_label = label_counts.idxmax()
48
+ hate_ratio = 1 - (label_counts.get("clean", 0) / total_samples)
49
 
50
+ col1.metric("์ด ์ƒ˜ํ”Œ ์ˆ˜", f"{total_samples:,}")
51
+ col2.metric("ํ˜์˜ค ๋น„์œจ", f"{hate_ratio:.2%}")
52
+ col3.metric("ํ‰๊ท  ํ…์ŠคํŠธ ๊ธธ์ด", f"{avg_length:.1f}")
53
+ col4.metric("์ตœ๋‹ค ๋ผ๋ฒจ", top_label)
54
 
55
  st.markdown("---")
56
 
57
+ # -----------------------------
58
+ # Charts Section
59
+ # -----------------------------
60
+ left, right = st.columns(2)
61
+
62
+ with left:
63
+ fig1 = px.bar(
64
+ label_counts,
65
+ x=label_counts.index,
66
+ y=label_counts.values,
67
+ title="๋ผ๋ฒจ ๋ถ„ํฌ",
68
+ )
69
+ st.plotly_chart(fig1, use_container_width=True)
70
+
71
+ with right:
72
+ fig2 = px.histogram(
73
+ df,
74
+ x="length",
75
+ nbins=50,
76
+ title="ํ…์ŠคํŠธ ๊ธธ์ด ๋ถ„ํฌ",
77
+ )
78
+ st.plotly_chart(fig2, use_container_width=True)
79
 
80
  st.markdown("---")
81
 
82
+ # -----------------------------
83
+ # Label Filter Section
84
+ # -----------------------------
85
+ st.subheader("๐Ÿ”Ž ๋ผ๋ฒจ ํ•„ํ„ฐ๋ง")
86
 
87
+ selected_label = st.selectbox(
88
+ "๋ผ๋ฒจ ์„ ํƒ",
89
+ sorted(df_exploded["label"].unique())
90
+ )
91
 
92
+ filtered_df = df[df["label"].apply(lambda x: selected_label in x)]
93
+
94
+ st.write(f"์„ ํƒ๋œ ๋ผ๋ฒจ ์ƒ˜ํ”Œ ์ˆ˜: {len(filtered_df):,}")
95
+ st.dataframe(filtered_df[["text", "label"]].head(100), use_container_width=True)
96
 
97
  st.markdown("---")
98
 
99
+ # -----------------------------
100
+ # WordCloud Section
101
+ # -----------------------------
102
+ st.subheader("โ˜๏ธ Word Cloud")
103
 
104
+ text_data = " ".join(filtered_df["text"].tolist())
105
 
106
+ if len(text_data) > 0:
107
+ wordcloud = WordCloud(
108
+ font_path="/usr/share/fonts/truetype/nanum/NanumGothic.ttf",
109
+ background_color="white",
110
+ width=800,
111
+ height=400
112
+ ).generate(text_data)
113
 
114
+ fig_wc, ax = plt.subplots()
115
+ ax.imshow(wordcloud, interpolation="bilinear")
116
+ ax.axis("off")
117
+
118
+ st.pyplot(fig_wc)
119
+ else:
120
+ st.info("ํ•ด๋‹น ๋ผ๋ฒจ์— ๋Œ€ํ•œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")