Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
import pandas as pd
|
| 3 |
from wordcloud import WordCloud
|
| 4 |
import matplotlib.pyplot as plt
|
| 5 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
@@ -13,17 +12,22 @@ import warnings
|
|
| 13 |
# ๊ฒฝ๊ณ ๋ฉ์์ง ๋ฌด์
|
| 14 |
warnings.filterwarnings("ignore")
|
| 15 |
|
| 16 |
-
# ๋ถ์ฉ์ด ๋ชฉ๋ก
|
| 17 |
-
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
okt = Okt()
|
| 21 |
text = re.sub(r'[^\w\s]', '', text) # ํน์ ๋ฌธ์ ์ ๊ฑฐ
|
| 22 |
nouns = okt.nouns(text) # ๋ช
์ฌ ์ถ์ถ
|
| 23 |
-
nouns = [noun for noun in nouns if len(noun) > 1 and noun not in
|
| 24 |
return ' '.join(nouns)
|
| 25 |
|
| 26 |
-
def topic_modeling(texts, n_components
|
| 27 |
vectorizer = CountVectorizer()
|
| 28 |
data_vectorized = vectorizer.fit_transform(texts)
|
| 29 |
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
|
|
@@ -31,29 +35,66 @@ def topic_modeling(texts, n_components=5):
|
|
| 31 |
features = vectorizer.get_feature_names()
|
| 32 |
topics = {}
|
| 33 |
for topic_idx, topic in enumerate(lda.components_):
|
| 34 |
-
topics[f"Topic {topic_idx}"] = [features[i] for i in topic.argsort()[:-21:-1]]
|
| 35 |
return topics
|
| 36 |
|
| 37 |
-
def generate_wordcloud(text):
|
| 38 |
-
wordcloud = WordCloud(width=800, height=400).generate(text)
|
| 39 |
-
plt.
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
|
| 44 |
# ์คํธ๋ฆผ๋ฆฟ UI ์ค์
|
| 45 |
st.title("ํ
์คํธ ๋ถ์ ๋๊ตฌ")
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
uploaded_file = st.file_uploader("ํ
์คํธ ํ์ผ ์
๋ก๋", type=['txt'])
|
| 48 |
-
if uploaded_file is not None:
|
| 49 |
-
with st.spinner('ํ์ผ์ ์ฒ๋ฆฌ ์ค์
๋๋ค...'):
|
| 50 |
-
text = str(uploaded_file.read(), 'utf-8')
|
| 51 |
-
preprocessed_text = preprocess_text(text) # ์ ์ฒ๋ฆฌ๋ ๋ช
์ฌ ์ถ์ถ
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
st.subheader("ํ ํฝ ๋ชจ๋ธ๋ง ๊ฒฐ๊ณผ")
|
| 54 |
-
|
|
|
|
| 55 |
for topic, words in topics.items():
|
| 56 |
st.write(f"{topic}: {', '.join(words)}")
|
| 57 |
-
|
|
|
|
| 58 |
st.subheader("์๋ ํด๋ผ์ฐ๋")
|
| 59 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
|
|
|
| 2 |
from wordcloud import WordCloud
|
| 3 |
import matplotlib.pyplot as plt
|
| 4 |
from sklearn.feature_extraction.text import CountVectorizer
|
|
|
|
| 12 |
# ๊ฒฝ๊ณ ๋ฉ์์ง ๋ฌด์
|
| 13 |
warnings.filterwarnings("ignore")
|
| 14 |
|
| 15 |
+
# ์ด๊ธฐ ๋ถ์ฉ์ด ๋ชฉ๋ก
|
| 16 |
+
default_stopwords = set(["์๋ค", "์๋ค", "๊ฒ", "๊ทธ", "์ด", "ํ๋", "ํ๊ธฐ", "ํ ", "๋", "์", "์ด๋ค", "์ํค๋ค"])
|
| 17 |
|
| 18 |
+
@st.cache_data
|
| 19 |
+
def preprocess_text(text, user_stopwords):
|
| 20 |
+
spacing = Spacing()
|
| 21 |
+
text = spacing(text) # ๋์ด์ฐ๊ธฐ ๊ต์
|
| 22 |
+
text = spell_checker.check(text).checked # ๋ง์ถค๋ฒ ๊ฒ์ฌ
|
| 23 |
+
|
| 24 |
okt = Okt()
|
| 25 |
text = re.sub(r'[^\w\s]', '', text) # ํน์ ๋ฌธ์ ์ ๊ฑฐ
|
| 26 |
nouns = okt.nouns(text) # ๋ช
์ฌ ์ถ์ถ
|
| 27 |
+
nouns = [noun for noun in nouns if len(noun) > 1 and noun not in user_stopwords] # ๋ถ์ฉ์ด ์ ๊ฑฐ ๋ฐ ํ ๊ธ์ ์ด์์ ๋ช
์ฌ๋ง ์ ํ
|
| 28 |
return ' '.join(nouns)
|
| 29 |
|
| 30 |
+
def topic_modeling(texts, n_components):
|
| 31 |
vectorizer = CountVectorizer()
|
| 32 |
data_vectorized = vectorizer.fit_transform(texts)
|
| 33 |
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
|
|
|
|
| 35 |
features = vectorizer.get_feature_names()
|
| 36 |
topics = {}
|
| 37 |
for topic_idx, topic in enumerate(lda.components_):
|
| 38 |
+
topics[f"Topic {topic_idx + 1}"] = [features[i] for i in topic.argsort()[:-21:-1]]
|
| 39 |
return topics
|
| 40 |
|
| 41 |
+
def generate_wordcloud(text, color):
|
| 42 |
+
wordcloud = WordCloud(width=800, height=400, background_color=color).generate(text)
|
| 43 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
| 44 |
+
ax.imshow(wordcloud, interpolation='bilinear')
|
| 45 |
+
ax.axis("off")
|
| 46 |
+
return fig
|
| 47 |
|
| 48 |
# ์คํธ๋ฆผ๋ฆฟ UI ์ค์
|
| 49 |
st.title("ํ
์คํธ ๋ถ์ ๋๊ตฌ")
|
| 50 |
|
| 51 |
+
# ์ฌ์ด๋๋ฐ์ ๋ถ์ฉ์ด ์
๋ ฅ ํ๋ ์ถ๊ฐ
|
| 52 |
+
user_stopwords = st.sidebar.text_area("๋ถ์ฉ์ด๋ฅผ ์
๋ ฅํ์ธ์ (์ผํ๋ก ๊ตฌ๋ถ)",
|
| 53 |
+
value=", ".join(default_stopwords))
|
| 54 |
+
user_stopwords = set(user_stopwords.split(", ")) | default_stopwords
|
| 55 |
+
|
| 56 |
+
# ํ์ผ ์
๋ก๋
|
| 57 |
uploaded_file = st.file_uploader("ํ
์คํธ ํ์ผ ์
๋ก๋", type=['txt'])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
+
if uploaded_file is not None:
|
| 60 |
+
try:
|
| 61 |
+
with st.spinner('ํ์ผ์ ์ฒ๋ฆฌ ์ค์
๋๋ค...'):
|
| 62 |
+
text = str(uploaded_file.read(), 'utf-8')
|
| 63 |
+
|
| 64 |
+
# ํ
์คํธ ํฌ๊ธฐ์ ๋ฐ๋ฅธ ํ๋ก๊ทธ๋ ์ค ๋ฐ ์ถ๊ฐ
|
| 65 |
+
progress_bar = st.progress(0)
|
| 66 |
+
chunk_size = max(1, len(text) // 100) # ํ
์คํธ๋ฅผ 100๊ฐ์ ์ฒญํฌ๋ก ๋๋
|
| 67 |
+
preprocessed_chunks = []
|
| 68 |
+
|
| 69 |
+
for i in range(0, len(text), chunk_size):
|
| 70 |
+
chunk = text[i:i+chunk_size]
|
| 71 |
+
preprocessed_chunk = preprocess_text(chunk, user_stopwords)
|
| 72 |
+
preprocessed_chunks.append(preprocessed_chunk)
|
| 73 |
+
progress_bar.progress((i + chunk_size) / len(text))
|
| 74 |
+
|
| 75 |
+
preprocessed_text = " ".join(preprocessed_chunks)
|
| 76 |
+
|
| 77 |
+
# ํ ํฝ ๋ชจ๋ธ๋ง
|
| 78 |
st.subheader("ํ ํฝ ๋ชจ๋ธ๋ง ๊ฒฐ๊ณผ")
|
| 79 |
+
n_topics = st.slider("ํ ํฝ ์ ์ ํ", min_value=2, max_value=10, value=5)
|
| 80 |
+
topics = topic_modeling([preprocessed_text], n_topics)
|
| 81 |
for topic, words in topics.items():
|
| 82 |
st.write(f"{topic}: {', '.join(words)}")
|
| 83 |
+
|
| 84 |
+
# ์๋ ํด๋ผ์ฐ๋
|
| 85 |
st.subheader("์๋ ํด๋ผ์ฐ๋")
|
| 86 |
+
color = st.color_picker("๋ฐฐ๊ฒฝ์ ์ ํ", "#ffffff")
|
| 87 |
+
fig = generate_wordcloud(preprocessed_text, color)
|
| 88 |
+
st.pyplot(fig)
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
st.error(f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}")
|
| 92 |
+
|
| 93 |
+
# ์ฌ์ฉ ์ค๋ช
์ถ๊ฐ
|
| 94 |
+
st.sidebar.markdown("""
|
| 95 |
+
## ์ฌ์ฉ ๋ฐฉ๋ฒ
|
| 96 |
+
1. ์ฌ์ด๋๋ฐ์์ ๋ถ์ฉ์ด๋ฅผ ์ถ๊ฐํ๊ฑฐ๋ ์์ ํ ์ ์์ต๋๋ค.
|
| 97 |
+
2. ํ
์คํธ ํ์ผ(.txt)์ ์
๋ก๋ํ์ธ์.
|
| 98 |
+
3. ํ ํฝ ๋ชจ๋ธ๋ง์ ํ ํฝ ์๋ฅผ ์ ํํ์ธ์.
|
| 99 |
+
4. ์๋ํด๋ผ์ฐ๋์ ๋ฐฐ๊ฒฝ์์ ์ ํํ ์ ์์ต๋๋ค.
|
| 100 |
+
""")
|