Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,36 +13,32 @@ warnings.filterwarnings("ignore")
|
|
| 13 |
plt.rcParams['font.family'] = 'NanumGothic'
|
| 14 |
|
| 15 |
# μ΄κΈ° λΆμ©μ΄ λͺ©λ‘
|
| 16 |
-
default_stopwords = set(["μλ€", "μλ€", "λλ€", "μ΄λ€", "νλ€", "κ°λ€", "μνλ€", "μ
|
| 17 |
|
| 18 |
def simple_tokenize(text):
|
| 19 |
-
#
|
| 20 |
-
words = re.findall(r'
|
| 21 |
-
|
|
|
|
| 22 |
|
| 23 |
@st.cache_data
|
| 24 |
def preprocess_text(text, user_stopwords):
|
| 25 |
-
# νκΈκ³Ό κ³΅λ°±λ§ λ¨κΈ°κ³ λͺ¨λ μ κ±°
|
| 26 |
-
text = re.sub(r'[^γ±-γ
γ
-γ
£κ°-ν£\s]', '', text)
|
| 27 |
-
# ν ν°ν λ° λΆμ©μ΄ μ κ±°
|
| 28 |
words = simple_tokenize(text)
|
| 29 |
words = [word for word in words if word not in user_stopwords]
|
| 30 |
return ' '.join(words)
|
| 31 |
|
| 32 |
def topic_modeling(texts, n_components):
|
| 33 |
-
vectorizer = CountVectorizer(tokenizer=simple_tokenize)
|
| 34 |
data_vectorized = vectorizer.fit_transform(texts)
|
| 35 |
-
lda = LatentDirichletAllocation(n_components=n_components, random_state=
|
| 36 |
lda.fit(data_vectorized)
|
| 37 |
|
| 38 |
-
|
| 39 |
-
features = vectorizer.get_feature_names_out()
|
| 40 |
-
except AttributeError:
|
| 41 |
-
features = vectorizer.get_feature_names()
|
| 42 |
|
| 43 |
topics = {}
|
| 44 |
for topic_idx, topic in enumerate(lda.components_):
|
| 45 |
-
|
|
|
|
| 46 |
return topics
|
| 47 |
|
| 48 |
def generate_word_frequency_chart(text, color, n=20):
|
|
@@ -52,7 +48,7 @@ def generate_word_frequency_chart(text, color, n=20):
|
|
| 52 |
|
| 53 |
fig, ax = plt.subplots(figsize=(12, 6))
|
| 54 |
ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
|
| 55 |
-
ax.invert_yaxis()
|
| 56 |
ax.set_title("μμ {} λ¨μ΄".format(n))
|
| 57 |
plt.tight_layout()
|
| 58 |
return fig
|
|
|
|
| 13 |
plt.rcParams['font.family'] = 'NanumGothic'
|
| 14 |
|
| 15 |
# μ΄κΈ° λΆμ©μ΄ λͺ©λ‘
|
| 16 |
+
default_stopwords = set(["μλ€", "μλ€", "λλ€", "μ΄λ€", "νλ€", "κ°λ€", "μνλ€", "μλ", "νλ", "κ·Έλ¦¬κ³ ", "κ·Έλ°", "μ΄λ°", "μ λ°", "μ΄λ κ²", "μ λ κ²", "κ·Έλ κ²"])
|
| 17 |
|
| 18 |
def simple_tokenize(text):
|
| 19 |
+
# νκΈ λ¨μ΄λ§ μΆμΆ
|
| 20 |
+
words = re.findall(r'[κ°-ν£]+', text)
|
| 21 |
+
# 2μμ μ΄μμ λ¨μ΄λ§ μ ν
|
| 22 |
+
return [word for word in words if len(word) > 1]
|
| 23 |
|
| 24 |
@st.cache_data
|
| 25 |
def preprocess_text(text, user_stopwords):
|
|
|
|
|
|
|
|
|
|
| 26 |
words = simple_tokenize(text)
|
| 27 |
words = [word for word in words if word not in user_stopwords]
|
| 28 |
return ' '.join(words)
|
| 29 |
|
| 30 |
def topic_modeling(texts, n_components):
|
| 31 |
+
vectorizer = CountVectorizer(tokenizer=simple_tokenize, max_df=0.95, min_df=2)
|
| 32 |
data_vectorized = vectorizer.fit_transform(texts)
|
| 33 |
+
lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=10)
|
| 34 |
lda.fit(data_vectorized)
|
| 35 |
|
| 36 |
+
features = vectorizer.get_feature_names_out()
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
topics = {}
|
| 39 |
for topic_idx, topic in enumerate(lda.components_):
|
| 40 |
+
top_words = [features[i] for i in topic.argsort()[:-11:-1]]
|
| 41 |
+
topics[f"Topic {topic_idx + 1}"] = top_words
|
| 42 |
return topics
|
| 43 |
|
| 44 |
def generate_word_frequency_chart(text, color, n=20):
|
|
|
|
| 48 |
|
| 49 |
fig, ax = plt.subplots(figsize=(12, 6))
|
| 50 |
ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
|
| 51 |
+
ax.invert_yaxis()
|
| 52 |
ax.set_title("μμ {} λ¨μ΄".format(n))
|
| 53 |
plt.tight_layout()
|
| 54 |
return fig
|