Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -50,19 +50,19 @@ def preprocess_text(text, user_stopwords):
|
|
| 50 |
return ""
|
| 51 |
|
| 52 |
def topic_modeling(texts, n_components):
|
| 53 |
-
doc_count = len(texts)
|
| 54 |
-
min_df = max(2, int(doc_count * 0.01))
|
| 55 |
-
max_df = min(0.95, int(doc_count * 0.95))
|
| 56 |
-
|
| 57 |
# default_stopwords를 리스트로 변환
|
| 58 |
stop_words_list = list(default_stopwords)
|
| 59 |
|
| 60 |
-
vectorizer = CountVectorizer(
|
| 61 |
data_vectorized = vectorizer.fit_transform(texts)
|
| 62 |
|
| 63 |
n_tokens = data_vectorized.shape[1]
|
| 64 |
n_components = min(n_components, n_tokens)
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=20)
|
| 67 |
lda.fit(data_vectorized)
|
| 68 |
|
|
@@ -117,8 +117,11 @@ if uploaded_file is not None:
|
|
| 117 |
st.subheader("토픽 모델링 결과")
|
| 118 |
n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
|
| 119 |
topics = topic_modeling([preprocessed_text], n_topics)
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
st.subheader("상위 10개 Trigram")
|
| 124 |
top_trigrams = get_top_trigrams(preprocessed_text)
|
|
|
|
| 50 |
return ""
|
| 51 |
|
| 52 |
def topic_modeling(texts, n_components):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
# default_stopwords를 리스트로 변환
|
| 54 |
stop_words_list = list(default_stopwords)
|
| 55 |
|
| 56 |
+
vectorizer = CountVectorizer(stop_words=stop_words_list)
|
| 57 |
data_vectorized = vectorizer.fit_transform(texts)
|
| 58 |
|
| 59 |
n_tokens = data_vectorized.shape[1]
|
| 60 |
n_components = min(n_components, n_tokens)
|
| 61 |
|
| 62 |
+
if n_components < 2:
|
| 63 |
+
st.warning("추출된 고유 단어가 너무 적습니다. 더 긴 텍스트를 사용해 주세요.")
|
| 64 |
+
return {}
|
| 65 |
+
|
| 66 |
lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=20)
|
| 67 |
lda.fit(data_vectorized)
|
| 68 |
|
|
|
|
| 117 |
st.subheader("토픽 모델링 결과")
|
| 118 |
n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
|
| 119 |
topics = topic_modeling([preprocessed_text], n_topics)
|
| 120 |
+
if topics:
|
| 121 |
+
for topic, words in topics.items():
|
| 122 |
+
st.write(f"{topic}: {', '.join(words)}")
|
| 123 |
+
else:
|
| 124 |
+
st.warning("토픽 모델링을 수행할 수 없습니다. 더 긴 텍스트를 사용해 주세요.")
|
| 125 |
|
| 126 |
st.subheader("상위 10개 Trigram")
|
| 127 |
top_trigrams = get_top_trigrams(preprocessed_text)
|