Spaces:

soojeongcrystal
/

text

Sleeping

App Files Files Community

soojeongcrystal commited on Jul 26, 2024

Commit

3e6e0f4

verified ·

1 Parent(s): 326f364

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -15

app.py CHANGED Viewed

@@ -13,36 +13,32 @@ warnings.filterwarnings("ignore")
 plt.rcParams['font.family'] = 'NanumGothic'
 # 초기 불용어 목록
-default_stopwords = set(["있다", "없다", "되다", "이다", "하다", "같다", "위하다", "있다", "되어다", "통해", "위해", "대한", "있는", "하는"])
 def simple_tokenize(text):
-    # 간단한 형태소 분석: 명사 추출
-    words = re.findall(r'\w+', text)
-    return [word for word in words if len(word) > 1 and not word.endswith(('다', '요', '까', '네'))]
 @st.cache_data
 def preprocess_text(text, user_stopwords):
-    # 한글과 공백만 남기고 모두 제거
-    text = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', text)
-    # 토큰화 및 불용어 제거
     words = simple_tokenize(text)
     words = [word for word in words if word not in user_stopwords]
     return ' '.join(words)
 def topic_modeling(texts, n_components):
-    vectorizer = CountVectorizer(tokenizer=simple_tokenize)
     data_vectorized = vectorizer.fit_transform(texts)
-    lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
     lda.fit(data_vectorized)
-    try:
-        features = vectorizer.get_feature_names_out()
-    except AttributeError:
-        features = vectorizer.get_feature_names()
     topics = {}
     for topic_idx, topic in enumerate(lda.components_):
-        topics[f"Topic {topic_idx + 1}"] = [features[i] for i in topic.argsort()[:-21:-1]]
     return topics
 def generate_word_frequency_chart(text, color, n=20):
@@ -52,7 +48,7 @@ def generate_word_frequency_chart(text, color, n=20):
     fig, ax = plt.subplots(figsize=(12, 6))
     ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
-    ax.invert_yaxis()  # 가장 빈도가 높은 단어를 위쪽에 표시
     ax.set_title("상위 {} 단어".format(n))
     plt.tight_layout()
     return fig

 plt.rcParams['font.family'] = 'NanumGothic'
 # 초기 불용어 목록
+default_stopwords = set(["있다", "없다", "되다", "이다", "하다", "같다", "위하다", "있는", "하는", "그리고", "그런", "이런", "저런", "이렇게", "저렇게", "그렇게"])
 def simple_tokenize(text):
+    # 한글 단어만 추출
+    words = re.findall(r'[가-힣]+', text)
+    # 2음절 이상의 단어만 선택
+    return [word for word in words if len(word) > 1]
 @st.cache_data
 def preprocess_text(text, user_stopwords):
     words = simple_tokenize(text)
     words = [word for word in words if word not in user_stopwords]
     return ' '.join(words)
 def topic_modeling(texts, n_components):
+    vectorizer = CountVectorizer(tokenizer=simple_tokenize, max_df=0.95, min_df=2)
     data_vectorized = vectorizer.fit_transform(texts)
+    lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=10)
     lda.fit(data_vectorized)
+    features = vectorizer.get_feature_names_out()
     topics = {}
     for topic_idx, topic in enumerate(lda.components_):
+        top_words = [features[i] for i in topic.argsort()[:-11:-1]]
+        topics[f"Topic {topic_idx + 1}"] = top_words
     return topics
 def generate_word_frequency_chart(text, color, n=20):
     fig, ax = plt.subplots(figsize=(12, 6))
     ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
+    ax.invert_yaxis()
     ax.set_title("상위 {} 단어".format(n))
     plt.tight_layout()
     return fig