Spaces:

soojeongcrystal
/

text

Sleeping

App Files Files Community

soojeongcrystal commited on Jul 26, 2024

Commit

326f364

verified ·

1 Parent(s): 35bdcd6

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -23

app.py CHANGED Viewed

@@ -5,28 +5,32 @@ from sklearn.decomposition import LatentDirichletAllocation
 import re
 import warnings
 from collections import Counter
 warnings.filterwarnings("ignore")
 # 초기 불용어 목록
-default_stopwords = set(["있다", "없다", "되다", "이다", "하다", "같다", "위하다", "있다", "되어다"])
-def is_noun(word):
-    # 간단한 규칙: 2음절 이상이고 '다'로 끝나지 않으면 명사로 간주
-    return len(word) >= 2 and not word.endswith('다')
 @st.cache_data
 def preprocess_text(text, user_stopwords):
     # 한글과 공백만 남기고 모두 제거
     text = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', text)
-    # 단어 분리 (공백 기준)
-    words = text.split()
-    # 명사 추출, 불용어 제거, 두 글자 이상의 단어만 선택
-    words = [word for word in words if is_noun(word) and len(word) > 1 and word not in user_stopwords]
     return ' '.join(words)
 def topic_modeling(texts, n_components):
-    vectorizer = CountVectorizer()
     data_vectorized = vectorizer.fit_transform(texts)
     lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
     lda.fit(data_vectorized)
@@ -42,29 +46,22 @@ def topic_modeling(texts, n_components):
     return topics
 def generate_word_frequency_chart(text, color, n=20):
-    words = text.split()
     word_freq = Counter(words)
     top_words = dict(word_freq.most_common(n))
     fig, ax = plt.subplots(figsize=(12, 6))
     ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
     ax.invert_yaxis()  # 가장 빈도가 높은 단어를 위쪽에 표시
-    ax.set_title("Top {} Words".format(n))
     plt.tight_layout()
     return fig
 def get_top_trigrams(text, n=10):
-    trigram_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=10000)
-    trigrams = trigram_vectorizer.fit_transform([text])
-    try:
-        trigram_features = trigram_vectorizer.get_feature_names_out()
-    except AttributeError:
-        trigram_features = trigram_vectorizer.get_feature_names()
-    trigram_counts = trigrams.sum(axis=0).A1
-    top_trigrams = sorted(zip(trigram_features, trigram_counts), key=lambda x: x[1], reverse=True)[:n]
-    return top_trigrams
 # 스트림릿 UI 설정
 st.title("텍스트 분석 도구")
@@ -104,7 +101,7 @@ if uploaded_file is not None:
         st.subheader("상위 10개 Trigram")
         top_trigrams = get_top_trigrams(preprocessed_text)
         for trigram, count in top_trigrams:
-            st.write(f"{trigram}: {count}")
         st.subheader("단어 빈도 차트")
         color = st.color_picker("막대 색상 선택", "#1f77b4")

 import re
 import warnings
 from collections import Counter
+import matplotlib.font_manager as fm
 warnings.filterwarnings("ignore")
+# 한글 폰트 설정
+plt.rcParams['font.family'] = 'NanumGothic'
 # 초기 불용어 목록
+default_stopwords = set(["있다", "없다", "되다", "이다", "하다", "같다", "위하다", "있다", "되어다", "통해", "위해", "대한", "있는", "하는"])
+def simple_tokenize(text):
+    # 간단한 형태소 분석: 명사 추출
+    words = re.findall(r'\w+', text)
+    return [word for word in words if len(word) > 1 and not word.endswith(('다', '요', '까', '네'))]
 @st.cache_data
 def preprocess_text(text, user_stopwords):
     # 한글과 공백만 남기고 모두 제거
     text = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', text)
+    # 토큰화 및 불용어 제거
+    words = simple_tokenize(text)
+    words = [word for word in words if word not in user_stopwords]
     return ' '.join(words)
 def topic_modeling(texts, n_components):
+    vectorizer = CountVectorizer(tokenizer=simple_tokenize)
     data_vectorized = vectorizer.fit_transform(texts)
     lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
     lda.fit(data_vectorized)
     return topics
 def generate_word_frequency_chart(text, color, n=20):
+    words = simple_tokenize(text)
     word_freq = Counter(words)
     top_words = dict(word_freq.most_common(n))
     fig, ax = plt.subplots(figsize=(12, 6))
     ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
     ax.invert_yaxis()  # 가장 빈도가 높은 단어를 위쪽에 표시
+    ax.set_title("상위 {} 단어".format(n))
     plt.tight_layout()
     return fig
 def get_top_trigrams(text, n=10):
+    words = simple_tokenize(text)
+    trigrams = zip(words, words[1:], words[2:])
+    trigram_freq = Counter(trigrams)
+    return trigram_freq.most_common(n)
 # 스트림릿 UI 설정
 st.title("텍스트 분석 도구")
         st.subheader("상위 10개 Trigram")
         top_trigrams = get_top_trigrams(preprocessed_text)
         for trigram, count in top_trigrams:
+            st.write(f"{' '.join(trigram)}: {count}")
         st.subheader("단어 빈도 차트")
         color = st.color_picker("막대 색상 선택", "#1f77b4")