Spaces:

soojeongcrystal
/

text

Sleeping

soojeongcrystal commited on Jul 26, 2024

Commit

0d12815

verified ·

1 Parent(s): c50998c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import re
 import warnings
 from collections import Counter
 import traceback
 warnings.filterwarnings("ignore")
@@ -21,11 +22,20 @@ default_stopwords = set([
     "또한", "그래서", "그리고", "하지만", "그러나", "따라서", "때문에"
 ])
 def extract_nouns(text):
-    # 2음절 이상의 한글 단어를 추출하는 정규표현식
-    pattern = r'\b[가-힣]{2,}\b'
-    nouns = re.findall(pattern, text)
-    return [noun for noun in nouns if noun not in default_stopwords]
 @st.cache_data
 def preprocess_text(text, user_stopwords):
@@ -44,7 +54,7 @@ def preprocess_text(text, user_stopwords):
 def topic_modeling(texts, n_components):
     stop_words_list = list(default_stopwords)
-    vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000, max_df=0.9, min_df=2)
     data_vectorized = vectorizer.fit_transform(texts)
     n_tokens = data_vectorized.shape[1]

 import warnings
 from collections import Counter
 import traceback
+import MeCab
 warnings.filterwarnings("ignore")
     "또한", "그래서", "그리고", "하지만", "그러나", "따라서", "때문에"
 ])
+@st.cache_resource
+def load_mecab():
+    return MeCab.Tagger()
+mecab = load_mecab()
 def extract_nouns(text):
+    nouns = []
+    nodes = mecab.parseToNode(text)
+    while nodes:
+        if nodes.feature.split(',')[0] == '명사':
+            nouns.append(nodes.surface)
+        nodes = nodes.next
+    return [noun for noun in nouns if len(noun) > 1 and noun not in default_stopwords]
 @st.cache_data
 def preprocess_text(text, user_stopwords):
 def topic_modeling(texts, n_components):
     stop_words_list = list(default_stopwords)
+    vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000)
     data_vectorized = vectorizer.fit_transform(texts)
     n_tokens = data_vectorized.shape[1]