soojeongcrystal commited on
Commit
0d12815
ยท
verified ยท
1 Parent(s): c50998c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -5
app.py CHANGED
@@ -6,6 +6,7 @@ import re
6
  import warnings
7
  from collections import Counter
8
  import traceback
 
9
 
10
  warnings.filterwarnings("ignore")
11
 
@@ -21,11 +22,20 @@ default_stopwords = set([
21
  "๋˜ํ•œ", "๊ทธ๋ž˜์„œ", "๊ทธ๋ฆฌ๊ณ ", "ํ•˜์ง€๋งŒ", "๊ทธ๋Ÿฌ๋‚˜", "๋”ฐ๋ผ์„œ", "๋•Œ๋ฌธ์—"
22
  ])
23
 
 
 
 
 
 
 
24
  def extract_nouns(text):
25
- # 2์Œ์ ˆ ์ด์ƒ์˜ ํ•œ๊ธ€ ๋‹จ์–ด๋ฅผ ์ถ”์ถœํ•˜๋Š” ์ •๊ทœํ‘œํ˜„์‹
26
- pattern = r'\b[๊ฐ€-ํžฃ]{2,}\b'
27
- nouns = re.findall(pattern, text)
28
- return [noun for noun in nouns if noun not in default_stopwords]
 
 
 
29
 
30
  @st.cache_data
31
  def preprocess_text(text, user_stopwords):
@@ -44,7 +54,7 @@ def preprocess_text(text, user_stopwords):
44
  def topic_modeling(texts, n_components):
45
  stop_words_list = list(default_stopwords)
46
 
47
- vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000, max_df=0.9, min_df=2)
48
  data_vectorized = vectorizer.fit_transform(texts)
49
 
50
  n_tokens = data_vectorized.shape[1]
 
6
  import warnings
7
  from collections import Counter
8
  import traceback
9
+ import MeCab
10
 
11
  warnings.filterwarnings("ignore")
12
 
 
22
  "๋˜ํ•œ", "๊ทธ๋ž˜์„œ", "๊ทธ๋ฆฌ๊ณ ", "ํ•˜์ง€๋งŒ", "๊ทธ๋Ÿฌ๋‚˜", "๋”ฐ๋ผ์„œ", "๋•Œ๋ฌธ์—"
23
  ])
24
 
25
+ @st.cache_resource
26
+ def load_mecab():
27
+ return MeCab.Tagger()
28
+
29
+ mecab = load_mecab()
30
+
31
  def extract_nouns(text):
32
+ nouns = []
33
+ nodes = mecab.parseToNode(text)
34
+ while nodes:
35
+ if nodes.feature.split(',')[0] == '๋ช…์‚ฌ':
36
+ nouns.append(nodes.surface)
37
+ nodes = nodes.next
38
+ return [noun for noun in nouns if len(noun) > 1 and noun not in default_stopwords]
39
 
40
  @st.cache_data
41
  def preprocess_text(text, user_stopwords):
 
54
  def topic_modeling(texts, n_components):
55
  stop_words_list = list(default_stopwords)
56
 
57
+ vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000)
58
  data_vectorized = vectorizer.fit_transform(texts)
59
 
60
  n_tokens = data_vectorized.shape[1]