soojeongcrystal commited on
Commit
3e6e0f4
Β·
verified Β·
1 Parent(s): 326f364

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -15
app.py CHANGED
@@ -13,36 +13,32 @@ warnings.filterwarnings("ignore")
13
  plt.rcParams['font.family'] = 'NanumGothic'
14
 
15
  # 초기 λΆˆμš©μ–΄ λͺ©λ‘
16
- default_stopwords = set(["μžˆλ‹€", "μ—†λ‹€", "λ˜λ‹€", "이닀", "ν•˜λ‹€", "κ°™λ‹€", "μœ„ν•˜λ‹€", "μžˆλ‹€", "λ˜μ–΄λ‹€", "톡해", "μœ„ν•΄", "λŒ€ν•œ", "μžˆλŠ”", "ν•˜λŠ”"])
17
 
18
  def simple_tokenize(text):
19
- # κ°„λ‹¨ν•œ ν˜•νƒœμ†Œ 뢄석: λͺ…사 μΆ”μΆœ
20
- words = re.findall(r'\w+', text)
21
- return [word for word in words if len(word) > 1 and not word.endswith(('λ‹€', 'μš”', '까', 'λ„€'))]
 
22
 
23
  @st.cache_data
24
  def preprocess_text(text, user_stopwords):
25
- # ν•œκΈ€κ³Ό 곡백만 남기고 λͺ¨λ‘ 제거
26
- text = re.sub(r'[^γ„±-γ…Žγ…-γ…£κ°€-힣\s]', '', text)
27
- # 토큰화 및 λΆˆμš©μ–΄ 제거
28
  words = simple_tokenize(text)
29
  words = [word for word in words if word not in user_stopwords]
30
  return ' '.join(words)
31
 
32
  def topic_modeling(texts, n_components):
33
- vectorizer = CountVectorizer(tokenizer=simple_tokenize)
34
  data_vectorized = vectorizer.fit_transform(texts)
35
- lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
36
  lda.fit(data_vectorized)
37
 
38
- try:
39
- features = vectorizer.get_feature_names_out()
40
- except AttributeError:
41
- features = vectorizer.get_feature_names()
42
 
43
  topics = {}
44
  for topic_idx, topic in enumerate(lda.components_):
45
- topics[f"Topic {topic_idx + 1}"] = [features[i] for i in topic.argsort()[:-21:-1]]
 
46
  return topics
47
 
48
  def generate_word_frequency_chart(text, color, n=20):
@@ -52,7 +48,7 @@ def generate_word_frequency_chart(text, color, n=20):
52
 
53
  fig, ax = plt.subplots(figsize=(12, 6))
54
  ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
55
- ax.invert_yaxis() # κ°€μž₯ λΉˆλ„κ°€ 높은 단어λ₯Ό μœ„μͺ½μ— ν‘œμ‹œ
56
  ax.set_title("μƒμœ„ {} 단어".format(n))
57
  plt.tight_layout()
58
  return fig
 
13
  plt.rcParams['font.family'] = 'NanumGothic'
14
 
15
  # 초기 λΆˆμš©μ–΄ λͺ©λ‘
16
+ default_stopwords = set(["μžˆλ‹€", "μ—†λ‹€", "λ˜λ‹€", "이닀", "ν•˜λ‹€", "κ°™λ‹€", "μœ„ν•˜λ‹€", "μžˆλŠ”", "ν•˜λŠ”", "그리고", "그런", "이런", "μ €λŸ°", "μ΄λ ‡κ²Œ", "μ €λ ‡κ²Œ", "κ·Έλ ‡κ²Œ"])
17
 
18
  def simple_tokenize(text):
19
+ # ν•œκΈ€ λ‹¨μ–΄λ§Œ μΆ”μΆœ
20
+ words = re.findall(r'[κ°€-힣]+', text)
21
+ # 2음절 μ΄μƒμ˜ λ‹¨μ–΄λ§Œ 선택
22
+ return [word for word in words if len(word) > 1]
23
 
24
  @st.cache_data
25
  def preprocess_text(text, user_stopwords):
 
 
 
26
  words = simple_tokenize(text)
27
  words = [word for word in words if word not in user_stopwords]
28
  return ' '.join(words)
29
 
30
  def topic_modeling(texts, n_components):
31
+ vectorizer = CountVectorizer(tokenizer=simple_tokenize, max_df=0.95, min_df=2)
32
  data_vectorized = vectorizer.fit_transform(texts)
33
+ lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=10)
34
  lda.fit(data_vectorized)
35
 
36
+ features = vectorizer.get_feature_names_out()
 
 
 
37
 
38
  topics = {}
39
  for topic_idx, topic in enumerate(lda.components_):
40
+ top_words = [features[i] for i in topic.argsort()[:-11:-1]]
41
+ topics[f"Topic {topic_idx + 1}"] = top_words
42
  return topics
43
 
44
  def generate_word_frequency_chart(text, color, n=20):
 
48
 
49
  fig, ax = plt.subplots(figsize=(12, 6))
50
  ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
51
+ ax.invert_yaxis()
52
  ax.set_title("μƒμœ„ {} 단어".format(n))
53
  plt.tight_layout()
54
  return fig