soojeongcrystal commited on
Commit
326f364
ยท
verified ยท
1 Parent(s): 35bdcd6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -23
app.py CHANGED
@@ -5,28 +5,32 @@ from sklearn.decomposition import LatentDirichletAllocation
5
  import re
6
  import warnings
7
  from collections import Counter
 
8
 
9
  warnings.filterwarnings("ignore")
10
 
 
 
 
11
  # ์ดˆ๊ธฐ ๋ถˆ์šฉ์–ด ๋ชฉ๋ก
12
- default_stopwords = set(["์žˆ๋‹ค", "์—†๋‹ค", "๋˜๋‹ค", "์ด๋‹ค", "ํ•˜๋‹ค", "๊ฐ™๋‹ค", "์œ„ํ•˜๋‹ค", "์žˆ๋‹ค", "๋˜์–ด๋‹ค"])
13
 
14
- def is_noun(word):
15
- # ๊ฐ„๋‹จํ•œ ๊ทœ์น™: 2์Œ์ ˆ ์ด์ƒ์ด๊ณ  '๋‹ค'๋กœ ๋๋‚˜์ง€ ์•Š์œผ๋ฉด ๋ช…์‚ฌ๋กœ ๊ฐ„์ฃผ
16
- return len(word) >= 2 and not word.endswith('๋‹ค')
 
17
 
18
  @st.cache_data
19
  def preprocess_text(text, user_stopwords):
20
  # ํ•œ๊ธ€๊ณผ ๊ณต๋ฐฑ๋งŒ ๋‚จ๊ธฐ๊ณ  ๋ชจ๋‘ ์ œ๊ฑฐ
21
  text = re.sub(r'[^ใ„ฑ-ใ…Žใ…-ใ…ฃ๊ฐ€-ํžฃ\s]', '', text)
22
- # ๋‹จ์–ด ๋ถ„๋ฆฌ (๊ณต๋ฐฑ ๊ธฐ์ค€)
23
- words = text.split()
24
- # ๋ช…์‚ฌ ์ถ”์ถœ, ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ, ๋‘ ๊ธ€์ž ์ด์ƒ์˜ ๋‹จ์–ด๋งŒ ์„ ํƒ
25
- words = [word for word in words if is_noun(word) and len(word) > 1 and word not in user_stopwords]
26
  return ' '.join(words)
27
 
28
  def topic_modeling(texts, n_components):
29
- vectorizer = CountVectorizer()
30
  data_vectorized = vectorizer.fit_transform(texts)
31
  lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
32
  lda.fit(data_vectorized)
@@ -42,29 +46,22 @@ def topic_modeling(texts, n_components):
42
  return topics
43
 
44
  def generate_word_frequency_chart(text, color, n=20):
45
- words = text.split()
46
  word_freq = Counter(words)
47
  top_words = dict(word_freq.most_common(n))
48
 
49
  fig, ax = plt.subplots(figsize=(12, 6))
50
  ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
51
  ax.invert_yaxis() # ๊ฐ€์žฅ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด๋ฅผ ์œ„์ชฝ์— ํ‘œ์‹œ
52
- ax.set_title("Top {} Words".format(n))
53
  plt.tight_layout()
54
  return fig
55
 
56
  def get_top_trigrams(text, n=10):
57
- trigram_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=10000)
58
- trigrams = trigram_vectorizer.fit_transform([text])
59
-
60
- try:
61
- trigram_features = trigram_vectorizer.get_feature_names_out()
62
- except AttributeError:
63
- trigram_features = trigram_vectorizer.get_feature_names()
64
-
65
- trigram_counts = trigrams.sum(axis=0).A1
66
- top_trigrams = sorted(zip(trigram_features, trigram_counts), key=lambda x: x[1], reverse=True)[:n]
67
- return top_trigrams
68
 
69
  # ์ŠคํŠธ๋ฆผ๋ฆฟ UI ์„ค์ •
70
  st.title("ํ…์ŠคํŠธ ๋ถ„์„ ๋„๊ตฌ")
@@ -104,7 +101,7 @@ if uploaded_file is not None:
104
  st.subheader("์ƒ์œ„ 10๊ฐœ Trigram")
105
  top_trigrams = get_top_trigrams(preprocessed_text)
106
  for trigram, count in top_trigrams:
107
- st.write(f"{trigram}: {count}")
108
 
109
  st.subheader("๋‹จ์–ด ๋นˆ๋„ ์ฐจํŠธ")
110
  color = st.color_picker("๋ง‰๋Œ€ ์ƒ‰์ƒ ์„ ํƒ", "#1f77b4")
 
5
  import re
6
  import warnings
7
  from collections import Counter
8
+ import matplotlib.font_manager as fm
9
 
10
  warnings.filterwarnings("ignore")
11
 
12
+ # ํ•œ๊ธ€ ํฐํŠธ ์„ค์ •
13
+ plt.rcParams['font.family'] = 'NanumGothic'
14
+
15
  # ์ดˆ๊ธฐ ๋ถˆ์šฉ์–ด ๋ชฉ๋ก
16
+ default_stopwords = set(["์žˆ๋‹ค", "์—†๋‹ค", "๋˜๋‹ค", "์ด๋‹ค", "ํ•˜๋‹ค", "๊ฐ™๋‹ค", "์œ„ํ•˜๋‹ค", "์žˆ๋‹ค", "๋˜์–ด๋‹ค", "ํ†ตํ•ด", "์œ„ํ•ด", "๋Œ€ํ•œ", "์žˆ๋Š”", "ํ•˜๋Š”"])
17
 
18
+ def simple_tokenize(text):
19
+ # ๊ฐ„๋‹จํ•œ ํ˜•ํƒœ์†Œ ๋ถ„์„: ๋ช…์‚ฌ ์ถ”์ถœ
20
+ words = re.findall(r'\w+', text)
21
+ return [word for word in words if len(word) > 1 and not word.endswith(('๋‹ค', '์š”', '๊นŒ', '๋„ค'))]
22
 
23
  @st.cache_data
24
  def preprocess_text(text, user_stopwords):
25
  # ํ•œ๊ธ€๊ณผ ๊ณต๋ฐฑ๋งŒ ๋‚จ๊ธฐ๊ณ  ๋ชจ๋‘ ์ œ๊ฑฐ
26
  text = re.sub(r'[^ใ„ฑ-ใ…Žใ…-ใ…ฃ๊ฐ€-ํžฃ\s]', '', text)
27
+ # ํ† ํฐํ™” ๋ฐ ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ
28
+ words = simple_tokenize(text)
29
+ words = [word for word in words if word not in user_stopwords]
 
30
  return ' '.join(words)
31
 
32
  def topic_modeling(texts, n_components):
33
+ vectorizer = CountVectorizer(tokenizer=simple_tokenize)
34
  data_vectorized = vectorizer.fit_transform(texts)
35
  lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
36
  lda.fit(data_vectorized)
 
46
  return topics
47
 
48
  def generate_word_frequency_chart(text, color, n=20):
49
+ words = simple_tokenize(text)
50
  word_freq = Counter(words)
51
  top_words = dict(word_freq.most_common(n))
52
 
53
  fig, ax = plt.subplots(figsize=(12, 6))
54
  ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
55
  ax.invert_yaxis() # ๊ฐ€์žฅ ๋นˆ๋„๊ฐ€ ๋†’์€ ๋‹จ์–ด๋ฅผ ์œ„์ชฝ์— ํ‘œ์‹œ
56
+ ax.set_title("์ƒ์œ„ {} ๋‹จ์–ด".format(n))
57
  plt.tight_layout()
58
  return fig
59
 
60
  def get_top_trigrams(text, n=10):
61
+ words = simple_tokenize(text)
62
+ trigrams = zip(words, words[1:], words[2:])
63
+ trigram_freq = Counter(trigrams)
64
+ return trigram_freq.most_common(n)
 
 
 
 
 
 
 
65
 
66
  # ์ŠคํŠธ๋ฆผ๋ฆฟ UI ์„ค์ •
67
  st.title("ํ…์ŠคํŠธ ๋ถ„์„ ๋„๊ตฌ")
 
101
  st.subheader("์ƒ์œ„ 10๊ฐœ Trigram")
102
  top_trigrams = get_top_trigrams(preprocessed_text)
103
  for trigram, count in top_trigrams:
104
+ st.write(f"{' '.join(trigram)}: {count}")
105
 
106
  st.subheader("๋‹จ์–ด ๋นˆ๋„ ์ฐจํŠธ")
107
  color = st.color_picker("๋ง‰๋Œ€ ์ƒ‰์ƒ ์„ ํƒ", "#1f77b4")