soojeongcrystal commited on
Commit
de8d584
Β·
verified Β·
1 Parent(s): f45d210

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -13
app.py CHANGED
@@ -5,12 +5,16 @@ from sklearn.feature_extraction.text import CountVectorizer
5
  from sklearn.decomposition import LatentDirichletAllocation
6
  import re
7
  import warnings
 
8
 
9
- # κ²½κ³  λ©”μ‹œμ§€ λ¬΄μ‹œ
10
  warnings.filterwarnings("ignore")
11
 
12
  # 초기 λΆˆμš©μ–΄ λͺ©λ‘
13
- default_stopwords = set(["μžˆλ‹€", "μ—†λ‹€", "것", "κ·Έ", "이", "ν•˜λŠ”", "ν•˜κΈ°", "ν• ", "되", "수", "이닀", "μ‹œν‚€λ‹€"])
 
 
 
 
14
 
15
  @st.cache_data
16
  def preprocess_text(text, user_stopwords):
@@ -18,8 +22,8 @@ def preprocess_text(text, user_stopwords):
18
  text = re.sub(r'[^γ„±-γ…Žγ…-γ…£κ°€-힣\s]', '', text)
19
  # 단어 뢄리 (곡백 κΈ°μ€€)
20
  words = text.split()
21
- # λΆˆμš©μ–΄ 제거 및 두 κΈ€μž μ΄μƒμ˜ λ‹¨μ–΄λ§Œ 선택
22
- words = [word for word in words if len(word) > 1 and word not in user_stopwords]
23
  return ' '.join(words)
24
 
25
  def topic_modeling(texts, n_components):
@@ -28,11 +32,9 @@ def topic_modeling(texts, n_components):
28
  lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
29
  lda.fit(data_vectorized)
30
 
31
- # μ—¬κΈ°μ„œ get_feature_names λŒ€μ‹  get_feature_names_out을 μ‚¬μš©ν•©λ‹ˆλ‹€
32
  try:
33
  features = vectorizer.get_feature_names_out()
34
  except AttributeError:
35
- # 이전 λ²„μ „κ³Όμ˜ ν˜Έν™˜μ„±μ„ μœ„ν•΄ μ˜ˆμ™Έ 처리
36
  features = vectorizer.get_feature_names()
37
 
38
  topics = {}
@@ -48,15 +50,26 @@ def generate_wordcloud(text, color):
48
  ax.axis("off")
49
  return fig
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # 슀트림릿 UI μ„€μ •
52
  st.title("ν…μŠ€νŠΈ 뢄석 도ꡬ")
53
 
54
- # μ‚¬μ΄λ“œλ°”μ— λΆˆμš©μ–΄ μž…λ ₯ ν•„λ“œ μΆ”κ°€
55
  user_stopwords = st.sidebar.text_area("λΆˆμš©μ–΄λ₯Ό μž…λ ₯ν•˜μ„Έμš” (μ‰Όν‘œλ‘œ ꡬ뢄)",
56
  value=", ".join(default_stopwords))
57
  user_stopwords = set(user_stopwords.split(", ")) | default_stopwords
58
 
59
- # 파일 μ—…λ‘œλ”
60
  uploaded_file = st.file_uploader("ν…μŠ€νŠΈ 파일 μ—…λ‘œλ“œ", type=['txt'])
61
 
62
  if uploaded_file is not None:
@@ -64,7 +77,6 @@ if uploaded_file is not None:
64
  with st.spinner('νŒŒμΌμ„ 처리 μ€‘μž…λ‹ˆλ‹€...'):
65
  text = str(uploaded_file.read(), 'utf-8')
66
 
67
- # ν…μŠ€νŠΈ 크기에 λ”°λ₯Έ ν”„λ‘œκ·Έλ ˆμŠ€ λ°” μΆ”κ°€
68
  progress_bar = st.progress(0)
69
  total_chunks = 100
70
  chunk_size = max(1, len(text) // total_chunks)
@@ -80,14 +92,17 @@ if uploaded_file is not None:
80
 
81
  preprocessed_text = " ".join(preprocessed_chunks)
82
 
83
- # ν† ν”½ λͺ¨λΈλ§
84
  st.subheader("ν† ν”½ λͺ¨λΈλ§ κ²°κ³Ό")
85
  n_topics = st.slider("ν† ν”½ 수 선택", min_value=2, max_value=10, value=5)
86
  topics = topic_modeling([preprocessed_text], n_topics)
87
  for topic, words in topics.items():
88
  st.write(f"{topic}: {', '.join(words)}")
89
 
90
- # μ›Œλ“œ ν΄λΌμš°λ“œ
 
 
 
 
91
  st.subheader("μ›Œλ“œ ν΄λΌμš°λ“œ")
92
  color = st.color_picker("배경색 선택", "#ffffff")
93
  fig = generate_wordcloud(preprocessed_text, color)
@@ -96,11 +111,11 @@ if uploaded_file is not None:
96
  except Exception as e:
97
  st.error(f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}")
98
 
99
- # μ‚¬μš© μ„€λͺ… μΆ”κ°€
100
  st.sidebar.markdown("""
101
  ## μ‚¬μš© 방법
102
  1. μ‚¬μ΄λ“œλ°”μ—μ„œ λΆˆμš©μ–΄λ₯Ό μΆ”κ°€ν•˜κ±°λ‚˜ μˆ˜μ •ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
103
  2. ν…μŠ€νŠΈ 파일(.txt)을 μ—…λ‘œλ“œν•˜μ„Έμš”.
104
  3. ν† ν”½ λͺ¨λΈλ§μ˜ ν† ν”½ 수λ₯Ό μ„ νƒν•˜μ„Έμš”.
105
- 4. μ›Œλ“œν΄λΌμš°λ“œμ˜ 배경색을 선택할 수 μžˆμŠ΅λ‹ˆλ‹€.
 
106
  """)
 
5
  from sklearn.decomposition import LatentDirichletAllocation
6
  import re
7
  import warnings
8
+ from collections import Counter
9
 
 
10
  warnings.filterwarnings("ignore")
11
 
12
  # 초기 λΆˆμš©μ–΄ λͺ©λ‘
13
+ default_stopwords = set(["μžˆλ‹€", "μ—†λ‹€", "λ˜λ‹€", "이닀", "ν•˜λ‹€", "κ°™λ‹€", "μœ„ν•˜λ‹€", "μžˆλ‹€", "λ˜μ–΄λ‹€"])
14
+
15
+ def is_noun(word):
16
+ # κ°„λ‹¨ν•œ κ·œμΉ™: 2음절 이상이고 'λ‹€'둜 λλ‚˜μ§€ μ•ŠμœΌλ©΄ λͺ…μ‚¬λ‘œ κ°„μ£Ό
17
+ return len(word) >= 2 and not word.endswith('λ‹€')
18
 
19
  @st.cache_data
20
  def preprocess_text(text, user_stopwords):
 
22
  text = re.sub(r'[^γ„±-γ…Žγ…-γ…£κ°€-힣\s]', '', text)
23
  # 단어 뢄리 (곡백 κΈ°μ€€)
24
  words = text.split()
25
+ # λͺ…사 μΆ”μΆœ, λΆˆμš©μ–΄ 제거, 두 κΈ€μž μ΄μƒμ˜ λ‹¨μ–΄λ§Œ 선택
26
+ words = [word for word in words if is_noun(word) and len(word) > 1 and word not in user_stopwords]
27
  return ' '.join(words)
28
 
29
  def topic_modeling(texts, n_components):
 
32
  lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
33
  lda.fit(data_vectorized)
34
 
 
35
  try:
36
  features = vectorizer.get_feature_names_out()
37
  except AttributeError:
 
38
  features = vectorizer.get_feature_names()
39
 
40
  topics = {}
 
50
  ax.axis("off")
51
  return fig
52
 
53
+ def get_top_trigrams(text, n=10):
54
+ trigram_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=10000)
55
+ trigrams = trigram_vectorizer.fit_transform([text])
56
+
57
+ try:
58
+ trigram_features = trigram_vectorizer.get_feature_names_out()
59
+ except AttributeError:
60
+ trigram_features = trigram_vectorizer.get_feature_names()
61
+
62
+ trigram_counts = trigrams.sum(axis=0).A1
63
+ top_trigrams = sorted(zip(trigram_features, trigram_counts), key=lambda x: x[1], reverse=True)[:n]
64
+ return top_trigrams
65
+
66
  # 슀트림릿 UI μ„€μ •
67
  st.title("ν…μŠ€νŠΈ 뢄석 도ꡬ")
68
 
 
69
  user_stopwords = st.sidebar.text_area("λΆˆμš©μ–΄λ₯Ό μž…λ ₯ν•˜μ„Έμš” (μ‰Όν‘œλ‘œ ꡬ뢄)",
70
  value=", ".join(default_stopwords))
71
  user_stopwords = set(user_stopwords.split(", ")) | default_stopwords
72
 
 
73
  uploaded_file = st.file_uploader("ν…μŠ€νŠΈ 파일 μ—…λ‘œλ“œ", type=['txt'])
74
 
75
  if uploaded_file is not None:
 
77
  with st.spinner('νŒŒμΌμ„ 처리 μ€‘μž…λ‹ˆλ‹€...'):
78
  text = str(uploaded_file.read(), 'utf-8')
79
 
 
80
  progress_bar = st.progress(0)
81
  total_chunks = 100
82
  chunk_size = max(1, len(text) // total_chunks)
 
92
 
93
  preprocessed_text = " ".join(preprocessed_chunks)
94
 
 
95
  st.subheader("ν† ν”½ λͺ¨λΈλ§ κ²°κ³Ό")
96
  n_topics = st.slider("ν† ν”½ 수 선택", min_value=2, max_value=10, value=5)
97
  topics = topic_modeling([preprocessed_text], n_topics)
98
  for topic, words in topics.items():
99
  st.write(f"{topic}: {', '.join(words)}")
100
 
101
+ st.subheader("μƒμœ„ 10개 Trigram")
102
+ top_trigrams = get_top_trigrams(preprocessed_text)
103
+ for trigram, count in top_trigrams:
104
+ st.write(f"{trigram}: {count}")
105
+
106
  st.subheader("μ›Œλ“œ ν΄λΌμš°λ“œ")
107
  color = st.color_picker("배경색 선택", "#ffffff")
108
  fig = generate_wordcloud(preprocessed_text, color)
 
111
  except Exception as e:
112
  st.error(f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}")
113
 
 
114
  st.sidebar.markdown("""
115
  ## μ‚¬μš© 방법
116
  1. μ‚¬μ΄λ“œλ°”μ—μ„œ λΆˆμš©μ–΄λ₯Ό μΆ”κ°€ν•˜κ±°λ‚˜ μˆ˜μ •ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
117
  2. ν…μŠ€νŠΈ 파일(.txt)을 μ—…λ‘œλ“œν•˜μ„Έμš”.
118
  3. ν† ν”½ λͺ¨λΈλ§μ˜ ν† ν”½ 수λ₯Ό μ„ νƒν•˜μ„Έμš”.
119
+ 4. μƒμœ„ 10개 Trigram을 ν™•μΈν•˜μ„Έμš”.
120
+ 5. μ›Œλ“œν΄λΌμš°λ“œμ˜ 배경색을 선택할 수 μžˆμŠ΅λ‹ˆλ‹€.
121
  """)