soojeongcrystal commited on
Commit
5b8f710
ยท
verified ยท
1 Parent(s): 76b2f6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -31
app.py CHANGED
@@ -32,15 +32,14 @@ tokenizer = RegexTokenizer()
32
 
33
  def extract_nouns(text):
34
  try:
35
- sentences = re.split('[.!?]+', text)
36
- nouns = []
37
- for sentence in sentences:
38
- if sentence.strip(): # ๋นˆ ๋ฌธ์žฅ ๊ฑด๋„ˆ๋›ฐ๊ธฐ
39
- extracted = noun_extractor.extract(sentence)
40
- if extracted is None:
41
- st.warning(f"๋‹ค์Œ ๋ฌธ์žฅ์—์„œ ๋ช…์‚ฌ ์ถ”์ถœ ์‹คํŒจ: {sentence[:50]}...")
42
- continue
43
- nouns.extend([word for word, score in extracted.items() if score > 0])
44
  return [noun for noun in nouns if len(noun) > 1]
45
  except Exception as e:
46
  st.error(f"๋ช…์‚ฌ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
@@ -116,28 +115,8 @@ if uploaded_file is not None:
116
  text = str(uploaded_file.read(), 'utf-8')
117
  st.text(f"ํŒŒ์ผ ๊ธธ์ด: {len(text)} ๋ฌธ์ž")
118
 
119
- progress_bar = st.progress(0)
120
- total_chunks = 100
121
- chunk_size = max(1, len(text) // total_chunks)
122
- preprocessed_chunks = []
123
 
124
- for i in range(total_chunks):
125
- start = i * chunk_size
126
- end = start + chunk_size if i < total_chunks - 1 else len(text)
127
- chunk = text[start:end]
128
- st.text(f"์ฒญํฌ {i+1} ์ฒ˜๋ฆฌ ์ค‘: ๊ธธ์ด {len(chunk)} ๋ฌธ์ž")
129
- preprocessed_chunk = preprocess_text(chunk, user_stopwords)
130
- if preprocessed_chunk:
131
- preprocessed_chunks.append(preprocessed_chunk)
132
- else:
133
- st.warning(f"์ฒญํฌ {i+1}์—์„œ ์œ ํšจํ•œ ํ…์ŠคํŠธ๊ฐ€ ์ถ”์ถœ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค.")
134
- progress_bar.progress(min(1.0, (i + 1) / total_chunks))
135
-
136
- if i % 10 == 0 or i == total_chunks - 1: # ๋งค 10๋ฒˆ์งธ ์ฒญํฌ์™€ ๋งˆ์ง€๋ง‰ ์ฒญํฌ์— ๋Œ€ํ•ด ์ •๋ณด ์ถœ๋ ฅ
137
- st.text(f"์ฒ˜๋ฆฌ๋œ ์ฒญํฌ: {i+1}/{total_chunks}, ํ˜„์žฌ ์ฒญํฌ ๊ธธ์ด: {len(preprocessed_chunk)}")
138
-
139
- st.text(f"์ฒ˜๋ฆฌ๋œ ์ฒญํฌ ์ˆ˜: {len(preprocessed_chunks)}")
140
- preprocessed_text = " ".join(preprocessed_chunks)
141
  st.text(f"์ฒ˜๋ฆฌ๋œ ํ…์ŠคํŠธ ๊ธธ์ด: {len(preprocessed_text)} ๋ฌธ์ž")
142
 
143
  if not preprocessed_text:
@@ -145,7 +124,7 @@ if uploaded_file is not None:
145
  else:
146
  st.subheader("ํ† ํ”ฝ ๋ชจ๋ธ๋ง ๊ฒฐ๊ณผ")
147
  n_topics = st.slider("ํ† ํ”ฝ ์ˆ˜ ์„ ํƒ", min_value=2, max_value=10, value=5)
148
- topics = topic_modeling(preprocessed_chunks, n_topics)
149
  for topic, words in topics.items():
150
  st.write(f"{topic}: {', '.join(words)}")
151
 
 
32
 
33
  def extract_nouns(text):
34
  try:
35
+ # ์ „์ฒด ํ…์ŠคํŠธ์— ๋Œ€ํ•ด ํ•œ ๋ฒˆ๋งŒ extract ๋ฉ”์„œ๋“œ ํ˜ธ์ถœ
36
+ extracted = noun_extractor.extract(text)
37
+ if extracted is None:
38
+ st.warning("๋ช…์‚ฌ ์ถ”์ถœ์— ์‹คํŒจํ–ˆ์Šต๋‹ˆ๋‹ค.")
39
+ return []
40
+ # score๊ฐ€ 0๋ณด๋‹ค ํฐ ๋‹จ์–ด๋งŒ ์„ ํƒ
41
+ nouns = [word for word, score in extracted.items() if score > 0]
42
+ # 2์Œ์ ˆ ์ด์ƒ์˜ ๋ช…์‚ฌ๋งŒ ์„ ํƒ
 
43
  return [noun for noun in nouns if len(noun) > 1]
44
  except Exception as e:
45
  st.error(f"๋ช…์‚ฌ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
 
115
  text = str(uploaded_file.read(), 'utf-8')
116
  st.text(f"ํŒŒ์ผ ๊ธธ์ด: {len(text)} ๋ฌธ์ž")
117
 
118
+ preprocessed_text = preprocess_text(text, user_stopwords)
 
 
 
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  st.text(f"์ฒ˜๋ฆฌ๋œ ํ…์ŠคํŠธ ๊ธธ์ด: {len(preprocessed_text)} ๋ฌธ์ž")
121
 
122
  if not preprocessed_text:
 
124
  else:
125
  st.subheader("ํ† ํ”ฝ ๋ชจ๋ธ๋ง ๊ฒฐ๊ณผ")
126
  n_topics = st.slider("ํ† ํ”ฝ ์ˆ˜ ์„ ํƒ", min_value=2, max_value=10, value=5)
127
+ topics = topic_modeling([preprocessed_text], n_topics)
128
  for topic, words in topics.items():
129
  st.write(f"{topic}: {', '.join(words)}")
130