Spaces:

soojeongcrystal
/

text

Sleeping

App Files Files Community

soojeongcrystal commited on Jul 26, 2024

Commit

5b8f710

verified ·

1 Parent(s): 76b2f6d

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -31

app.py CHANGED Viewed

@@ -32,15 +32,14 @@ tokenizer = RegexTokenizer()
 def extract_nouns(text):
     try:
-        sentences = re.split('[.!?]+', text)
-        nouns = []
-        for sentence in sentences:
-            if sentence.strip():  # 빈 문장 건너뛰기
-                extracted = noun_extractor.extract(sentence)
-                if extracted is None:
-                    st.warning(f"다음 문장에서 명사 추출 실패: {sentence[:50]}...")
-                    continue
-                nouns.extend([word for word, score in extracted.items() if score > 0])
         return [noun for noun in nouns if len(noun) > 1]
     except Exception as e:
         st.error(f"명사 추출 중 오류 발생: {str(e)}")
@@ -116,28 +115,8 @@ if uploaded_file is not None:
             text = str(uploaded_file.read(), 'utf-8')
             st.text(f"파일 길이: {len(text)} 문자")
-            progress_bar = st.progress(0)
-            total_chunks = 100
-            chunk_size = max(1, len(text) // total_chunks)
-            preprocessed_chunks = []
-            for i in range(total_chunks):
-                start = i * chunk_size
-                end = start + chunk_size if i < total_chunks - 1 else len(text)
-                chunk = text[start:end]
-                st.text(f"청크 {i+1} 처리 중: 길이 {len(chunk)} 문자")
-                preprocessed_chunk = preprocess_text(chunk, user_stopwords)
-                if preprocessed_chunk:
-                    preprocessed_chunks.append(preprocessed_chunk)
-                else:
-                    st.warning(f"청크 {i+1}에서 유효한 텍스트가 추출되지 않았습니다.")
-                progress_bar.progress(min(1.0, (i + 1) / total_chunks))
-                if i % 10 == 0 or i == total_chunks - 1:  # 매 10번째 청크와 마지막 청크에 대해 정보 출력
-                    st.text(f"처리된 청크: {i+1}/{total_chunks}, 현재 청크 길이: {len(preprocessed_chunk)}")
-            st.text(f"처리된 청크 수: {len(preprocessed_chunks)}")
-            preprocessed_text = " ".join(preprocessed_chunks)
             st.text(f"처리된 텍스트 길이: {len(preprocessed_text)} 문자")
         if not preprocessed_text:
@@ -145,7 +124,7 @@ if uploaded_file is not None:
         else:
             st.subheader("토픽 모델링 결과")
             n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
-            topics = topic_modeling(preprocessed_chunks, n_topics)
             for topic, words in topics.items():
                 st.write(f"{topic}: {', '.join(words)}")

 def extract_nouns(text):
     try:
+        # 전체 텍스트에 대해 한 번만 extract 메서드 호출
+        extracted = noun_extractor.extract(text)
+        if extracted is None:
+            st.warning("명사 추출에 실패했습니다.")
+            return []
+        # score가 0보다 큰 단어만 선택
+        nouns = [word for word, score in extracted.items() if score > 0]
+        # 2음절 이상의 명사만 선택
         return [noun for noun in nouns if len(noun) > 1]
     except Exception as e:
         st.error(f"명사 추출 중 오류 발생: {str(e)}")
             text = str(uploaded_file.read(), 'utf-8')
             st.text(f"파일 길이: {len(text)} 문자")
+            preprocessed_text = preprocess_text(text, user_stopwords)
             st.text(f"처리된 텍스트 길이: {len(preprocessed_text)} 문자")
         if not preprocessed_text:
         else:
             st.subheader("토픽 모델링 결과")
             n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
+            topics = topic_modeling([preprocessed_text], n_topics)
             for topic, words in topics.items():
                 st.write(f"{topic}: {', '.join(words)}")