Spaces:

soojeongcrystal
/

text

Sleeping

App Files Files Community

soojeongcrystal commited on Jul 26, 2024

Commit

b5a82df

verified ·

1 Parent(s): 4096d83

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -18

app.py CHANGED Viewed

@@ -37,15 +37,18 @@ def extract_nouns(text):
     nouns = []
     for sentence in sentences:
         extracted = noun_extractor.extract(sentence)
-        nouns.extend(extracted.keys())
     # 2음절 이상의 명사만 선택
     return [noun for noun in nouns if len(noun) > 1]
 @st.cache_data
 def preprocess_text(text, user_stopwords):
     nouns = extract_nouns(text)
-    nouns = [noun for noun in nouns if noun not in user_stopwords]
     return ' '.join(nouns)
 def topic_modeling(texts, n_components):
@@ -112,26 +115,30 @@ if uploaded_file is not None:
                 end = start + chunk_size if i < total_chunks - 1 else len(text)
                 chunk = text[start:end]
                 preprocessed_chunk = preprocess_text(chunk, user_stopwords)
-                preprocessed_chunks.append(preprocessed_chunk)
                 progress_bar.progress(min(1.0, (i + 1) / total_chunks))
             preprocessed_text = " ".join(preprocessed_chunks)
-        st.subheader("토픽 모델링 결과")
-        n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
-        topics = topic_modeling(preprocessed_chunks, n_topics)
-        for topic, words in topics.items():
-            st.write(f"{topic}: {', '.join(words)}")
-        st.subheader("상위 10개 Trigram")
-        top_trigrams = get_top_trigrams(preprocessed_text)
-        for trigram, count in top_trigrams:
-            st.write(f"{' '.join(trigram)}: {count}")
-        st.subheader("단어 빈도 차트")
-        color = st.color_picker("막대 색상 선택", "#1f77b4")
-        fig = generate_word_frequency_chart(preprocessed_text, color)
-        st.pyplot(fig)
     except Exception as e:
         st.error(f"오류가 발생했습니다: {str(e)}")

     nouns = []
     for sentence in sentences:
         extracted = noun_extractor.extract(sentence)
+        if extracted:  # None이 아닌 경우에만 처리
+            nouns.extend(extracted.keys())
     # 2음절 이상의 명사만 선택
     return [noun for noun in nouns if len(noun) > 1]
 @st.cache_data
 def preprocess_text(text, user_stopwords):
+    if not text:  # 빈 문자열 체크
+        return ""
     nouns = extract_nouns(text)
+    nouns = [noun for noun in nouns if noun and noun not in user_stopwords]
     return ' '.join(nouns)
 def topic_modeling(texts, n_components):
                 end = start + chunk_size if i < total_chunks - 1 else len(text)
                 chunk = text[start:end]
                 preprocessed_chunk = preprocess_text(chunk, user_stopwords)
+                if preprocessed_chunk:  # 빈 문자열이 아닌 경우에만 추가
+                    preprocessed_chunks.append(preprocessed_chunk)
                 progress_bar.progress(min(1.0, (i + 1) / total_chunks))
             preprocessed_text = " ".join(preprocessed_chunks)
+        if not preprocessed_text:
+            st.warning("처리된 텍스트가 없습니다. 다른 파일을 업로드해 주세요.")
+        else:
+            st.subheader("토픽 모델링 결과")
+            n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
+            topics = topic_modeling(preprocessed_chunks, n_topics)
+            for topic, words in topics.items():
+                st.write(f"{topic}: {', '.join(words)}")
+            st.subheader("상위 10개 Trigram")
+            top_trigrams = get_top_trigrams(preprocessed_text)
+            for trigram, count in top_trigrams:
+                st.write(f"{' '.join(trigram)}: {count}")
+            st.subheader("단어 빈도 차트")
+            color = st.color_picker("막대 색상 선택", "#1f77b4")
+            fig = generate_word_frequency_chart(preprocessed_text, color)
+            st.pyplot(fig)
     except Exception as e:
         st.error(f"오류가 발생했습니다: {str(e)}")