Spaces:

soojeongcrystal
/

text

Sleeping

App Files Files Community

soojeongcrystal commited on Jul 26, 2024

Commit

de8d584

verified ·

1 Parent(s): f45d210

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -13

app.py CHANGED Viewed

@@ -5,12 +5,16 @@ from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
 import re
 import warnings
-# 경고 메시지 무시
 warnings.filterwarnings("ignore")
 # 초기 불용어 목록
-default_stopwords = set(["있다", "없다", "것", "그", "이", "하는", "하기", "할", "되", "수", "이다", "시키다"])
 @st.cache_data
 def preprocess_text(text, user_stopwords):
@@ -18,8 +22,8 @@ def preprocess_text(text, user_stopwords):
     text = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', text)
     # 단어 분리 (공백 기준)
     words = text.split()
-    # 불용어 제거 및 두 글자 이상의 단어만 선택
-    words = [word for word in words if len(word) > 1 and word not in user_stopwords]
     return ' '.join(words)
 def topic_modeling(texts, n_components):
@@ -28,11 +32,9 @@ def topic_modeling(texts, n_components):
     lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
     lda.fit(data_vectorized)
-    # 여기서 get_feature_names 대신 get_feature_names_out을 사용합니다
     try:
         features = vectorizer.get_feature_names_out()
     except AttributeError:
-        # 이전 버전과의 호환성을 위해 예외 처리
         features = vectorizer.get_feature_names()
     topics = {}
@@ -48,15 +50,26 @@ def generate_wordcloud(text, color):
     ax.axis("off")
     return fig
 # 스트림릿 UI 설정
 st.title("텍스트 분석 도구")
-# 사이드바에 불용어 입력 필드 추가
 user_stopwords = st.sidebar.text_area("불용어를 입력하세요 (쉼표로 구분)",
                                       value=", ".join(default_stopwords))
 user_stopwords = set(user_stopwords.split(", ")) | default_stopwords
-# 파일 업로더
 uploaded_file = st.file_uploader("텍스트 파일 업로드", type=['txt'])
 if uploaded_file is not None:
@@ -64,7 +77,6 @@ if uploaded_file is not None:
         with st.spinner('파일을 처리 중입니다...'):
             text = str(uploaded_file.read(), 'utf-8')
-            # 텍스트 크기에 따른 프로그레스 바 추가
             progress_bar = st.progress(0)
             total_chunks = 100
             chunk_size = max(1, len(text) // total_chunks)
@@ -80,14 +92,17 @@ if uploaded_file is not None:
             preprocessed_text = " ".join(preprocessed_chunks)
-        # 토픽 모델링
         st.subheader("토픽 모델링 결과")
         n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
         topics = topic_modeling([preprocessed_text], n_topics)
         for topic, words in topics.items():
             st.write(f"{topic}: {', '.join(words)}")
-        # 워드 클라우드
         st.subheader("워드 클라우드")
         color = st.color_picker("배경색 선택", "#ffffff")
         fig = generate_wordcloud(preprocessed_text, color)
@@ -96,11 +111,11 @@ if uploaded_file is not None:
     except Exception as e:
         st.error(f"오류가 발생했습니다: {str(e)}")
-# 사용 설명 추가
 st.sidebar.markdown("""
 ## 사용 방법
 1. 사이드바에서 불용어를 추가하거나 수정할 수 있습니다.
 2. 텍스트 파일(.txt)을 업로드하세요.
 3. 토픽 모델링의 토픽 수를 선택하세요.
-4. 워드클라우드의 배경색을 선택할 수 있습니다.
 """)

 from sklearn.decomposition import LatentDirichletAllocation
 import re
 import warnings
+from collections import Counter
 warnings.filterwarnings("ignore")
 # 초기 불용어 목록
+default_stopwords = set(["있다", "없다", "되다", "이다", "하다", "같다", "위하다", "있다", "되어다"])
+def is_noun(word):
+    # 간단한 규칙: 2음절 이상이고 '다'로 끝나지 않으면 명사로 간주
+    return len(word) >= 2 and not word.endswith('다')
 @st.cache_data
 def preprocess_text(text, user_stopwords):
     text = re.sub(r'[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', text)
     # 단어 분리 (공백 기준)
     words = text.split()
+    # 명사 추출, 불용어 제거, 두 글자 이상의 단어만 선택
+    words = [word for word in words if is_noun(word) and len(word) > 1 and word not in user_stopwords]
     return ' '.join(words)
 def topic_modeling(texts, n_components):
     lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
     lda.fit(data_vectorized)
     try:
         features = vectorizer.get_feature_names_out()
     except AttributeError:
         features = vectorizer.get_feature_names()
     topics = {}
     ax.axis("off")
     return fig
+def get_top_trigrams(text, n=10):
+    trigram_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=10000)
+    trigrams = trigram_vectorizer.fit_transform([text])
+    try:
+        trigram_features = trigram_vectorizer.get_feature_names_out()
+    except AttributeError:
+        trigram_features = trigram_vectorizer.get_feature_names()
+    trigram_counts = trigrams.sum(axis=0).A1
+    top_trigrams = sorted(zip(trigram_features, trigram_counts), key=lambda x: x[1], reverse=True)[:n]
+    return top_trigrams
 # 스트림릿 UI 설정
 st.title("텍스트 분석 도구")
 user_stopwords = st.sidebar.text_area("불용어를 입력하세요 (쉼표로 구분)",
                                       value=", ".join(default_stopwords))
 user_stopwords = set(user_stopwords.split(", ")) | default_stopwords
 uploaded_file = st.file_uploader("텍스트 파일 업로드", type=['txt'])
 if uploaded_file is not None:
         with st.spinner('파일을 처리 중입니다...'):
             text = str(uploaded_file.read(), 'utf-8')
             progress_bar = st.progress(0)
             total_chunks = 100
             chunk_size = max(1, len(text) // total_chunks)
             preprocessed_text = " ".join(preprocessed_chunks)
         st.subheader("토픽 모델링 결과")
         n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
         topics = topic_modeling([preprocessed_text], n_topics)
         for topic, words in topics.items():
             st.write(f"{topic}: {', '.join(words)}")
+        st.subheader("상위 10개 Trigram")
+        top_trigrams = get_top_trigrams(preprocessed_text)
+        for trigram, count in top_trigrams:
+            st.write(f"{trigram}: {count}")
         st.subheader("워드 클라우드")
         color = st.color_picker("배경색 선택", "#ffffff")
         fig = generate_wordcloud(preprocessed_text, color)
     except Exception as e:
         st.error(f"오류가 발생했습니다: {str(e)}")
 st.sidebar.markdown("""
 ## 사용 방법
 1. 사이드바에서 불용어를 추가하거나 수정할 수 있습니다.
 2. 텍스트 파일(.txt)을 업로드하세요.
 3. 토픽 모델링의 토픽 수를 선택하세요.
+4. 상위 10개 Trigram을 확인하세요.
+5. 워드클라우드의 배경색을 선택할 수 있습니다.
 """)