Spaces:

soojeongcrystal
/

topicmodeling

Sleeping

App Files Files Community

soojeongcrystal commited on Aug 23, 2024

Commit

05ce311

verified ·

1 Parent(s): 9ab0e71

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -15

app.py CHANGED Viewed

@@ -16,19 +16,20 @@ def load_okt():
 okt = load_okt()
-# 불용어 목록 (예시, 필요에 따라 확장 가능)
-stop_words = ['이', '그', '저', '것', '수', '등', '들', '및', '에서', '그리고', '그래서', '또는', '그런데']
 @st.cache_data
-def preprocess_text(text):
-    # 특수 문자 제거
-    text = re.sub(r'[^\w\s]', '', text)
-    # 숫자 제거
-    text = re.sub(r'\d+', '', text)
-    # 형태소 분석 및 품사 태깅
-    tagged = okt.pos(text, stem=True)
-    # 명사, 동사, 형용사만 선택하고 불용어 제거
-    processed = [word for word, pos in tagged if pos in ['Noun', 'Verb', 'Adjective'] and word not in stop_words]
     return ' '.join(processed)
 # Streamlit 앱 설정
@@ -42,6 +43,10 @@ api_key = st.sidebar.text_input("Claude API 키를 입력하세요", type="passw
 if not api_key:
     api_key = os.environ.get("ANTHROPIC_API_KEY")
 # 파일 업로드
 uploaded_file = st.sidebar.file_uploader("CSV 파일을 업로드하세요", type="csv")
@@ -60,7 +65,7 @@ if uploaded_file is not None:
     if st.sidebar.button("토픽 모델링 실행"):
         # 텍스트 전처리
         with st.spinner("텍스트 전처리 중..."):
-            df['processed_text'] = df[text_column].apply(preprocess_text)
         # 토픽 모델링
         with st.spinner("토픽 모델링 실행 중..."):
@@ -81,10 +86,18 @@ if uploaded_file is not None:
         for idx, topic in enumerate(lda.components_):
             st.subheader(f"토픽 {idx + 1}")
             top_features_ind = topic.argsort()[:-11:-1]
-            top_features = [(feature_names[i], topic[i], tfidf_matrix[:, tfidf_vectorizer.vocabulary_[feature_names[i]]].mean()) for i in top_features_ind]
-            df_topic = pd.DataFrame(top_features, columns=['단어', 'LDA 점수', 'TF-IDF'])
-            st.table(df_topic.style.format({'LDA 점수': '{:.4f}', 'TF-IDF': '{:.4f}'}))
         # 토픽 비중 그래프
         st.header("토픽 비중 그래프")
@@ -119,3 +132,32 @@ if uploaded_file is not None:
                     st.write(response.completion)
         else:
             st.warning("Claude API 키가 설정되지 않았습니다. 토픽 해석을 제공받을 수 없습니다.")

 okt = load_okt()
+# 기본 불용어 목록
+default_stop_words = ['이', '그', '저', '것', '수', '등', '들', '및', '에서', '그리고', '그래서', '또는', '그런데']
 @st.cache_data
+def preprocess_text(text, stop_words):
+    # 숫자 및 특수 문자 제거
+    text = re.sub(r'[^가-힣\s]', '', text)
+    # 형태소 분석 및 명사 추출
+    nouns = okt.nouns(text)
+    # 불용어 제거 및 길이가 1인 단어 제거
+    processed = [word for word in nouns if word not in stop_words and len(word) > 1]
     return ' '.join(processed)
 # Streamlit 앱 설정
 if not api_key:
     api_key = os.environ.get("ANTHROPIC_API_KEY")
+# 불용어 설정
+stop_words_input = st.sidebar.text_area("불용어 목록 (쉼표로 구분)", ', '.join(default_stop_words))
+stop_words = [word.strip() for word in stop_words_input.split(',') if word.strip()]
 # 파일 업로드
 uploaded_file = st.sidebar.file_uploader("CSV 파일을 업로드하세요", type="csv")
     if st.sidebar.button("토픽 모델링 실행"):
         # 텍스트 전처리
         with st.spinner("텍스트 전처리 중..."):
+            df['processed_text'] = df[text_column].apply(lambda x: preprocess_text(x, stop_words))
         # 토픽 모델링
         with st.spinner("토픽 모델링 실행 중..."):
         for idx, topic in enumerate(lda.components_):
             st.subheader(f"토픽 {idx + 1}")
             top_features_ind = topic.argsort()[:-11:-1]
+            # LDA 상위 단어 테이블
+            lda_top_words = [(feature_names[i], topic[i]) for i in top_features_ind]
+            df_lda = pd.DataFrame(lda_top_words, columns=['단어', 'LDA 점수'])
+            st.subheader("LDA 상위 단어")
+            st.table(df_lda.style.format({'LDA 점수': '{:.4f}'}))
+            # TF-IDF 상위 단어 테이블
+            tfidf_top_words = [(feature_names[i], tfidf_matrix[:, tfidf_vectorizer.vocabulary_[feature_names[i]]].mean()) for i in top_features_ind]
+            df_tfidf = pd.DataFrame(tfidf_top_words, columns=['단어', 'TF-IDF'])
+            st.subheader("TF-IDF 상위 단어")
+            st.table(df_tfidf.style.format({'TF-IDF': '{:.4f}'}))
         # 토픽 비중 그래프
         st.header("토픽 비중 그래프")
                     st.write(response.completion)
         else:
             st.warning("Claude API 키가 설정되지 않았습니다. 토픽 해석을 제공받을 수 없습니다.")
+# 기존 코드 맨 아래에 추가
+# CSS를 사용하여 푸터 스타일 정의
+footer_style = """
+<style>
+.footer {
+    position: fixed;
+    left: 0;
+    bottom: 0;
+    width: 100%;
+    background-color: #f1f1f1;
+    color: black;
+    text-align: center;
+    padding: 10px;
+    font-size: 14px;
+}
+</style>
+"""
+# 푸터 HTML
+footer_html = footer_style + """
+<div class="footer">
+    mySUNI 행복 College
+</div>
+"""
+# 푸터 렌더링
+st.markdown(footer_html, unsafe_allow_html=True)