topicmodelingadv

Sleeping

App Files Files Community

soojeongcrystal commited on Sep 4, 2024

Commit

75fbae2

verified ·

1 Parent(s): 11382a5

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -123

app.py CHANGED Viewed

@@ -286,137 +286,147 @@ st.markdown("""
 st.markdown('<h1 class="small-title">📊토픽모델링 for SK</h1>', unsafe_allow_html=True)
-# 3개의 탭 생성
-tab1, tab2, tab3 = st.tabs(["토픽 수 결정", "전체 분석", "조건부 분석"])
-# 탭 1: 토픽 수 결정
-with tab1:
-    st.header("토픽 수 결정")
-    if 'df' not in st.session_state:
-        uploaded_file = st.file_uploader("CSV 파일을 업로드하세요", type="csv")
-        if uploaded_file:
-            df = pd.read_csv(uploaded_file)
-            st.session_state['df'] = df
-            st.write(df.head())
-    if 'df' in st.session_state:
-        df = st.session_state['df']
-        text_column = st.selectbox("텍스트 컬럼을 선택하세요", df.columns)
-        topic_range = st.slider("토픽 수 범위 선택", min_value=2, max_value=20, value=(2, 10))
-        if st.button("Perplexity 및 Coherence 계산"):
-            perplexities, coherences = calculate_perplexity_coherence(df, text_column, default_stop_words, range(topic_range[0], topic_range[1] + 1))
-            plot_perplexity_coherence(range(topic_range[0], topic_range[1] + 1), perplexities, coherences)
-# 탭 2: 전체 분석
-with tab2:
-    st.header("전체 분석")
-    if 'df' in st.session_state:
-        df = st.session_state['df']
-        text_column = st.selectbox("텍스트 컬럼을 선택하세요", df.columns, key="전체분석")
-        num_topics = st.slider("토픽 수를 선택하세요", 2, 20, 5)
-        if st.button("토픽 모델링 실행"):
-            topic_results, lda, lda_output, tfidf_matrix, feature_names = perform_topic_modeling(df, text_column, num_topics, default_stop_words)
-            # 토픽 요약
-            display_topic_summary(topic_results)
-            # 토픽 비중 그래프
-            st.header("토픽 비중 그래프")
-            df_weights = pd.DataFrame({
-                '토픽': [f'토픽 {i+1}' for i in range(num_topics)],
-                '비중': [result['weight'] for result in topic_results]
-            })
-            colors = generate_colors(num_topics)
-            chart = alt.Chart(df_weights).mark_bar().encode(
-                x=alt.X('토픽:N', axis=alt.Axis(labelAngle=0)),
-                y=alt.Y('비중:Q', axis=alt.Axis(format=',.1f')),
-                color=alt.Color('토픽:N', scale=alt.Scale(range=colors))
-            ).properties(
-                width=600,
-                height=400,
-                title='문서 내 토픽 비중 (%)'
-            )
-            text = chart.mark_text(
-                align='center',
-                baseline='bottom',
-                dy=-5
-            ).encode(
-                text=alt.Text('비중:Q', format='.1f')
-            )
-            st.altair_chart(chart + text, use_container_width=True)
-            # 네트워크 그래프
-            G = create_network_graph(topic_results, num_words=20)
-            img_bytes = plot_network_graph(G)
-            st.image(img_bytes, caption="토픽별 상위 20개 단어 네트워크", use_column_width=True)
-            # 토픽 할당 데이터 다운로드
-            df['topic'] = lda_output.argmax(axis=1) + 1
-            download_topic_assignment(df)
-            # 종합 해석
-            api_key = st.text_input("Claude API 키를 입력하세요", type="password")
-            if api_key:
-                st.subheader("토픽 종합 해석")
-                with st.spinner("Claude AI로 토픽 해석 중..."):
-                    interpretation = interpret_topics_full(api_key, topic_results)
-                    st.text_area("해석 결과", value=interpretation, height=300)
-# 탭 3: 조건부 분석
-with tab3:
-    st.header("조건부 분석")
-    if 'df' in st.session_state:
-        df = st.session_state['df']
-        text_column = st.selectbox("텍스트 컬럼을 선택하세요", df.columns, key="조건부분석")
-        condition_column = st.selectbox("조건부 분석에 사용할 변수를 선택하세요", df.columns)
-        if pd.api.types.is_numeric_dtype(df[condition_column]):
-            analysis_method = st.radio("분석 방법 선택", ["범위 선택", "임계값 기준"])
-            if analysis_method == "범위 선택":
-                min_val, max_val = df[condition_column].min(), df[condition_column].max()
-                condition = st.slider(f"{condition_column} 범위 선택", float(min_val), float(max_val), (float(min_val), float(max_val)))
-            else:
-                threshold = st.number_input(f"{condition_column} 임계값 설정", min_value=float(df[condition_column].min()), max_value=float(df[condition_column].max()), value=float((df[condition_column].min() + df[condition_column].max()) / 2))
-                comparison = st.radio("비교 기준", ["이상", "이하"])
-                condition = (comparison, threshold)
-        else:
-            unique_values = df[condition_column].unique()
-            condition = st.multiselect(f"{condition_column} 값 선택", unique_values, default=unique_values)
-        num_topics = st.slider("토픽 수를 선택하세요", 2, 20, 5, key="조건부토픽수")
-        if st.button("조건부 토픽 모델링 실행"):
-            if isinstance(condition, tuple):
-                if condition[0] == "이상":
-                    filtered_df = df[df[condition_column] >= condition[1]]
-                else:
-                    filtered_df = df[df[condition_column] <= condition[1]]
-            else:
-                filtered_df = df[df[condition_column].between(*condition)] if isinstance(condition, tuple) else df[df[condition_column].isin(condition)]
-            topic_results, lda, lda_output, tfidf_matrix, feature_names = perform_topic_modeling(filtered_df, text_column, num_topics, default_stop_words)
-            # 토픽 요약
-            display_topic_summary(topic_results)
-            # 네트워크 그래프
-            G = create_network_graph(topic_results, num_words=20)
-            img_bytes = plot_network_graph(G)
-            st.image(img_bytes, caption="토픽별 상위 20개 단어 네트워크", use_column_width=True)
-            # 토픽 할당 데이터 다운로드
-            filtered_df['topic'] = lda_output.argmax(axis=1) + 1
-            download_topic_assignment(filtered_df)
 # 푸터 추가
 st.markdown("""

 st.markdown('<h1 class="small-title">📊토픽모델링 for SK</h1>', unsafe_allow_html=True)
+# 사이드바 설정
+with st.sidebar:
+    st.header('설정하기')
+    api_key = st.text_input("Claude API 키를 입력하세요", type="password")
+    if not api_key:
+        api_key = os.environ.get("ANTHROPIC_API_KEY")
+    st.caption("Claude API가 있으면 토픽 종합 해석까지 가능합니다. 공백으로 두면 기본적인 결과만 나옵니다.")
+    stop_words_input = st.text_area("불용어 목록 (쉼표로 구분)", ', '.join(default_stop_words))
+    stop_words = [word.strip() for word in stop_words_input.split(',') if word.strip()]
+    st.caption("결과를 보고 업데이트해주세요.")
+    uploaded_file = st.file_uploader("CSV 파일을 업로드하세요", type="csv")
+    st.caption("csv-UTF 형식을 사용해주세요!")
+# 데이터 로드 및 초기 설정
+if uploaded_file is not None:
+    try:
+        df = pd.read_csv(uploaded_file)
+        if df.empty:
+            st.error("CSV 파일에 데이터가 없습니다.")
+        else:
+            st.success("파일이 성공적으로 업로드되었습니다.")
+            st.subheader("데이터 미리보기")
+            st.write(df.head())
+            text_column = st.selectbox("텍스트 컬럼을 선택하세요", df.columns)
+            num_topics = st.slider("토픽 수를 선택하세요", 2, 20, 5)
+            # 분석 방법 선택
+            analysis_type = st.radio("분석 방법 선택", ["전체 분석", "조건부 분석"])
+            # 토픽 수 검토 탭
+            st.sidebar.header('토픽 수 검토')
+            topic_range = st.sidebar.slider("토픽 수 범위 선택", min_value=2, max_value=20, value=(2, 10))
+            if st.sidebar.button("Perplexity 및 Coherence 계산"):
+                perplexities, coherences = calculate_perplexity_coherence(df, text_column, stop_words, range(topic_range[0], topic_range[1] + 1))
+                plot_perplexity_coherence(range(topic_range[0], topic_range[1] + 1), perplexities, coherences)
+            if analysis_type == "전체 분석":
+                st.header("전체 데이터 분석 결과")
+                with st.spinner("토픽 모델링 실행 중..."):
+                    topic_results, lda, lda_output, tfidf_matrix, feature_names = perform_topic_modeling(df, text_column, num_topics, stop_words)
+                # 토픽 요약
+                display_topic_summary(topic_results)
+                # 토픽 비중 그래프
+                st.header("토픽 비중 그래프")
+                df_weights = pd.DataFrame({
+                    '토픽': [f'토픽 {i+1}' for i in range(num_topics)],
+                    '비중': [result['weight'] for result in topic_results]
+                })
+                colors = generate_colors(num_topics)
+                chart = alt.Chart(df_weights).mark_bar().encode(
+                    x=alt.X('토픽:N', axis=alt.Axis(labelAngle=0)),
+                    y=alt.Y('비중:Q', axis=alt.Axis(format=',.1f')),
+                    color=alt.Color('토픽:N', scale=alt.Scale(range=colors))
+                ).properties(
+                    width=600,
+                    height=400,
+                    title='문서 내 토픽 비중 (%)'
+                )
+                text = chart.mark_text(
+                    align='center',
+                    baseline='bottom',
+                    dy=-5
+                ).encode(
+                    text=alt.Text('비중:Q', format='.1f')
+                )
+                st.altair_chart(chart + text, use_container_width=True)
+                # 네트워크 그래프
+                st.header("토픽 단어 네트워크 그래프")
+                G = create_network_graph(topic_results, num_words=20)
+                img_bytes = plot_network_graph(G)
+                st.image(img_bytes, caption="토픽별 상위 20개 단어 네트워크", use_column_width=True)
+                # 토픽 할당 데이터 다운로드
+                df['topic'] = lda_output.argmax(axis=1) + 1
+                download_topic_assignment(df)
+                # 종합 해석
+                if api_key:
+                    st.subheader("토픽 종합 해석")
+                    with st.spinner("Claude AI로 토픽 해석 중..."):
+                        interpretation = interpret_topics_full(api_key, topic_results)
+                        st.text_area("해석 결과", value=interpretation, height=300)
+            elif analysis_type == "조건부 분석":
+                st.header("조건부 분석 결과")
+                condition_column = st.selectbox("조건부 분석에 사용할 변수를 선택하세요", df.columns)
+                if pd.api.types.is_numeric_dtype(df[condition_column]):
+                    min_val, max_val = df[condition_column].min(), df[condition_column].max()
+                    st.write(f"{condition_column}의 범위: {min_val:.2f} ~ {max_val:.2f}")
+                    analysis_method = st.radio("분석 방법 선택", ["범위 선택", "임계값 기준"])
+                    if analysis_method == "범위 선택":
+                        condition = st.slider(f"{condition_column} 범위 선택", float(min_val), float(max_val), (float(min_val), float(max_val)))
+                    else:  # 임계값 기준
+                        threshold = st.number_input(f"{condition_column} 임계값 설정", min_value=float(min_val), max_value=float(max_val), value=float((min_val + max_val) / 2))
+                        comparison = st.radio("비교 기준", ["이상", "이하"])
+                        condition = (comparison, threshold)
+                    is_numeric = True
+                else:
+                    unique_values = df[condition_column].unique()
+                    condition = st.multiselect(f"{condition_column} 값 선택", unique_values, default=unique_values)
+                    is_numeric = False
+                if st.button("토픽 모델링 실행"):
+                    st.session_state.run_analysis = True
+                    st.session_state.text_column = text_column
+                    st.session_state.num_topics = num_topics
+                    st.session_state.analysis_type = analysis_type
+                    if analysis_type == "조건부 분석":
+                        st.session_state.condition_column = condition_column
+                        st.session_state.condition = condition
+                        st.session_state.is_numeric = is_numeric
+                else:
+                    st.session_state.run_analysis = False
+    except pd.errors.EmptyDataError:
+        st.error("업로드된 CSV 파일이 비어있습니다. 다시 확인해주세요.")
+    except UnicodeDecodeError:
+        st.error("파일 인코딩에 문제가 있습니다. UTF-8 인코딩으로 저장된 CSV 파일을 사용해주세요.")
+    except Exception as e:
+        st.error(f"파일을 읽는 중 오류가 발생했습니다: {str(e)}")
+else:
+    st.info("CSV 파일을 업로드해주세요.")
 # 푸터 추가
 st.markdown("""