topicmodelingadv

Sleeping

App Files Files Community

soojeongcrystal commited on Sep 4, 2024

Commit

d7e6c74

verified ·

1 Parent(s): 943bd5f

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -17

app.py CHANGED Viewed

@@ -46,7 +46,6 @@ def calculate_perplexity_coherence(df, text_column, stop_words, topic_range):
     perplexities = []
     coherences = []
-    combined_scores = []
     dictionary = corpora.Dictionary([text.split() for text in df['processed_text']])
     corpus = [dictionary.doc2bow(text.split()) for text in df['processed_text']]
@@ -64,11 +63,25 @@ def calculate_perplexity_coherence(df, text_column, stop_words, topic_range):
         coherence = coherence_model_lda.get_coherence()
         coherences.append(coherence)
-        # Combined Score 계산 (예: Perplexity의 역수와 Coherence의 가중 평균)
-        combined_score = (1 / perplexity) * 0.3 + coherence * 0.7
         combined_scores.append(combined_score)
-    return perplexities, coherences, combined_scores
 # Perplexity 및 Coherence 그래프 그리기
 def plot_perplexity_coherence(topic_range, perplexities, coherences):
@@ -342,7 +355,7 @@ if uploaded_file is not None:
                 topic_range = st.slider("토픽 수 범위 선택", min_value=2, max_value=20, value=(2, 10), step=1)
                 if st.button("Perplexity 및 Coherence 계산"):
-                    perplexities, coherences, combined_scores = calculate_perplexity_coherence(df, text_column, stop_words, range(topic_range[0], topic_range[1] + 1))
                     # Plot Perplexity와 Coherence
                     plot_perplexity_coherence(range(topic_range[0], topic_range[1] + 1), perplexities, coherences)
@@ -352,9 +365,9 @@ if uploaded_file is not None:
                         '토픽 수': list(range(topic_range[0], topic_range[1] + 1)),
                         'Combined Score': combined_scores
                     })
                     chart3 = alt.Chart(df_metrics_combined).mark_line(color='green').encode(
-                        x=alt.X('토픽 수:Q', title='토픽 수', scale=alt.Scale(domain=[min(topic_range), max(topic_range)])),
                         y=alt.Y('Combined Score:Q', title='Combined Score'),
                         tooltip=['토픽 수', 'Combined Score']
                     ).properties(
@@ -362,19 +375,20 @@ if uploaded_file is not None:
                         height=400,
                         title='Combined Score vs 토픽 수'
                     )
                     st.altair_chart(chart3, use_container_width=True)
-                    # 최적의 토픽 수 추천 (Combined Score 사용)
                     best_combined_index = np.argmax(combined_scores)
                     best_topic_count = topic_range[0] + best_combined_index
-                    st.markdown(f"**추천 토픽 수**: Perplexity와 Coherence를 종합적으로 고려하면 토픽 개수는 **{best_topic_count}개를 추천합니다**")
-                    # 추가 정보 제공
-                    st.info(f"Combined Score가 가장 높은 토픽 수: {best_topic_count}")
-                    st.info(f"Perplexity가 가장 낮은 토픽 수: {topic_range[0] + np.argmin(perplexities)}")
-                    st.info(f"Coherence가 가장 높은 토픽 수: {topic_range[0] + np.argmax(coherences)}")
             # 탭 2: 전체 분석
             with tab2:

     perplexities = []
     coherences = []
     dictionary = corpora.Dictionary([text.split() for text in df['processed_text']])
     corpus = [dictionary.doc2bow(text.split()) for text in df['processed_text']]
         coherence = coherence_model_lda.get_coherence()
         coherences.append(coherence)
+    # Perplexity 변화율 계산 (감소율)
+    perplexity_diffs = [-((perplexities[i] - perplexities[i-1]) / perplexities[i-1])
+                        if i > 0 else 0 for i in range(len(perplexities))]
+    # Coherence 변화율 계산 (증가율)
+    coherence_diffs = [(coherences[i] - coherences[i-1]) / coherences[i-1]
+                       if i > 0 else 0 for i in range(len(coherences))]
+    # Combined Score 계산
+    combined_scores = []
+    for p_diff, c_diff in zip(perplexity_diffs, coherence_diffs):
+        # Perplexity 감소와 Coherence 증가의 조화 평균
+        if p_diff > 0 and c_diff > 0:
+            combined_score = 2 / ((1/p_diff) + (1/c_diff))
+        else:
+            combined_score = 0
         combined_scores.append(combined_score)
+    return perplexities, coherences, combined_scores, perplexity_diffs, coherence_diffs
 # Perplexity 및 Coherence 그래프 그리기
 def plot_perplexity_coherence(topic_range, perplexities, coherences):
                 topic_range = st.slider("토픽 수 범위 선택", min_value=2, max_value=20, value=(2, 10), step=1)
                 if st.button("Perplexity 및 Coherence 계산"):
+                    perplexities, coherences, combined_scores, perplexity_diffs, coherence_diffs = calculate_perplexity_coherence(df, text_column, stop_words, range(topic_range[0], topic_range[1] + 1))
                     # Plot Perplexity와 Coherence
                     plot_perplexity_coherence(range(topic_range[0], topic_range[1] + 1), perplexities, coherences)
                         '토픽 수': list(range(topic_range[0], topic_range[1] + 1)),
                         'Combined Score': combined_scores
                     })
                     chart3 = alt.Chart(df_metrics_combined).mark_line(color='green').encode(
+                        x=alt.X('토픽 수:Q', title='토픽 수'),
                         y=alt.Y('Combined Score:Q', title='Combined Score'),
                         tooltip=['토픽 수', 'Combined Score']
                     ).properties(
                         height=400,
                         title='Combined Score vs 토픽 수'
                     )
                     st.altair_chart(chart3, use_container_width=True)
+                    # 최적의 토픽 수 추천
                     best_combined_index = np.argmax(combined_scores)
                     best_topic_count = topic_range[0] + best_combined_index
+                    st.markdown(f"""
+                    **추천 토픽 수**: Perplexity와 Coherence의 변화율을 종합적으로 고려하면 **{best_topic_count}개의 토픽**을 추천합니다.
+                    - Perplexity 감소율이 가장 큰 토픽 수: {topic_range[0] + np.argmax(perplexity_diffs)}
+                    - Coherence 증가율이 가장 큰 토픽 수: {topic_range[0] + np.argmax(coherence_diffs)}
+                    - Combined Score가 가장 높은 토픽 수: {best_topic_count}
+                    """)
             # 탭 2: 전체 분석
             with tab2: