Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -46,7 +46,6 @@ def calculate_perplexity_coherence(df, text_column, stop_words, topic_range):
|
|
| 46 |
|
| 47 |
perplexities = []
|
| 48 |
coherences = []
|
| 49 |
-
combined_scores = []
|
| 50 |
dictionary = corpora.Dictionary([text.split() for text in df['processed_text']])
|
| 51 |
corpus = [dictionary.doc2bow(text.split()) for text in df['processed_text']]
|
| 52 |
|
|
@@ -64,11 +63,25 @@ def calculate_perplexity_coherence(df, text_column, stop_words, topic_range):
|
|
| 64 |
coherence = coherence_model_lda.get_coherence()
|
| 65 |
coherences.append(coherence)
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
combined_scores.append(combined_score)
|
| 70 |
|
| 71 |
-
return perplexities, coherences, combined_scores
|
| 72 |
|
| 73 |
# Perplexity 및 Coherence 그래프 그리기
|
| 74 |
def plot_perplexity_coherence(topic_range, perplexities, coherences):
|
|
@@ -342,7 +355,7 @@ if uploaded_file is not None:
|
|
| 342 |
topic_range = st.slider("토픽 수 범위 선택", min_value=2, max_value=20, value=(2, 10), step=1)
|
| 343 |
|
| 344 |
if st.button("Perplexity 및 Coherence 계산"):
|
| 345 |
-
perplexities, coherences, combined_scores = calculate_perplexity_coherence(df, text_column, stop_words, range(topic_range[0], topic_range[1] + 1))
|
| 346 |
|
| 347 |
# Plot Perplexity와 Coherence
|
| 348 |
plot_perplexity_coherence(range(topic_range[0], topic_range[1] + 1), perplexities, coherences)
|
|
@@ -352,9 +365,9 @@ if uploaded_file is not None:
|
|
| 352 |
'토픽 수': list(range(topic_range[0], topic_range[1] + 1)),
|
| 353 |
'Combined Score': combined_scores
|
| 354 |
})
|
| 355 |
-
|
| 356 |
chart3 = alt.Chart(df_metrics_combined).mark_line(color='green').encode(
|
| 357 |
-
x=alt.X('토픽 수:Q', title='토픽 수'
|
| 358 |
y=alt.Y('Combined Score:Q', title='Combined Score'),
|
| 359 |
tooltip=['토픽 수', 'Combined Score']
|
| 360 |
).properties(
|
|
@@ -362,19 +375,20 @@ if uploaded_file is not None:
|
|
| 362 |
height=400,
|
| 363 |
title='Combined Score vs 토픽 수'
|
| 364 |
)
|
| 365 |
-
|
| 366 |
st.altair_chart(chart3, use_container_width=True)
|
| 367 |
-
|
| 368 |
-
# 최적의 토픽 수 추천
|
| 369 |
best_combined_index = np.argmax(combined_scores)
|
| 370 |
best_topic_count = topic_range[0] + best_combined_index
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
|
|
|
| 378 |
|
| 379 |
# 탭 2: 전체 분석
|
| 380 |
with tab2:
|
|
|
|
| 46 |
|
| 47 |
perplexities = []
|
| 48 |
coherences = []
|
|
|
|
| 49 |
dictionary = corpora.Dictionary([text.split() for text in df['processed_text']])
|
| 50 |
corpus = [dictionary.doc2bow(text.split()) for text in df['processed_text']]
|
| 51 |
|
|
|
|
| 63 |
coherence = coherence_model_lda.get_coherence()
|
| 64 |
coherences.append(coherence)
|
| 65 |
|
| 66 |
+
# Perplexity 변화율 계산 (감소율)
|
| 67 |
+
perplexity_diffs = [-((perplexities[i] - perplexities[i-1]) / perplexities[i-1])
|
| 68 |
+
if i > 0 else 0 for i in range(len(perplexities))]
|
| 69 |
+
|
| 70 |
+
# Coherence 변화율 계산 (증가율)
|
| 71 |
+
coherence_diffs = [(coherences[i] - coherences[i-1]) / coherences[i-1]
|
| 72 |
+
if i > 0 else 0 for i in range(len(coherences))]
|
| 73 |
+
|
| 74 |
+
# Combined Score 계산
|
| 75 |
+
combined_scores = []
|
| 76 |
+
for p_diff, c_diff in zip(perplexity_diffs, coherence_diffs):
|
| 77 |
+
# Perplexity 감소와 Coherence 증가의 조화 평균
|
| 78 |
+
if p_diff > 0 and c_diff > 0:
|
| 79 |
+
combined_score = 2 / ((1/p_diff) + (1/c_diff))
|
| 80 |
+
else:
|
| 81 |
+
combined_score = 0
|
| 82 |
combined_scores.append(combined_score)
|
| 83 |
|
| 84 |
+
return perplexities, coherences, combined_scores, perplexity_diffs, coherence_diffs
|
| 85 |
|
| 86 |
# Perplexity 및 Coherence 그래프 그리기
|
| 87 |
def plot_perplexity_coherence(topic_range, perplexities, coherences):
|
|
|
|
| 355 |
topic_range = st.slider("토픽 수 범위 선택", min_value=2, max_value=20, value=(2, 10), step=1)
|
| 356 |
|
| 357 |
if st.button("Perplexity 및 Coherence 계산"):
|
| 358 |
+
perplexities, coherences, combined_scores, perplexity_diffs, coherence_diffs = calculate_perplexity_coherence(df, text_column, stop_words, range(topic_range[0], topic_range[1] + 1))
|
| 359 |
|
| 360 |
# Plot Perplexity와 Coherence
|
| 361 |
plot_perplexity_coherence(range(topic_range[0], topic_range[1] + 1), perplexities, coherences)
|
|
|
|
| 365 |
'토픽 수': list(range(topic_range[0], topic_range[1] + 1)),
|
| 366 |
'Combined Score': combined_scores
|
| 367 |
})
|
| 368 |
+
|
| 369 |
chart3 = alt.Chart(df_metrics_combined).mark_line(color='green').encode(
|
| 370 |
+
x=alt.X('토픽 수:Q', title='토픽 수'),
|
| 371 |
y=alt.Y('Combined Score:Q', title='Combined Score'),
|
| 372 |
tooltip=['토픽 수', 'Combined Score']
|
| 373 |
).properties(
|
|
|
|
| 375 |
height=400,
|
| 376 |
title='Combined Score vs 토픽 수'
|
| 377 |
)
|
| 378 |
+
|
| 379 |
st.altair_chart(chart3, use_container_width=True)
|
| 380 |
+
|
| 381 |
+
# 최적의 토픽 수 추천
|
| 382 |
best_combined_index = np.argmax(combined_scores)
|
| 383 |
best_topic_count = topic_range[0] + best_combined_index
|
| 384 |
+
|
| 385 |
+
st.markdown(f"""
|
| 386 |
+
**추천 토픽 수**: Perplexity와 Coherence의 변화율을 종합적으로 고려하면 **{best_topic_count}개의 토픽**을 추천합니다.
|
| 387 |
+
|
| 388 |
+
- Perplexity 감소율이 가장 큰 토픽 수: {topic_range[0] + np.argmax(perplexity_diffs)}
|
| 389 |
+
- Coherence 증가율이 가장 큰 토픽 수: {topic_range[0] + np.argmax(coherence_diffs)}
|
| 390 |
+
- Combined Score가 가장 높은 토픽 수: {best_topic_count}
|
| 391 |
+
""")
|
| 392 |
|
| 393 |
# 탭 2: 전체 분석
|
| 394 |
with tab2:
|