soojeongcrystal commited on
Commit
d7e6c74
·
verified ·
1 Parent(s): 943bd5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -17
app.py CHANGED
@@ -46,7 +46,6 @@ def calculate_perplexity_coherence(df, text_column, stop_words, topic_range):
46
 
47
  perplexities = []
48
  coherences = []
49
- combined_scores = []
50
  dictionary = corpora.Dictionary([text.split() for text in df['processed_text']])
51
  corpus = [dictionary.doc2bow(text.split()) for text in df['processed_text']]
52
 
@@ -64,11 +63,25 @@ def calculate_perplexity_coherence(df, text_column, stop_words, topic_range):
64
  coherence = coherence_model_lda.get_coherence()
65
  coherences.append(coherence)
66
 
67
- # Combined Score 계산 (예: Perplexity의 역수와 Coherence의 가중 평균)
68
- combined_score = (1 / perplexity) * 0.3 + coherence * 0.7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  combined_scores.append(combined_score)
70
 
71
- return perplexities, coherences, combined_scores
72
 
73
  # Perplexity 및 Coherence 그래프 그리기
74
  def plot_perplexity_coherence(topic_range, perplexities, coherences):
@@ -342,7 +355,7 @@ if uploaded_file is not None:
342
  topic_range = st.slider("토픽 수 범위 선택", min_value=2, max_value=20, value=(2, 10), step=1)
343
 
344
  if st.button("Perplexity 및 Coherence 계산"):
345
- perplexities, coherences, combined_scores = calculate_perplexity_coherence(df, text_column, stop_words, range(topic_range[0], topic_range[1] + 1))
346
 
347
  # Plot Perplexity와 Coherence
348
  plot_perplexity_coherence(range(topic_range[0], topic_range[1] + 1), perplexities, coherences)
@@ -352,9 +365,9 @@ if uploaded_file is not None:
352
  '토픽 수': list(range(topic_range[0], topic_range[1] + 1)),
353
  'Combined Score': combined_scores
354
  })
355
-
356
  chart3 = alt.Chart(df_metrics_combined).mark_line(color='green').encode(
357
- x=alt.X('토픽 수:Q', title='토픽 수', scale=alt.Scale(domain=[min(topic_range), max(topic_range)])),
358
  y=alt.Y('Combined Score:Q', title='Combined Score'),
359
  tooltip=['토픽 수', 'Combined Score']
360
  ).properties(
@@ -362,19 +375,20 @@ if uploaded_file is not None:
362
  height=400,
363
  title='Combined Score vs 토픽 수'
364
  )
365
-
366
  st.altair_chart(chart3, use_container_width=True)
367
-
368
- # 최적의 토픽 수 추천 (Combined Score 사용)
369
  best_combined_index = np.argmax(combined_scores)
370
  best_topic_count = topic_range[0] + best_combined_index
371
- st.markdown(f"**추천 토픽 수**: Perplexity와 Coherence를 종합적으로 고려하면 토픽 개수는 **{best_topic_count}개를 추천합니다**")
372
-
373
- # 정보 제공
374
- st.info(f"Combined Score가 가장 높은 토픽 수: {best_topic_count}")
375
- st.info(f"Perplexity 가장 낮은 토픽 수: {topic_range[0] + np.argmin(perplexities)}")
376
- st.info(f"Coherence가 가장 높은 토픽 수: {topic_range[0] + np.argmax(coherences)}")
377
-
 
378
 
379
  # 탭 2: 전체 분석
380
  with tab2:
 
46
 
47
  perplexities = []
48
  coherences = []
 
49
  dictionary = corpora.Dictionary([text.split() for text in df['processed_text']])
50
  corpus = [dictionary.doc2bow(text.split()) for text in df['processed_text']]
51
 
 
63
  coherence = coherence_model_lda.get_coherence()
64
  coherences.append(coherence)
65
 
66
+ # Perplexity 변화율 계산 (감소율)
67
+ perplexity_diffs = [-((perplexities[i] - perplexities[i-1]) / perplexities[i-1])
68
+ if i > 0 else 0 for i in range(len(perplexities))]
69
+
70
+ # Coherence 변화율 계산 (증가율)
71
+ coherence_diffs = [(coherences[i] - coherences[i-1]) / coherences[i-1]
72
+ if i > 0 else 0 for i in range(len(coherences))]
73
+
74
+ # Combined Score 계산
75
+ combined_scores = []
76
+ for p_diff, c_diff in zip(perplexity_diffs, coherence_diffs):
77
+ # Perplexity 감소와 Coherence 증가의 조화 평균
78
+ if p_diff > 0 and c_diff > 0:
79
+ combined_score = 2 / ((1/p_diff) + (1/c_diff))
80
+ else:
81
+ combined_score = 0
82
  combined_scores.append(combined_score)
83
 
84
+ return perplexities, coherences, combined_scores, perplexity_diffs, coherence_diffs
85
 
86
  # Perplexity 및 Coherence 그래프 그리기
87
  def plot_perplexity_coherence(topic_range, perplexities, coherences):
 
355
  topic_range = st.slider("토픽 수 범위 선택", min_value=2, max_value=20, value=(2, 10), step=1)
356
 
357
  if st.button("Perplexity 및 Coherence 계산"):
358
+ perplexities, coherences, combined_scores, perplexity_diffs, coherence_diffs = calculate_perplexity_coherence(df, text_column, stop_words, range(topic_range[0], topic_range[1] + 1))
359
 
360
  # Plot Perplexity와 Coherence
361
  plot_perplexity_coherence(range(topic_range[0], topic_range[1] + 1), perplexities, coherences)
 
365
  '토픽 수': list(range(topic_range[0], topic_range[1] + 1)),
366
  'Combined Score': combined_scores
367
  })
368
+
369
  chart3 = alt.Chart(df_metrics_combined).mark_line(color='green').encode(
370
+ x=alt.X('토픽 수:Q', title='토픽 수'),
371
  y=alt.Y('Combined Score:Q', title='Combined Score'),
372
  tooltip=['토픽 수', 'Combined Score']
373
  ).properties(
 
375
  height=400,
376
  title='Combined Score vs 토픽 수'
377
  )
378
+
379
  st.altair_chart(chart3, use_container_width=True)
380
+
381
+ # 최적의 토픽 수 추천
382
  best_combined_index = np.argmax(combined_scores)
383
  best_topic_count = topic_range[0] + best_combined_index
384
+
385
+ st.markdown(f"""
386
+ ** 토픽 수**: Perplexity와 Coherence의 변화율을 종합적으로 고려하면 **{best_topic_count}개의 토픽**을 추천합니다.
387
+
388
+ - Perplexity 감소율이 가장 토픽 수: {topic_range[0] + np.argmax(perplexity_diffs)}
389
+ - Coherence율이 가장 토픽 수: {topic_range[0] + np.argmax(coherence_diffs)}
390
+ - Combined Score가 가장 높은 토픽 수: {best_topic_count}
391
+ """)
392
 
393
  # 탭 2: 전체 분석
394
  with tab2: