Spaces:

soojeongcrystal
/

topicmodeling

Sleeping

App Files Files Community

soojeongcrystal commited on Aug 23, 2024

Commit

83d2bc1

verified ·

1 Parent(s): 9b4047d

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -11

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from konlpy.tag import Okt
 import re
 import os
 import altair as alt
 # KoNLPy 형태소 분석기 초기화
 @st.cache_resource
@@ -17,7 +18,7 @@ def load_okt():
 okt = load_okt()
 # 기본 불용어 목록
-default_stop_words = ['이', '그', '저', '것', '수', '등', '들', '및', '에서', '그리고', '그래서', '또는', '그런데']
 @st.cache_data
 def preprocess_text(text, stop_words):
@@ -32,6 +33,11 @@ def preprocess_text(text, stop_words):
     return ' '.join(processed)
 # Streamlit 앱 설정
 st.title('한국어 토픽 모델링 앱')
@@ -126,16 +132,48 @@ if uploaded_file is not None:
         # 토픽 비중 그래프
         st.header("토픽 비중 그래프")
         topic_weights = lda_output.mean(axis=0)
-        df_weights = pd.DataFrame({'토픽': range(1, num_topics + 1), '비중': topic_weights})
-        chart = alt.Chart(df_weights).mark_bar().encode(
-            x='토픽:O',
-            y='비중:Q'
-        ).properties(
             width=600,
             height=400,
-            title='문서 내 토픽 비중'
         )
         st.altair_chart(chart, use_container_width=True)
         # Claude API를 사용하여 토픽 해석
@@ -145,11 +183,27 @@ if uploaded_file is not None:
             st.header("Claude의 토픽 해석")
             for idx, topic in enumerate(lda.components_):
                 with st.spinner(f"토픽 {idx + 1} 해석 중..."):
-                    top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
-                    prompt = f"{HUMAN_PROMPT} 다음 한국어 단어들로 구성된 토픽을 해석해주세요: {', '.join(top_words)}"
                     response = anthropic.completions.create(
                         model="claude-2.1",
-                        max_tokens_to_sample=300,
                         prompt=f"{prompt}\n\n{AI_PROMPT}",
                     )
                     st.subheader(f"토픽 {idx + 1} 해석:")

 import re
 import os
 import altair as alt
+import colorsys
 # KoNLPy 형태소 분석기 초기화
 @st.cache_resource
 okt = load_okt()
 # 기본 불용어 목록
+default_stop_words = ['이', '그', '저', '것', '수', '등', '들', '및', '에서', '그리고', '그래서', '또는', '그런데', '의', '대한', '간의']
 @st.cache_data
 def preprocess_text(text, stop_words):
     return ' '.join(processed)
+# HSL 색상 생성
+def generate_colors(n):
+    HSV_tuples = [(x * 1.0 / n, 0.5, 0.9) for x in range(n)]
+    return ['#%02x%02x%02x' % tuple(int(x*255) for x in colorsys.hsv_to_rgb(*hsv)) for hsv in HSV_tuples]
 # Streamlit 앱 설정
 st.title('한국어 토픽 모델링 앱')
         # 토픽 비중 그래프
         st.header("토픽 비중 그래프")
         topic_weights = lda_output.mean(axis=0)
+        df_weights = pd.DataFrame({
+            '토픽': [f'토픽 {i+1}' for i in range(num_topics)],
+            '비중': topic_weights
+        })
+        # 퍼센트로 변환
+        df_weights['퍼센트'] = df_weights['비중'] / df_weights['비중'].sum() * 100
+        colors = generate_colors(num_topics)
+        # 차트 생성
+        base = alt.Chart(df_weights).encode(
+            x=alt.X('토픽:N', axis=alt.Axis(labelAngle=0)),
+            y=alt.Y('퍼센트:Q', axis=alt.Axis(format=',.1f'))
+        )
+        bars = base.mark_bar().encode(
+            color=alt.Color('토픽:N', scale=alt.Scale(range=colors))
+        )
+        text = base.mark_text(
+            align='center',
+            baseline='middle',
+            dy=-10  # 텍스트를 약간 위로 이동
+        ).encode(
+            text=alt.Text('퍼센트:Q', format='.1f')
+        )
+        chart = (bars + text).properties(
             width=600,
             height=400,
+            title='문서 내 토픽 비중 (%)'
+        ).configure_axis(
+            labelFontSize=12,
+            titleFontSize=14
+        ).configure_title(
+            fontSize=16,
+            font='Arial',
+            anchor='middle',
+            color='gray'
         )
         st.altair_chart(chart, use_container_width=True)
         # Claude API를 사용하여 토픽 해석
             st.header("Claude의 토픽 해석")
             for idx, topic in enumerate(lda.components_):
                 with st.spinner(f"토픽 {idx + 1} 해석 중..."):
+                    lda_top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
+                    topic_docs = lda_output[:, idx].argsort()[::-1][:100]
+                    topic_tfidf = tfidf_matrix[topic_docs].mean(axis=0).A1
+                    tfidf_top_words = [feature_names[i] for i in topic_tfidf.argsort()[:-11:-1]]
+                    prompt = f"{HUMAN_PROMPT} 다음은 토픽 모델링 결과로 나온 단어들입니다. 이를 바탕으로 아래 형식에 맞춰 토픽을 해석해주세요:
+{{LDA 상위 단어}}: {', '.join(lda_top_words)}
+{{TF-IDF 상위 단어}}: {', '.join(tfidf_top_words)}
+1. 토픽 의미 해석: (위 단어들을 바탕으로 이 토픽이 어떤 의미를 나타내는지 2-3문장으로 설명해주세요)
+2. 토픽명 제안: (이 토픽을 잘 나타낼 수 있는 간단한 제목을 제안해주세요)
+3. 대표적인 예시 응답: (이 토픽과 관련된 대표적인 발언이나 문장 예시를 3가지 제시해주세요)
+위 형식에 맞춰 답변해주세요."
                     response = anthropic.completions.create(
                         model="claude-2.1",
+                        max_tokens_to_sample=1000,
                         prompt=f"{prompt}\n\n{AI_PROMPT}",
                     )
                     st.subheader(f"토픽 {idx + 1} 해석:")