Spaces:

soojeongcrystal
/

topicmodeling

Sleeping

App Files Files Community

soojeongcrystal commited on Aug 24, 2024

Commit

0f3d973

verified ·

1 Parent(s): 2d8ab82

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -83

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import streamlit as st
 import pandas as pd
 import numpy as np
-from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
 from konlpy.tag import Okt
@@ -11,8 +11,6 @@ import altair as alt
 import colorsys
 import networkx as nx
 import streamlit.components.v1 as components
-import anthropic
 # Streamlit 페이지 설정
 st.set_page_config(layout="wide", page_title="📊 토픽모델링 for SK", page_icon="📊")
@@ -60,24 +58,44 @@ def create_network_graph(topic_results, num_words=30):
 # HTML 네트워크 그래프 생성 함수
 def create_custom_network_html(G):
-    nodes = [{"id": n, "label": n} for n in G.nodes()]
     edges = [{"from": u, "to": v} for u, v in G.edges()]
     html_content = f"""
     <html>
     <head>
-      <script type="text/javascript" src="https://unpkg.com/vis-network@standalone/umd/vis-network.min.js"></script>
     </head>
     <body>
-      <div id="mynetwork" style="width: 600px; height: 400px; border: 1px solid lightgray;"></div>
       <script type="text/javascript">
-        var nodes = {nodes};
-        var edges = {edges};
         var container = document.getElementById('mynetwork');
         var data = {{
-            nodes: new vis.DataSet(nodes),
-            edges: new vis.DataSet(edges)
         }};
-        var options = {{}};
         var network = new vis.Network(container, data, options);
       </script>
     </body>
@@ -85,27 +103,6 @@ def create_custom_network_html(G):
     """
     return html_content
-def visualize_network(G):
-    if G is None or G.number_of_nodes() == 0:
-        st.error("Graph is empty or not properly initialized.")
-        return
-    try:
-        nt = Network(height="500px", width="100%", bgcolor="#222222", font_color="white")
-        if G.number_of_nodes() > 0:  # G에 노드가 존재하는지 확인
-            nt.from_nx(G)
-        else:
-            st.error("No nodes in the graph to render.")
-            return
-        nt.show("network.html")
-        with open("network.html", 'r', encoding='utf-8') as f:
-            html_string = f.read()
-        components.html(html_string, height=500)
-    except Exception as e:
-        st.error(f"Failed to render the network graph: {str(e)}")
 # 헤더 스타일 변경
 st.markdown("""
 <style>
@@ -116,6 +113,12 @@ st.markdown("""
         background-color: #f1f1f1;
         padding: 10px;
     }
 </style>
 <div style="background-color: #f1f1f1; padding: 10px; color: #707070; text-align: right; width: 100%;">
     mySUNI 행복 College 행복담당조직 Meet-Up
@@ -143,31 +146,25 @@ with st.sidebar:
     # 파일 업로드
     uploaded_file = st.file_uploader("CSV 파일을 업로드하세요", type="csv")
-    if uploaded_file is not None:
-        try:
-            # 파일 내용 확인
-            file_contents = uploaded_file.getvalue().decode('utf-8')
-            if not file_contents.strip():
-                st.error("업로드된 CSV 파일이 비어 있습니다.")
             else:
-                df = pd.read_csv(uploaded_file)
-                if df.empty:
-                    st.error("CSV 파일에 데이터가 없습니다.")
-                else:
-                    st.success("파일이 성공적으로 업로드되었습니다.")
-                    st.write("데이터 미리보기:")
-                    st.write(df.head())
-                    text_column = st.selectbox("텍스트 컬럼을 선택하세요", df.columns)
-                    num_topics = st.slider("토픽 수를 선택하세요", 2, 20, 5)
-                    if st.button("토픽 모델링 실행"):
-                        st.session_state.run_analysis = True
-                    else:
-                        st.session_state.run_analysis = False
-        except pd.errors.EmptyDataError:
-            st.error("업로드된 CSV 파일이 비어있거나 올바르지 않습니다. 다시 확인해주세요.")
-        except Exception as e:
-            st.error(f"파일을 읽는 중 오류가 발생했습니다: {str(e)}")
 # 메인 컨텐츠
 if 'run_analysis' in st.session_state and st.session_state.run_analysis:
@@ -195,6 +192,23 @@ if 'run_analysis' in st.session_state and st.session_state.run_analysis:
             topic_results = []
             for idx, topic in enumerate(lda.components_):
                 st.subheader(f"토픽 {idx + 1}")
@@ -206,7 +220,7 @@ if 'run_analysis' in st.session_state and st.session_state.run_analysis:
                     lda_top_words = [(feature_names[i], topic[i]) for i in topic.argsort()[:-11:-1]]
                     df_lda = pd.DataFrame(lda_top_words, columns=['단어', 'LDA 점수'])
                     st.subheader("LDA 상위 단어")
-                    st.dataframe(df_lda.style.format({'LDA 점수': '{:.4f}'}), height=400)
                 with col2:
                     # 토픽별 TF-IDF 계산
@@ -215,14 +229,7 @@ if 'run_analysis' in st.session_state and st.session_state.run_analysis:
                     tfidf_top_words = [(feature_names[i], topic_tfidf[i]) for i in topic_tfidf.argsort()[:-11:-1]]
                     df_tfidf = pd.DataFrame(tfidf_top_words, columns=['단어', 'TF-IDF'])
                     st.subheader("TF-IDF 상위 단어")
-                    st.dataframe(df_tfidf.style.format({'TF-IDF': '{:.4f}'}), height=400)
-                topic_results.append({
-                    'topic_num': idx + 1,
-                    'lda_words': [word for word, _ in lda_top_words],
-                    'tfidf_words': [word for word, _ in tfidf_top_words],
-                    'weight': lda_output[:, idx].mean() * 100  # 퍼센트로 변환
-                })
             # 토픽 비중 그래프
             st.header("토픽 비중 그래프")
@@ -267,20 +274,22 @@ if 'run_analysis' in st.session_state and st.session_state.run_analysis:
             st.altair_chart(chart, use_container_width=True)
-            # 새로운 시각화 함수 호출
             st.header("토픽 단어 네트워크 그래프")
-            G = create_network_graph(topic_results)
-            html_content = create_custom_network_html(G)
-            components.html(html_content, height=500)
             # Claude API를 사용하여 토픽 해석
             if api_key:
-                client = anthropic.Anthropic(api_key=api_key)
                 st.header("Claude의 토픽 해석")
                 with st.spinner("토픽 해석 중..."):
-                    prompt = f"""{HUMAN_PROMPT} 다음은 LDA 토픽 모델링 결과로 나온 각 토픽의 정보입니다. 이를 바탕으로 전체 토픽을 종합적으로 해석해주세요:
             {", ".join([f"토픽 {info['topic_num']} (비중: {info['weight']:.1f}%)" for info in topic_results])}
@@ -313,16 +322,16 @@ if 'run_analysis' in st.session_state and st.session_state.run_analysis:
             위 형식에 맞춰 답변해주세요. 사용자가 쉽게 복사하여 사용할 수 있도록 간결하고 명확하게 작성해주세요."""
-                    response = client.messages.create(
-                        model="claude-3-5-sonnet-20240620",
-                        max_tokens=3000,
-                        messages=[
-                            {"role": "user", "content": f"{prompt}\n\n{AI_PROMPT}"}
-                        ]
-                    )
-                    st.subheader("토픽 모델링 종합 결과")
-                    st.text_area("결과를 복사하여 사용하세요:", value=response.completion, height=500)
             else:
                 st.warning("Claude API 키가 설정되지 않았습니다. https://console.anthropic.com/settings/keys 에 접속하여 API 키를 발급받으시면 토픽명과 해석을 제공받으실 수 있습니다.")

 import streamlit as st
 import pandas as pd
 import numpy as np
+from anthropic import Anthropic
 from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 from sklearn.decomposition import LatentDirichletAllocation
 from konlpy.tag import Okt
 import colorsys
 import networkx as nx
 import streamlit.components.v1 as components
 # Streamlit 페이지 설정
 st.set_page_config(layout="wide", page_title="📊 토픽모델링 for SK", page_icon="📊")
 # HTML 네트워크 그래프 생성 함수
 def create_custom_network_html(G):
+    nodes = [{"id": n, "label": n, "color": G.nodes[n].get('color', '#000000')} for n in G.nodes()]
     edges = [{"from": u, "to": v} for u, v in G.edges()]
     html_content = f"""
     <html>
     <head>
+      <script type="text/javascript" src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
+      <style type="text/css">
+        #mynetwork {{
+          width: 100%;
+          height: 500px;
+          border: 1px solid lightgray;
+        }}
+      </style>
     </head>
     <body>
+      <div id="mynetwork"></div>
       <script type="text/javascript">
+        var nodes = new vis.DataSet({nodes});
+        var edges = new vis.DataSet({edges});
         var container = document.getElementById('mynetwork');
         var data = {{
+            nodes: nodes,
+            edges: edges
+        }};
+        var options = {{
+            nodes: {{
+                shape: 'dot',
+                size: 20,
+                font: {{
+                    size: 15,
+                    color: '#000000'
+                }},
+                borderWidth: 2
+            }},
+            edges: {{
+                width: 1
+            }}
         }};
         var network = new vis.Network(container, data, options);
       </script>
     </body>
     """
     return html_content
 # 헤더 스타일 변경
 st.markdown("""
 <style>
         background-color: #f1f1f1;
         padding: 10px;
     }
+    .topic-summary {
+        background-color: #f0f2f6;
+        border-left: 5px solid #4e8098;
+        padding: 10px;
+        margin-bottom: 10px;
+    }
 </style>
 <div style="background-color: #f1f1f1; padding: 10px; color: #707070; text-align: right; width: 100%;">
     mySUNI 행복 College 행복담당조직 Meet-Up
     # 파일 업로드
     uploaded_file = st.file_uploader("CSV 파일을 업로드하세요", type="csv")
+# 파일 미리보기 및 분석 실행 (본문에서)
+if uploaded_file is not None:
+    try:
+        df = pd.read_csv(uploaded_file)
+        if df.empty:
+            st.error("CSV 파일에 데이터가 없습니다.")
+        else:
+            st.success("파일이 성공적으로 업로드되었습니다.")
+            st.subheader("데이터 미리보기")
+            st.write(df.head())
+            text_column = st.selectbox("텍스트 컬럼을 선택하세요", df.columns)
+            num_topics = st.slider("토픽 수를 선택하세요", 2, 20, 5)
+            if st.button("토픽 모델링 실행"):
+                st.session_state.run_analysis = True
             else:
+                st.session_state.run_analysis = False
+    except Exception as e:
+        st.error(f"파일을 읽는 중 오류가 발생했습니다: {str(e)}")
 # 메인 컨텐츠
 if 'run_analysis' in st.session_state and st.session_state.run_analysis:
             topic_results = []
+            # 토픽 요약을 callout 스타일로 표시
+            for idx, topic in enumerate(lda.components_):
+                lda_top_words = [(feature_names[i], topic[i]) for i in topic.argsort()[:-11:-1]]
+                topic_docs = lda_output[:, idx].argsort()[::-1][:100]
+                topic_tfidf = tfidf_matrix[topic_docs].mean(axis=0).A1
+                tfidf_top_words = [(feature_names[i], topic_tfidf[i]) for i in topic_tfidf.argsort()[:-11:-1]]
+                weight = lda_output[:, idx].mean() * 100
+                topic_results.append({
+                    'topic_num': idx + 1,
+                    'lda_words': [word for word, _ in lda_top_words],
+                    'tfidf_words': [word for word, _ in tfidf_top_words],
+                    'weight': weight
+                })
+            topic_summary = ", ".join([f"토픽 {info['topic_num']} (비중: {info['weight']:.1f}%)" for info in topic_results])
+            st.markdown(f'<div class="topic-summary">{topic_summary}</div>', unsafe_allow_html=True)
             for idx, topic in enumerate(lda.components_):
                 st.subheader(f"토픽 {idx + 1}")
                     lda_top_words = [(feature_names[i], topic[i]) for i in topic.argsort()[:-11:-1]]
                     df_lda = pd.DataFrame(lda_top_words, columns=['단어', 'LDA 점수'])
                     st.subheader("LDA 상위 단어")
+                    st.table(df_lda.style.format({'LDA 점수': '{:.4f}'}))
                 with col2:
                     # 토픽별 TF-IDF 계산
                     tfidf_top_words = [(feature_names[i], topic_tfidf[i]) for i in topic_tfidf.argsort()[:-11:-1]]
                     df_tfidf = pd.DataFrame(tfidf_top_words, columns=['단어', 'TF-IDF'])
                     st.subheader("TF-IDF 상위 단어")
+                    st.table(df_tfidf.style.format({'TF-IDF': '{:.4f}'}))
             # 토픽 비중 그래프
             st.header("토픽 비중 그래프")
             st.altair_chart(chart, use_container_width=True)
+            # 네트워크 그래프 생성 및 시각화
             st.header("토픽 단어 네트워크 그래프")
+            try:
+                G = create_network_graph(topic_results)
+                html_content = create_custom_network_html(G)
+                components.html(html_content, height=500)
+            except Exception as e:
+                st.error(f"네트워크 그래프 생성 중 오류가 발생했습니다: {str(e)}")
             # Claude API를 사용하여 토픽 해석
             if api_key:
+                client = Anthropic(api_key=api_key)
                 st.header("Claude의 토픽 해석")
                 with st.spinner("토픽 해석 중..."):
+                    prompt = f"""다음은 LDA 토픽 모델링 결과로 나온 각 토픽의 정보입니다. 이를 바탕으로 전체 토픽을 종합적으로 해석해주세요:
             {", ".join([f"토픽 {info['topic_num']} (비중: {info['weight']:.1f}%)" for info in topic_results])}
             위 형식에 맞춰 답변해주세요. 사용자가 쉽게 복사하여 사용할 수 있도록 간결하고 명확하게 작성해주세요."""
+                    try:
+                        response = client.completions.create(
+                            model="claude-3-sonnet-20240229",
+                            max_tokens=3000,
+                            prompt=prompt
+                        )
+                        st.subheader("토픽 모델링 종합 결과")
+                        st.text_area("결과를 복사하여 사용하세요:", value=response.completion, height=500)
+                    except Exception as e:
+                        st.error(f"Claude API 호출 중 오류가 발생했습니다: {str(e)}")
             else:
                 st.warning("Claude API 키가 설정되지 않았습니다. https://console.anthropic.com/settings/keys 에 접속하여 API 키를 발급받으시면 토픽명과 해석을 제공받으실 수 있습니다.")