Spaces:

soojeongcrystal
/

text

Sleeping

App Files Files Community

soojeongcrystal commited on Jul 26, 2024

Commit

a9bea46

verified ·

1 Parent(s): 9aff819

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -20

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import streamlit as st
-import pandas as pd
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from sklearn.feature_extraction.text import CountVectorizer
@@ -13,17 +12,22 @@ import warnings
 # 경고 메시지 무시
 warnings.filterwarnings("ignore")
-# 불용어 목록
-stopwords = set(["있다", "없다", "것", "그", "이", "하는", "하기", "할", "되", "수", "이다", "시키다"])
-def preprocess_text(text):
     okt = Okt()
     text = re.sub(r'[^\w\s]', '', text)  # 특수 문자 제거
     nouns = okt.nouns(text)  # 명사 추출
-    nouns = [noun for noun in nouns if len(noun) > 1 and noun not in stopwords]  # 불용어 제거 및 한 글자 이상의 명사만 선택
     return ' '.join(nouns)
-def topic_modeling(texts, n_components=5):
     vectorizer = CountVectorizer()
     data_vectorized = vectorizer.fit_transform(texts)
     lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
@@ -31,29 +35,66 @@ def topic_modeling(texts, n_components=5):
     features = vectorizer.get_feature_names()
     topics = {}
     for topic_idx, topic in enumerate(lda.components_):
-        topics[f"Topic {topic_idx}"] = [features[i] for i in topic.argsort()[:-21:-1]]
     return topics
-def generate_wordcloud(text):
-    wordcloud = WordCloud(width=800, height=400).generate(text)
-    plt.figure(figsize=(10, 5))
-    plt.imshow(wordcloud, interpolation='bilinear')
-    plt.axis("off")
-    plt.show()
 # 스트림릿 UI 설정
 st.title("텍스트 분석 도구")
 uploaded_file = st.file_uploader("텍스트 파일 업로드", type=['txt'])
-if uploaded_file is not None:
-    with st.spinner('파일을 처리 중입니다...'):
-        text = str(uploaded_file.read(), 'utf-8')
-        preprocessed_text = preprocess_text(text)  # 전처리된 명사 추출
         st.subheader("토픽 모델링 결과")
-        topics = topic_modeling([preprocessed_text])
         for topic, words in topics.items():
             st.write(f"{topic}: {', '.join(words)}")
         st.subheader("워드 클라우드")
-        st.pyplot(generate_wordcloud(preprocessed_text))

 import streamlit as st
 from wordcloud import WordCloud
 import matplotlib.pyplot as plt
 from sklearn.feature_extraction.text import CountVectorizer
 # 경고 메시지 무시
 warnings.filterwarnings("ignore")
+# 초기 불용어 목록
+default_stopwords = set(["있다", "없다", "것", "그", "이", "하는", "하기", "할", "되", "수", "이다", "시키다"])
+@st.cache_data
+def preprocess_text(text, user_stopwords):
+    spacing = Spacing()
+    text = spacing(text)  # 띄어쓰기 교정
+    text = spell_checker.check(text).checked  # 맞춤법 검사
     okt = Okt()
     text = re.sub(r'[^\w\s]', '', text)  # 특수 문자 제거
     nouns = okt.nouns(text)  # 명사 추출
+    nouns = [noun for noun in nouns if len(noun) > 1 and noun not in user_stopwords]  # 불용어 제거 및 한 글자 이상의 명사만 선택
     return ' '.join(nouns)
+def topic_modeling(texts, n_components):
     vectorizer = CountVectorizer()
     data_vectorized = vectorizer.fit_transform(texts)
     lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
     features = vectorizer.get_feature_names()
     topics = {}
     for topic_idx, topic in enumerate(lda.components_):
+        topics[f"Topic {topic_idx + 1}"] = [features[i] for i in topic.argsort()[:-21:-1]]
     return topics
+def generate_wordcloud(text, color):
+    wordcloud = WordCloud(width=800, height=400, background_color=color).generate(text)
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.imshow(wordcloud, interpolation='bilinear')
+    ax.axis("off")
+    return fig
 # 스트림릿 UI 설정
 st.title("텍스트 분석 도구")
+# 사이드바에 불용어 입력 필드 추가
+user_stopwords = st.sidebar.text_area("불용어를 입력하세요 (쉼표로 구분)",
+                                      value=", ".join(default_stopwords))
+user_stopwords = set(user_stopwords.split(", ")) | default_stopwords
+# 파일 업로더
 uploaded_file = st.file_uploader("텍스트 파일 업로드", type=['txt'])
+if uploaded_file is not None:
+    try:
+        with st.spinner('파일을 처리 중입니다...'):
+            text = str(uploaded_file.read(), 'utf-8')
+            # 텍스트 크기에 따른 프로그레스 바 추가
+            progress_bar = st.progress(0)
+            chunk_size = max(1, len(text) // 100)  # 텍스트를 100개의 청크로 나눔
+            preprocessed_chunks = []
+            for i in range(0, len(text), chunk_size):
+                chunk = text[i:i+chunk_size]
+                preprocessed_chunk = preprocess_text(chunk, user_stopwords)
+                preprocessed_chunks.append(preprocessed_chunk)
+                progress_bar.progress((i + chunk_size) / len(text))
+            preprocessed_text = " ".join(preprocessed_chunks)
+        # 토픽 모델링
         st.subheader("토픽 모델링 결과")
+        n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
+        topics = topic_modeling([preprocessed_text], n_topics)
         for topic, words in topics.items():
             st.write(f"{topic}: {', '.join(words)}")
+        # 워드 클라우드
         st.subheader("워드 클라우드")
+        color = st.color_picker("배경색 선택", "#ffffff")
+        fig = generate_wordcloud(preprocessed_text, color)
+        st.pyplot(fig)
+    except Exception as e:
+        st.error(f"오류가 발생했습니다: {str(e)}")
+# 사용 설명 추가
+st.sidebar.markdown("""
+## 사용 방법
+1. 사이드바에서 불용어를 추가하거나 수정할 수 있습니다.
+2. 텍스트 파일(.txt)을 업로드하세요.
+3. 토픽 모델링의 토픽 수를 선택하세요.
+4. 워드클라우드의 배경색을 선택할 수 있습니다.
+""")