Spaces:

soojeongcrystal
/

text

Sleeping

File size: 6,157 Bytes

41bf78c
 
 
 
e765d87
de8d584
6bdbb72
90768ea
 
e765d87
 
 
90768ea
a6f4c85
 
 
90768ea
76b2f6d
326f364
 
a9bea46
0f24738
 
 
 
 
 
 
de8d584
0d12815
46e44d6
90768ea
46e44d6
0d12815
46e44d6
0d12815
c50998c
46e44d6
 
 
 
 
 
 
41bf78c
a9bea46
 
6bdbb72
4ea47f1
76b2f6d
6bdbb72
 
 
 
 
 
76b2f6d
b5a82df
41bf78c
a9bea46
bec772f
 
0d12815
41bf78c
8723070
 
 
 
7235700
 
 
 
c50998c
 
41bf78c
f45d210
3e6e0f4
f45d210
41bf78c
 
3e6e0f4
 
41bf78c
 
35bdcd6
0f24738
 
35bdcd6
 
 
 
3e6e0f4
326f364
35bdcd6
a9bea46
41bf78c
de8d584
0f24738
 
326f364
 
de8d584
41bf78c
 
 
a9bea46
 
bec772f
a9bea46
41bf78c
 
a9bea46
 
 
 
6bdbb72
a9bea46
5b8f710
a9bea46
6bdbb72
c50998c
6bdbb72
b5a82df
 
 
 
 
5b8f710
7235700
 
 
 
 
b5a82df
 
 
 
 
 
 
 
 
 
6bdbb72
a9bea46
6bdbb72
 
 
a9bea46
 
 
 
 
 
de8d584
35bdcd6
a9bea46

import streamlit as st
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import warnings
from collections import Counter
import traceback
import os
import jpype

warnings.filterwarnings("ignore")

# JPype 설정
if not jpype.isJVMStarted():
    jvm_path = jpype.getDefaultJVMPath()
    jpype.startJVM(jvm_path, "-Djava.class.path=/usr/local/lib/python3.10/site-packages/konlpy/java/*")

# 한글 폰트 설정 (허깅페이스 환경에 맞게 수정 필요할 수 있음)
plt.rcParams['font.family'] = 'NanumGothic'

# 초기 불용어 목록
default_stopwords = set([
    "있다", "없다", "되다", "이다", "하다", "같다", "위하다", "있는", "하는",
    "그리고", "그런", "이런", "저런", "이렇게", "저렇게", "그렇게",
    "우리", "저는", "제가", "내가", "나는", "있습니다", "있었습니다", "되었습니다",
    "하였습니다", "통해", "위해", "대한", "함께", "많은", "어떤", "저희", "이를",
    "또한", "그래서", "그리고", "하지만", "그러나", "따라서", "때문에"
])

@st.cache_resource
def load_okt():
    from konlpy.tag import Okt
    return Okt()

okt = load_okt()

def extract_nouns(text):
    try:
        nouns = okt.nouns(text)
        return [noun for noun in nouns if len(noun) > 1]
    except Exception as e:
        st.error(f"명사 추출 중 오류 발생: {str(e)}")
        st.error(f"문제가 발생한 텍스트: {text[:100]}...")
        return []

@st.cache_data
def preprocess_text(text, user_stopwords):
    try:
        if not text or not isinstance(text, str):
            st.warning(f"유효하지 않은 입력: {type(text)}")
            return ""
        nouns = extract_nouns(text)
        nouns = [noun for noun in nouns if noun and noun not in user_stopwords]
        return ' '.join(nouns)
    except Exception as e:
        st.error(f"텍스트 전처리 중 오류 발생: {str(e)}")
        st.error(f"문제가 발생한 텍스트: {text[:100]}...")
        return ""

def topic_modeling(texts, n_components):
    stop_words_list = list(default_stopwords)

    vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000)
    data_vectorized = vectorizer.fit_transform(texts)
    
    n_tokens = data_vectorized.shape[1]
    n_components = min(n_components, n_tokens)

    if n_components < 2:
        st.warning("추출된 고유 단어가 너무 적습니다. 더 긴 텍스트를 사용해 주세요.")
        return {}

    lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=50,
                                    learning_method='online', learning_offset=50., doc_topic_prior=0.1, topic_word_prior=0.01)
    lda.fit(data_vectorized)
    
    features = vectorizer.get_feature_names_out()
    
    topics = {}
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [features[i] for i in topic.argsort()[:-11:-1]]
        topics[f"Topic {topic_idx + 1}"] = top_words
    return topics

def generate_word_frequency_chart(text, color, n=20):
    nouns = extract_nouns(text)
    word_freq = Counter(nouns)
    top_words = dict(word_freq.most_common(n))
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
    ax.invert_yaxis()
    ax.set_title("상위 {} 단어".format(n))
    plt.tight_layout()
    return fig

def get_top_trigrams(text, n=10):
    nouns = extract_nouns(text)
    trigrams = zip(nouns, nouns[1:], nouns[2:])
    trigram_freq = Counter(trigrams)
    return trigram_freq.most_common(n)

# 스트림릿 UI 설정
st.title("텍스트 분석 도구")

user_stopwords = st.sidebar.text_area("불용어를 입력하세요 (쉼표로 구분)", 
                                      value=", ".join(default_stopwords))
user_stopwords = list(set(user_stopwords.split(", ")) | default_stopwords)

uploaded_file = st.file_uploader("텍스트 파일 업로드", type=['txt'])

if uploaded_file is not None:
    try:
        with st.spinner('파일을 처리 중입니다...'):
            text = str(uploaded_file.read(), 'utf-8')
            st.text(f"파일 길이: {len(text)} 문자")
            
            preprocessed_text = preprocess_text(text, user_stopwords)
            
            st.text(f"처리된 텍스트 길이: {len(preprocessed_text)} 문자")
            st.text(f"고유 단어 수: {len(set(preprocessed_text.split()))}")
        
        if not preprocessed_text:
            st.warning("처리된 텍스트가 없습니다. 다른 파일을 업로드해 주세요.")
        else:
            st.subheader("토픽 모델링 결과")
            n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
            topics = topic_modeling([preprocessed_text], n_topics)
            if topics:
                for topic, words in topics.items():
                    st.write(f"{topic}: {', '.join(words)}")
            else:
                st.warning("토픽 모델링을 수행할 수 없습니다. 더 긴 텍스트를 사용해 주세요.")
            
            st.subheader("상위 10개 Trigram")
            top_trigrams = get_top_trigrams(preprocessed_text)
            for trigram, count in top_trigrams:
                st.write(f"{' '.join(trigram)}: {count}")
            
            st.subheader("단어 빈도 차트")
            color = st.color_picker("막대 색상 선택", "#1f77b4")
            fig = generate_word_frequency_chart(preprocessed_text, color)
            st.pyplot(fig)

    except Exception as e:
        st.error(f"파일 처리 중 오류 발생: {str(e)}")
        st.text("오류 발생 위치:")
        st.text(traceback.format_exc())

st.sidebar.markdown("""
## 사용 방법
1. 사이드바에서 불용어를 추가하거나 수정할 수 있습니다.
2. 텍스트 파일(.txt)을 업로드하세요.
3. 토픽 모델링의 토픽 수를 선택하세요.
4. 상위 10개 Trigram을 확인하세요.
5. 단어 빈도 차트의 막대 색상을 선택할 수 있습니다.
""")