File size: 6,157 Bytes
41bf78c
 
 
 
e765d87
de8d584
6bdbb72
90768ea
 
e765d87
 
 
90768ea
a6f4c85
 
 
90768ea
76b2f6d
326f364
 
a9bea46
0f24738
 
 
 
 
 
 
de8d584
0d12815
46e44d6
90768ea
46e44d6
0d12815
46e44d6
0d12815
c50998c
46e44d6
 
 
 
 
 
 
41bf78c
a9bea46
 
6bdbb72
4ea47f1
76b2f6d
6bdbb72
 
 
 
 
 
76b2f6d
b5a82df
41bf78c
a9bea46
bec772f
 
0d12815
41bf78c
8723070
 
 
 
7235700
 
 
 
c50998c
 
41bf78c
f45d210
3e6e0f4
f45d210
41bf78c
 
3e6e0f4
 
41bf78c
 
35bdcd6
0f24738
 
35bdcd6
 
 
 
3e6e0f4
326f364
35bdcd6
a9bea46
41bf78c
de8d584
0f24738
 
326f364
 
de8d584
41bf78c
 
 
a9bea46
 
bec772f
a9bea46
41bf78c
 
a9bea46
 
 
 
6bdbb72
a9bea46
5b8f710
a9bea46
6bdbb72
c50998c
6bdbb72
b5a82df
 
 
 
 
5b8f710
7235700
 
 
 
 
b5a82df
 
 
 
 
 
 
 
 
 
6bdbb72
a9bea46
6bdbb72
 
 
a9bea46
 
 
 
 
 
de8d584
35bdcd6
a9bea46
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import streamlit as st
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import warnings
from collections import Counter
import traceback
import os
import jpype

warnings.filterwarnings("ignore")

# JPype μ„€μ •
if not jpype.isJVMStarted():
    jvm_path = jpype.getDefaultJVMPath()
    jpype.startJVM(jvm_path, "-Djava.class.path=/usr/local/lib/python3.10/site-packages/konlpy/java/*")

# ν•œκΈ€ 폰트 μ„€μ • (ν—ˆκΉ…νŽ˜μ΄μŠ€ ν™˜κ²½μ— 맞게 μˆ˜μ • ν•„μš”ν•  수 있음)
plt.rcParams['font.family'] = 'NanumGothic'

# 초기 λΆˆμš©μ–΄ λͺ©λ‘
default_stopwords = set([
    "μžˆλ‹€", "μ—†λ‹€", "λ˜λ‹€", "이닀", "ν•˜λ‹€", "κ°™λ‹€", "μœ„ν•˜λ‹€", "μžˆλŠ”", "ν•˜λŠ”",
    "그리고", "그런", "이런", "μ €λŸ°", "μ΄λ ‡κ²Œ", "μ €λ ‡κ²Œ", "κ·Έλ ‡κ²Œ",
    "우리", "μ €λŠ”", "μ œκ°€", "λ‚΄κ°€", "λ‚˜λŠ”", "μžˆμŠ΅λ‹ˆλ‹€", "μžˆμ—ˆμŠ΅λ‹ˆλ‹€", "λ˜μ—ˆμŠ΅λ‹ˆλ‹€",
    "ν•˜μ˜€μŠ΅λ‹ˆλ‹€", "톡해", "μœ„ν•΄", "λŒ€ν•œ", "ν•¨κ»˜", "λ§Žμ€", "μ–΄λ–€", "저희", "이λ₯Ό",
    "λ˜ν•œ", "κ·Έλž˜μ„œ", "그리고", "ν•˜μ§€λ§Œ", "κ·ΈλŸ¬λ‚˜", "λ”°λΌμ„œ", "λ•Œλ¬Έμ—"
])

@st.cache_resource
def load_okt():
    from konlpy.tag import Okt
    return Okt()

okt = load_okt()

def extract_nouns(text):
    try:
        nouns = okt.nouns(text)
        return [noun for noun in nouns if len(noun) > 1]
    except Exception as e:
        st.error(f"λͺ…사 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
        st.error(f"λ¬Έμ œκ°€ λ°œμƒν•œ ν…μŠ€νŠΈ: {text[:100]}...")
        return []

@st.cache_data
def preprocess_text(text, user_stopwords):
    try:
        if not text or not isinstance(text, str):
            st.warning(f"μœ νš¨ν•˜μ§€ μ•Šμ€ μž…λ ₯: {type(text)}")
            return ""
        nouns = extract_nouns(text)
        nouns = [noun for noun in nouns if noun and noun not in user_stopwords]
        return ' '.join(nouns)
    except Exception as e:
        st.error(f"ν…μŠ€νŠΈ μ „μ²˜λ¦¬ 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
        st.error(f"λ¬Έμ œκ°€ λ°œμƒν•œ ν…μŠ€νŠΈ: {text[:100]}...")
        return ""

def topic_modeling(texts, n_components):
    stop_words_list = list(default_stopwords)

    vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000)
    data_vectorized = vectorizer.fit_transform(texts)
    
    n_tokens = data_vectorized.shape[1]
    n_components = min(n_components, n_tokens)

    if n_components < 2:
        st.warning("μΆ”μΆœλœ 고유 단어가 λ„ˆλ¬΄ μ μŠ΅λ‹ˆλ‹€. 더 κΈ΄ ν…μŠ€νŠΈλ₯Ό μ‚¬μš©ν•΄ μ£Όμ„Έμš”.")
        return {}

    lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=50,
                                    learning_method='online', learning_offset=50., doc_topic_prior=0.1, topic_word_prior=0.01)
    lda.fit(data_vectorized)
    
    features = vectorizer.get_feature_names_out()
    
    topics = {}
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [features[i] for i in topic.argsort()[:-11:-1]]
        topics[f"Topic {topic_idx + 1}"] = top_words
    return topics

def generate_word_frequency_chart(text, color, n=20):
    nouns = extract_nouns(text)
    word_freq = Counter(nouns)
    top_words = dict(word_freq.most_common(n))
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
    ax.invert_yaxis()
    ax.set_title("μƒμœ„ {} 단어".format(n))
    plt.tight_layout()
    return fig

def get_top_trigrams(text, n=10):
    nouns = extract_nouns(text)
    trigrams = zip(nouns, nouns[1:], nouns[2:])
    trigram_freq = Counter(trigrams)
    return trigram_freq.most_common(n)

# 슀트림릿 UI μ„€μ •
st.title("ν…μŠ€νŠΈ 뢄석 도ꡬ")

user_stopwords = st.sidebar.text_area("λΆˆμš©μ–΄λ₯Ό μž…λ ₯ν•˜μ„Έμš” (μ‰Όν‘œλ‘œ ꡬ뢄)", 
                                      value=", ".join(default_stopwords))
user_stopwords = list(set(user_stopwords.split(", ")) | default_stopwords)

uploaded_file = st.file_uploader("ν…μŠ€νŠΈ 파일 μ—…λ‘œλ“œ", type=['txt'])

if uploaded_file is not None:
    try:
        with st.spinner('νŒŒμΌμ„ 처리 μ€‘μž…λ‹ˆλ‹€...'):
            text = str(uploaded_file.read(), 'utf-8')
            st.text(f"파일 길이: {len(text)} 문자")
            
            preprocessed_text = preprocess_text(text, user_stopwords)
            
            st.text(f"처리된 ν…μŠ€νŠΈ 길이: {len(preprocessed_text)} 문자")
            st.text(f"고유 단어 수: {len(set(preprocessed_text.split()))}")
        
        if not preprocessed_text:
            st.warning("처리된 ν…μŠ€νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€. λ‹€λ₯Έ νŒŒμΌμ„ μ—…λ‘œλ“œν•΄ μ£Όμ„Έμš”.")
        else:
            st.subheader("ν† ν”½ λͺ¨λΈλ§ κ²°κ³Ό")
            n_topics = st.slider("ν† ν”½ 수 선택", min_value=2, max_value=10, value=5)
            topics = topic_modeling([preprocessed_text], n_topics)
            if topics:
                for topic, words in topics.items():
                    st.write(f"{topic}: {', '.join(words)}")
            else:
                st.warning("ν† ν”½ λͺ¨λΈλ§μ„ μˆ˜ν–‰ν•  수 μ—†μŠ΅λ‹ˆλ‹€. 더 κΈ΄ ν…μŠ€νŠΈλ₯Ό μ‚¬μš©ν•΄ μ£Όμ„Έμš”.")
            
            st.subheader("μƒμœ„ 10개 Trigram")
            top_trigrams = get_top_trigrams(preprocessed_text)
            for trigram, count in top_trigrams:
                st.write(f"{' '.join(trigram)}: {count}")
            
            st.subheader("단어 λΉˆλ„ 차트")
            color = st.color_picker("λ§‰λŒ€ 색상 선택", "#1f77b4")
            fig = generate_word_frequency_chart(preprocessed_text, color)
            st.pyplot(fig)

    except Exception as e:
        st.error(f"파일 처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
        st.text("였λ₯˜ λ°œμƒ μœ„μΉ˜:")
        st.text(traceback.format_exc())

st.sidebar.markdown("""
## μ‚¬μš© 방법
1. μ‚¬μ΄λ“œλ°”μ—μ„œ λΆˆμš©μ–΄λ₯Ό μΆ”κ°€ν•˜κ±°λ‚˜ μˆ˜μ •ν•  수 μžˆμŠ΅λ‹ˆλ‹€.
2. ν…μŠ€νŠΈ 파일(.txt)을 μ—…λ‘œλ“œν•˜μ„Έμš”.
3. ν† ν”½ λͺ¨λΈλ§μ˜ ν† ν”½ 수λ₯Ό μ„ νƒν•˜μ„Έμš”.
4. μƒμœ„ 10개 Trigram을 ν™•μΈν•˜μ„Έμš”.
5. 단어 λΉˆλ„ 차트의 λ§‰λŒ€ 색상을 선택할 수 μžˆμŠ΅λ‹ˆλ‹€.
""")