Spaces:
Sleeping
Sleeping
File size: 6,157 Bytes
41bf78c e765d87 de8d584 6bdbb72 90768ea e765d87 90768ea a6f4c85 90768ea 76b2f6d 326f364 a9bea46 0f24738 de8d584 0d12815 46e44d6 90768ea 46e44d6 0d12815 46e44d6 0d12815 c50998c 46e44d6 41bf78c a9bea46 6bdbb72 4ea47f1 76b2f6d 6bdbb72 76b2f6d b5a82df 41bf78c a9bea46 bec772f 0d12815 41bf78c 8723070 7235700 c50998c 41bf78c f45d210 3e6e0f4 f45d210 41bf78c 3e6e0f4 41bf78c 35bdcd6 0f24738 35bdcd6 3e6e0f4 326f364 35bdcd6 a9bea46 41bf78c de8d584 0f24738 326f364 de8d584 41bf78c a9bea46 bec772f a9bea46 41bf78c a9bea46 6bdbb72 a9bea46 5b8f710 a9bea46 6bdbb72 c50998c 6bdbb72 b5a82df 5b8f710 7235700 b5a82df 6bdbb72 a9bea46 6bdbb72 a9bea46 de8d584 35bdcd6 a9bea46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | import streamlit as st
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import warnings
from collections import Counter
import traceback
import os
import jpype
warnings.filterwarnings("ignore")
# JPype μ€μ
if not jpype.isJVMStarted():
jvm_path = jpype.getDefaultJVMPath()
jpype.startJVM(jvm_path, "-Djava.class.path=/usr/local/lib/python3.10/site-packages/konlpy/java/*")
# νκΈ ν°νΈ μ€μ (νκΉ
νμ΄μ€ νκ²½μ λ§κ² μμ νμν μ μμ)
plt.rcParams['font.family'] = 'NanumGothic'
# μ΄κΈ° λΆμ©μ΄ λͺ©λ‘
default_stopwords = set([
"μλ€", "μλ€", "λλ€", "μ΄λ€", "νλ€", "κ°λ€", "μνλ€", "μλ", "νλ",
"κ·Έλ¦¬κ³ ", "κ·Έλ°", "μ΄λ°", "μ λ°", "μ΄λ κ²", "μ λ κ²", "κ·Έλ κ²",
"μ°λ¦¬", "μ λ", "μ κ°", "λ΄κ°", "λλ", "μμ΅λλ€", "μμμ΅λλ€", "λμμ΅λλ€",
"νμμ΅λλ€", "ν΅ν΄", "μν΄", "λν", "ν¨κ»", "λ§μ", "μ΄λ€", "μ ν¬", "μ΄λ₯Ό",
"λν", "κ·Έλμ", "κ·Έλ¦¬κ³ ", "νμ§λ§", "κ·Έλ¬λ", "λ°λΌμ", "λλ¬Έμ"
])
@st.cache_resource
def load_okt():
from konlpy.tag import Okt
return Okt()
okt = load_okt()
def extract_nouns(text):
try:
nouns = okt.nouns(text)
return [noun for noun in nouns if len(noun) > 1]
except Exception as e:
st.error(f"λͺ
μ¬ μΆμΆ μ€ μ€λ₯ λ°μ: {str(e)}")
st.error(f"λ¬Έμ κ° λ°μν ν
μ€νΈ: {text[:100]}...")
return []
@st.cache_data
def preprocess_text(text, user_stopwords):
try:
if not text or not isinstance(text, str):
st.warning(f"μ ν¨νμ§ μμ μ
λ ₯: {type(text)}")
return ""
nouns = extract_nouns(text)
nouns = [noun for noun in nouns if noun and noun not in user_stopwords]
return ' '.join(nouns)
except Exception as e:
st.error(f"ν
μ€νΈ μ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}")
st.error(f"λ¬Έμ κ° λ°μν ν
μ€νΈ: {text[:100]}...")
return ""
def topic_modeling(texts, n_components):
stop_words_list = list(default_stopwords)
vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000)
data_vectorized = vectorizer.fit_transform(texts)
n_tokens = data_vectorized.shape[1]
n_components = min(n_components, n_tokens)
if n_components < 2:
st.warning("μΆμΆλ κ³ μ λ¨μ΄κ° λ무 μ μ΅λλ€. λ κΈ΄ ν
μ€νΈλ₯Ό μ¬μ©ν΄ μ£ΌμΈμ.")
return {}
lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=50,
learning_method='online', learning_offset=50., doc_topic_prior=0.1, topic_word_prior=0.01)
lda.fit(data_vectorized)
features = vectorizer.get_feature_names_out()
topics = {}
for topic_idx, topic in enumerate(lda.components_):
top_words = [features[i] for i in topic.argsort()[:-11:-1]]
topics[f"Topic {topic_idx + 1}"] = top_words
return topics
def generate_word_frequency_chart(text, color, n=20):
nouns = extract_nouns(text)
word_freq = Counter(nouns)
top_words = dict(word_freq.most_common(n))
fig, ax = plt.subplots(figsize=(12, 6))
ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
ax.invert_yaxis()
ax.set_title("μμ {} λ¨μ΄".format(n))
plt.tight_layout()
return fig
def get_top_trigrams(text, n=10):
nouns = extract_nouns(text)
trigrams = zip(nouns, nouns[1:], nouns[2:])
trigram_freq = Counter(trigrams)
return trigram_freq.most_common(n)
# μ€νΈλ¦Όλ¦Ώ UI μ€μ
st.title("ν
μ€νΈ λΆμ λꡬ")
user_stopwords = st.sidebar.text_area("λΆμ©μ΄λ₯Ό μ
λ ₯νμΈμ (μΌνλ‘ κ΅¬λΆ)",
value=", ".join(default_stopwords))
user_stopwords = list(set(user_stopwords.split(", ")) | default_stopwords)
uploaded_file = st.file_uploader("ν
μ€νΈ νμΌ μ
λ‘λ", type=['txt'])
if uploaded_file is not None:
try:
with st.spinner('νμΌμ μ²λ¦¬ μ€μ
λλ€...'):
text = str(uploaded_file.read(), 'utf-8')
st.text(f"νμΌ κΈΈμ΄: {len(text)} λ¬Έμ")
preprocessed_text = preprocess_text(text, user_stopwords)
st.text(f"μ²λ¦¬λ ν
μ€νΈ κΈΈμ΄: {len(preprocessed_text)} λ¬Έμ")
st.text(f"κ³ μ λ¨μ΄ μ: {len(set(preprocessed_text.split()))}")
if not preprocessed_text:
st.warning("μ²λ¦¬λ ν
μ€νΈκ° μμ΅λλ€. λ€λ₯Έ νμΌμ μ
λ‘λν΄ μ£ΌμΈμ.")
else:
st.subheader("ν ν½ λͺ¨λΈλ§ κ²°κ³Ό")
n_topics = st.slider("ν ν½ μ μ ν", min_value=2, max_value=10, value=5)
topics = topic_modeling([preprocessed_text], n_topics)
if topics:
for topic, words in topics.items():
st.write(f"{topic}: {', '.join(words)}")
else:
st.warning("ν ν½ λͺ¨λΈλ§μ μνν μ μμ΅λλ€. λ κΈ΄ ν
μ€νΈλ₯Ό μ¬μ©ν΄ μ£ΌμΈμ.")
st.subheader("μμ 10κ° Trigram")
top_trigrams = get_top_trigrams(preprocessed_text)
for trigram, count in top_trigrams:
st.write(f"{' '.join(trigram)}: {count}")
st.subheader("λ¨μ΄ λΉλ μ°¨νΈ")
color = st.color_picker("λ§λ μμ μ ν", "#1f77b4")
fig = generate_word_frequency_chart(preprocessed_text, color)
st.pyplot(fig)
except Exception as e:
st.error(f"νμΌ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}")
st.text("μ€λ₯ λ°μ μμΉ:")
st.text(traceback.format_exc())
st.sidebar.markdown("""
## μ¬μ© λ°©λ²
1. μ¬μ΄λλ°μμ λΆμ©μ΄λ₯Ό μΆκ°νκ±°λ μμ ν μ μμ΅λλ€.
2. ν
μ€νΈ νμΌ(.txt)μ μ
λ‘λνμΈμ.
3. ν ν½ λͺ¨λΈλ§μ ν ν½ μλ₯Ό μ ννμΈμ.
4. μμ 10κ° Trigramμ νμΈνμΈμ.
5. λ¨μ΄ λΉλ μ°¨νΈμ λ§λ μμμ μ νν μ μμ΅λλ€.
""") |