text / app.py
soojeongcrystal's picture
Update app.py
a6f4c85 verified
import streamlit as st
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import warnings
from collections import Counter
import traceback
import os
import jpype
warnings.filterwarnings("ignore")
# JPype ์„ค์ •
if not jpype.isJVMStarted():
jvm_path = jpype.getDefaultJVMPath()
jpype.startJVM(jvm_path, "-Djava.class.path=/usr/local/lib/python3.10/site-packages/konlpy/java/*")
# ํ•œ๊ธ€ ํฐํŠธ ์„ค์ • (ํ—ˆ๊น…ํŽ˜์ด์Šค ํ™˜๊ฒฝ์— ๋งž๊ฒŒ ์ˆ˜์ • ํ•„์š”ํ•  ์ˆ˜ ์žˆ์Œ)
plt.rcParams['font.family'] = 'NanumGothic'
# ์ดˆ๊ธฐ ๋ถˆ์šฉ์–ด ๋ชฉ๋ก
default_stopwords = set([
"์žˆ๋‹ค", "์—†๋‹ค", "๋˜๋‹ค", "์ด๋‹ค", "ํ•˜๋‹ค", "๊ฐ™๋‹ค", "์œ„ํ•˜๋‹ค", "์žˆ๋Š”", "ํ•˜๋Š”",
"๊ทธ๋ฆฌ๊ณ ", "๊ทธ๋Ÿฐ", "์ด๋Ÿฐ", "์ €๋Ÿฐ", "์ด๋ ‡๊ฒŒ", "์ €๋ ‡๊ฒŒ", "๊ทธ๋ ‡๊ฒŒ",
"์šฐ๋ฆฌ", "์ €๋Š”", "์ œ๊ฐ€", "๋‚ด๊ฐ€", "๋‚˜๋Š”", "์žˆ์Šต๋‹ˆ๋‹ค", "์žˆ์—ˆ์Šต๋‹ˆ๋‹ค", "๋˜์—ˆ์Šต๋‹ˆ๋‹ค",
"ํ•˜์˜€์Šต๋‹ˆ๋‹ค", "ํ†ตํ•ด", "์œ„ํ•ด", "๋Œ€ํ•œ", "ํ•จ๊ป˜", "๋งŽ์€", "์–ด๋–ค", "์ €ํฌ", "์ด๋ฅผ",
"๋˜ํ•œ", "๊ทธ๋ž˜์„œ", "๊ทธ๋ฆฌ๊ณ ", "ํ•˜์ง€๋งŒ", "๊ทธ๋Ÿฌ๋‚˜", "๋”ฐ๋ผ์„œ", "๋•Œ๋ฌธ์—"
])
@st.cache_resource
def load_okt():
from konlpy.tag import Okt
return Okt()
okt = load_okt()
def extract_nouns(text):
try:
nouns = okt.nouns(text)
return [noun for noun in nouns if len(noun) > 1]
except Exception as e:
st.error(f"๋ช…์‚ฌ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
st.error(f"๋ฌธ์ œ๊ฐ€ ๋ฐœ์ƒํ•œ ํ…์ŠคํŠธ: {text[:100]}...")
return []
@st.cache_data
def preprocess_text(text, user_stopwords):
try:
if not text or not isinstance(text, str):
st.warning(f"์œ ํšจํ•˜์ง€ ์•Š์€ ์ž…๋ ฅ: {type(text)}")
return ""
nouns = extract_nouns(text)
nouns = [noun for noun in nouns if noun and noun not in user_stopwords]
return ' '.join(nouns)
except Exception as e:
st.error(f"ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
st.error(f"๋ฌธ์ œ๊ฐ€ ๋ฐœ์ƒํ•œ ํ…์ŠคํŠธ: {text[:100]}...")
return ""
def topic_modeling(texts, n_components):
stop_words_list = list(default_stopwords)
vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000)
data_vectorized = vectorizer.fit_transform(texts)
n_tokens = data_vectorized.shape[1]
n_components = min(n_components, n_tokens)
if n_components < 2:
st.warning("์ถ”์ถœ๋œ ๊ณ ์œ  ๋‹จ์–ด๊ฐ€ ๋„ˆ๋ฌด ์ ์Šต๋‹ˆ๋‹ค. ๋” ๊ธด ํ…์ŠคํŠธ๋ฅผ ์‚ฌ์šฉํ•ด ์ฃผ์„ธ์š”.")
return {}
lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=50,
learning_method='online', learning_offset=50., doc_topic_prior=0.1, topic_word_prior=0.01)
lda.fit(data_vectorized)
features = vectorizer.get_feature_names_out()
topics = {}
for topic_idx, topic in enumerate(lda.components_):
top_words = [features[i] for i in topic.argsort()[:-11:-1]]
topics[f"Topic {topic_idx + 1}"] = top_words
return topics
def generate_word_frequency_chart(text, color, n=20):
nouns = extract_nouns(text)
word_freq = Counter(nouns)
top_words = dict(word_freq.most_common(n))
fig, ax = plt.subplots(figsize=(12, 6))
ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
ax.invert_yaxis()
ax.set_title("์ƒ์œ„ {} ๋‹จ์–ด".format(n))
plt.tight_layout()
return fig
def get_top_trigrams(text, n=10):
nouns = extract_nouns(text)
trigrams = zip(nouns, nouns[1:], nouns[2:])
trigram_freq = Counter(trigrams)
return trigram_freq.most_common(n)
# ์ŠคํŠธ๋ฆผ๋ฆฟ UI ์„ค์ •
st.title("ํ…์ŠคํŠธ ๋ถ„์„ ๋„๊ตฌ")
user_stopwords = st.sidebar.text_area("๋ถˆ์šฉ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š” (์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„)",
value=", ".join(default_stopwords))
user_stopwords = list(set(user_stopwords.split(", ")) | default_stopwords)
uploaded_file = st.file_uploader("ํ…์ŠคํŠธ ํŒŒ์ผ ์—…๋กœ๋“œ", type=['txt'])
if uploaded_file is not None:
try:
with st.spinner('ํŒŒ์ผ์„ ์ฒ˜๋ฆฌ ์ค‘์ž…๋‹ˆ๋‹ค...'):
text = str(uploaded_file.read(), 'utf-8')
st.text(f"ํŒŒ์ผ ๊ธธ์ด: {len(text)} ๋ฌธ์ž")
preprocessed_text = preprocess_text(text, user_stopwords)
st.text(f"์ฒ˜๋ฆฌ๋œ ํ…์ŠคํŠธ ๊ธธ์ด: {len(preprocessed_text)} ๋ฌธ์ž")
st.text(f"๊ณ ์œ  ๋‹จ์–ด ์ˆ˜: {len(set(preprocessed_text.split()))}")
if not preprocessed_text:
st.warning("์ฒ˜๋ฆฌ๋œ ํ…์ŠคํŠธ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค. ๋‹ค๋ฅธ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด ์ฃผ์„ธ์š”.")
else:
st.subheader("ํ† ํ”ฝ ๋ชจ๋ธ๋ง ๊ฒฐ๊ณผ")
n_topics = st.slider("ํ† ํ”ฝ ์ˆ˜ ์„ ํƒ", min_value=2, max_value=10, value=5)
topics = topic_modeling([preprocessed_text], n_topics)
if topics:
for topic, words in topics.items():
st.write(f"{topic}: {', '.join(words)}")
else:
st.warning("ํ† ํ”ฝ ๋ชจ๋ธ๋ง์„ ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๋” ๊ธด ํ…์ŠคํŠธ๋ฅผ ์‚ฌ์šฉํ•ด ์ฃผ์„ธ์š”.")
st.subheader("์ƒ์œ„ 10๊ฐœ Trigram")
top_trigrams = get_top_trigrams(preprocessed_text)
for trigram, count in top_trigrams:
st.write(f"{' '.join(trigram)}: {count}")
st.subheader("๋‹จ์–ด ๋นˆ๋„ ์ฐจํŠธ")
color = st.color_picker("๋ง‰๋Œ€ ์ƒ‰์ƒ ์„ ํƒ", "#1f77b4")
fig = generate_word_frequency_chart(preprocessed_text, color)
st.pyplot(fig)
except Exception as e:
st.error(f"ํŒŒ์ผ ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
st.text("์˜ค๋ฅ˜ ๋ฐœ์ƒ ์œ„์น˜:")
st.text(traceback.format_exc())
st.sidebar.markdown("""
## ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
1. ์‚ฌ์ด๋“œ๋ฐ”์—์„œ ๋ถˆ์šฉ์–ด๋ฅผ ์ถ”๊ฐ€ํ•˜๊ฑฐ๋‚˜ ์ˆ˜์ •ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
2. ํ…์ŠคํŠธ ํŒŒ์ผ(.txt)์„ ์—…๋กœ๋“œํ•˜์„ธ์š”.
3. ํ† ํ”ฝ ๋ชจ๋ธ๋ง์˜ ํ† ํ”ฝ ์ˆ˜๋ฅผ ์„ ํƒํ•˜์„ธ์š”.
4. ์ƒ์œ„ 10๊ฐœ Trigram์„ ํ™•์ธํ•˜์„ธ์š”.
5. ๋‹จ์–ด ๋นˆ๋„ ์ฐจํŠธ์˜ ๋ง‰๋Œ€ ์ƒ‰์ƒ์„ ์„ ํƒํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
""")