Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.decomposition import LatentDirichletAllocation | |
| import warnings | |
| from collections import Counter | |
| import traceback | |
| import os | |
| import jpype | |
| warnings.filterwarnings("ignore") | |
| # JPype ์ค์ | |
| if not jpype.isJVMStarted(): | |
| jvm_path = jpype.getDefaultJVMPath() | |
| jpype.startJVM(jvm_path, "-Djava.class.path=/usr/local/lib/python3.10/site-packages/konlpy/java/*") | |
| # ํ๊ธ ํฐํธ ์ค์ (ํ๊น ํ์ด์ค ํ๊ฒฝ์ ๋ง๊ฒ ์์ ํ์ํ ์ ์์) | |
| plt.rcParams['font.family'] = 'NanumGothic' | |
| # ์ด๊ธฐ ๋ถ์ฉ์ด ๋ชฉ๋ก | |
| default_stopwords = set([ | |
| "์๋ค", "์๋ค", "๋๋ค", "์ด๋ค", "ํ๋ค", "๊ฐ๋ค", "์ํ๋ค", "์๋", "ํ๋", | |
| "๊ทธ๋ฆฌ๊ณ ", "๊ทธ๋ฐ", "์ด๋ฐ", "์ ๋ฐ", "์ด๋ ๊ฒ", "์ ๋ ๊ฒ", "๊ทธ๋ ๊ฒ", | |
| "์ฐ๋ฆฌ", "์ ๋", "์ ๊ฐ", "๋ด๊ฐ", "๋๋", "์์ต๋๋ค", "์์์ต๋๋ค", "๋์์ต๋๋ค", | |
| "ํ์์ต๋๋ค", "ํตํด", "์ํด", "๋ํ", "ํจ๊ป", "๋ง์", "์ด๋ค", "์ ํฌ", "์ด๋ฅผ", | |
| "๋ํ", "๊ทธ๋์", "๊ทธ๋ฆฌ๊ณ ", "ํ์ง๋ง", "๊ทธ๋ฌ๋", "๋ฐ๋ผ์", "๋๋ฌธ์" | |
| ]) | |
| def load_okt(): | |
| from konlpy.tag import Okt | |
| return Okt() | |
| okt = load_okt() | |
| def extract_nouns(text): | |
| try: | |
| nouns = okt.nouns(text) | |
| return [noun for noun in nouns if len(noun) > 1] | |
| except Exception as e: | |
| st.error(f"๋ช ์ฌ ์ถ์ถ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| st.error(f"๋ฌธ์ ๊ฐ ๋ฐ์ํ ํ ์คํธ: {text[:100]}...") | |
| return [] | |
| def preprocess_text(text, user_stopwords): | |
| try: | |
| if not text or not isinstance(text, str): | |
| st.warning(f"์ ํจํ์ง ์์ ์ ๋ ฅ: {type(text)}") | |
| return "" | |
| nouns = extract_nouns(text) | |
| nouns = [noun for noun in nouns if noun and noun not in user_stopwords] | |
| return ' '.join(nouns) | |
| except Exception as e: | |
| st.error(f"ํ ์คํธ ์ ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| st.error(f"๋ฌธ์ ๊ฐ ๋ฐ์ํ ํ ์คํธ: {text[:100]}...") | |
| return "" | |
| def topic_modeling(texts, n_components): | |
| stop_words_list = list(default_stopwords) | |
| vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000) | |
| data_vectorized = vectorizer.fit_transform(texts) | |
| n_tokens = data_vectorized.shape[1] | |
| n_components = min(n_components, n_tokens) | |
| if n_components < 2: | |
| st.warning("์ถ์ถ๋ ๊ณ ์ ๋จ์ด๊ฐ ๋๋ฌด ์ ์ต๋๋ค. ๋ ๊ธด ํ ์คํธ๋ฅผ ์ฌ์ฉํด ์ฃผ์ธ์.") | |
| return {} | |
| lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=50, | |
| learning_method='online', learning_offset=50., doc_topic_prior=0.1, topic_word_prior=0.01) | |
| lda.fit(data_vectorized) | |
| features = vectorizer.get_feature_names_out() | |
| topics = {} | |
| for topic_idx, topic in enumerate(lda.components_): | |
| top_words = [features[i] for i in topic.argsort()[:-11:-1]] | |
| topics[f"Topic {topic_idx + 1}"] = top_words | |
| return topics | |
| def generate_word_frequency_chart(text, color, n=20): | |
| nouns = extract_nouns(text) | |
| word_freq = Counter(nouns) | |
| top_words = dict(word_freq.most_common(n)) | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| ax.barh(list(top_words.keys()), list(top_words.values()), color=color) | |
| ax.invert_yaxis() | |
| ax.set_title("์์ {} ๋จ์ด".format(n)) | |
| plt.tight_layout() | |
| return fig | |
| def get_top_trigrams(text, n=10): | |
| nouns = extract_nouns(text) | |
| trigrams = zip(nouns, nouns[1:], nouns[2:]) | |
| trigram_freq = Counter(trigrams) | |
| return trigram_freq.most_common(n) | |
| # ์คํธ๋ฆผ๋ฆฟ UI ์ค์ | |
| st.title("ํ ์คํธ ๋ถ์ ๋๊ตฌ") | |
| user_stopwords = st.sidebar.text_area("๋ถ์ฉ์ด๋ฅผ ์ ๋ ฅํ์ธ์ (์ผํ๋ก ๊ตฌ๋ถ)", | |
| value=", ".join(default_stopwords)) | |
| user_stopwords = list(set(user_stopwords.split(", ")) | default_stopwords) | |
| uploaded_file = st.file_uploader("ํ ์คํธ ํ์ผ ์ ๋ก๋", type=['txt']) | |
| if uploaded_file is not None: | |
| try: | |
| with st.spinner('ํ์ผ์ ์ฒ๋ฆฌ ์ค์ ๋๋ค...'): | |
| text = str(uploaded_file.read(), 'utf-8') | |
| st.text(f"ํ์ผ ๊ธธ์ด: {len(text)} ๋ฌธ์") | |
| preprocessed_text = preprocess_text(text, user_stopwords) | |
| st.text(f"์ฒ๋ฆฌ๋ ํ ์คํธ ๊ธธ์ด: {len(preprocessed_text)} ๋ฌธ์") | |
| st.text(f"๊ณ ์ ๋จ์ด ์: {len(set(preprocessed_text.split()))}") | |
| if not preprocessed_text: | |
| st.warning("์ฒ๋ฆฌ๋ ํ ์คํธ๊ฐ ์์ต๋๋ค. ๋ค๋ฅธ ํ์ผ์ ์ ๋ก๋ํด ์ฃผ์ธ์.") | |
| else: | |
| st.subheader("ํ ํฝ ๋ชจ๋ธ๋ง ๊ฒฐ๊ณผ") | |
| n_topics = st.slider("ํ ํฝ ์ ์ ํ", min_value=2, max_value=10, value=5) | |
| topics = topic_modeling([preprocessed_text], n_topics) | |
| if topics: | |
| for topic, words in topics.items(): | |
| st.write(f"{topic}: {', '.join(words)}") | |
| else: | |
| st.warning("ํ ํฝ ๋ชจ๋ธ๋ง์ ์ํํ ์ ์์ต๋๋ค. ๋ ๊ธด ํ ์คํธ๋ฅผ ์ฌ์ฉํด ์ฃผ์ธ์.") | |
| st.subheader("์์ 10๊ฐ Trigram") | |
| top_trigrams = get_top_trigrams(preprocessed_text) | |
| for trigram, count in top_trigrams: | |
| st.write(f"{' '.join(trigram)}: {count}") | |
| st.subheader("๋จ์ด ๋น๋ ์ฐจํธ") | |
| color = st.color_picker("๋ง๋ ์์ ์ ํ", "#1f77b4") | |
| fig = generate_word_frequency_chart(preprocessed_text, color) | |
| st.pyplot(fig) | |
| except Exception as e: | |
| st.error(f"ํ์ผ ์ฒ๋ฆฌ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| st.text("์ค๋ฅ ๋ฐ์ ์์น:") | |
| st.text(traceback.format_exc()) | |
| st.sidebar.markdown(""" | |
| ## ์ฌ์ฉ ๋ฐฉ๋ฒ | |
| 1. ์ฌ์ด๋๋ฐ์์ ๋ถ์ฉ์ด๋ฅผ ์ถ๊ฐํ๊ฑฐ๋ ์์ ํ ์ ์์ต๋๋ค. | |
| 2. ํ ์คํธ ํ์ผ(.txt)์ ์ ๋ก๋ํ์ธ์. | |
| 3. ํ ํฝ ๋ชจ๋ธ๋ง์ ํ ํฝ ์๋ฅผ ์ ํํ์ธ์. | |
| 4. ์์ 10๊ฐ Trigram์ ํ์ธํ์ธ์. | |
| 5. ๋จ์ด ๋น๋ ์ฐจํธ์ ๋ง๋ ์์์ ์ ํํ ์ ์์ต๋๋ค. | |
| """) |