Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,12 +5,16 @@ from sklearn.feature_extraction.text import CountVectorizer
|
|
| 5 |
from sklearn.decomposition import LatentDirichletAllocation
|
| 6 |
import re
|
| 7 |
import warnings
|
|
|
|
| 8 |
|
| 9 |
-
# κ²½κ³ λ©μμ§ λ¬΄μ
|
| 10 |
warnings.filterwarnings("ignore")
|
| 11 |
|
| 12 |
# μ΄κΈ° λΆμ©μ΄ λͺ©λ‘
|
| 13 |
-
default_stopwords = set(["μλ€", "μλ€", "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
@st.cache_data
|
| 16 |
def preprocess_text(text, user_stopwords):
|
|
@@ -18,8 +22,8 @@ def preprocess_text(text, user_stopwords):
|
|
| 18 |
text = re.sub(r'[^γ±-γ
γ
-γ
£κ°-ν£\s]', '', text)
|
| 19 |
# λ¨μ΄ λΆλ¦¬ (곡백 κΈ°μ€)
|
| 20 |
words = text.split()
|
| 21 |
-
# λΆμ©μ΄ μ κ±°
|
| 22 |
-
words = [word for word in words if len(word) > 1 and word not in user_stopwords]
|
| 23 |
return ' '.join(words)
|
| 24 |
|
| 25 |
def topic_modeling(texts, n_components):
|
|
@@ -28,11 +32,9 @@ def topic_modeling(texts, n_components):
|
|
| 28 |
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
|
| 29 |
lda.fit(data_vectorized)
|
| 30 |
|
| 31 |
-
# μ¬κΈ°μ get_feature_names λμ get_feature_names_outμ μ¬μ©ν©λλ€
|
| 32 |
try:
|
| 33 |
features = vectorizer.get_feature_names_out()
|
| 34 |
except AttributeError:
|
| 35 |
-
# μ΄μ λ²μ κ³Όμ νΈνμ±μ μν΄ μμΈ μ²λ¦¬
|
| 36 |
features = vectorizer.get_feature_names()
|
| 37 |
|
| 38 |
topics = {}
|
|
@@ -48,15 +50,26 @@ def generate_wordcloud(text, color):
|
|
| 48 |
ax.axis("off")
|
| 49 |
return fig
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# μ€νΈλ¦Όλ¦Ώ UI μ€μ
|
| 52 |
st.title("ν
μ€νΈ λΆμ λꡬ")
|
| 53 |
|
| 54 |
-
# μ¬μ΄λλ°μ λΆμ©μ΄ μ
λ ₯ νλ μΆκ°
|
| 55 |
user_stopwords = st.sidebar.text_area("λΆμ©μ΄λ₯Ό μ
λ ₯νμΈμ (μΌνλ‘ κ΅¬λΆ)",
|
| 56 |
value=", ".join(default_stopwords))
|
| 57 |
user_stopwords = set(user_stopwords.split(", ")) | default_stopwords
|
| 58 |
|
| 59 |
-
# νμΌ μ
λ‘λ
|
| 60 |
uploaded_file = st.file_uploader("ν
μ€νΈ νμΌ μ
λ‘λ", type=['txt'])
|
| 61 |
|
| 62 |
if uploaded_file is not None:
|
|
@@ -64,7 +77,6 @@ if uploaded_file is not None:
|
|
| 64 |
with st.spinner('νμΌμ μ²λ¦¬ μ€μ
λλ€...'):
|
| 65 |
text = str(uploaded_file.read(), 'utf-8')
|
| 66 |
|
| 67 |
-
# ν
μ€νΈ ν¬κΈ°μ λ°λ₯Έ νλ‘κ·Έλ μ€ λ° μΆκ°
|
| 68 |
progress_bar = st.progress(0)
|
| 69 |
total_chunks = 100
|
| 70 |
chunk_size = max(1, len(text) // total_chunks)
|
|
@@ -80,14 +92,17 @@ if uploaded_file is not None:
|
|
| 80 |
|
| 81 |
preprocessed_text = " ".join(preprocessed_chunks)
|
| 82 |
|
| 83 |
-
# ν ν½ λͺ¨λΈλ§
|
| 84 |
st.subheader("ν ν½ λͺ¨λΈλ§ κ²°κ³Ό")
|
| 85 |
n_topics = st.slider("ν ν½ μ μ ν", min_value=2, max_value=10, value=5)
|
| 86 |
topics = topic_modeling([preprocessed_text], n_topics)
|
| 87 |
for topic, words in topics.items():
|
| 88 |
st.write(f"{topic}: {', '.join(words)}")
|
| 89 |
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
st.subheader("μλ ν΄λΌμ°λ")
|
| 92 |
color = st.color_picker("λ°°κ²½μ μ ν", "#ffffff")
|
| 93 |
fig = generate_wordcloud(preprocessed_text, color)
|
|
@@ -96,11 +111,11 @@ if uploaded_file is not None:
|
|
| 96 |
except Exception as e:
|
| 97 |
st.error(f"μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}")
|
| 98 |
|
| 99 |
-
# μ¬μ© μ€λͺ
μΆκ°
|
| 100 |
st.sidebar.markdown("""
|
| 101 |
## μ¬μ© λ°©λ²
|
| 102 |
1. μ¬μ΄λλ°μμ λΆμ©μ΄λ₯Ό μΆκ°νκ±°λ μμ ν μ μμ΅λλ€.
|
| 103 |
2. ν
μ€νΈ νμΌ(.txt)μ μ
λ‘λνμΈμ.
|
| 104 |
3. ν ν½ λͺ¨λΈλ§μ ν ν½ μλ₯Ό μ ννμΈμ.
|
| 105 |
-
4.
|
|
|
|
| 106 |
""")
|
|
|
|
| 5 |
from sklearn.decomposition import LatentDirichletAllocation
|
| 6 |
import re
|
| 7 |
import warnings
|
| 8 |
+
from collections import Counter
|
| 9 |
|
|
|
|
| 10 |
warnings.filterwarnings("ignore")
|
| 11 |
|
| 12 |
# μ΄κΈ° λΆμ©μ΄ λͺ©λ‘
|
| 13 |
+
default_stopwords = set(["μλ€", "μλ€", "λλ€", "μ΄λ€", "νλ€", "κ°λ€", "μνλ€", "μλ€", "λμ΄λ€"])
|
| 14 |
+
|
| 15 |
+
def is_noun(word):
|
| 16 |
+
# κ°λ¨ν κ·μΉ: 2μμ μ΄μμ΄κ³ 'λ€'λ‘ λλμ§ μμΌλ©΄ λͺ
μ¬λ‘ κ°μ£Ό
|
| 17 |
+
return len(word) >= 2 and not word.endswith('λ€')
|
| 18 |
|
| 19 |
@st.cache_data
|
| 20 |
def preprocess_text(text, user_stopwords):
|
|
|
|
| 22 |
text = re.sub(r'[^γ±-γ
γ
-γ
£κ°-ν£\s]', '', text)
|
| 23 |
# λ¨μ΄ λΆλ¦¬ (곡백 κΈ°μ€)
|
| 24 |
words = text.split()
|
| 25 |
+
# λͺ
μ¬ μΆμΆ, λΆμ©μ΄ μ κ±°, λ κΈμ μ΄μμ λ¨μ΄λ§ μ ν
|
| 26 |
+
words = [word for word in words if is_noun(word) and len(word) > 1 and word not in user_stopwords]
|
| 27 |
return ' '.join(words)
|
| 28 |
|
| 29 |
def topic_modeling(texts, n_components):
|
|
|
|
| 32 |
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
|
| 33 |
lda.fit(data_vectorized)
|
| 34 |
|
|
|
|
| 35 |
try:
|
| 36 |
features = vectorizer.get_feature_names_out()
|
| 37 |
except AttributeError:
|
|
|
|
| 38 |
features = vectorizer.get_feature_names()
|
| 39 |
|
| 40 |
topics = {}
|
|
|
|
| 50 |
ax.axis("off")
|
| 51 |
return fig
|
| 52 |
|
| 53 |
+
def get_top_trigrams(text, n=10):
|
| 54 |
+
trigram_vectorizer = CountVectorizer(ngram_range=(3, 3), max_features=10000)
|
| 55 |
+
trigrams = trigram_vectorizer.fit_transform([text])
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
trigram_features = trigram_vectorizer.get_feature_names_out()
|
| 59 |
+
except AttributeError:
|
| 60 |
+
trigram_features = trigram_vectorizer.get_feature_names()
|
| 61 |
+
|
| 62 |
+
trigram_counts = trigrams.sum(axis=0).A1
|
| 63 |
+
top_trigrams = sorted(zip(trigram_features, trigram_counts), key=lambda x: x[1], reverse=True)[:n]
|
| 64 |
+
return top_trigrams
|
| 65 |
+
|
| 66 |
# μ€νΈλ¦Όλ¦Ώ UI μ€μ
|
| 67 |
st.title("ν
μ€νΈ λΆμ λꡬ")
|
| 68 |
|
|
|
|
| 69 |
user_stopwords = st.sidebar.text_area("λΆμ©μ΄λ₯Ό μ
λ ₯νμΈμ (μΌνλ‘ κ΅¬λΆ)",
|
| 70 |
value=", ".join(default_stopwords))
|
| 71 |
user_stopwords = set(user_stopwords.split(", ")) | default_stopwords
|
| 72 |
|
|
|
|
| 73 |
uploaded_file = st.file_uploader("ν
μ€νΈ νμΌ μ
λ‘λ", type=['txt'])
|
| 74 |
|
| 75 |
if uploaded_file is not None:
|
|
|
|
| 77 |
with st.spinner('νμΌμ μ²λ¦¬ μ€μ
λλ€...'):
|
| 78 |
text = str(uploaded_file.read(), 'utf-8')
|
| 79 |
|
|
|
|
| 80 |
progress_bar = st.progress(0)
|
| 81 |
total_chunks = 100
|
| 82 |
chunk_size = max(1, len(text) // total_chunks)
|
|
|
|
| 92 |
|
| 93 |
preprocessed_text = " ".join(preprocessed_chunks)
|
| 94 |
|
|
|
|
| 95 |
st.subheader("ν ν½ λͺ¨λΈλ§ κ²°κ³Ό")
|
| 96 |
n_topics = st.slider("ν ν½ μ μ ν", min_value=2, max_value=10, value=5)
|
| 97 |
topics = topic_modeling([preprocessed_text], n_topics)
|
| 98 |
for topic, words in topics.items():
|
| 99 |
st.write(f"{topic}: {', '.join(words)}")
|
| 100 |
|
| 101 |
+
st.subheader("μμ 10κ° Trigram")
|
| 102 |
+
top_trigrams = get_top_trigrams(preprocessed_text)
|
| 103 |
+
for trigram, count in top_trigrams:
|
| 104 |
+
st.write(f"{trigram}: {count}")
|
| 105 |
+
|
| 106 |
st.subheader("μλ ν΄λΌμ°λ")
|
| 107 |
color = st.color_picker("λ°°κ²½μ μ ν", "#ffffff")
|
| 108 |
fig = generate_wordcloud(preprocessed_text, color)
|
|
|
|
| 111 |
except Exception as e:
|
| 112 |
st.error(f"μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}")
|
| 113 |
|
|
|
|
| 114 |
st.sidebar.markdown("""
|
| 115 |
## μ¬μ© λ°©λ²
|
| 116 |
1. μ¬μ΄λλ°μμ λΆμ©μ΄λ₯Ό μΆκ°νκ±°λ μμ ν μ μμ΅λλ€.
|
| 117 |
2. ν
μ€νΈ νμΌ(.txt)μ μ
λ‘λνμΈμ.
|
| 118 |
3. ν ν½ λͺ¨λΈλ§μ ν ν½ μλ₯Ό μ ννμΈμ.
|
| 119 |
+
4. μμ 10κ° Trigramμ νμΈνμΈμ.
|
| 120 |
+
5. μλν΄λΌμ°λμ λ°°κ²½μμ μ νν μ μμ΅λλ€.
|
| 121 |
""")
|