Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,28 +5,32 @@ from sklearn.decomposition import LatentDirichletAllocation
|
|
| 5 |
import re
|
| 6 |
import warnings
|
| 7 |
from collections import Counter
|
|
|
|
| 8 |
|
| 9 |
warnings.filterwarnings("ignore")
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
# ์ด๊ธฐ ๋ถ์ฉ์ด ๋ชฉ๋ก
|
| 12 |
-
default_stopwords = set(["์๋ค", "์๋ค", "๋๋ค", "์ด๋ค", "ํ๋ค", "๊ฐ๋ค", "์ํ๋ค", "์๋ค", "๋์ด๋ค"])
|
| 13 |
|
| 14 |
-
def
|
| 15 |
-
# ๊ฐ๋จํ
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
@st.cache_data
|
| 19 |
def preprocess_text(text, user_stopwords):
|
| 20 |
# ํ๊ธ๊ณผ ๊ณต๋ฐฑ๋ง ๋จ๊ธฐ๊ณ ๋ชจ๋ ์ ๊ฑฐ
|
| 21 |
text = re.sub(r'[^ใฑ-ใ
ใ
-ใ
ฃ๊ฐ-ํฃ\s]', '', text)
|
| 22 |
-
#
|
| 23 |
-
words =
|
| 24 |
-
|
| 25 |
-
words = [word for word in words if is_noun(word) and len(word) > 1 and word not in user_stopwords]
|
| 26 |
return ' '.join(words)
|
| 27 |
|
| 28 |
def topic_modeling(texts, n_components):
|
| 29 |
-
vectorizer = CountVectorizer()
|
| 30 |
data_vectorized = vectorizer.fit_transform(texts)
|
| 31 |
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
|
| 32 |
lda.fit(data_vectorized)
|
|
@@ -42,29 +46,22 @@ def topic_modeling(texts, n_components):
|
|
| 42 |
return topics
|
| 43 |
|
| 44 |
def generate_word_frequency_chart(text, color, n=20):
|
| 45 |
-
words =
|
| 46 |
word_freq = Counter(words)
|
| 47 |
top_words = dict(word_freq.most_common(n))
|
| 48 |
|
| 49 |
fig, ax = plt.subplots(figsize=(12, 6))
|
| 50 |
ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
|
| 51 |
ax.invert_yaxis() # ๊ฐ์ฅ ๋น๋๊ฐ ๋์ ๋จ์ด๋ฅผ ์์ชฝ์ ํ์
|
| 52 |
-
ax.set_title("
|
| 53 |
plt.tight_layout()
|
| 54 |
return fig
|
| 55 |
|
| 56 |
def get_top_trigrams(text, n=10):
|
| 57 |
-
|
| 58 |
-
trigrams =
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
trigram_features = trigram_vectorizer.get_feature_names_out()
|
| 62 |
-
except AttributeError:
|
| 63 |
-
trigram_features = trigram_vectorizer.get_feature_names()
|
| 64 |
-
|
| 65 |
-
trigram_counts = trigrams.sum(axis=0).A1
|
| 66 |
-
top_trigrams = sorted(zip(trigram_features, trigram_counts), key=lambda x: x[1], reverse=True)[:n]
|
| 67 |
-
return top_trigrams
|
| 68 |
|
| 69 |
# ์คํธ๋ฆผ๋ฆฟ UI ์ค์
|
| 70 |
st.title("ํ
์คํธ ๋ถ์ ๋๊ตฌ")
|
|
@@ -104,7 +101,7 @@ if uploaded_file is not None:
|
|
| 104 |
st.subheader("์์ 10๊ฐ Trigram")
|
| 105 |
top_trigrams = get_top_trigrams(preprocessed_text)
|
| 106 |
for trigram, count in top_trigrams:
|
| 107 |
-
st.write(f"{trigram}: {count}")
|
| 108 |
|
| 109 |
st.subheader("๋จ์ด ๋น๋ ์ฐจํธ")
|
| 110 |
color = st.color_picker("๋ง๋ ์์ ์ ํ", "#1f77b4")
|
|
|
|
| 5 |
import re
|
| 6 |
import warnings
|
| 7 |
from collections import Counter
|
| 8 |
+
import matplotlib.font_manager as fm
|
| 9 |
|
| 10 |
warnings.filterwarnings("ignore")
|
| 11 |
|
| 12 |
+
# ํ๊ธ ํฐํธ ์ค์
|
| 13 |
+
plt.rcParams['font.family'] = 'NanumGothic'
|
| 14 |
+
|
| 15 |
# ์ด๊ธฐ ๋ถ์ฉ์ด ๋ชฉ๋ก
|
| 16 |
+
default_stopwords = set(["์๋ค", "์๋ค", "๋๋ค", "์ด๋ค", "ํ๋ค", "๊ฐ๋ค", "์ํ๋ค", "์๋ค", "๋์ด๋ค", "ํตํด", "์ํด", "๋ํ", "์๋", "ํ๋"])
|
| 17 |
|
| 18 |
+
def simple_tokenize(text):
|
| 19 |
+
# ๊ฐ๋จํ ํํ์ ๋ถ์: ๋ช
์ฌ ์ถ์ถ
|
| 20 |
+
words = re.findall(r'\w+', text)
|
| 21 |
+
return [word for word in words if len(word) > 1 and not word.endswith(('๋ค', '์', '๊น', '๋ค'))]
|
| 22 |
|
| 23 |
@st.cache_data
|
| 24 |
def preprocess_text(text, user_stopwords):
|
| 25 |
# ํ๊ธ๊ณผ ๊ณต๋ฐฑ๋ง ๋จ๊ธฐ๊ณ ๋ชจ๋ ์ ๊ฑฐ
|
| 26 |
text = re.sub(r'[^ใฑ-ใ
ใ
-ใ
ฃ๊ฐ-ํฃ\s]', '', text)
|
| 27 |
+
# ํ ํฐํ ๋ฐ ๋ถ์ฉ์ด ์ ๊ฑฐ
|
| 28 |
+
words = simple_tokenize(text)
|
| 29 |
+
words = [word for word in words if word not in user_stopwords]
|
|
|
|
| 30 |
return ' '.join(words)
|
| 31 |
|
| 32 |
def topic_modeling(texts, n_components):
|
| 33 |
+
vectorizer = CountVectorizer(tokenizer=simple_tokenize)
|
| 34 |
data_vectorized = vectorizer.fit_transform(texts)
|
| 35 |
lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
|
| 36 |
lda.fit(data_vectorized)
|
|
|
|
| 46 |
return topics
|
| 47 |
|
| 48 |
def generate_word_frequency_chart(text, color, n=20):
|
| 49 |
+
words = simple_tokenize(text)
|
| 50 |
word_freq = Counter(words)
|
| 51 |
top_words = dict(word_freq.most_common(n))
|
| 52 |
|
| 53 |
fig, ax = plt.subplots(figsize=(12, 6))
|
| 54 |
ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
|
| 55 |
ax.invert_yaxis() # ๊ฐ์ฅ ๋น๋๊ฐ ๋์ ๋จ์ด๋ฅผ ์์ชฝ์ ํ์
|
| 56 |
+
ax.set_title("์์ {} ๋จ์ด".format(n))
|
| 57 |
plt.tight_layout()
|
| 58 |
return fig
|
| 59 |
|
| 60 |
def get_top_trigrams(text, n=10):
|
| 61 |
+
words = simple_tokenize(text)
|
| 62 |
+
trigrams = zip(words, words[1:], words[2:])
|
| 63 |
+
trigram_freq = Counter(trigrams)
|
| 64 |
+
return trigram_freq.most_common(n)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
# ์คํธ๋ฆผ๋ฆฟ UI ์ค์
|
| 67 |
st.title("ํ
์คํธ ๋ถ์ ๋๊ตฌ")
|
|
|
|
| 101 |
st.subheader("์์ 10๊ฐ Trigram")
|
| 102 |
top_trigrams = get_top_trigrams(preprocessed_text)
|
| 103 |
for trigram, count in top_trigrams:
|
| 104 |
+
st.write(f"{' '.join(trigram)}: {count}")
|
| 105 |
|
| 106 |
st.subheader("๋จ์ด ๋น๋ ์ฐจํธ")
|
| 107 |
color = st.color_picker("๋ง๋ ์์ ์ ํ", "#1f77b4")
|