Spaces:
Build error
Build error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
from kiwipiepy import Kiwi
|
| 5 |
+
kiwi = Kiwi()
|
| 6 |
+
|
| 7 |
+
from collections import Counter
|
| 8 |
+
from wordcloud import WordCloud
|
| 9 |
+
import matplotlib.pyplot as plt
|
| 10 |
+
|
| 11 |
+
def kiwi_tokenize(txt, nouns=True, remove1=False, stopwords=[]):
|
| 12 |
+
'''๋ฌธ์์ด txt๋ฅผ ๋ฐ์ kiwi๋ก ํํ์ ์ถ์ถ: nouns=๋ช
์ฌ๋ง ์ถ์ถ ์ฌ๋ถ, remove1=1์์ ํ ํฐ ์ ์ธ ์ฌ๋ถ, stopwords=๋ถ์ฉ์ด ๋ฆฌ์คํธ '''
|
| 13 |
+
try:
|
| 14 |
+
# ์ ์ (cleaning): ๋น๋ฌธ์์ซ์ ๋ฑ ๋
ธ์ด์ฆ ์ ๊ฑฐ
|
| 15 |
+
txt1=re.sub(r"[^\s๊ฐ-ํฃa-zA-Z0-9]", " ", txt) # re.sub: ๋ฌธ์์ด ๋ถ๋ถ ๊ต์ฒด. r์ ์ ๊ทํํ์ ์ฌ์ฉํ๋ค๋ ํ์.
|
| 16 |
+
# "[^ ๊ฐ-ํฃa-zA-Z1-9]"๋ ํ๊ธ ์์ด ์ซ์ ์ด์ธ์ ๋ฌธ์์ด ์๋ฏธ.
|
| 17 |
+
# txt1=txt1.replace("X", " "): ํน์ ๋จ์ด๋ง ์ญ์ ํ ๋์๋ replace ํจ์๋ก ๊ฐ๋จํ ์คํ
|
| 18 |
+
# ํ ํฐํ(tokenization): ํํ์ ์ถ์ถ
|
| 19 |
+
morphs=kiwi.tokenize(txt1)
|
| 20 |
+
morphs_all=[m[0] for m in morphs] # ๋ชจ๋ ํ์ฌ์ ํด๋นํ๋ ํํ์ ๋ชจ๋ ์ถ์ถ
|
| 21 |
+
morphs_select=['NNG', 'NNP', 'NP', 'NR', 'VV', 'VX', 'VCP', 'VCN', 'VA','VA-I', 'MM', 'MAG'] # ์ผ๋ฐ๋ช
์ฌ, ๊ณ ์ ๋ช
์ฌ, ์ฉ์ธ(๋์ฌ, ํ์ฉ์ฌ ๋ฑ), ๊ดํ์ฌ, ์ผ๋ฐ๋ถ์ฌ # ํ์ฌ ๋ถ๋ฅํ ์ฐธ์กฐ
|
| 22 |
+
# ๋ช
์ฌ ์ถ์ถ(nou extraction) ์ฌ๋ถ ์ ํ
|
| 23 |
+
if nouns==True:
|
| 24 |
+
token_lst=[m[0] for m in morphs if m[1] in morphs_select[:4]]
|
| 25 |
+
else:
|
| 26 |
+
token_lst=[m for m in morphs if m[1] in morphs_select]
|
| 27 |
+
# stemming(์ด๊ฐ ์ถ์ถ, ๋์ฌ-ํ์ฉ์ฌ ๋ฑ ์ฉ์ธ์ ์ํ ๋ณต๊ตฌ) ์ ์ฉ
|
| 28 |
+
token_lst=[m[0]+'๋ค' if m[1].startswith('V') else m[0] for m in token_lst]
|
| 29 |
+
# 1์์ ํ ํฐ ์ ์ธ ์ฌ๋ถ ์ ํ
|
| 30 |
+
if remove1==True:
|
| 31 |
+
token_lst=[t for t in token_lst if len(t)>1 ]
|
| 32 |
+
else:
|
| 33 |
+
pass
|
| 34 |
+
# ๋ถ์ฉ์ด(stopwords) ์ ์ฉ: ์ ์ธํด์ผ ํ ํ ํฐ๋ค์ ์งํฉ
|
| 35 |
+
token_lst=[t for t in token_lst if t not in stopwords]
|
| 36 |
+
except:
|
| 37 |
+
token_lst=[]
|
| 38 |
+
return token_lst
|
| 39 |
+
|
| 40 |
+
def generate_wordcloud(text):
|
| 41 |
+
token_list=kiwi_tokenize(text, nouns=True, remove1=True, stopwords=[])
|
| 42 |
+
keywords_all=Counter(token_list).most_common(100)
|
| 43 |
+
|
| 44 |
+
mywordcloud = WordCloud(
|
| 45 |
+
font_path = 'NanumGothic-Regular.ttf', # ํฐํธ ์ ์ฅ ๊ฒฝ๋ก
|
| 46 |
+
background_color='white',
|
| 47 |
+
colormap = "Accent_r", # ์ฌ์ฉ ์์ ์ง์ # https://matplotlib.org/stable/tutorials/colors/colormaps.html
|
| 48 |
+
width=1500, height=1000 # ๊ทธ๋ฆผ ํฝ์
|
| 49 |
+
).generate_from_frequencies(dict(keywords_all))
|
| 50 |
+
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))
|
| 51 |
+
plt.imshow(mywordcloud, interpolation='bilinear')
|
| 52 |
+
plt.axis('off')
|
| 53 |
+
st.pyplot(fig)
|
| 54 |
+
|
| 55 |
+
def main():
|
| 56 |
+
st.title("์๋ํด๋ผ์ฐ๋(Word Cloud) ๋ง๋ค๊ธฐ")
|
| 57 |
+
st.write("๊ฐ๊ณตํ ํ
์คํธ๋ฅผ ์
๋ ฅํ์ธ์:")
|
| 58 |
+
text_input = st.text_area("ํ
์คํธ", "")
|
| 59 |
+
|
| 60 |
+
if st.button("์๋ํด๋ผ์ฐ๋ ์์"):
|
| 61 |
+
if text_input:
|
| 62 |
+
generate_wordcloud(text_input)
|
| 63 |
+
else:
|
| 64 |
+
st.warning("Please enter some text.")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
main()
|