jonghhhh commited on
Commit
9d8e008
ยท
verified ยท
1 Parent(s): e127123

Delete mywordcloud.py

Browse files
Files changed (1) hide show
  1. mywordcloud.py +0 -68
mywordcloud.py DELETED
@@ -1,68 +0,0 @@
1
- import streamlit as st
2
-
3
- import re
4
- from kiwipiepy import Kiwi
5
- kiwi = Kiwi()
6
-
7
- from collections import Counter
8
- from wordcloud import WordCloud
9
- import matplotlib.pyplot as plt
10
-
11
- def kiwi_tokenize(txt, nouns=True, remove1=False, stopwords=[]):
12
- '''๋ฌธ์ž์—ด txt๋ฅผ ๋ฐ›์•„ kiwi๋กœ ํ˜•ํƒœ์†Œ ์ถ”์ถœ: nouns=๋ช…์‚ฌ๋งŒ ์ถ”์ถœ ์—ฌ๋ถ€, remove1=1์Œ์ ˆ ํ† ํฐ ์ œ์™ธ ์—ฌ๋ถ€, stopwords=๋ถˆ์šฉ์–ด ๋ฆฌ์ŠคํŠธ '''
13
- try:
14
- # ์ •์ œ(cleaning): ๋น„๋ฌธ์ž์ˆซ์ž ๋“ฑ ๋…ธ์ด์ฆˆ ์ œ๊ฑฐ
15
- txt1=re.sub(r"[^\s๊ฐ€-ํžฃa-zA-Z0-9]", " ", txt) # re.sub: ๋ฌธ์ž์—ด ๋ถ€๋ถ„ ๊ต์ฒด. r์€ ์ •๊ทœํ‘œํ˜„์‹ ์‚ฌ์šฉํ•œ๋‹ค๋Š” ํ‘œ์‹œ.
16
- # "[^ ๊ฐ€-ํžฃa-zA-Z1-9]"๋Š” ํ•œ๊ธ€ ์˜์–ด ์ˆซ์ž ์ด์™ธ์˜ ๋ฌธ์ž์—ด ์˜๋ฏธ.
17
- # txt1=txt1.replace("X", " "): ํŠน์ • ๋‹จ์–ด๋งŒ ์‚ญ์ œํ•  ๋•Œ์—๋Š” replace ํ•จ์ˆ˜๋กœ ๊ฐ„๋‹จํžˆ ์‹คํ–‰
18
- # ํ† ํฐํ™”(tokenization): ํ˜•ํƒœ์†Œ ์ถ”์ถœ
19
- morphs=kiwi.tokenize(txt1)
20
- morphs_all=[m[0] for m in morphs] # ๋ชจ๋“  ํ’ˆ์‚ฌ์— ํ•ด๋‹นํ•˜๋Š” ํ˜•ํƒœ์†Œ ๋ชจ๋‘ ์ถ”์ถœ
21
- morphs_select=['NNG', 'NNP', 'NP', 'NR', 'VV', 'VX', 'VCP', 'VCN', 'VA','VA-I', 'MM', 'MAG'] # ์ผ๋ฐ˜๋ช…์‚ฌ, ๊ณ ์œ ๋ช…์‚ฌ, ์šฉ์–ธ(๋™์‚ฌ, ํ˜•์šฉ์‚ฌ ๋“ฑ), ๊ด€ํ˜•์‚ฌ, ์ผ๋ฐ˜๋ถ€์‚ฌ # ํ’ˆ์‚ฌ ๋ถ„๋ฅ˜ํ‘œ ์ฐธ์กฐ
22
- # ๋ช…์‚ฌ ์ถ”์ถœ(nou extraction) ์—ฌ๋ถ€ ์„ ํƒ
23
- if nouns==True:
24
- token_lst=[m[0] for m in morphs if m[1] in morphs_select[:4]]
25
- else:
26
- token_lst=[m for m in morphs if m[1] in morphs_select]
27
- # stemming(์–ด๊ฐ„ ์ถ”์ถœ, ๋™์‚ฌ-ํ˜•์šฉ์‚ฌ ๋“ฑ ์šฉ์–ธ์˜ ์›ํ˜• ๋ณต๊ตฌ) ์ ์šฉ
28
- token_lst=[m[0]+'๋‹ค' if m[1].startswith('V') else m[0] for m in token_lst]
29
- # 1์Œ์ ˆ ํ† ํฐ ์ œ์™ธ ์—ฌ๋ถ€ ์„ ํƒ
30
- if remove1==True:
31
- token_lst=[t for t in token_lst if len(t)>1 ]
32
- else:
33
- pass
34
- # ๋ถˆ์šฉ์–ด(stopwords) ์ ์šฉ: ์ œ์™ธํ•ด์•ผ ํ•  ํ† ํฐ๋“ค์˜ ์ง‘ํ•ฉ
35
- token_lst=[t for t in token_lst if t not in stopwords]
36
- except:
37
- token_lst=[]
38
- return token_lst
39
-
40
- def generate_wordcloud(text):
41
- token_list=kiwi_tokenize(text, nouns=True, remove1=True, stopwords=[])
42
- keywords_all=Counter(token_list).most_common(100)
43
-
44
- mywordcloud = WordCloud(
45
- font_path = 'NanumGothic-Regular.ttf', # ํฐํŠธ ์ €์žฅ ๊ฒฝ๋กœ
46
- background_color='white',
47
- colormap = "Accent_r", # ์‚ฌ์šฉ ์ƒ‰์ƒ ์ง€์ • # https://matplotlib.org/stable/tutorials/colors/colormaps.html
48
- width=1500, height=1000 # ๊ทธ๋ฆผ ํ”ฝ์…€
49
- ).generate_from_frequencies(dict(keywords_all))
50
- fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 8))
51
- plt.imshow(mywordcloud, interpolation='bilinear')
52
- plt.axis('off')
53
- st.pyplot(fig)
54
-
55
- def main():
56
- st.title("์›Œ๋“œํด๋ผ์šฐ๋“œ(Word Cloud) ๋งŒ๋“ค๊ธฐ")
57
- st.write("๊ฐ€๊ณตํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”:")
58
- text_input = st.text_area("ํ…์ŠคํŠธ", "")
59
-
60
- if st.button("์›Œ๋“œํด๋ผ์šฐ๋“œ ์‹œ์ž‘"):
61
- if text_input:
62
- generate_wordcloud(text_input)
63
- else:
64
- st.warning("Please enter some text.")
65
-
66
-
67
- if __name__ == "__main__":
68
- main()