soojeongcrystal commited on
Commit
a9bea46
ยท
verified ยท
1 Parent(s): 9aff819

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -20
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import streamlit as st
2
- import pandas as pd
3
  from wordcloud import WordCloud
4
  import matplotlib.pyplot as plt
5
  from sklearn.feature_extraction.text import CountVectorizer
@@ -13,17 +12,22 @@ import warnings
13
  # ๊ฒฝ๊ณ  ๋ฉ”์‹œ์ง€ ๋ฌด์‹œ
14
  warnings.filterwarnings("ignore")
15
 
16
- # ๋ถˆ์šฉ์–ด ๋ชฉ๋ก
17
- stopwords = set(["์žˆ๋‹ค", "์—†๋‹ค", "๊ฒƒ", "๊ทธ", "์ด", "ํ•˜๋Š”", "ํ•˜๊ธฐ", "ํ• ", "๋˜", "์ˆ˜", "์ด๋‹ค", "์‹œํ‚ค๋‹ค"])
18
 
19
- def preprocess_text(text):
 
 
 
 
 
20
  okt = Okt()
21
  text = re.sub(r'[^\w\s]', '', text) # ํŠน์ˆ˜ ๋ฌธ์ž ์ œ๊ฑฐ
22
  nouns = okt.nouns(text) # ๋ช…์‚ฌ ์ถ”์ถœ
23
- nouns = [noun for noun in nouns if len(noun) > 1 and noun not in stopwords] # ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ ๋ฐ ํ•œ ๊ธ€์ž ์ด์ƒ์˜ ๋ช…์‚ฌ๋งŒ ์„ ํƒ
24
  return ' '.join(nouns)
25
 
26
- def topic_modeling(texts, n_components=5):
27
  vectorizer = CountVectorizer()
28
  data_vectorized = vectorizer.fit_transform(texts)
29
  lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
@@ -31,29 +35,66 @@ def topic_modeling(texts, n_components=5):
31
  features = vectorizer.get_feature_names()
32
  topics = {}
33
  for topic_idx, topic in enumerate(lda.components_):
34
- topics[f"Topic {topic_idx}"] = [features[i] for i in topic.argsort()[:-21:-1]]
35
  return topics
36
 
37
- def generate_wordcloud(text):
38
- wordcloud = WordCloud(width=800, height=400).generate(text)
39
- plt.figure(figsize=(10, 5))
40
- plt.imshow(wordcloud, interpolation='bilinear')
41
- plt.axis("off")
42
- plt.show()
43
 
44
  # ์ŠคํŠธ๋ฆผ๋ฆฟ UI ์„ค์ •
45
  st.title("ํ…์ŠคํŠธ ๋ถ„์„ ๋„๊ตฌ")
46
 
 
 
 
 
 
 
47
  uploaded_file = st.file_uploader("ํ…์ŠคํŠธ ํŒŒ์ผ ์—…๋กœ๋“œ", type=['txt'])
48
- if uploaded_file is not None:
49
- with st.spinner('ํŒŒ์ผ์„ ์ฒ˜๋ฆฌ ์ค‘์ž…๋‹ˆ๋‹ค...'):
50
- text = str(uploaded_file.read(), 'utf-8')
51
- preprocessed_text = preprocess_text(text) # ์ „์ฒ˜๋ฆฌ๋œ ๋ช…์‚ฌ ์ถ”์ถœ
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  st.subheader("ํ† ํ”ฝ ๋ชจ๋ธ๋ง ๊ฒฐ๊ณผ")
54
- topics = topic_modeling([preprocessed_text])
 
55
  for topic, words in topics.items():
56
  st.write(f"{topic}: {', '.join(words)}")
57
-
 
58
  st.subheader("์›Œ๋“œ ํด๋ผ์šฐ๋“œ")
59
- st.pyplot(generate_wordcloud(preprocessed_text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
2
  from wordcloud import WordCloud
3
  import matplotlib.pyplot as plt
4
  from sklearn.feature_extraction.text import CountVectorizer
 
12
  # ๊ฒฝ๊ณ  ๋ฉ”์‹œ์ง€ ๋ฌด์‹œ
13
  warnings.filterwarnings("ignore")
14
 
15
+ # ์ดˆ๊ธฐ ๋ถˆ์šฉ์–ด ๋ชฉ๋ก
16
+ default_stopwords = set(["์žˆ๋‹ค", "์—†๋‹ค", "๊ฒƒ", "๊ทธ", "์ด", "ํ•˜๋Š”", "ํ•˜๊ธฐ", "ํ• ", "๋˜", "์ˆ˜", "์ด๋‹ค", "์‹œํ‚ค๋‹ค"])
17
 
18
+ @st.cache_data
19
+ def preprocess_text(text, user_stopwords):
20
+ spacing = Spacing()
21
+ text = spacing(text) # ๋„์–ด์“ฐ๊ธฐ ๊ต์ •
22
+ text = spell_checker.check(text).checked # ๋งž์ถค๋ฒ• ๊ฒ€์‚ฌ
23
+
24
  okt = Okt()
25
  text = re.sub(r'[^\w\s]', '', text) # ํŠน์ˆ˜ ๋ฌธ์ž ์ œ๊ฑฐ
26
  nouns = okt.nouns(text) # ๋ช…์‚ฌ ์ถ”์ถœ
27
+ nouns = [noun for noun in nouns if len(noun) > 1 and noun not in user_stopwords] # ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ ๋ฐ ํ•œ ๊ธ€์ž ์ด์ƒ์˜ ๋ช…์‚ฌ๋งŒ ์„ ํƒ
28
  return ' '.join(nouns)
29
 
30
+ def topic_modeling(texts, n_components):
31
  vectorizer = CountVectorizer()
32
  data_vectorized = vectorizer.fit_transform(texts)
33
  lda = LatentDirichletAllocation(n_components=n_components, random_state=0)
 
35
  features = vectorizer.get_feature_names()
36
  topics = {}
37
  for topic_idx, topic in enumerate(lda.components_):
38
+ topics[f"Topic {topic_idx + 1}"] = [features[i] for i in topic.argsort()[:-21:-1]]
39
  return topics
40
 
41
+ def generate_wordcloud(text, color):
42
+ wordcloud = WordCloud(width=800, height=400, background_color=color).generate(text)
43
+ fig, ax = plt.subplots(figsize=(10, 5))
44
+ ax.imshow(wordcloud, interpolation='bilinear')
45
+ ax.axis("off")
46
+ return fig
47
 
48
  # ์ŠคํŠธ๋ฆผ๋ฆฟ UI ์„ค์ •
49
  st.title("ํ…์ŠคํŠธ ๋ถ„์„ ๋„๊ตฌ")
50
 
51
+ # ์‚ฌ์ด๋“œ๋ฐ”์— ๋ถˆ์šฉ์–ด ์ž…๋ ฅ ํ•„๋“œ ์ถ”๊ฐ€
52
+ user_stopwords = st.sidebar.text_area("๋ถˆ์šฉ์–ด๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š” (์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„)",
53
+ value=", ".join(default_stopwords))
54
+ user_stopwords = set(user_stopwords.split(", ")) | default_stopwords
55
+
56
+ # ํŒŒ์ผ ์—…๋กœ๋”
57
  uploaded_file = st.file_uploader("ํ…์ŠคํŠธ ํŒŒ์ผ ์—…๋กœ๋“œ", type=['txt'])
 
 
 
 
58
 
59
+ if uploaded_file is not None:
60
+ try:
61
+ with st.spinner('ํŒŒ์ผ์„ ์ฒ˜๋ฆฌ ์ค‘์ž…๋‹ˆ๋‹ค...'):
62
+ text = str(uploaded_file.read(), 'utf-8')
63
+
64
+ # ํ…์ŠคํŠธ ํฌ๊ธฐ์— ๋”ฐ๋ฅธ ํ”„๋กœ๊ทธ๋ ˆ์Šค ๋ฐ” ์ถ”๊ฐ€
65
+ progress_bar = st.progress(0)
66
+ chunk_size = max(1, len(text) // 100) # ํ…์ŠคํŠธ๋ฅผ 100๊ฐœ์˜ ์ฒญํฌ๋กœ ๋‚˜๋ˆ”
67
+ preprocessed_chunks = []
68
+
69
+ for i in range(0, len(text), chunk_size):
70
+ chunk = text[i:i+chunk_size]
71
+ preprocessed_chunk = preprocess_text(chunk, user_stopwords)
72
+ preprocessed_chunks.append(preprocessed_chunk)
73
+ progress_bar.progress((i + chunk_size) / len(text))
74
+
75
+ preprocessed_text = " ".join(preprocessed_chunks)
76
+
77
+ # ํ† ํ”ฝ ๋ชจ๋ธ๋ง
78
  st.subheader("ํ† ํ”ฝ ๋ชจ๋ธ๋ง ๊ฒฐ๊ณผ")
79
+ n_topics = st.slider("ํ† ํ”ฝ ์ˆ˜ ์„ ํƒ", min_value=2, max_value=10, value=5)
80
+ topics = topic_modeling([preprocessed_text], n_topics)
81
  for topic, words in topics.items():
82
  st.write(f"{topic}: {', '.join(words)}")
83
+
84
+ # ์›Œ๋“œ ํด๋ผ์šฐ๋“œ
85
  st.subheader("์›Œ๋“œ ํด๋ผ์šฐ๋“œ")
86
+ color = st.color_picker("๋ฐฐ๊ฒฝ์ƒ‰ ์„ ํƒ", "#ffffff")
87
+ fig = generate_wordcloud(preprocessed_text, color)
88
+ st.pyplot(fig)
89
+
90
+ except Exception as e:
91
+ st.error(f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}")
92
+
93
+ # ์‚ฌ์šฉ ์„ค๋ช… ์ถ”๊ฐ€
94
+ st.sidebar.markdown("""
95
+ ## ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
96
+ 1. ์‚ฌ์ด๋“œ๋ฐ”์—์„œ ๋ถˆ์šฉ์–ด๋ฅผ ์ถ”๊ฐ€ํ•˜๊ฑฐ๋‚˜ ์ˆ˜์ •ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
97
+ 2. ํ…์ŠคํŠธ ํŒŒ์ผ(.txt)์„ ์—…๋กœ๋“œํ•˜์„ธ์š”.
98
+ 3. ํ† ํ”ฝ ๋ชจ๋ธ๋ง์˜ ํ† ํ”ฝ ์ˆ˜๋ฅผ ์„ ํƒํ•˜์„ธ์š”.
99
+ 4. ์›Œ๋“œํด๋ผ์šฐ๋“œ์˜ ๋ฐฐ๊ฒฝ์ƒ‰์„ ์„ ํƒํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
100
+ """)