Spaces:

soojeongcrystal
/

text

Sleeping

App Files Files Community

text / app.py

soojeongcrystal

Update app.py

a6f4c85 verified over 1 year ago

raw

history blame contribute delete

6.16 kB

	import streamlit as st
	import matplotlib.pyplot as plt
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.decomposition import LatentDirichletAllocation
	import warnings
	from collections import Counter
	import traceback
	import os
	import jpype

	warnings.filterwarnings("ignore")

	# JPype 설정
	if not jpype.isJVMStarted():
	jvm_path = jpype.getDefaultJVMPath()
	jpype.startJVM(jvm_path, "-Djava.class.path=/usr/local/lib/python3.10/site-packages/konlpy/java/*")

	# 한글 폰트 설정 (허깅페이스 환경에 맞게 수정 필요할 수 있음)
	plt.rcParams['font.family'] = 'NanumGothic'

	# 초기 불용어 목록
	default_stopwords = set([
	"있다", "없다", "되다", "이다", "하다", "같다", "위하다", "있는", "하는",
	"그리고", "그런", "이런", "저런", "이렇게", "저렇게", "그렇게",
	"우리", "저는", "제가", "내가", "나는", "있습니다", "있었습니다", "되었습니다",
	"하였습니다", "통해", "위해", "대한", "함께", "많은", "어떤", "저희", "이를",
	"또한", "그래서", "그리고", "하지만", "그러나", "따라서", "때문에"
	])

	@st.cache_resource
	def load_okt():
	from konlpy.tag import Okt
	return Okt()

	okt = load_okt()

	def extract_nouns(text):
	try:
	nouns = okt.nouns(text)
	return [noun for noun in nouns if len(noun) > 1]
	except Exception as e:
	st.error(f"명사 추출 중 오류 발생: {str(e)}")
	st.error(f"문제가 발생한 텍스트: {text[:100]}...")
	return []

	@st.cache_data
	def preprocess_text(text, user_stopwords):
	try:
	if not text or not isinstance(text, str):
	st.warning(f"유효하지 않은 입력: {type(text)}")
	return ""
	nouns = extract_nouns(text)
	nouns = [noun for noun in nouns if noun and noun not in user_stopwords]
	return ' '.join(nouns)
	except Exception as e:
	st.error(f"텍스트 전처리 중 오류 발생: {str(e)}")
	st.error(f"문제가 발생한 텍스트: {text[:100]}...")
	return ""

	def topic_modeling(texts, n_components):
	stop_words_list = list(default_stopwords)

	vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000)
	data_vectorized = vectorizer.fit_transform(texts)

	n_tokens = data_vectorized.shape[1]
	n_components = min(n_components, n_tokens)

	if n_components < 2:
	st.warning("추출된 고유 단어가 너무 적습니다. 더 긴 텍스트를 사용해 주세요.")
	return {}

	lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=50,
	learning_method='online', learning_offset=50., doc_topic_prior=0.1, topic_word_prior=0.01)
	lda.fit(data_vectorized)

	features = vectorizer.get_feature_names_out()

	topics = {}
	for topic_idx, topic in enumerate(lda.components_):
	top_words = [features[i] for i in topic.argsort()[:-11:-1]]
	topics[f"Topic {topic_idx + 1}"] = top_words
	return topics

	def generate_word_frequency_chart(text, color, n=20):
	nouns = extract_nouns(text)
	word_freq = Counter(nouns)
	top_words = dict(word_freq.most_common(n))

	fig, ax = plt.subplots(figsize=(12, 6))
	ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
	ax.invert_yaxis()
	ax.set_title("상위 {} 단어".format(n))
	plt.tight_layout()
	return fig

	def get_top_trigrams(text, n=10):
	nouns = extract_nouns(text)
	trigrams = zip(nouns, nouns[1:], nouns[2:])
	trigram_freq = Counter(trigrams)
	return trigram_freq.most_common(n)

	# 스트림릿 UI 설정
	st.title("텍스트 분석 도구")

	user_stopwords = st.sidebar.text_area("불용어를 입력하세요 (쉼표로 구분)",
	value=", ".join(default_stopwords))
	user_stopwords = list(set(user_stopwords.split(", ")) \| default_stopwords)

	uploaded_file = st.file_uploader("텍스트 파일 업로드", type=['txt'])

	if uploaded_file is not None:
	try:
	with st.spinner('파일을 처리 중입니다...'):
	text = str(uploaded_file.read(), 'utf-8')
	st.text(f"파일 길이: {len(text)} 문자")

	preprocessed_text = preprocess_text(text, user_stopwords)

	st.text(f"처리된 텍스트 길이: {len(preprocessed_text)} 문자")
	st.text(f"고유 단어 수: {len(set(preprocessed_text.split()))}")

	if not preprocessed_text:
	st.warning("처리된 텍스트가 없습니다. 다른 파일을 업로드해 주세요.")
	else:
	st.subheader("토픽 모델링 결과")
	n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
	topics = topic_modeling([preprocessed_text], n_topics)
	if topics:
	for topic, words in topics.items():
	st.write(f"{topic}: {', '.join(words)}")
	else:
	st.warning("토픽 모델링을 수행할 수 없습니다. 더 긴 텍스트를 사용해 주세요.")

	st.subheader("상위 10개 Trigram")
	top_trigrams = get_top_trigrams(preprocessed_text)
	for trigram, count in top_trigrams:
	st.write(f"{' '.join(trigram)}: {count}")

	st.subheader("단어 빈도 차트")
	color = st.color_picker("막대 색상 선택", "#1f77b4")
	fig = generate_word_frequency_chart(preprocessed_text, color)
	st.pyplot(fig)

	except Exception as e:
	st.error(f"파일 처리 중 오류 발생: {str(e)}")
	st.text("오류 발생 위치:")
	st.text(traceback.format_exc())

	st.sidebar.markdown("""
	## 사용 방법
	1. 사이드바에서 불용어를 추가하거나 수정할 수 있습니다.
	2. 텍스트 파일(.txt)을 업로드하세요.
	3. 토픽 모델링의 토픽 수를 선택하세요.
	4. 상위 10개 Trigram을 확인하세요.
	5. 단어 빈도 차트의 막대 색상을 선택할 수 있습니다.
	""")