import streamlit as st
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import warnings
from collections import Counter
import traceback
import os
import jpype

warnings.filterwarnings("ignore")

# JPype 설정
if not jpype.isJVMStarted():
    jvm_path = jpype.getDefaultJVMPath()
    jpype.startJVM(jvm_path, "-Djava.class.path=/usr/local/lib/python3.10/site-packages/konlpy/java/*")

# 한글 폰트 설정 (허깅페이스 환경에 맞게 수정 필요할 수 있음)
plt.rcParams['font.family'] = 'NanumGothic'

# 초기 불용어 목록
default_stopwords = set([
    "있다", "없다", "되다", "이다", "하다", "같다", "위하다", "있는", "하는",
    "그리고", "그런", "이런", "저런", "이렇게", "저렇게", "그렇게",
    "우리", "저는", "제가", "내가", "나는", "있습니다", "있었습니다", "되었습니다",
    "하였습니다", "통해", "위해", "대한", "함께", "많은", "어떤", "저희", "이를",
    "또한", "그래서", "그리고", "하지만", "그러나", "따라서", "때문에"
])

@st.cache_resource
def load_okt():
    from konlpy.tag import Okt
    return Okt()

okt = load_okt()

def extract_nouns(text):
    try:
        nouns = okt.nouns(text)
        return [noun for noun in nouns if len(noun) > 1]
    except Exception as e:
        st.error(f"명사 추출 중 오류 발생: {str(e)}")
        st.error(f"문제가 발생한 텍스트: {text[:100]}...")
        return []

@st.cache_data
def preprocess_text(text, user_stopwords):
    try:
        if not text or not isinstance(text, str):
            st.warning(f"유효하지 않은 입력: {type(text)}")
            return ""
        nouns = extract_nouns(text)
        nouns = [noun for noun in nouns if noun and noun not in user_stopwords]
        return ' '.join(nouns)
    except Exception as e:
        st.error(f"텍스트 전처리 중 오류 발생: {str(e)}")
        st.error(f"문제가 발생한 텍스트: {text[:100]}...")
        return ""

def topic_modeling(texts, n_components):
    stop_words_list = list(default_stopwords)

    vectorizer = CountVectorizer(stop_words=stop_words_list, max_features=1000)
    data_vectorized = vectorizer.fit_transform(texts)
    
    n_tokens = data_vectorized.shape[1]
    n_components = min(n_components, n_tokens)

    if n_components < 2:
        st.warning("추출된 고유 단어가 너무 적습니다. 더 긴 텍스트를 사용해 주세요.")
        return {}

    lda = LatentDirichletAllocation(n_components=n_components, random_state=42, max_iter=50,
                                    learning_method='online', learning_offset=50., doc_topic_prior=0.1, topic_word_prior=0.01)
    lda.fit(data_vectorized)
    
    features = vectorizer.get_feature_names_out()
    
    topics = {}
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [features[i] for i in topic.argsort()[:-11:-1]]
        topics[f"Topic {topic_idx + 1}"] = top_words
    return topics

def generate_word_frequency_chart(text, color, n=20):
    nouns = extract_nouns(text)
    word_freq = Counter(nouns)
    top_words = dict(word_freq.most_common(n))
    
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.barh(list(top_words.keys()), list(top_words.values()), color=color)
    ax.invert_yaxis()
    ax.set_title("상위 {} 단어".format(n))
    plt.tight_layout()
    return fig

def get_top_trigrams(text, n=10):
    nouns = extract_nouns(text)
    trigrams = zip(nouns, nouns[1:], nouns[2:])
    trigram_freq = Counter(trigrams)
    return trigram_freq.most_common(n)

# 스트림릿 UI 설정
st.title("텍스트 분석 도구")

user_stopwords = st.sidebar.text_area("불용어를 입력하세요 (쉼표로 구분)", 
                                      value=", ".join(default_stopwords))
user_stopwords = list(set(user_stopwords.split(", ")) | default_stopwords)

uploaded_file = st.file_uploader("텍스트 파일 업로드", type=['txt'])

if uploaded_file is not None:
    try:
        with st.spinner('파일을 처리 중입니다...'):
            text = str(uploaded_file.read(), 'utf-8')
            st.text(f"파일 길이: {len(text)} 문자")
            
            preprocessed_text = preprocess_text(text, user_stopwords)
            
            st.text(f"처리된 텍스트 길이: {len(preprocessed_text)} 문자")
            st.text(f"고유 단어 수: {len(set(preprocessed_text.split()))}")
        
        if not preprocessed_text:
            st.warning("처리된 텍스트가 없습니다. 다른 파일을 업로드해 주세요.")
        else:
            st.subheader("토픽 모델링 결과")
            n_topics = st.slider("토픽 수 선택", min_value=2, max_value=10, value=5)
            topics = topic_modeling([preprocessed_text], n_topics)
            if topics:
                for topic, words in topics.items():
                    st.write(f"{topic}: {', '.join(words)}")
            else:
                st.warning("토픽 모델링을 수행할 수 없습니다. 더 긴 텍스트를 사용해 주세요.")
            
            st.subheader("상위 10개 Trigram")
            top_trigrams = get_top_trigrams(preprocessed_text)
            for trigram, count in top_trigrams:
                st.write(f"{' '.join(trigram)}: {count}")
            
            st.subheader("단어 빈도 차트")
            color = st.color_picker("막대 색상 선택", "#1f77b4")
            fig = generate_word_frequency_chart(preprocessed_text, color)
            st.pyplot(fig)

    except Exception as e:
        st.error(f"파일 처리 중 오류 발생: {str(e)}")
        st.text("오류 발생 위치:")
        st.text(traceback.format_exc())

st.sidebar.markdown("""
## 사용 방법
1. 사이드바에서 불용어를 추가하거나 수정할 수 있습니다.
2. 텍스트 파일(.txt)을 업로드하세요.
3. 토픽 모델링의 토픽 수를 선택하세요.
4. 상위 10개 Trigram을 확인하세요.
5. 단어 빈도 차트의 막대 색상을 선택할 수 있습니다.
""")