import streamlit as st
import pandas as pd
import numpy as np
import os
import re
import MeCab
import ipadic
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from transformers import pipeline
from bertopic import BERTopic
import plotly.express as px
import plotly.graph_objects as go
from umap import UMAP

# --- 1. Font Configuration ---
BASE_DIR = os.path.dirname(__file__)
FONT_PATH_REG = os.path.join(BASE_DIR, "NotoSansJP-Regular.ttf")
FONT_PATH_BOLD = os.path.join(BASE_DIR, "NotoSansJP-Bold.ttf")

# FontPropertiesの初期化（findfont警告を回避）
jp_font = FontProperties(fname=FONT_PATH_REG)
jp_font_bold = FontProperties(fname=FONT_PATH_BOLD)

# Matplotlibのグローバル設定
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.unicode_minus'] = False

# --- 2. Text Processing Logic ---
class JapaneseTextProcessor:
    def __init__(self, stopwords=None, cleaning_settings=None):
        self.tagger = MeCab.Tagger(ipadic.MECAB_ARGS)
        self.stopwords = stopwords if stopwords else []
        self.cleaning_settings = cleaning_settings if cleaning_settings else {}

    def clean_text(self, text):
        if not isinstance(text, str): return ""
        text = text.strip().lower()
        if self.cleaning_settings.get("remove_digits"):
            text = re.sub(r'\d+', '', text)
        if self.cleaning_settings.get("remove_symbols"):
            text = re.sub(r'[!-/:-@[-`{-~]', '', text)
            text = re.sub(r'[！-／：-＠［-｀｛-～]', '', text)
        return text

    def tokenize(self, text):
        processed = self.clean_text(text)
        node = self.tagger.parseToNode(processed)
        tokens = []
        while node:
            features = node.feature.split(',')
            pos = features[0]
            if pos in ['名詞', '動詞', '形容詞']:
                token = node.surface
                if token not in self.stopwords and len(token) > 1:
                    tokens.append(token)
            node = node.next
        return " ".join(tokens)

# --- 3. Streamlit UI Config ---
st.set_page_config(page_title="Text Analytics Dashboard", layout="wide")
st.title("🇯🇵 日本語テキストアナリティクス ダッシュボード")

# Sidebar
st.sidebar.header("1. データアップロード")
uploaded_file = st.sidebar.file_uploader("CSVまたはExcelファイルを選択", type=['csv', 'xlsx'])

st.sidebar.header("2. 前処理・ベクトル化設定")
analyzer = st.sidebar.selectbox("Analyzer", ["word", "char"], index=0)
ngram_range = st.sidebar.slider("n-gram 範囲", 1, 6, (1, 3))
use_stopword = st.sidebar.checkbox("ストップワードを使用", value=True)
add_stopwords = st.sidebar.text_area("追加ストップワード（改行区切り）", "")

clean_options = {
    "remove_digits": st.sidebar.checkbox("数字の除去", value=True),
    "remove_symbols": st.sidebar.checkbox("記号の除去", value=True)
}

# --- State Management (Tab Persistence) ---
if "active_tab" not in st.session_state:
    st.session_state.active_tab = "感情分析"
if "current_file" not in st.session_state:
    st.session_state.current_file = None
if "current_column" not in st.session_state:
    st.session_state.current_column = None

@st.cache_data
def load_data(file):
    if file.name.endswith('.csv'):
        return pd.read_csv(file)
    return pd.read_excel(file)

if uploaded_file:
    df_raw = load_data(uploaded_file)
    str_cols = df_raw.select_dtypes(include=['object']).columns.tolist()
    text_col = st.sidebar.selectbox("分析対象列", str_cols if str_cols else df_raw.columns)
    
    # ファイル変更時にキャッシュをリセット
    if st.session_state.current_file != uploaded_file.name or st.session_state.current_column != text_col:
        st.session_state.current_file = uploaded_file.name
        st.session_state.current_column = text_col
        if "sentiment_df" in st.session_state:
            del st.session_state.sentiment_df

    stopword_list = []
    if use_stopword:
        sw_path = os.path.join(BASE_DIR, "Japanese_stopword_list.txt")
        if os.path.exists(sw_path):
            with open(sw_path, "r", encoding="utf-8") as f:
                stopword_list = [line.strip() for line in f if line.strip()]
        if add_stopwords:
            stopword_list.extend([w.strip() for w in add_stopwords.split('\n') if w.strip()])

    processor = JapaneseTextProcessor(stopwords=stopword_list, cleaning_settings=clean_options)
    
    with st.spinner("前処理中..."):
        df = df_raw.copy()
        df['processed_text'] = df[text_col].astype(str).apply(processor.tokenize)
        df = df[df['processed_text'].str.strip() != ""].reset_index(drop=True)

    if df.empty:
        st.error("前処理の結果、有効なテキストがなくなりました。")
    else:
        # ベクトル化
        vectorizer = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, min_df=1, max_df=0.95)
        try:
            tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
            vocab = vectorizer.get_feature_names_out()
        except Exception as e:
            st.error(f"ベクトル化エラー: {e}")
            st.stop()

        # --- Horizontal Radio Navigation ---
        tabs = ["感情分析", "キーワード", "類似検索", "LDAトピック", "BERTopic"]
        st.session_state.active_tab = st.radio(
            "分析メニューを選択（他タブの操作中も状態を維持します）",
            tabs,
            index=tabs.index(st.session_state.active_tab),
            horizontal=True
        )
        st.markdown("---")

        # --- Content Routing ---
        if st.session_state.active_tab == "感情分析":
            st.header("1) 感情分析・分類")
            if "sentiment_df" not in st.session_state:
                @st.cache_resource
                def load_sentiment_model():
                    return pipeline("sentiment-analysis", model="koheiduck/bert-japanese-finetuned-sentiment")
                
                try:
                    classifier = load_sentiment_model()
                    texts = df[text_col].tolist()
                    batch_size = 16
                    all_results = []
                    status_text = st.empty()
                    progress_bar = st.progress(0)
                    for i in range(0, len(texts), batch_size):
                        batch = texts[i : i + batch_size]
                        batch_res = classifier(batch, truncation=True)
                        all_results.extend(batch_res)
                        percent = min(100, int((i + len(batch)) / len(texts) * 100))
                        progress_bar.progress(percent)
                        status_text.text(f"推論中: {i + len(batch)} / {len(texts)} 件")
                    status_text.success("推論完了")
                    st.session_state.sentiment_df = pd.DataFrame(all_results)
                except Exception as e:
                    st.error(f"モデルエラー: {e}")
            
            if "sentiment_df" in st.session_state:
                df['sentiment'] = st.session_state.sentiment_df['label'].values
                df['score'] = st.session_state.sentiment_df['score'].values
                c1, c2 = st.columns(2)
                with c1:
                    fig, ax = plt.subplots()
                    df['sentiment'].value_counts().plot(kind='bar', ax=ax, color='skyblue')
                    ax.set_title("感情分布", fontproperties=jp_font_bold)
                    ax.set_xticklabels(ax.get_xticklabels(), fontproperties=jp_font, rotation=45)
                    st.pyplot(fig)
                with c2:
                    st.dataframe(df[[text_col, 'sentiment', 'score']].head(20))

        elif st.session_state.active_tab == "キーワード":
            st.header("2) TF-IDF 重要語")
            tfidf_sum = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
            top_idx = tfidf_sum.argsort()[-20:][::-1]
            top_words = [vocab[i] for i in top_idx]
            top_scores = tfidf_sum[top_idx]
            fig = px.bar(x=top_scores, y=top_words, orientation='h', title="TF-IDF上位語")
            fig.update_layout(yaxis={'categoryorder':'total ascending'})
            st.plotly_chart(fig, use_container_width=True)

        elif st.session_state.active_tab == "類似検索":
            st.header("3) 類似文検索")
            query = st.text_input("検索クエリを入力（Enterで実行）", key="search_box")
            if query:
                q_proc = processor.tokenize(query)
                if q_proc:
                    q_vec = vectorizer.transform([q_proc])
                    sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
                    top_k = sims.argsort()[-5:][::-1]
                    for idx in top_k:
                        with st.expander(f"Score: {sims[idx]:.4f} | {df[text_col].iloc[idx][:40]}..."):
                            st.write(df[text_col].iloc[idx])

        elif st.session_state.active_tab == "LDAトピック":
            st.header("4) LDAトピックモデル")
            n_topics = st.slider("トピック数", 2, 10, 5)
            lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
            lda_dist = lda.fit_transform(tfidf_matrix)
            
            st.subheader("トピック別重要語")
            cols = st.columns(2)
            for i, topic in enumerate(lda.components_):
                top_w = [vocab[j] for j in topic.argsort()[-10:][::-1]]
                with cols[i % 2]:
                    fig, ax = plt.subplots(figsize=(6, 3))
                    ax.barh(top_w, topic[topic.argsort()[-10:][::-1]], color='teal')
                    ax.set_title(f"Topic {i}", fontproperties=jp_font_bold)
                    ax.invert_yaxis()
                    for l in ax.get_yticklabels(): l.set_fontproperties(jp_font)
                    st.pyplot(fig)

            st.subheader("文書-トピック分布（ヒートマップ）")
            st.plotly_chart(px.imshow(lda_dist[:50], x=[f"T{i}" for i in range(n_topics)]), use_container_width=True)

            st.subheader("文書配置 (UMAP)")
            if len(df) > n_topics:
                reducer = UMAP(n_components=2, random_state=42)
                embed = reducer.fit_transform(lda_dist)
                df_u = pd.DataFrame(embed, columns=['x', 'y'])
                df_u['topic'] = lda_dist.argmax(axis=1).astype(str)
                df_u['text'] = df[text_col].str[:50]
                st.plotly_chart(px.scatter(df_u, x='x', y='y', color='topic', hover_data=['text']), use_container_width=True)

        elif st.session_state.active_tab == "BERTopic":
            st.header("5) BERTopic 分析")
            @st.cache_resource
            def run_bertopic(docs):
                m = BERTopic(language="japanese", calculate_probabilities=True)
                t, p = m.fit_transform(docs)
                return m, t, p
            try:
                m, t, p = run_bertopic(df[text_col].tolist())
                st.dataframe(m.get_topic_info())
                st.plotly_chart(m.visualize_topics(), use_container_width=True)
                st.plotly_chart(m.visualize_barchart(), use_container_width=True)
                st.plotly_chart(m.visualize_heatmap(), use_container_width=True)
                st.plotly_chart(m.visualize_documents(df[text_col].tolist()), use_container_width=True)
            except Exception as e:
                st.error(f"BERTopicエラー: {e}")

else:
    st.info("サイドバーから分析ファイルをアップロードしてください。")