import streamlit as st import pandas as pd import numpy as np import os import re import MeCab import ipadic import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.decomposition import LatentDirichletAllocation from transformers import pipeline from bertopic import BERTopic import plotly.express as px import plotly.graph_objects as go from umap import UMAP # --- 1. Font Configuration --- BASE_DIR = os.path.dirname(__file__) FONT_PATH_REG = os.path.join(BASE_DIR, "NotoSansJP-Regular.ttf") FONT_PATH_BOLD = os.path.join(BASE_DIR, "NotoSansJP-Bold.ttf") # FontPropertiesの初期化(findfont警告を回避) jp_font = FontProperties(fname=FONT_PATH_REG) jp_font_bold = FontProperties(fname=FONT_PATH_BOLD) # Matplotlibのグローバル設定 plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['axes.unicode_minus'] = False # --- 2. Text Processing Logic --- class JapaneseTextProcessor: def __init__(self, stopwords=None, cleaning_settings=None): self.tagger = MeCab.Tagger(ipadic.MECAB_ARGS) self.stopwords = stopwords if stopwords else [] self.cleaning_settings = cleaning_settings if cleaning_settings else {} def clean_text(self, text): if not isinstance(text, str): return "" text = text.strip().lower() if self.cleaning_settings.get("remove_digits"): text = re.sub(r'\d+', '', text) if self.cleaning_settings.get("remove_symbols"): text = re.sub(r'[!-/:-@[-`{-~]', '', text) text = re.sub(r'[!-/:-@[-`{-~]', '', text) return text def tokenize(self, text): processed = self.clean_text(text) node = self.tagger.parseToNode(processed) tokens = [] while node: features = node.feature.split(',') pos = features[0] if pos in ['名詞', '動詞', '形容詞']: token = node.surface if token not in self.stopwords and len(token) > 1: tokens.append(token) node = node.next return " ".join(tokens) # --- 3. Streamlit UI Config --- st.set_page_config(page_title="Text Analytics Dashboard", layout="wide") st.title("🇯🇵 日本語テキストアナリティクス ダッシュボード") # Sidebar st.sidebar.header("1. データアップロード") uploaded_file = st.sidebar.file_uploader("CSVまたはExcelファイルを選択", type=['csv', 'xlsx']) st.sidebar.header("2. 前処理・ベクトル化設定") analyzer = st.sidebar.selectbox("Analyzer", ["word", "char"], index=0) ngram_range = st.sidebar.slider("n-gram 範囲", 1, 6, (1, 3)) use_stopword = st.sidebar.checkbox("ストップワードを使用", value=True) add_stopwords = st.sidebar.text_area("追加ストップワード(改行区切り)", "") clean_options = { "remove_digits": st.sidebar.checkbox("数字の除去", value=True), "remove_symbols": st.sidebar.checkbox("記号の除去", value=True) } # --- State Management (Tab Persistence) --- if "active_tab" not in st.session_state: st.session_state.active_tab = "感情分析" if "current_file" not in st.session_state: st.session_state.current_file = None if "current_column" not in st.session_state: st.session_state.current_column = None @st.cache_data def load_data(file): if file.name.endswith('.csv'): return pd.read_csv(file) return pd.read_excel(file) if uploaded_file: df_raw = load_data(uploaded_file) str_cols = df_raw.select_dtypes(include=['object']).columns.tolist() text_col = st.sidebar.selectbox("分析対象列", str_cols if str_cols else df_raw.columns) # ファイル変更時にキャッシュをリセット if st.session_state.current_file != uploaded_file.name or st.session_state.current_column != text_col: st.session_state.current_file = uploaded_file.name st.session_state.current_column = text_col if "sentiment_df" in st.session_state: del st.session_state.sentiment_df stopword_list = [] if use_stopword: sw_path = os.path.join(BASE_DIR, "Japanese_stopword_list.txt") if os.path.exists(sw_path): with open(sw_path, "r", encoding="utf-8") as f: stopword_list = [line.strip() for line in f if line.strip()] if add_stopwords: stopword_list.extend([w.strip() for w in add_stopwords.split('\n') if w.strip()]) processor = JapaneseTextProcessor(stopwords=stopword_list, cleaning_settings=clean_options) with st.spinner("前処理中..."): df = df_raw.copy() df['processed_text'] = df[text_col].astype(str).apply(processor.tokenize) df = df[df['processed_text'].str.strip() != ""].reset_index(drop=True) if df.empty: st.error("前処理の結果、有効なテキストがなくなりました。") else: # ベクトル化 vectorizer = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, min_df=1, max_df=0.95) try: tfidf_matrix = vectorizer.fit_transform(df['processed_text']) vocab = vectorizer.get_feature_names_out() except Exception as e: st.error(f"ベクトル化エラー: {e}") st.stop() # --- Horizontal Radio Navigation --- tabs = ["感情分析", "キーワード", "類似検索", "LDAトピック", "BERTopic"] st.session_state.active_tab = st.radio( "分析メニューを選択(他タブの操作中も状態を維持します)", tabs, index=tabs.index(st.session_state.active_tab), horizontal=True ) st.markdown("---") # --- Content Routing --- if st.session_state.active_tab == "感情分析": st.header("1) 感情分析・分類") if "sentiment_df" not in st.session_state: @st.cache_resource def load_sentiment_model(): return pipeline("sentiment-analysis", model="koheiduck/bert-japanese-finetuned-sentiment") try: classifier = load_sentiment_model() texts = df[text_col].tolist() batch_size = 16 all_results = [] status_text = st.empty() progress_bar = st.progress(0) for i in range(0, len(texts), batch_size): batch = texts[i : i + batch_size] batch_res = classifier(batch, truncation=True) all_results.extend(batch_res) percent = min(100, int((i + len(batch)) / len(texts) * 100)) progress_bar.progress(percent) status_text.text(f"推論中: {i + len(batch)} / {len(texts)} 件") status_text.success("推論完了") st.session_state.sentiment_df = pd.DataFrame(all_results) except Exception as e: st.error(f"モデルエラー: {e}") if "sentiment_df" in st.session_state: df['sentiment'] = st.session_state.sentiment_df['label'].values df['score'] = st.session_state.sentiment_df['score'].values c1, c2 = st.columns(2) with c1: fig, ax = plt.subplots() df['sentiment'].value_counts().plot(kind='bar', ax=ax, color='skyblue') ax.set_title("感情分布", fontproperties=jp_font_bold) ax.set_xticklabels(ax.get_xticklabels(), fontproperties=jp_font, rotation=45) st.pyplot(fig) with c2: st.dataframe(df[[text_col, 'sentiment', 'score']].head(20)) elif st.session_state.active_tab == "キーワード": st.header("2) TF-IDF 重要語") tfidf_sum = np.asarray(tfidf_matrix.sum(axis=0)).flatten() top_idx = tfidf_sum.argsort()[-20:][::-1] top_words = [vocab[i] for i in top_idx] top_scores = tfidf_sum[top_idx] fig = px.bar(x=top_scores, y=top_words, orientation='h', title="TF-IDF上位語") fig.update_layout(yaxis={'categoryorder':'total ascending'}) st.plotly_chart(fig, use_container_width=True) elif st.session_state.active_tab == "類似検索": st.header("3) 類似文検索") query = st.text_input("検索クエリを入力(Enterで実行)", key="search_box") if query: q_proc = processor.tokenize(query) if q_proc: q_vec = vectorizer.transform([q_proc]) sims = cosine_similarity(q_vec, tfidf_matrix).flatten() top_k = sims.argsort()[-5:][::-1] for idx in top_k: with st.expander(f"Score: {sims[idx]:.4f} | {df[text_col].iloc[idx][:40]}..."): st.write(df[text_col].iloc[idx]) elif st.session_state.active_tab == "LDAトピック": st.header("4) LDAトピックモデル") n_topics = st.slider("トピック数", 2, 10, 5) lda = LatentDirichletAllocation(n_components=n_topics, random_state=42) lda_dist = lda.fit_transform(tfidf_matrix) st.subheader("トピック別重要語") cols = st.columns(2) for i, topic in enumerate(lda.components_): top_w = [vocab[j] for j in topic.argsort()[-10:][::-1]] with cols[i % 2]: fig, ax = plt.subplots(figsize=(6, 3)) ax.barh(top_w, topic[topic.argsort()[-10:][::-1]], color='teal') ax.set_title(f"Topic {i}", fontproperties=jp_font_bold) ax.invert_yaxis() for l in ax.get_yticklabels(): l.set_fontproperties(jp_font) st.pyplot(fig) st.subheader("文書-トピック分布(ヒートマップ)") st.plotly_chart(px.imshow(lda_dist[:50], x=[f"T{i}" for i in range(n_topics)]), use_container_width=True) st.subheader("文書配置 (UMAP)") if len(df) > n_topics: reducer = UMAP(n_components=2, random_state=42) embed = reducer.fit_transform(lda_dist) df_u = pd.DataFrame(embed, columns=['x', 'y']) df_u['topic'] = lda_dist.argmax(axis=1).astype(str) df_u['text'] = df[text_col].str[:50] st.plotly_chart(px.scatter(df_u, x='x', y='y', color='topic', hover_data=['text']), use_container_width=True) elif st.session_state.active_tab == "BERTopic": st.header("5) BERTopic 分析") @st.cache_resource def run_bertopic(docs): m = BERTopic(language="japanese", calculate_probabilities=True) t, p = m.fit_transform(docs) return m, t, p try: m, t, p = run_bertopic(df[text_col].tolist()) st.dataframe(m.get_topic_info()) st.plotly_chart(m.visualize_topics(), use_container_width=True) st.plotly_chart(m.visualize_barchart(), use_container_width=True) st.plotly_chart(m.visualize_heatmap(), use_container_width=True) st.plotly_chart(m.visualize_documents(df[text_col].tolist()), use_container_width=True) except Exception as e: st.error(f"BERTopicエラー: {e}") else: st.info("サイドバーから分析ファイルをアップロードしてください。")