| import streamlit as st
|
| import pandas as pd
|
| import numpy as np
|
| import os
|
| import re
|
| import MeCab
|
| import ipadic
|
| import matplotlib.pyplot as plt
|
| from matplotlib.font_manager import FontProperties
|
| from sklearn.feature_extraction.text import TfidfVectorizer
|
| from sklearn.metrics.pairwise import cosine_similarity
|
| from sklearn.decomposition import LatentDirichletAllocation
|
| from transformers import pipeline
|
| from bertopic import BERTopic
|
| import plotly.express as px
|
| import plotly.graph_objects as go
|
| from umap import UMAP
|
|
|
|
|
| BASE_DIR = os.path.dirname(__file__)
|
| FONT_PATH_REG = os.path.join(BASE_DIR, "NotoSansJP-Regular.ttf")
|
| FONT_PATH_BOLD = os.path.join(BASE_DIR, "NotoSansJP-Bold.ttf")
|
|
|
|
|
| jp_font = FontProperties(fname=FONT_PATH_REG)
|
| jp_font_bold = FontProperties(fname=FONT_PATH_BOLD)
|
|
|
|
|
| plt.rcParams['font.family'] = 'sans-serif'
|
| plt.rcParams['axes.unicode_minus'] = False
|
|
|
|
|
| class JapaneseTextProcessor:
|
| def __init__(self, stopwords=None, cleaning_settings=None):
|
| self.tagger = MeCab.Tagger(ipadic.MECAB_ARGS)
|
| self.stopwords = stopwords if stopwords else []
|
| self.cleaning_settings = cleaning_settings if cleaning_settings else {}
|
|
|
| def clean_text(self, text):
|
| if not isinstance(text, str): return ""
|
| text = text.strip().lower()
|
| if self.cleaning_settings.get("remove_digits"):
|
| text = re.sub(r'\d+', '', text)
|
| if self.cleaning_settings.get("remove_symbols"):
|
| text = re.sub(r'[!-/:-@[-`{-~]', '', text)
|
| text = re.sub(r'[!-/:-@[-`{-~]', '', text)
|
| return text
|
|
|
| def tokenize(self, text):
|
| processed = self.clean_text(text)
|
| node = self.tagger.parseToNode(processed)
|
| tokens = []
|
| while node:
|
| features = node.feature.split(',')
|
| pos = features[0]
|
| if pos in ['名詞', '動詞', '形容詞']:
|
| token = node.surface
|
| if token not in self.stopwords and len(token) > 1:
|
| tokens.append(token)
|
| node = node.next
|
| return " ".join(tokens)
|
|
|
|
|
| st.set_page_config(page_title="Text Analytics Dashboard", layout="wide")
|
| st.title("🇯🇵 日本語テキストアナリティクス ダッシュボード")
|
|
|
|
|
| st.sidebar.header("1. データアップロード")
|
| uploaded_file = st.sidebar.file_uploader("CSVまたはExcelファイルを選択", type=['csv', 'xlsx'])
|
|
|
| st.sidebar.header("2. 前処理・ベクトル化設定")
|
| analyzer = st.sidebar.selectbox("Analyzer", ["word", "char"], index=0)
|
| ngram_range = st.sidebar.slider("n-gram 範囲", 1, 6, (1, 3))
|
| use_stopword = st.sidebar.checkbox("ストップワードを使用", value=True)
|
| add_stopwords = st.sidebar.text_area("追加ストップワード(改行区切り)", "")
|
|
|
| clean_options = {
|
| "remove_digits": st.sidebar.checkbox("数字の除去", value=True),
|
| "remove_symbols": st.sidebar.checkbox("記号の除去", value=True)
|
| }
|
|
|
|
|
| if "active_tab" not in st.session_state:
|
| st.session_state.active_tab = "感情分析"
|
| if "current_file" not in st.session_state:
|
| st.session_state.current_file = None
|
| if "current_column" not in st.session_state:
|
| st.session_state.current_column = None
|
|
|
| @st.cache_data
|
| def load_data(file):
|
| if file.name.endswith('.csv'):
|
| return pd.read_csv(file)
|
| return pd.read_excel(file)
|
|
|
| if uploaded_file:
|
| df_raw = load_data(uploaded_file)
|
| str_cols = df_raw.select_dtypes(include=['object']).columns.tolist()
|
| text_col = st.sidebar.selectbox("分析対象列", str_cols if str_cols else df_raw.columns)
|
|
|
|
|
| if st.session_state.current_file != uploaded_file.name or st.session_state.current_column != text_col:
|
| st.session_state.current_file = uploaded_file.name
|
| st.session_state.current_column = text_col
|
| if "sentiment_df" in st.session_state:
|
| del st.session_state.sentiment_df
|
|
|
| stopword_list = []
|
| if use_stopword:
|
| sw_path = os.path.join(BASE_DIR, "Japanese_stopword_list.txt")
|
| if os.path.exists(sw_path):
|
| with open(sw_path, "r", encoding="utf-8") as f:
|
| stopword_list = [line.strip() for line in f if line.strip()]
|
| if add_stopwords:
|
| stopword_list.extend([w.strip() for w in add_stopwords.split('\n') if w.strip()])
|
|
|
| processor = JapaneseTextProcessor(stopwords=stopword_list, cleaning_settings=clean_options)
|
|
|
| with st.spinner("前処理中..."):
|
| df = df_raw.copy()
|
| df['processed_text'] = df[text_col].astype(str).apply(processor.tokenize)
|
| df = df[df['processed_text'].str.strip() != ""].reset_index(drop=True)
|
|
|
| if df.empty:
|
| st.error("前処理の結果、有効なテキストがなくなりました。")
|
| else:
|
|
|
| vectorizer = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, min_df=1, max_df=0.95)
|
| try:
|
| tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
|
| vocab = vectorizer.get_feature_names_out()
|
| except Exception as e:
|
| st.error(f"ベクトル化エラー: {e}")
|
| st.stop()
|
|
|
|
|
| tabs = ["感情分析", "キーワード", "類似検索", "LDAトピック", "BERTopic"]
|
| st.session_state.active_tab = st.radio(
|
| "分析メニューを選択(他タブの操作中も状態を維持します)",
|
| tabs,
|
| index=tabs.index(st.session_state.active_tab),
|
| horizontal=True
|
| )
|
| st.markdown("---")
|
|
|
|
|
| if st.session_state.active_tab == "感情分析":
|
| st.header("1) 感情分析・分類")
|
| if "sentiment_df" not in st.session_state:
|
| @st.cache_resource
|
| def load_sentiment_model():
|
| return pipeline("sentiment-analysis", model="koheiduck/bert-japanese-finetuned-sentiment")
|
|
|
| try:
|
| classifier = load_sentiment_model()
|
| texts = df[text_col].tolist()
|
| batch_size = 16
|
| all_results = []
|
| status_text = st.empty()
|
| progress_bar = st.progress(0)
|
| for i in range(0, len(texts), batch_size):
|
| batch = texts[i : i + batch_size]
|
| batch_res = classifier(batch, truncation=True)
|
| all_results.extend(batch_res)
|
| percent = min(100, int((i + len(batch)) / len(texts) * 100))
|
| progress_bar.progress(percent)
|
| status_text.text(f"推論中: {i + len(batch)} / {len(texts)} 件")
|
| status_text.success("推論完了")
|
| st.session_state.sentiment_df = pd.DataFrame(all_results)
|
| except Exception as e:
|
| st.error(f"モデルエラー: {e}")
|
|
|
| if "sentiment_df" in st.session_state:
|
| df['sentiment'] = st.session_state.sentiment_df['label'].values
|
| df['score'] = st.session_state.sentiment_df['score'].values
|
| c1, c2 = st.columns(2)
|
| with c1:
|
| fig, ax = plt.subplots()
|
| df['sentiment'].value_counts().plot(kind='bar', ax=ax, color='skyblue')
|
| ax.set_title("感情分布", fontproperties=jp_font_bold)
|
| ax.set_xticklabels(ax.get_xticklabels(), fontproperties=jp_font, rotation=45)
|
| st.pyplot(fig)
|
| with c2:
|
| st.dataframe(df[[text_col, 'sentiment', 'score']].head(20))
|
|
|
| elif st.session_state.active_tab == "キーワード":
|
| st.header("2) TF-IDF 重要語")
|
| tfidf_sum = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
|
| top_idx = tfidf_sum.argsort()[-20:][::-1]
|
| top_words = [vocab[i] for i in top_idx]
|
| top_scores = tfidf_sum[top_idx]
|
| fig = px.bar(x=top_scores, y=top_words, orientation='h', title="TF-IDF上位語")
|
| fig.update_layout(yaxis={'categoryorder':'total ascending'})
|
| st.plotly_chart(fig, use_container_width=True)
|
|
|
| elif st.session_state.active_tab == "類似検索":
|
| st.header("3) 類似文検索")
|
| query = st.text_input("検索クエリを入力(Enterで実行)", key="search_box")
|
| if query:
|
| q_proc = processor.tokenize(query)
|
| if q_proc:
|
| q_vec = vectorizer.transform([q_proc])
|
| sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
|
| top_k = sims.argsort()[-5:][::-1]
|
| for idx in top_k:
|
| with st.expander(f"Score: {sims[idx]:.4f} | {df[text_col].iloc[idx][:40]}..."):
|
| st.write(df[text_col].iloc[idx])
|
|
|
| elif st.session_state.active_tab == "LDAトピック":
|
| st.header("4) LDAトピックモデル")
|
| n_topics = st.slider("トピック数", 2, 10, 5)
|
| lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
|
| lda_dist = lda.fit_transform(tfidf_matrix)
|
|
|
| st.subheader("トピック別重要語")
|
| cols = st.columns(2)
|
| for i, topic in enumerate(lda.components_):
|
| top_w = [vocab[j] for j in topic.argsort()[-10:][::-1]]
|
| with cols[i % 2]:
|
| fig, ax = plt.subplots(figsize=(6, 3))
|
| ax.barh(top_w, topic[topic.argsort()[-10:][::-1]], color='teal')
|
| ax.set_title(f"Topic {i}", fontproperties=jp_font_bold)
|
| ax.invert_yaxis()
|
| for l in ax.get_yticklabels(): l.set_fontproperties(jp_font)
|
| st.pyplot(fig)
|
|
|
| st.subheader("文書-トピック分布(ヒートマップ)")
|
| st.plotly_chart(px.imshow(lda_dist[:50], x=[f"T{i}" for i in range(n_topics)]), use_container_width=True)
|
|
|
| st.subheader("文書配置 (UMAP)")
|
| if len(df) > n_topics:
|
| reducer = UMAP(n_components=2, random_state=42)
|
| embed = reducer.fit_transform(lda_dist)
|
| df_u = pd.DataFrame(embed, columns=['x', 'y'])
|
| df_u['topic'] = lda_dist.argmax(axis=1).astype(str)
|
| df_u['text'] = df[text_col].str[:50]
|
| st.plotly_chart(px.scatter(df_u, x='x', y='y', color='topic', hover_data=['text']), use_container_width=True)
|
|
|
| elif st.session_state.active_tab == "BERTopic":
|
| st.header("5) BERTopic 分析")
|
| @st.cache_resource
|
| def run_bertopic(docs):
|
| m = BERTopic(language="japanese", calculate_probabilities=True)
|
| t, p = m.fit_transform(docs)
|
| return m, t, p
|
| try:
|
| m, t, p = run_bertopic(df[text_col].tolist())
|
| st.dataframe(m.get_topic_info())
|
| st.plotly_chart(m.visualize_topics(), use_container_width=True)
|
| st.plotly_chart(m.visualize_barchart(), use_container_width=True)
|
| st.plotly_chart(m.visualize_heatmap(), use_container_width=True)
|
| st.plotly_chart(m.visualize_documents(df[text_col].tolist()), use_container_width=True)
|
| except Exception as e:
|
| st.error(f"BERTopicエラー: {e}")
|
|
|
| else:
|
| st.info("サイドバーから分析ファイルをアップロードしてください。") |