text_analytics_dashboard_bert / text_analytics_dashboard_bert.py
Msk7000's picture
Upload 5 files
c0aee5d verified
import streamlit as st
import pandas as pd
import numpy as np
import os
import re
import MeCab
import ipadic
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from transformers import pipeline
from bertopic import BERTopic
import plotly.express as px
import plotly.graph_objects as go
from umap import UMAP
# --- 1. Font Configuration ---
BASE_DIR = os.path.dirname(__file__)
FONT_PATH_REG = os.path.join(BASE_DIR, "NotoSansJP-Regular.ttf")
FONT_PATH_BOLD = os.path.join(BASE_DIR, "NotoSansJP-Bold.ttf")
# FontPropertiesの初期化(findfont警告を回避)
jp_font = FontProperties(fname=FONT_PATH_REG)
jp_font_bold = FontProperties(fname=FONT_PATH_BOLD)
# Matplotlibのグローバル設定
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.unicode_minus'] = False
# --- 2. Text Processing Logic ---
class JapaneseTextProcessor:
def __init__(self, stopwords=None, cleaning_settings=None):
self.tagger = MeCab.Tagger(ipadic.MECAB_ARGS)
self.stopwords = stopwords if stopwords else []
self.cleaning_settings = cleaning_settings if cleaning_settings else {}
def clean_text(self, text):
if not isinstance(text, str): return ""
text = text.strip().lower()
if self.cleaning_settings.get("remove_digits"):
text = re.sub(r'\d+', '', text)
if self.cleaning_settings.get("remove_symbols"):
text = re.sub(r'[!-/:-@[-`{-~]', '', text)
text = re.sub(r'[!-/:-@[-`{-~]', '', text)
return text
def tokenize(self, text):
processed = self.clean_text(text)
node = self.tagger.parseToNode(processed)
tokens = []
while node:
features = node.feature.split(',')
pos = features[0]
if pos in ['名詞', '動詞', '形容詞']:
token = node.surface
if token not in self.stopwords and len(token) > 1:
tokens.append(token)
node = node.next
return " ".join(tokens)
# --- 3. Streamlit UI Config ---
st.set_page_config(page_title="Text Analytics Dashboard", layout="wide")
st.title("🇯🇵 日本語テキストアナリティクス ダッシュボード")
# Sidebar
st.sidebar.header("1. データアップロード")
uploaded_file = st.sidebar.file_uploader("CSVまたはExcelファイルを選択", type=['csv', 'xlsx'])
st.sidebar.header("2. 前処理・ベクトル化設定")
analyzer = st.sidebar.selectbox("Analyzer", ["word", "char"], index=0)
ngram_range = st.sidebar.slider("n-gram 範囲", 1, 6, (1, 3))
use_stopword = st.sidebar.checkbox("ストップワードを使用", value=True)
add_stopwords = st.sidebar.text_area("追加ストップワード(改行区切り)", "")
clean_options = {
"remove_digits": st.sidebar.checkbox("数字の除去", value=True),
"remove_symbols": st.sidebar.checkbox("記号の除去", value=True)
}
# --- State Management (Tab Persistence) ---
if "active_tab" not in st.session_state:
st.session_state.active_tab = "感情分析"
if "current_file" not in st.session_state:
st.session_state.current_file = None
if "current_column" not in st.session_state:
st.session_state.current_column = None
@st.cache_data
def load_data(file):
if file.name.endswith('.csv'):
return pd.read_csv(file)
return pd.read_excel(file)
if uploaded_file:
df_raw = load_data(uploaded_file)
str_cols = df_raw.select_dtypes(include=['object']).columns.tolist()
text_col = st.sidebar.selectbox("分析対象列", str_cols if str_cols else df_raw.columns)
# ファイル変更時にキャッシュをリセット
if st.session_state.current_file != uploaded_file.name or st.session_state.current_column != text_col:
st.session_state.current_file = uploaded_file.name
st.session_state.current_column = text_col
if "sentiment_df" in st.session_state:
del st.session_state.sentiment_df
stopword_list = []
if use_stopword:
sw_path = os.path.join(BASE_DIR, "Japanese_stopword_list.txt")
if os.path.exists(sw_path):
with open(sw_path, "r", encoding="utf-8") as f:
stopword_list = [line.strip() for line in f if line.strip()]
if add_stopwords:
stopword_list.extend([w.strip() for w in add_stopwords.split('\n') if w.strip()])
processor = JapaneseTextProcessor(stopwords=stopword_list, cleaning_settings=clean_options)
with st.spinner("前処理中..."):
df = df_raw.copy()
df['processed_text'] = df[text_col].astype(str).apply(processor.tokenize)
df = df[df['processed_text'].str.strip() != ""].reset_index(drop=True)
if df.empty:
st.error("前処理の結果、有効なテキストがなくなりました。")
else:
# ベクトル化
vectorizer = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, min_df=1, max_df=0.95)
try:
tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
vocab = vectorizer.get_feature_names_out()
except Exception as e:
st.error(f"ベクトル化エラー: {e}")
st.stop()
# --- Horizontal Radio Navigation ---
tabs = ["感情分析", "キーワード", "類似検索", "LDAトピック", "BERTopic"]
st.session_state.active_tab = st.radio(
"分析メニューを選択(他タブの操作中も状態を維持します)",
tabs,
index=tabs.index(st.session_state.active_tab),
horizontal=True
)
st.markdown("---")
# --- Content Routing ---
if st.session_state.active_tab == "感情分析":
st.header("1) 感情分析・分類")
if "sentiment_df" not in st.session_state:
@st.cache_resource
def load_sentiment_model():
return pipeline("sentiment-analysis", model="koheiduck/bert-japanese-finetuned-sentiment")
try:
classifier = load_sentiment_model()
texts = df[text_col].tolist()
batch_size = 16
all_results = []
status_text = st.empty()
progress_bar = st.progress(0)
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
batch_res = classifier(batch, truncation=True)
all_results.extend(batch_res)
percent = min(100, int((i + len(batch)) / len(texts) * 100))
progress_bar.progress(percent)
status_text.text(f"推論中: {i + len(batch)} / {len(texts)} 件")
status_text.success("推論完了")
st.session_state.sentiment_df = pd.DataFrame(all_results)
except Exception as e:
st.error(f"モデルエラー: {e}")
if "sentiment_df" in st.session_state:
df['sentiment'] = st.session_state.sentiment_df['label'].values
df['score'] = st.session_state.sentiment_df['score'].values
c1, c2 = st.columns(2)
with c1:
fig, ax = plt.subplots()
df['sentiment'].value_counts().plot(kind='bar', ax=ax, color='skyblue')
ax.set_title("感情分布", fontproperties=jp_font_bold)
ax.set_xticklabels(ax.get_xticklabels(), fontproperties=jp_font, rotation=45)
st.pyplot(fig)
with c2:
st.dataframe(df[[text_col, 'sentiment', 'score']].head(20))
elif st.session_state.active_tab == "キーワード":
st.header("2) TF-IDF 重要語")
tfidf_sum = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
top_idx = tfidf_sum.argsort()[-20:][::-1]
top_words = [vocab[i] for i in top_idx]
top_scores = tfidf_sum[top_idx]
fig = px.bar(x=top_scores, y=top_words, orientation='h', title="TF-IDF上位語")
fig.update_layout(yaxis={'categoryorder':'total ascending'})
st.plotly_chart(fig, use_container_width=True)
elif st.session_state.active_tab == "類似検索":
st.header("3) 類似文検索")
query = st.text_input("検索クエリを入力(Enterで実行)", key="search_box")
if query:
q_proc = processor.tokenize(query)
if q_proc:
q_vec = vectorizer.transform([q_proc])
sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
top_k = sims.argsort()[-5:][::-1]
for idx in top_k:
with st.expander(f"Score: {sims[idx]:.4f} | {df[text_col].iloc[idx][:40]}..."):
st.write(df[text_col].iloc[idx])
elif st.session_state.active_tab == "LDAトピック":
st.header("4) LDAトピックモデル")
n_topics = st.slider("トピック数", 2, 10, 5)
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_dist = lda.fit_transform(tfidf_matrix)
st.subheader("トピック別重要語")
cols = st.columns(2)
for i, topic in enumerate(lda.components_):
top_w = [vocab[j] for j in topic.argsort()[-10:][::-1]]
with cols[i % 2]:
fig, ax = plt.subplots(figsize=(6, 3))
ax.barh(top_w, topic[topic.argsort()[-10:][::-1]], color='teal')
ax.set_title(f"Topic {i}", fontproperties=jp_font_bold)
ax.invert_yaxis()
for l in ax.get_yticklabels(): l.set_fontproperties(jp_font)
st.pyplot(fig)
st.subheader("文書-トピック分布(ヒートマップ)")
st.plotly_chart(px.imshow(lda_dist[:50], x=[f"T{i}" for i in range(n_topics)]), use_container_width=True)
st.subheader("文書配置 (UMAP)")
if len(df) > n_topics:
reducer = UMAP(n_components=2, random_state=42)
embed = reducer.fit_transform(lda_dist)
df_u = pd.DataFrame(embed, columns=['x', 'y'])
df_u['topic'] = lda_dist.argmax(axis=1).astype(str)
df_u['text'] = df[text_col].str[:50]
st.plotly_chart(px.scatter(df_u, x='x', y='y', color='topic', hover_data=['text']), use_container_width=True)
elif st.session_state.active_tab == "BERTopic":
st.header("5) BERTopic 分析")
@st.cache_resource
def run_bertopic(docs):
m = BERTopic(language="japanese", calculate_probabilities=True)
t, p = m.fit_transform(docs)
return m, t, p
try:
m, t, p = run_bertopic(df[text_col].tolist())
st.dataframe(m.get_topic_info())
st.plotly_chart(m.visualize_topics(), use_container_width=True)
st.plotly_chart(m.visualize_barchart(), use_container_width=True)
st.plotly_chart(m.visualize_heatmap(), use_container_width=True)
st.plotly_chart(m.visualize_documents(df[text_col].tolist()), use_container_width=True)
except Exception as e:
st.error(f"BERTopicエラー: {e}")
else:
st.info("サイドバーから分析ファイルをアップロードしてください。")