Spaces:

Msk7000
/

text_analytics_dashboard_bert

Sleeping

App Files Files Community

text_analytics_dashboard_bert / text_analytics_dashboard_bert.py

Msk7000

Upload 5 files

c0aee5d verified 4 months ago

raw

history blame contribute delete

12.2 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import os
	import re
	import MeCab
	import ipadic
	import matplotlib.pyplot as plt
	from matplotlib.font_manager import FontProperties
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from sklearn.decomposition import LatentDirichletAllocation
	from transformers import pipeline
	from bertopic import BERTopic
	import plotly.express as px
	import plotly.graph_objects as go
	from umap import UMAP

	# --- 1. Font Configuration ---
	BASE_DIR = os.path.dirname(__file__)
	FONT_PATH_REG = os.path.join(BASE_DIR, "NotoSansJP-Regular.ttf")
	FONT_PATH_BOLD = os.path.join(BASE_DIR, "NotoSansJP-Bold.ttf")

	# FontPropertiesの初期化（findfont警告を回避）
	jp_font = FontProperties(fname=FONT_PATH_REG)
	jp_font_bold = FontProperties(fname=FONT_PATH_BOLD)

	# Matplotlibのグローバル設定
	plt.rcParams['font.family'] = 'sans-serif'
	plt.rcParams['axes.unicode_minus'] = False

	# --- 2. Text Processing Logic ---
	class JapaneseTextProcessor:
	def __init__(self, stopwords=None, cleaning_settings=None):
	self.tagger = MeCab.Tagger(ipadic.MECAB_ARGS)
	self.stopwords = stopwords if stopwords else []
	self.cleaning_settings = cleaning_settings if cleaning_settings else {}

	def clean_text(self, text):
	if not isinstance(text, str): return ""
	text = text.strip().lower()
	if self.cleaning_settings.get("remove_digits"):
	text = re.sub(r'\d+', '', text)
	if self.cleaning_settings.get("remove_symbols"):
	text = re.sub(r'[!-/:-@[-`{-~]', '', text)
	text = re.sub(r'[！-／：-＠［-｀｛-～]', '', text)
	return text

	def tokenize(self, text):
	processed = self.clean_text(text)
	node = self.tagger.parseToNode(processed)
	tokens = []
	while node:
	features = node.feature.split(',')
	pos = features[0]
	if pos in ['名詞', '動詞', '形容詞']:
	token = node.surface
	if token not in self.stopwords and len(token) > 1:
	tokens.append(token)
	node = node.next
	return " ".join(tokens)

	# --- 3. Streamlit UI Config ---
	st.set_page_config(page_title="Text Analytics Dashboard", layout="wide")
	st.title("🇯🇵 日本語テキストアナリティクスダッシュボード")

	# Sidebar
	st.sidebar.header("1. データアップロード")
	uploaded_file = st.sidebar.file_uploader("CSVまたはExcelファイルを選択", type=['csv', 'xlsx'])

	st.sidebar.header("2. 前処理・ベクトル化設定")
	analyzer = st.sidebar.selectbox("Analyzer", ["word", "char"], index=0)
	ngram_range = st.sidebar.slider("n-gram 範囲", 1, 6, (1, 3))
	use_stopword = st.sidebar.checkbox("ストップワードを使用", value=True)
	add_stopwords = st.sidebar.text_area("追加ストップワード（改行区切り）", "")

	clean_options = {
	"remove_digits": st.sidebar.checkbox("数字の除去", value=True),
	"remove_symbols": st.sidebar.checkbox("記号の除去", value=True)
	}

	# --- State Management (Tab Persistence) ---
	if "active_tab" not in st.session_state:
	st.session_state.active_tab = "感情分析"
	if "current_file" not in st.session_state:
	st.session_state.current_file = None
	if "current_column" not in st.session_state:
	st.session_state.current_column = None

	@st.cache_data
	def load_data(file):
	if file.name.endswith('.csv'):
	return pd.read_csv(file)
	return pd.read_excel(file)

	if uploaded_file:
	df_raw = load_data(uploaded_file)
	str_cols = df_raw.select_dtypes(include=['object']).columns.tolist()
	text_col = st.sidebar.selectbox("分析対象列", str_cols if str_cols else df_raw.columns)

	# ファイル変更時にキャッシュをリセット
	if st.session_state.current_file != uploaded_file.name or st.session_state.current_column != text_col:
	st.session_state.current_file = uploaded_file.name
	st.session_state.current_column = text_col
	if "sentiment_df" in st.session_state:
	del st.session_state.sentiment_df

	stopword_list = []
	if use_stopword:
	sw_path = os.path.join(BASE_DIR, "Japanese_stopword_list.txt")
	if os.path.exists(sw_path):
	with open(sw_path, "r", encoding="utf-8") as f:
	stopword_list = [line.strip() for line in f if line.strip()]
	if add_stopwords:
	stopword_list.extend([w.strip() for w in add_stopwords.split('\n') if w.strip()])

	processor = JapaneseTextProcessor(stopwords=stopword_list, cleaning_settings=clean_options)

	with st.spinner("前処理中..."):
	df = df_raw.copy()
	df['processed_text'] = df[text_col].astype(str).apply(processor.tokenize)
	df = df[df['processed_text'].str.strip() != ""].reset_index(drop=True)

	if df.empty:
	st.error("前処理の結果、有効なテキストがなくなりました。")
	else:
	# ベクトル化
	vectorizer = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, min_df=1, max_df=0.95)
	try:
	tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
	vocab = vectorizer.get_feature_names_out()
	except Exception as e:
	st.error(f"ベクトル化エラー: {e}")
	st.stop()

	# --- Horizontal Radio Navigation ---
	tabs = ["感情分析", "キーワード", "類似検索", "LDAトピック", "BERTopic"]
	st.session_state.active_tab = st.radio(
	"分析メニューを選択（他タブの操作中も状態を維持します）",
	tabs,
	index=tabs.index(st.session_state.active_tab),
	horizontal=True
	)
	st.markdown("---")

	# --- Content Routing ---
	if st.session_state.active_tab == "感情分析":
	st.header("1) 感情分析・分類")
	if "sentiment_df" not in st.session_state:
	@st.cache_resource
	def load_sentiment_model():
	return pipeline("sentiment-analysis", model="koheiduck/bert-japanese-finetuned-sentiment")

	try:
	classifier = load_sentiment_model()
	texts = df[text_col].tolist()
	batch_size = 16
	all_results = []
	status_text = st.empty()
	progress_bar = st.progress(0)
	for i in range(0, len(texts), batch_size):
	batch = texts[i : i + batch_size]
	batch_res = classifier(batch, truncation=True)
	all_results.extend(batch_res)
	percent = min(100, int((i + len(batch)) / len(texts) * 100))
	progress_bar.progress(percent)
	status_text.text(f"推論中: {i + len(batch)} / {len(texts)} 件")
	status_text.success("推論完了")
	st.session_state.sentiment_df = pd.DataFrame(all_results)
	except Exception as e:
	st.error(f"モデルエラー: {e}")

	if "sentiment_df" in st.session_state:
	df['sentiment'] = st.session_state.sentiment_df['label'].values
	df['score'] = st.session_state.sentiment_df['score'].values
	c1, c2 = st.columns(2)
	with c1:
	fig, ax = plt.subplots()
	df['sentiment'].value_counts().plot(kind='bar', ax=ax, color='skyblue')
	ax.set_title("感情分布", fontproperties=jp_font_bold)
	ax.set_xticklabels(ax.get_xticklabels(), fontproperties=jp_font, rotation=45)
	st.pyplot(fig)
	with c2:
	st.dataframe(df[[text_col, 'sentiment', 'score']].head(20))

	elif st.session_state.active_tab == "キーワード":
	st.header("2) TF-IDF 重要語")
	tfidf_sum = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
	top_idx = tfidf_sum.argsort()[-20:][::-1]
	top_words = [vocab[i] for i in top_idx]
	top_scores = tfidf_sum[top_idx]
	fig = px.bar(x=top_scores, y=top_words, orientation='h', title="TF-IDF上位語")
	fig.update_layout(yaxis={'categoryorder':'total ascending'})
	st.plotly_chart(fig, use_container_width=True)

	elif st.session_state.active_tab == "類似検索":
	st.header("3) 類似文検索")
	query = st.text_input("検索クエリを入力（Enterで実行）", key="search_box")
	if query:
	q_proc = processor.tokenize(query)
	if q_proc:
	q_vec = vectorizer.transform([q_proc])
	sims = cosine_similarity(q_vec, tfidf_matrix).flatten()
	top_k = sims.argsort()[-5:][::-1]
	for idx in top_k:
	with st.expander(f"Score: {sims[idx]:.4f} \| {df[text_col].iloc[idx][:40]}..."):
	st.write(df[text_col].iloc[idx])

	elif st.session_state.active_tab == "LDAトピック":
	st.header("4) LDAトピックモデル")
	n_topics = st.slider("トピック数", 2, 10, 5)
	lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
	lda_dist = lda.fit_transform(tfidf_matrix)

	st.subheader("トピック別重要語")
	cols = st.columns(2)
	for i, topic in enumerate(lda.components_):
	top_w = [vocab[j] for j in topic.argsort()[-10:][::-1]]
	with cols[i % 2]:
	fig, ax = plt.subplots(figsize=(6, 3))
	ax.barh(top_w, topic[topic.argsort()[-10:][::-1]], color='teal')
	ax.set_title(f"Topic {i}", fontproperties=jp_font_bold)
	ax.invert_yaxis()
	for l in ax.get_yticklabels(): l.set_fontproperties(jp_font)
	st.pyplot(fig)

	st.subheader("文書-トピック分布（ヒートマップ）")
	st.plotly_chart(px.imshow(lda_dist[:50], x=[f"T{i}" for i in range(n_topics)]), use_container_width=True)

	st.subheader("文書配置 (UMAP)")
	if len(df) > n_topics:
	reducer = UMAP(n_components=2, random_state=42)
	embed = reducer.fit_transform(lda_dist)
	df_u = pd.DataFrame(embed, columns=['x', 'y'])
	df_u['topic'] = lda_dist.argmax(axis=1).astype(str)
	df_u['text'] = df[text_col].str[:50]
	st.plotly_chart(px.scatter(df_u, x='x', y='y', color='topic', hover_data=['text']), use_container_width=True)

	elif st.session_state.active_tab == "BERTopic":
	st.header("5) BERTopic 分析")
	@st.cache_resource
	def run_bertopic(docs):
	m = BERTopic(language="japanese", calculate_probabilities=True)
	t, p = m.fit_transform(docs)
	return m, t, p
	try:
	m, t, p = run_bertopic(df[text_col].tolist())
	st.dataframe(m.get_topic_info())
	st.plotly_chart(m.visualize_topics(), use_container_width=True)
	st.plotly_chart(m.visualize_barchart(), use_container_width=True)
	st.plotly_chart(m.visualize_heatmap(), use_container_width=True)
	st.plotly_chart(m.visualize_documents(df[text_col].tolist()), use_container_width=True)
	except Exception as e:
	st.error(f"BERTopicエラー: {e}")

	else:
	st.info("サイドバーから分析ファイルをアップロードしてください。")