Spaces:

Corin1998
/

multilingual_sns_analyzerV2

Sleeping

App Files Files Community

multilingual_sns_analyzerV2 / app.py

Corin1998

Update app.py

770385f verified 8 months ago

raw

history blame contribute delete

10.1 kB

	import os
	os.environ.setdefault("NUMBA_CACHE_DIR", "/tmp/numba_cache")

	from typing import List, Tuple
	import json
	import pandas as pd
	import gradio as gr

	from lib.pipeline import analyze_texts
	from lib.viz import scatter_2d
	from lib.utils import now_utc_str, load_sample_df

	APP_NAME = "🔎 SNS Analyzer v3（クラスタ＋要約＋感情＋重複排除）— OpenAI API"


	# ----------------- CSV ロバスト読み込み -----------------
	def _file_path(file_obj) -> str \| None:
	# gr.File はパス(str)か、{'name','path'}のdict、または file-like を返すことがある
	if file_obj is None:
	return None
	if isinstance(file_obj, str):
	return file_obj
	if hasattr(file_obj, "name"):
	return file_obj.name
	if isinstance(file_obj, dict):
	return file_obj.get("path") or file_obj.get("name")
	return None

	def read_csv_flex(file_obj) -> pd.DataFrame:
	path = _file_path(file_obj)
	if not path or not os.path.exists(path):
	raise RuntimeError("アップロードされた CSV のパスが取得できませんでした。")
	encodings = ["utf-8", "utf-8-sig", "cp932", "shift_jis", "latin1"]
	seps = [None, ",", "\t", ";", "\|"]
	last_err = None
	for enc in encodings:
	for sep in seps:
	try:
	df = pd.read_csv(path, encoding=enc, sep=sep, engine="python", on_bad_lines="skip")
	if isinstance(df, pd.DataFrame) and df.shape[1] >= 1:
	return df
	except Exception as e:
	last_err = e
	continue
	raise RuntimeError(f"CSV 読み込みに失敗しました（encoding/区切りの自動判定に失敗）: {last_err}")

	def guess_text_col(df: pd.DataFrame, hint: str \| None) -> str:
	if hint and hint in df.columns:
	return hint
	candidates = [
	"text","TEXT","Text","review","comment","content","body","message",
	"本文","内容","テキスト","レビュー","クチコミ","コメント"
	]
	for c in candidates:
	if c in df.columns:
	return c
	for c in df.columns:
	if df[c].dtype == "object":
	return c
	return df.columns[0]
	# --------------------------------------------------------


	def run_analysis(
	text_blob: str,
	k: int,
	max_items: int,
	seed: int,
	dedup_on: bool,
	dedup_thr: float,
	csv_file,
	csv_text_col: str,
	out_lang_ui: str, # "日本語"/"English"/"自動"
	) -> Tuple[str, any, pd.DataFrame, str]:
	lang_map = {"日本語": "ja", "English": "en", "自動": "auto"}
	out_lang = lang_map.get(out_lang_ui, "ja")

	# 入力収集
	texts: List[str] = []
	if csv_file is not None:
	try:
	df = read_csv_flex(csv_file)
	col = guess_text_col(df, csv_text_col.strip() if csv_text_col else None)
	if col not in df.columns:
	return f"⚠️ CSVに列 `{col}` が見つかりません。列名を指定してください。", None, pd.DataFrame(), ""
	texts = [str(s).strip() for s in df[col].astype(str).fillna("").tolist() if str(s).strip()]
	except Exception as e:
	return f"⚠️ CSV読み込みでエラー: {e}", None, pd.DataFrame(), ""
	else:
	texts = [t.strip() for t in (text_blob or "").split("\n") if t.strip()]

	if max_items and max_items > 0:
	texts = texts[:max_items]
	if len(texts) < 2:
	return "⚠️ 2件以上のテキストを入力してください。", None, pd.DataFrame(), ""

	# 解析本体
	result = analyze_texts(
	texts=texts,
	k=k,
	random_state=seed,
	dedup=dedup_on,
	dedup_threshold=dedup_thr,
	output_lang=out_lang,
	)

	# 要約の表示（右ペイン）
	lines = []
	if result.get("dedup", {}).get("removed", 0) > 0:
	kept = result["dedup"]["kept"]
	removed = result["dedup"]["removed"]
	lines.append(f"> 近似重複を {removed} 件除外（残り {kept} 件）\n")

	for c in result["clusters"]:
	title = c.get("label") or c.get("summary", {}).get("title") or f"クラスタ {c['id']}"
	lines.append(f"### クラスタ {c['id']} (size={c['size']})")
	lines.append(f"{title}")
	if c.get("summary", {}).get("overview"):
	lines.append(c["summary"]["overview"])
	if "sentiment" in c:
	s = c["sentiment"]
	lines.append(f"感情比率 — 👍 {s['positive']:.0%} / 😐 {s['neutral']:.0%} / 👎 {s['negative']:.0%}")
	if c.get("summary", {}).get("actions"):
	lines.append("- 推奨アクション")
	for a in c["summary"]["actions"]:
	lines.append(f" - {a}")
	lines.append("")

	fig = scatter_2d(result["umap"], result["labels"])

	df_out = pd.DataFrame([
	{
	"id": c["id"],
	"size": c["size"],
	"label": (c.get("label") or c.get("summary", {}).get("title") or f"クラスタ {c['id']}"),
	"top_terms": ", ".join(c["top_terms"]),
	}
	for c in result["clusters"]
	])

	dl_json = json.dumps(result, ensure_ascii=False, indent=2)
	return "\n".join(lines), fig, df_out, dl_json


	def on_load_sample():
	df = load_sample_df()
	return "\n".join(df["text"].astype(str).tolist())


	def apply_labels(json_str: str, edited_df: pd.DataFrame) -> str:
	if not json_str:
	return json_str
	try:
	payload = json.loads(json_str)
	except Exception:
	return json_str
	if not isinstance(edited_df, pd.DataFrame) or "id" not in edited_df or "label" not in edited_df:
	return json_str
	id2label = {int(r["id"]): str(r["label"]) for _, r in edited_df.iterrows() if str(r["label"]).strip() != ""}
	for c in payload.get("clusters", []):
	cid = int(c["id"])
	if cid in id2label:
	c["label"] = id2label[cid]
	c.setdefault("summary", {})
	c["summary"]["title"] = id2label[cid]
	return json.dumps(payload, ensure_ascii=False, indent=2)


	def to_json_file(json_str: str):
	if not json_str:
	return gr.File.update(visible=False)
	path = f"/tmp/result_{now_utc_str()}.json"
	with open(path, "w", encoding="utf-8") as f:
	f.write(json_str)
	return gr.File.update(value=path, visible=True)


	def to_csv_file(json_str: str):
	if not json_str:
	return gr.File.update(visible=False)
	try:
	payload = json.loads(json_str)
	texts = payload.get("texts_kept", [])
	labels = payload.get("labels", [])
	cid2name = {int(c["id"]): (c.get("label") or c.get("summary", {}).get("title") or f"クラスタ {c['id']}") for c in payload.get("clusters", [])}
	rows = []
	for i, t in enumerate(texts):
	cid = int(labels[i])
	rows.append({"text": t, "cluster_id": cid, "cluster_label": cid2name.get(cid, f"クラスタ {cid}")})
	df = pd.DataFrame(rows)
	path = f"/tmp/result_{now_utc_str()}.csv"
	df.to_csv(path, index=False)
	return gr.File.update(value=path, visible=True)
	except Exception:
	return gr.File.update(visible=False)


	with gr.Blocks(title=APP_NAME, theme=gr.themes.Soft()) as demo:
	gr.Markdown(f"# {APP_NAME}")

	with gr.Row():
	with gr.Column(scale=6):
	text_in = gr.Textbox(label="テキスト（1行1件）", lines=14, placeholder="1行1件で貼り付け。CSVを使う場合は下でファイル選択。")
	with gr.Row():
	csv_in = gr.File(label="CSVをアップロード（任意）", file_types=[".csv"])
	csv_text_col = gr.Textbox(label="CSVのテキスト列名（空なら自動判定）", placeholder="text")
	with gr.Row():
	k_in = gr.Slider(label="クラスタ数（0=自動）", minimum=0, maximum=30, step=1, value=0)
	max_items_in = gr.Slider(label="最大件数", minimum=0, maximum=2000, step=50, value=0)
	seed_in = gr.Slider(label="乱数Seed", minimum=0, maximum=9999, step=1, value=42)
	with gr.Row():
	dedup_in = gr.Checkbox(label="近似重複を除外する", value=True)
	dedup_thr_in = gr.Slider(label="重複しきい値（コサイン類似 ≥）", minimum=0.80, maximum=0.99, step=0.01, value=0.95)
	with gr.Row():
	lang_in = gr.Radio(label="出力言語", choices=["日本語", "English", "自動"], value="日本語")
	with gr.Row():
	btn_sample = gr.Button("🧪 デモデータを読み込む")
	btn_run = gr.Button("🧭 解析する", variant="primary")

	with gr.Column(scale=6):
	summary_out = gr.Markdown(label="クラスタ要約")

	fig_out = gr.Plot(label="2D配置（UMAP）")
	table_out = gr.Dataframe(label="クラスタ編集（ラベルは編集可）", interactive=True, headers=["id", "size", "label", "top_terms"])
	json_out = gr.JSON(label="結果JSON（内部）", visible=False)

	with gr.Row():
	btn_apply = gr.Button("✏️ ラベル変更を適用")
	file_json = gr.File(label="結果JSONダウンロード", visible=False)
	file_csv = gr.File(label="結果CSVダウンロード", visible=False)

	btn_sample.click(on_load_sample, outputs=text_in)

	outputs = [summary_out, fig_out, table_out, json_out]
	btn_run.click(
	run_analysis,
	inputs=[text_in, k_in, max_items_in, seed_in, dedup_in, dedup_thr_in, csv_in, csv_text_col, lang_in],
	outputs=outputs,
	api_name="analyze",
	).then(
	to_json_file, inputs=json_out, outputs=file_json
	).then(
	to_csv_file, inputs=json_out, outputs=file_csv
	)

	btn_apply.click(
	apply_labels,
	inputs=[json_out, table_out],
	outputs=json_out,
	).then(
	to_json_file, inputs=json_out, outputs=file_json
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))