Spaces:

zbller
/

Mecari

Sleeping

App Files Files Community

Mecari / app.py

zbller

Upload folder using huggingface_hub

a113688 verified 5 months ago

raw

history blame contribute delete

6.67 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	import os
	import subprocess
	import gradio as gr

	# Ensure wandb never starts in Spaces
	os.environ["WANDB_MODE"] = "disabled"

	# Resolve MeCab binary for this process
	_default_mecab = "/usr/bin/mecab" if os.path.exists("/usr/bin/mecab") else "mecab"
	MECAB_BIN = os.getenv("MECAB_BIN", _default_mecab)
	os.environ["MECAB_BIN"] = MECAB_BIN

	# Lazy-loaded model
	_model = None
	_exp_info = None


	def _ensure_model():
	global _model, _exp_info
	if _model is None:
	from infer import load_model

	result = load_model()
	if result is None:
	raise RuntimeError(
	"Model could not be loaded. Ensure sample_model/ exists with config.yaml and model.pt."
	)
	_model, _exp_info = result


	def _to_mecab_lines(results, optimal_morphemes=None) -> str:
	# Build MeCab-like output lines
	def mecab_features(m):
	pos = m.get("pos", "*")
	pos1 = m.get("pos_detail1", "*")
	pos2 = m.get("pos_detail2", "*")
	ctype = m.get("inflection_type", "*")
	cform = m.get("inflection_form", "*")
	base = m.get("base_form", m.get("lemma", "")) or ""
	# Mecari output includes reading as 7th field
	reading = m.get("reading", "") or ""
	return f"{pos},{pos1},{pos2},{ctype},{cform},{base},{reading}"

	items = (
	optimal_morphemes
	if optimal_morphemes
	else [
	{
	"surface": r.get("surface", ""),
	"pos": r.get("pos", "*"),
	"pos_detail1": "*",
	"pos_detail2": "*",
	"inflection_type": "*",
	"inflection_form": "*",
	"base_form": r.get("surface", ""),
	"reading": r.get("reading", "*"),
	}
	for r in results
	]
	)

	lines = [f"{m.get('surface','')}\t{mecab_features(m)}" for m in items]
	lines.append("EOS")
	return "\n".join(lines)


	def mecab_plain(text: str) -> str:
	"""Run system MeCab and return its raw parsing (surface\tCSV ...\nEOS)."""
	try:
	from mecari.analyzers.mecab import MeCabAnalyzer

	analyzer = MeCabAnalyzer()
	mecab_bin = os.getenv("MECAB_BIN", analyzer.mecab_bin)
	args = [mecab_bin]
	if isinstance(analyzer.jumandic_path, str) and os.path.isdir(analyzer.jumandic_path):
	args += ["-d", analyzer.jumandic_path]
	p = subprocess.run(args, input=text, text=True, capture_output=True)
	out = (p.stdout or "") + ("\n" + p.stderr if p.stderr else "")
	if p.returncode != 0:
	return out.strip() or f"mecab error rc={p.returncode}"
	# Trim extra tail fields (e.g., カテゴリ:, ドメイン:) and keep first 6 features
	lines = []
	for line in out.splitlines():
	if not line or line.strip() == "EOS":
	lines.append("EOS")
	continue
	if "\t" in line:
	surface, feats = line.split("\t", 1)
	parts = [s.strip() for s in feats.split(",")]
	trimmed = parts[:6]
	while len(trimmed) < 6:
	trimmed.append("*")
	lines.append(f"{surface}\t{','.join(trimmed)}")
	else:
	lines.append(line)
	# Ensure trailing EOS only once
	if not lines or lines[-1] != "EOS":
	lines.append("EOS")
	return "\n".join(lines)
	except FileNotFoundError:
	return "MeCabバイナリが見つかりません（MECAB_BINやpackages.txtを確認）。"
	except Exception as e:
	return f"mecab実行時エラー: {e}"


	def analyze(text: str):
	if not text or not text.strip():
	return "", ""

	try:
	_ensure_model()
	from infer import predict_morphemes_from_text

	text = text.strip()
	result = predict_morphemes_from_text(text, _model, _exp_info, silent=True)
	if not result:
	return "推論に失敗しました。", mecab_plain(text)
	results, optimal_morphemes = result
	mecari_out = _to_mecab_lines(results, optimal_morphemes)
	mecab_out = mecab_plain(text)
	return mecari_out, mecab_out
	except FileNotFoundError:
	return (
	"MeCabが見つかりません。Spaceのpackages.txtに 'mecab' と 'mecab-jumandic-utf8' を含めてビルドし直すか、\n"
	"変数 MECAB_BIN=/usr/bin/mecab を設定してください。"
	), ""
	except Exception as e:
	import traceback

	tb = traceback.format_exc()
	return f"エラー: {e}\n\n{tb}", ""


	FONT_CSS = """
	/* Prefer common system fonts for Latin text */
	body, .gradio-container, .prose, textarea, input, button,
	.gr-text-input input, .gr-text-input textarea, .gr-textbox textarea {
	font-family: system-ui, -apple-system, 'Segoe UI', Roboto, 'Noto Sans',
	'Helvetica Neue', Arial, 'Apple Color Emoji', 'Segoe UI Emoji',
	sans-serif !important;
	}
	"""

	with gr.Blocks(theme=gr.themes.Soft(), css=FONT_CSS) as demo:
	gr.Markdown(
	"""
	# Mecari Morpheme Analyzer

	形態素解析器"Mecari"のデモです。Googleが発表した手法の非公式再現実装です。GitHub: https://github.com/zbller/Mecari
	"""
	)

	with gr.Row():
	inp = gr.Textbox(label="テキスト入力", value="外国人参政権", placeholder="とうきょうに行った", lines=3)
	btn = gr.Button("解析する")
	with gr.Row():
	out_mecari = gr.Textbox(label="Mecari", lines=10)
	out_mecab = gr.Textbox(label="MeCab（Jumandic）", lines=10)
	gr.Examples(
	examples=[
	["とうきょうに行った"],
	["吾輩わがはいは猫である。名前はまだ無い。"]
	],
	inputs=inp,
	outputs=[out_mecari, out_mecab],
	fn=analyze,
	label="Good examples",
	run_on_click=True,
	cache_examples=False,
	)
	gr.Examples(
	examples=[
	["すもももももももものうち"],
	["こちら葛飾区亀有公園前派出所"]
	],
	inputs=inp,
	outputs=[out_mecari, out_mecab],
	fn=analyze,
	label="Bad examples",
	run_on_click=True,
	cache_examples=False,
	)
	btn.click(fn=analyze, inputs=inp, outputs=[out_mecari, out_mecab])

	# Optional warm-up
	def _warmup():
	try:
	_ensure_model()
	except Exception:
	pass

	_warmup()

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))