File size: 6,674 Bytes
34c8a90 a113688 34c8a90 a113688 34c8a90 a113688 34c8a90 a5597da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import subprocess
import gradio as gr
# Ensure wandb never starts in Spaces
os.environ["WANDB_MODE"] = "disabled"
# Resolve MeCab binary for this process
_default_mecab = "/usr/bin/mecab" if os.path.exists("/usr/bin/mecab") else "mecab"
MECAB_BIN = os.getenv("MECAB_BIN", _default_mecab)
os.environ["MECAB_BIN"] = MECAB_BIN
# Lazy-loaded model
_model = None
_exp_info = None
def _ensure_model():
global _model, _exp_info
if _model is None:
from infer import load_model
result = load_model()
if result is None:
raise RuntimeError(
"Model could not be loaded. Ensure sample_model/ exists with config.yaml and model.pt."
)
_model, _exp_info = result
def _to_mecab_lines(results, optimal_morphemes=None) -> str:
# Build MeCab-like output lines
def mecab_features(m):
pos = m.get("pos", "*")
pos1 = m.get("pos_detail1", "*")
pos2 = m.get("pos_detail2", "*")
ctype = m.get("inflection_type", "*")
cform = m.get("inflection_form", "*")
base = m.get("base_form", m.get("lemma", "*")) or "*"
# Mecari output includes reading as 7th field
reading = m.get("reading", "*") or "*"
return f"{pos},{pos1},{pos2},{ctype},{cform},{base},{reading}"
items = (
optimal_morphemes
if optimal_morphemes
else [
{
"surface": r.get("surface", ""),
"pos": r.get("pos", "*"),
"pos_detail1": "*",
"pos_detail2": "*",
"inflection_type": "*",
"inflection_form": "*",
"base_form": r.get("surface", ""),
"reading": r.get("reading", "*"),
}
for r in results
]
)
lines = [f"{m.get('surface','')}\t{mecab_features(m)}" for m in items]
lines.append("EOS")
return "\n".join(lines)
def mecab_plain(text: str) -> str:
"""Run system MeCab and return its raw parsing (surface\tCSV ...\nEOS)."""
try:
from mecari.analyzers.mecab import MeCabAnalyzer
analyzer = MeCabAnalyzer()
mecab_bin = os.getenv("MECAB_BIN", analyzer.mecab_bin)
args = [mecab_bin]
if isinstance(analyzer.jumandic_path, str) and os.path.isdir(analyzer.jumandic_path):
args += ["-d", analyzer.jumandic_path]
p = subprocess.run(args, input=text, text=True, capture_output=True)
out = (p.stdout or "") + ("\n" + p.stderr if p.stderr else "")
if p.returncode != 0:
return out.strip() or f"mecab error rc={p.returncode}"
# Trim extra tail fields (e.g., カテゴリ:*, ドメイン:*) and keep first 6 features
lines = []
for line in out.splitlines():
if not line or line.strip() == "EOS":
lines.append("EOS")
continue
if "\t" in line:
surface, feats = line.split("\t", 1)
parts = [s.strip() for s in feats.split(",")]
trimmed = parts[:6]
while len(trimmed) < 6:
trimmed.append("*")
lines.append(f"{surface}\t{','.join(trimmed)}")
else:
lines.append(line)
# Ensure trailing EOS only once
if not lines or lines[-1] != "EOS":
lines.append("EOS")
return "\n".join(lines)
except FileNotFoundError:
return "MeCabバイナリが見つかりません(MECAB_BINやpackages.txtを確認)。"
except Exception as e:
return f"mecab実行時エラー: {e}"
def analyze(text: str):
if not text or not text.strip():
return "", ""
try:
_ensure_model()
from infer import predict_morphemes_from_text
text = text.strip()
result = predict_morphemes_from_text(text, _model, _exp_info, silent=True)
if not result:
return "推論に失敗しました。", mecab_plain(text)
results, optimal_morphemes = result
mecari_out = _to_mecab_lines(results, optimal_morphemes)
mecab_out = mecab_plain(text)
return mecari_out, mecab_out
except FileNotFoundError:
return (
"MeCabが見つかりません。Spaceのpackages.txtに 'mecab' と 'mecab-jumandic-utf8' を含めてビルドし直すか、\n"
"変数 MECAB_BIN=/usr/bin/mecab を設定してください。"
), ""
except Exception as e:
import traceback
tb = traceback.format_exc()
return f"エラー: {e}\n\n{tb}", ""
FONT_CSS = """
/* Prefer common system fonts for Latin text */
body, .gradio-container, .prose, textarea, input, button,
.gr-text-input input, .gr-text-input textarea, .gr-textbox textarea {
font-family: system-ui, -apple-system, 'Segoe UI', Roboto, 'Noto Sans',
'Helvetica Neue', Arial, 'Apple Color Emoji', 'Segoe UI Emoji',
sans-serif !important;
}
"""
with gr.Blocks(theme=gr.themes.Soft(), css=FONT_CSS) as demo:
gr.Markdown(
"""
# Mecari Morpheme Analyzer
形態素解析器"Mecari"のデモです。Googleが発表した手法の非公式再現実装です。GitHub: https://github.com/zbller/Mecari
"""
)
with gr.Row():
inp = gr.Textbox(label="テキスト入力", value="外国人参政権", placeholder="とうきょうに行った", lines=3)
btn = gr.Button("解析する")
with gr.Row():
out_mecari = gr.Textbox(label="Mecari", lines=10)
out_mecab = gr.Textbox(label="MeCab(Jumandic)", lines=10)
gr.Examples(
examples=[
["とうきょうに行った"],
["吾輩わがはいは猫である。名前はまだ無い。"]
],
inputs=inp,
outputs=[out_mecari, out_mecab],
fn=analyze,
label="Good examples",
run_on_click=True,
cache_examples=False,
)
gr.Examples(
examples=[
["すもももももももものうち"],
["こちら葛飾区亀有公園前派出所"]
],
inputs=inp,
outputs=[out_mecari, out_mecab],
fn=analyze,
label="Bad examples",
run_on_click=True,
cache_examples=False,
)
btn.click(fn=analyze, inputs=inp, outputs=[out_mecari, out_mecab])
# Optional warm-up
def _warmup():
try:
_ensure_model()
except Exception:
pass
_warmup()
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
|