File size: 15,005 Bytes

357ae2c

#!/usr/bin/env python3
"""
Pick an ONNX SaT model and segment text with it (local CPU test).

Examples:
    # interactive: choose a model from a menu, then type/paste text
    python scripts/run_segmentation.py

    # one-shot
    python scripts/run_segmentation.py --model sat-1l-sm-en_zh-int8 \
        --text "Your text here. 这是中文。" --max-length 80

    # from a file, tighter Chinese-style budget
    python scripts/run_segmentation.py -m sat-3l-sm-en_zh-int8 -f test.txt \
        --max-length 40 --min-length 15

Notes:
- "*-en_zh-*" models use a pruned vocab; the id-remap is recomputed on the fly
  (deterministic from the tokenizer), so no extra files are needed.
- onnxruntime needs the conda libstdc++ on this box; the script auto-preloads it
  and re-execs once if needed.
"""
import argparse
import math
import os
import re
import string
import sys
from pathlib import Path

# --- bootstrap: onnxruntime needs conda's libstdc++ preloaded on this machine ---
def _ensure_onnxruntime():
    import contextlib
    import io
    # Probe quietly: a failed import dumps a long numpy/GLIBCXX message to stderr.
    try:
        with contextlib.redirect_stderr(io.StringIO()):
            import onnxruntime  # noqa
        return
    except Exception:
        prefix = os.environ.get("CONDA_PREFIX") or sys.prefix
        lib = Path(prefix) / "lib" / "libstdc++.so.6"
        if lib.exists() and os.environ.get("_ORT_PRELOADED") != "1":
            os.environ["LD_PRELOAD"] = f"{lib}:{os.environ.get('LD_PRELOAD','')}".strip(":")
            os.environ["_ORT_PRELOADED"] = "1"
            os.execv(sys.executable, [sys.executable] + sys.argv)
        raise


_ensure_onnxruntime()

import importlib.util  # noqa: E402
import types  # noqa: E402

import numpy as np  # noqa: E402
import onnxruntime as ort  # noqa: E402

NEWLINE_INDEX = 0
ROOT = Path(__file__).resolve().parent.parent
MODELS_DIR = ROOT / "onnx_models"


# --- load the two tiny pure-numpy helper modules WITHOUT importing the heavy
#     wtpsplit package (which pulls torch/onnx/skops and costs ~5s on startup).
#     constraints.py references wtpsplit.utils.indices_to_sentences but
#     constrained_segmentation() never calls it, so we stub that one symbol. ---
def _load_light(path, name):
    if "wtpsplit" not in sys.modules:
        pkg = types.ModuleType("wtpsplit"); pkg.__path__ = []
        utils = types.ModuleType("wtpsplit.utils"); utils.__path__ = []
        utils.indices_to_sentences = lambda *a, **k: None  # unused here
        sys.modules["wtpsplit"] = pkg
        sys.modules["wtpsplit.utils"] = utils
    spec = importlib.util.spec_from_file_location(name, path)
    mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    return mod


_WT_UTILS = ROOT / "wtpsplit" / "utils"
constrained_segmentation = _load_light(_WT_UTILS / "constraints.py",
                                       "onnxseg_constraints").constrained_segmentation
create_prior_function = _load_light(_WT_UTILS / "priors.py",
                                    "onnxseg_priors").create_prior_function

def get_token_spans(offsets_mapping, tokens, special_tokens):
    valid = np.array([i for i, t in enumerate(tokens)
                      if i < len(offsets_mapping) and t not in special_tokens])
    return valid, np.array(offsets_mapping)[valid]


def token_to_char_probs(text, tokens, token_logits, special_tokens, offsets_mapping):
    char_probs = np.full((len(text), token_logits.shape[1]), -np.inf)
    vi, vo = get_token_spans(offsets_mapping, tokens, special_tokens)
    char_probs[vo[:, 1] - 1] = token_logits[vi]
    return char_probs


_TOK_CACHE = Path(__file__).resolve().parent / ".xlmr_tokenizer" / "tokenizer.json"


class FastTok:
    """Thin wrapper over the `tokenizers` Rust lib (loads in ~0.4s vs ~4.3s for
    transformers + AutoTokenizer). Exposes only what this script needs."""

    def __init__(self, tok):
        self._t = tok
        self.special_tokens = {"<s>", "</s>", "<pad>", "<unk>", "<mask>"}
        self.unk_token_id = tok.token_to_id("<unk>")
        self.all_special_ids = [tok.token_to_id(s) for s in self.special_tokens
                                if tok.token_to_id(s) is not None]

    def encode(self, text):
        e = self._t.encode(text)  # XLM-R template adds <s> ... </s>
        return e.ids, e.offsets, e.tokens

    def get_vocab(self):
        return self._t.get_vocab()


def load_tokenizer():
    """Return a FastTok. Builds the fast tokenizer.json cache via transformers
    only once (first ever run); afterwards loads via the `tokenizers` lib alone,
    so transformers/torch are never imported."""
    from tokenizers import Tokenizer
    if not _TOK_CACHE.exists():
        from transformers import AutoTokenizer  # lazy: only on first build
        AutoTokenizer.from_pretrained("xlm-roberta-base").save_pretrained(
            str(_TOK_CACHE.parent))
    return FastTok(Tokenizer.from_file(str(_TOK_CACHE)))


def compute_keep_ids(tokenizer):
    """EN+ZH keep-set: ASCII or CJK tokens, plus specials (pure-stdlib, fast)."""
    keep = set(tokenizer.all_special_ids)
    for tok, idx in tokenizer.get_vocab().items():
        s = tok.replace("▁", " ")  # SP underscore -> space
        if all(ord(c) < 128 for c in s) or any(_is_cjk(c) for c in s):
            keep.add(idx)
    return sorted(keep)


def get_remap(tokenizer):
    """old->new id map for EN+ZH pruning, cached to disk (.npy)."""
    cache = MODELS_DIR / "remap_en_zh.npy"
    if cache.exists():
        remap = np.load(cache)
    else:
        keep = compute_keep_ids(tokenizer)
        remap = np.full(250002, -1, dtype=np.int64)
        for new_id, old_id in enumerate(keep):
            remap[old_id] = new_id
        MODELS_DIR.mkdir(parents=True, exist_ok=True)
        np.save(cache, remap)
    return remap, int(remap[tokenizer.unk_token_id])


def find_models(root: Path):
    """Return {display_name: onnx_path} for every .onnx under onnx_models/."""
    out = {}
    for p in sorted(root.rglob("*.onnx")):
        variant = p.parent.name                      # e.g. sat-1l-sm-en_zh
        quant = "int8" if ".int8." in p.name else "fp32"
        out[f"{variant}-{quant}"] = p
    return out


def choose_model(models: dict):
    names = list(models)
    print("\nAvailable ONNX models:")
    for i, n in enumerate(names, 1):
        mb = models[n].stat().st_size / 1e6
        print(f"  {i:2d}) {n:30s} {mb:7.1f} MB")
    while True:
        sel = input("\nSelect model [number or name]: ").strip()
        if sel.isdigit() and 1 <= int(sel) <= len(names):
            return names[int(sel) - 1]
        if sel in models:
            return sel
        print("  invalid choice, try again")


def get_text(args):
    if args.text:
        return args.text
    if args.file:
        return Path(args.file).read_text(encoding="utf-8")
    print("\nEnter/paste text, then Ctrl-D (Ctrl-Z on Windows) to finish:")
    data = sys.stdin.read()
    return data if data.strip() else (
        "Breaking News: Scientists announced a discovery. 这是一个测试。It works well!")


CJK_RANGES = [(0x4E00, 0x9FFF), (0x3400, 0x4DBF), (0xF900, 0xFAFF),
              (0x3000, 0x303F), (0xFF00, 0xFFEF)]


def _is_cjk(ch):
    cp = ord(ch)
    return any(a <= cp <= b for a, b in CJK_RANGES)


# Punctuation that marks a prosodic pause, by strength (used as break-priority
# floors when a long sentence must be split below max_length). Sentence-ending
# punctuation is intentionally NOT floored here -- the model already predicts
# those boundaries well, and overriding it would create false breaks after
# abbreviations like "A.I.".
CLAUSE_PUNCT = set(",;:)]}—–"           # , ; : ) ] } em/en-dash
                   "，、；："      # CJK , 、 ; :
                   "”’")                 # closing “ ” ’
CJK_SENT_PUNCT = set("。！？…")   # 。 ！ ？ …

# Words that introduce a clause/phrase: breaking *before* one of these sounds
# more natural than a random word gap when a long span has no punctuation.
CONNECTORS = {
    "and", "but", "or", "nor", "yet", "so", "for",
    "which", "that", "who", "whom", "whose", "where", "when", "while",
    "because", "although", "though", "since", "if", "unless", "until",
    "after", "before", "as", "than", "whether",
}

FLOOR_CLAUSE = 0.25       # comma / semicolon / colon -> strongly preferred
FLOOR_CONNECTOR = 0.05    # break before "and/which/that..." in a comma-free span
FLOOR_HANZI = 5e-3        # between two Chinese chars (no spaces in zh)
FLOOR_SPACE = 1e-4        # plain word gap -> last-resort break
FORBID = 1e-9             # mid-word -> effectively never


def _connector_break_positions(text):
    """Indices i (break after char i) that sit right before a connector word."""
    pos = set()
    for m in re.finditer(r"\s+(\S+)", text):
        word = m.group(1).strip(string.punctuation).lower()
        if word in CONNECTORS and m.start() - 1 >= 0:
            pos.add(m.start() - 1)  # last char of the preceding word
    return pos


def pause_aware_mask(probs, text):
    """Bias forced breaks toward natural prosodic pauses so TTS doesn't pause
    mid-phrase. probs[i] = boundary prob *after* char i (between i and i+1).

    Model-predicted sentence boundaries (high prob) are preserved as-is and keep
    dominating. For everything else we raise a floor by pause strength:
      clause punctuation (, ; : 、 ， …) > connector word (and/which/that) >
      plain word gap,
    and mid-word positions are driven to ~0 so words/abbreviations are never cut.
    The result: long sentences break at the nearest comma/clause in range, then
    before a connecting word, and only at a bare space as a last resort.
    """
    p = probs.copy()
    n = len(text)
    connectors = _connector_break_positions(text)
    for i in range(n - 1):  # never break before end-of-text marker
        ch, nxt = text[i], text[i + 1]
        ends_token = nxt.isspace() or _is_cjk(nxt)
        if ch in CLAUSE_PUNCT and ends_token:
            p[i] = max(p[i], FLOOR_CLAUSE)
        elif ch in CJK_SENT_PUNCT:                       # zh sentence end
            p[i] = max(p[i], 0.9)
        elif i in connectors:                            # break before connector
            p[i] = max(p[i], FLOOR_CONNECTOR)
        elif nxt.isspace() or ch.isspace():              # plain word boundary
            p[i] = max(p[i], FLOOR_SPACE)
        elif _is_cjk(ch) and _is_cjk(nxt):               # between hanzi
            p[i] = max(p[i], FLOOR_HANZI)
        else:                                            # mid-word/abbreviation
            p[i] = min(p[i], FORBID)
    return p


# kept as an alias so existing imports (benchmark) keep working
word_safe_mask = pause_aware_mask


def boundary_probs(session, tokenizer, text, remap, unk_new):
    ids_list, offsets, tokens = tokenizer.encode(text)
    ids = np.array([ids_list], dtype=np.int64)
    mask = np.ones_like(ids)
    if remap is not None:
        ids = remap[ids]
        ids[ids == -1] = unk_new
    logits = session.run(["logits"], {"input_ids": ids, "attention_mask": mask})[0]
    char_logits = token_to_char_probs(text, tokens, logits[0],
                                      tokenizer.special_tokens, offsets)
    return 1.0 / (1.0 + np.exp(-char_logits[:, NEWLINE_INDEX]))


def main():
    ap = argparse.ArgumentParser(description="Segment text with a local ONNX SaT model")
    ap.add_argument("-m", "--model", help="model name (see menu if omitted)")
    ap.add_argument("-t", "--text", help="text to segment")
    ap.add_argument("-f", "--file", help="read text from this file")
    ap.add_argument("--max-length", type=int, default=80, help="target max chars per chunk")
    ap.add_argument("--min-length", type=int, default=40, help="min chars per chunk")
    ap.add_argument("--overflow", type=int, default=0,
                    help="chars a chunk may exceed --max-length to reach a comma/"
                         "clause/sentence pause (soft cap; 0 = hard cap)")
    ap.add_argument("--prior", default="gaussian",
                    choices=["uniform", "gaussian", "clipped_polynomial"])
    ap.add_argument("--target", type=int, default=70, help="gaussian target length")
    ap.add_argument("--spread", type=int, default=12, help="gaussian spread")
    ap.add_argument("--algorithm", default="viterbi", choices=["viterbi", "greedy"])
    ap.add_argument("--allow-midword", action="store_true",
                    help="permit breaks inside words/abbreviations (off by default)")
    args = ap.parse_args()

    models = find_models(MODELS_DIR)
    if not models:
        sys.exit(f"No ONNX models found under {MODELS_DIR}. Run build_and_test_onnx.py first.")

    name = args.model or choose_model(models)
    if name not in models:
        sys.exit(f"Unknown model '{name}'. Choices: {', '.join(models)}")
    path = models[name]

    tokenizer = load_tokenizer()
    remap = unk_new = None
    if "en_zh" in name:
        remap, unk_new = get_remap(tokenizer)

    session = ort.InferenceSession(str(path), providers=["CPUExecutionProvider"])
    text = get_text(args)

    probs = boundary_probs(session, tokenizer, text, remap, unk_new)
    if not args.allow_midword:
        probs = word_safe_mask(probs, text)

    # Hard ceiling for the DP. With --overflow, allow chunks past --max-length up
    # to this ceiling; a decay tail past --max-length keeps plain spaces from
    # exploiting the slack while still letting a strong pause (comma/sentence)
    # pull the break into the overflow zone.
    hard_max = args.max_length + max(0, args.overflow)
    prior_kwargs = {"max_length": hard_max}
    if args.prior != "uniform":
        prior_kwargs.update(target_length=args.target, spread=args.spread)
    base_prior = create_prior_function(args.prior, prior_kwargs)
    if args.overflow > 0:
        soft, decay = args.max_length, float(args.overflow)
        prior = lambda L: base_prior(L) * (  # noqa: E731
            1.0 if L <= soft else math.exp(-((L - soft) / decay) ** 2))
    else:
        prior = base_prior

    idx = constrained_segmentation(probs, prior, min_length=args.min_length,
                                   max_length=hard_max, algorithm=args.algorithm)
    cuts = [0] + list(idx) + [len(text)]
    chunks = [text[cuts[i]:cuts[i + 1]] for i in range(len(cuts) - 1)]

    print(f"\nModel: {name}  ({path.stat().st_size/1e6:.1f} MB)")
    print(f"Config: max={args.max_length} overflow={args.overflow} "
          f"min={args.min_length} prior={args.prior} algo={args.algorithm}")
    print(f"Input: {len(text)} chars -> {len(chunks)} chunks\n")
    for c in chunks:
        n = len(c)
        flag = "!" if n > hard_max else ("+" if n > args.max_length else " ")
        print(f"  {flag}[{n:3d}] {c.strip()[:90]}")
    assert "".join(chunks) == text, "TEXT NOT PRESERVED"
    print("\n  ✓ text preserved (chunks rejoin to original)")


if __name__ == "__main__":
    main()