|
|
""" |
|
|
Benchmark: underthesea_core FastText (Rust/PyO3) vs all Python fasttext libraries. |
|
|
|
|
|
Compares: model loading time, single prediction latency, batch throughput. |
|
|
|
|
|
Libraries tested: |
|
|
1. underthesea_core - Pure Rust (PyO3), predict-only |
|
|
2. fasttext-predict - C++ stripped predict-only, no numpy (<1MB) |
|
|
3. fasttext-wheel - Full Facebook C++ fasttext |
|
|
4. fast-langdetect - Wrapper around fasttext-predict, bundles lid.176.ftz |
|
|
5. fasttext-langdetect - Wrapper around full fasttext |
|
|
""" |
|
|
|
|
|
import subprocess |
|
|
import sys |
|
|
import json |
|
|
import os |
|
|
|
|
|
MODEL_PATH = "/tmp/lid.176.ftz" |
|
|
|
|
|
SENTENCES = [ |
|
|
"Xin chào, tôi là sinh viên Việt Nam", |
|
|
"Hôm nay thời tiết rất đẹp, tôi muốn đi dạo công viên", |
|
|
"Việt Nam là một quốc gia nằm ở phía đông bán đảo Đông Dương thuộc khu vực Đông Nam Á", |
|
|
"The quick brown fox jumps over the lazy dog", |
|
|
"Machine learning is a subset of artificial intelligence that focuses on building systems", |
|
|
"Natural language processing enables computers to understand human language", |
|
|
"Bonjour le monde, comment allez-vous aujourd'hui", |
|
|
"La France est un pays dont la métropole se situe en Europe de l'Ouest", |
|
|
"今天天气很好我想出去走走", |
|
|
"机器学习是人工智能的一个重要分支", |
|
|
"今日はとても良い天気ですね", |
|
|
"自然言語処理は人工知能の重要な分野です", |
|
|
"hello", |
|
|
"xin chào", |
|
|
"bonjour", |
|
|
"Việt Nam, tên gọi chính thức là Cộng hòa Xã hội chủ nghĩa Việt Nam, " |
|
|
"là một quốc gia nằm ở cực Đông của bán đảo Đông Dương thuộc khu vực " |
|
|
"Đông Nam Á, giáp với Lào, Campuchia, Trung Quốc, biển Đông và vịnh Thái Lan.", |
|
|
] |
|
|
|
|
|
|
|
|
RUNNER_SCRIPT = r''' |
|
|
import time, statistics, json, sys, os |
|
|
|
|
|
MODEL_PATH = sys.argv[1] |
|
|
SENTENCES = json.loads(sys.argv[2]) |
|
|
LIB_NAME = sys.argv[3] |
|
|
K = 3 |
|
|
WARMUP = 50 |
|
|
REPEATS = 500 |
|
|
BATCH_CALLS = 5000 |
|
|
|
|
|
def run(): |
|
|
# --- Load --- |
|
|
if LIB_NAME == "underthesea_core": |
|
|
from underthesea_core import FastText |
|
|
def load(): return FastText.load(MODEL_PATH) |
|
|
def predict(m, t): return m.predict(t, k=K) |
|
|
def fmt(r): return r[0][0] if r else "?" |
|
|
|
|
|
elif LIB_NAME == "fasttext-predict": |
|
|
import fasttext |
|
|
def load(): return fasttext.load_model(MODEL_PATH) |
|
|
def predict(m, t): return m.predict(t, k=K) |
|
|
def fmt(r): return r[0][0].replace("__label__","") if r[0] else "?" |
|
|
|
|
|
elif LIB_NAME == "fasttext-wheel": |
|
|
import fasttext |
|
|
def load(): return fasttext.load_model(MODEL_PATH) |
|
|
def predict(m, t): return m.predict(t, k=K) |
|
|
def fmt(r): return r[0][0].replace("__label__","") if r[0] else "?" |
|
|
|
|
|
elif LIB_NAME == "fast-langdetect": |
|
|
from fast_langdetect import detect |
|
|
# preload to avoid download during benchmark |
|
|
detect("warmup") |
|
|
def load(): return None |
|
|
def predict(m, t): return detect(t) |
|
|
def fmt(r): return r[0]["lang"] if isinstance(r, list) else r.get("lang","?") |
|
|
|
|
|
elif LIB_NAME == "fasttext-langdetect": |
|
|
from ftlangdetect import detect |
|
|
detect("warmup") |
|
|
def load(): return None |
|
|
def predict(m, t): return detect(t) |
|
|
def fmt(r): return r.get("lang","?") |
|
|
|
|
|
# --- Benchmark Load --- |
|
|
load_times = [] |
|
|
for _ in range(5): |
|
|
t0 = time.perf_counter() |
|
|
model = load() |
|
|
t1 = time.perf_counter() |
|
|
load_times.append(t1 - t0) |
|
|
load_ms = statistics.median(load_times) * 1000 |
|
|
|
|
|
# --- Warmup --- |
|
|
for _ in range(WARMUP): |
|
|
for s in SENTENCES: |
|
|
predict(model, s) |
|
|
|
|
|
# --- Single prediction latency --- |
|
|
per_sentence_us = [] |
|
|
for s in SENTENCES: |
|
|
times = [] |
|
|
for _ in range(REPEATS): |
|
|
t0 = time.perf_counter() |
|
|
predict(model, s) |
|
|
t1 = time.perf_counter() |
|
|
times.append(t1 - t0) |
|
|
per_sentence_us.append(statistics.median(times) * 1e6) |
|
|
|
|
|
avg_us = statistics.mean(per_sentence_us) |
|
|
throughput_single = 1e6 / avg_us |
|
|
|
|
|
# --- Batch throughput --- |
|
|
t0 = time.perf_counter() |
|
|
for _ in range(BATCH_CALLS): |
|
|
for s in SENTENCES: |
|
|
predict(model, s) |
|
|
t1 = time.perf_counter() |
|
|
total = BATCH_CALLS * len(SENTENCES) |
|
|
throughput_batch = total / (t1 - t0) |
|
|
|
|
|
# --- Predictions for verification --- |
|
|
preds = [] |
|
|
for s in SENTENCES: |
|
|
r = predict(model, s) |
|
|
preds.append(fmt(r)) |
|
|
|
|
|
result = { |
|
|
"lib": LIB_NAME, |
|
|
"load_ms": round(load_ms, 1), |
|
|
"avg_us": round(avg_us, 1), |
|
|
"min_us": round(min(per_sentence_us), 1), |
|
|
"max_us": round(max(per_sentence_us), 1), |
|
|
"throughput_single": int(throughput_single), |
|
|
"throughput_batch": int(throughput_batch), |
|
|
"preds": preds, |
|
|
} |
|
|
print(json.dumps(result)) |
|
|
|
|
|
run() |
|
|
''' |
|
|
|
|
|
VENVS = { |
|
|
"underthesea_core": "/tmp/venv_ftpredict/bin/python3", |
|
|
"fasttext-predict": "/tmp/venv_ftpredict/bin/python3", |
|
|
"fasttext-wheel": "/tmp/venv_ftwheel/bin/python3", |
|
|
"fast-langdetect": "/tmp/venv_fastlang/bin/python3", |
|
|
"fasttext-langdetect": "/tmp/venv_ftlangdetect/bin/python3", |
|
|
} |
|
|
|
|
|
|
|
|
def run_benchmark(lib_name, python_bin): |
|
|
"""Run benchmark in a subprocess with the correct venv.""" |
|
|
env = os.environ.copy() |
|
|
env.pop("VIRTUAL_ENV", None) |
|
|
result = subprocess.run( |
|
|
[python_bin, "-c", RUNNER_SCRIPT, MODEL_PATH, json.dumps(SENTENCES), lib_name], |
|
|
capture_output=True, text=True, timeout=600, env=env, |
|
|
) |
|
|
|
|
|
for line in result.stdout.strip().split("\n"): |
|
|
line = line.strip() |
|
|
if line.startswith("{"): |
|
|
return json.loads(line) |
|
|
print(f" ERROR ({lib_name}): {result.stderr[-500:]}", file=sys.stderr) |
|
|
return None |
|
|
|
|
|
|
|
|
def main(): |
|
|
print("=" * 80) |
|
|
print("FastText Library Benchmark") |
|
|
print("=" * 80) |
|
|
print(f"Model: {MODEL_PATH}") |
|
|
print(f"Sentences: {len(SENTENCES)}") |
|
|
print() |
|
|
|
|
|
results = [] |
|
|
for lib_name, python_bin in VENVS.items(): |
|
|
if not os.path.exists(python_bin): |
|
|
print(f" SKIP {lib_name}: venv not found at {python_bin}") |
|
|
continue |
|
|
print(f" Benchmarking {lib_name}...", end="", flush=True) |
|
|
r = run_benchmark(lib_name, python_bin) |
|
|
if r: |
|
|
print(f" done ({r['throughput_batch']:,} pred/s)") |
|
|
results.append(r) |
|
|
else: |
|
|
print(" FAILED") |
|
|
|
|
|
if not results: |
|
|
print("No results!") |
|
|
return |
|
|
|
|
|
|
|
|
print() |
|
|
print("=" * 80) |
|
|
print(f"{'Library':<22s} {'Load':>8s} {'Avg':>8s} {'Min':>8s} {'Max':>8s} {'Throughput':>12s}") |
|
|
print(f"{'':<22s} {'(ms)':>8s} {'(µs)':>8s} {'(µs)':>8s} {'(µs)':>8s} {'(pred/s)':>12s}") |
|
|
print("-" * 80) |
|
|
|
|
|
baseline = results[0]["throughput_batch"] |
|
|
for r in results: |
|
|
ratio = r["throughput_batch"] / baseline if baseline else 0 |
|
|
mark = "" if r["lib"] == results[0]["lib"] else f" ({ratio:.2f}x)" |
|
|
print(f" {r['lib']:<20s} {r['load_ms']:>8.1f} {r['avg_us']:>8.1f} " |
|
|
f"{r['min_us']:>8.1f} {r['max_us']:>8.1f} {r['throughput_batch']:>10,}{mark}") |
|
|
|
|
|
|
|
|
print() |
|
|
print("=" * 80) |
|
|
print("Prediction Verification (top-1 label)") |
|
|
print("-" * 80) |
|
|
ref = results[0] |
|
|
header = f" {'Text':<50s}" |
|
|
for r in results: |
|
|
header += f" {r['lib'][:10]:>10s}" |
|
|
print(header) |
|
|
print(" " + "-" * (50 + 11 * len(results))) |
|
|
|
|
|
for i, s in enumerate(SENTENCES): |
|
|
preview = s[:48] + ".." if len(s) > 48 else s |
|
|
row = f" {preview:<50s}" |
|
|
for r in results: |
|
|
pred = r["preds"][i] |
|
|
match = "" if pred == ref["preds"][i] else "*" |
|
|
row += f" {pred+match:>10s}" |
|
|
print(row) |
|
|
|
|
|
|
|
|
print() |
|
|
for r in results[1:]: |
|
|
matches = sum(1 for i in range(len(SENTENCES)) if r["preds"][i] == ref["preds"][i]) |
|
|
print(f" {r['lib']} vs {ref['lib']}: {matches}/{len(SENTENCES)} match") |
|
|
|
|
|
print() |
|
|
print("Done.") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|