Add exact tokenizer accounting to compression API
Browse files
README.md
CHANGED
|
@@ -27,7 +27,7 @@ Live Space:
|
|
| 27 |
- `https://wchen22-touchdown-compression-classifier.hf.space`
|
| 28 |
- Verified 2026-06-11 with HF CLI: runtime stage `RUNNING`, hardware
|
| 29 |
`cpu-basic`, domain `READY`, repo/runtime SHA
|
| 30 |
-
`
|
| 31 |
- The deployed scaffold supports chunked ONNX artifact inference for long
|
| 32 |
prompts. Use `hf spaces info wchen22/touchdown-compression-classifier --format
|
| 33 |
json` for the current repo/runtime SHA.
|
|
@@ -36,6 +36,11 @@ Live Space:
|
|
| 36 |
validates `/health`, `/v1/classify`, single `/v1/compress`, and managed
|
| 37 |
`inputs[]` batch, managed `messages[]`, plus gzipped JSON request/response
|
| 38 |
transport.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
- Full deployment receipt:
|
| 40 |
`python3 scripts/verify_compression_space.py --expected-sha <sha> --out reports/generated/compression_space/hf_space_verification.json`
|
| 41 |
validates HF runtime metadata, repo/runtime SHA agreement, API smoke, and
|
|
@@ -44,7 +49,7 @@ Live Space:
|
|
| 44 |
`reports/generated/compression_space/`; run the full verifier with the
|
| 45 |
current Space SHA to check runtime, API smoke, and remote/local file parity.
|
| 46 |
Current live receipt:
|
| 47 |
-
`reports/generated/compression_space/hf_space_verification_2026-06-11-
|
| 48 |
- Latest live result: `/v1/compress` saved 27/102 estimated tokens;
|
| 49 |
managed `inputs[]` returned `input_count=2`, `succeeded=2`, `failed=0`,
|
| 50 |
managed `messages[]` returned `message_count=2` with system-role protection,
|
|
@@ -64,6 +69,10 @@ Live Space:
|
|
| 64 |
is mounted. `/v1/compress` is rules-first deletion-only compression with
|
| 65 |
safety receipts. The Space app supports both single `input` requests and
|
| 66 |
managed `inputs[]` batches with per-item receipts and partial-error rows.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
- Mount `classifier_manifest.json`, tokenizer files, and optional `model.onnx`;
|
| 68 |
set `TOUCHDOWN_CLASSIFIER_ARTIFACT_DIR` to let the Space use artifact DROP
|
| 69 |
labels through ONNX Runtime or the manifest fallback. ONNX labels are
|
|
|
|
| 27 |
- `https://wchen22-touchdown-compression-classifier.hf.space`
|
| 28 |
- Verified 2026-06-11 with HF CLI: runtime stage `RUNNING`, hardware
|
| 29 |
`cpu-basic`, domain `READY`, repo/runtime SHA
|
| 30 |
+
`0dfe65a6c82c9e7fa37d2c4a32c8eda3ed4e96d7`.
|
| 31 |
- The deployed scaffold supports chunked ONNX artifact inference for long
|
| 32 |
prompts. Use `hf spaces info wchen22/touchdown-compression-classifier --format
|
| 33 |
json` for the current repo/runtime SHA.
|
|
|
|
| 36 |
validates `/health`, `/v1/classify`, single `/v1/compress`, and managed
|
| 37 |
`inputs[]` batch, managed `messages[]`, plus gzipped JSON request/response
|
| 38 |
transport.
|
| 39 |
+
- Real-corpus API benchmark:
|
| 40 |
+
`python3 scripts/benchmark_compression_api.py --base-url https://wchen22-touchdown-compression-classifier.hf.space --input-jsonl benchmarks/prompts/real/kv_stress_seed.jsonl --limit 4 --tokenizer-model Qwen/Qwen2.5-7B-Instruct --require-exact-tokens`.
|
| 41 |
+
This calls hosted `/v1/compress` over real prompt rows and fails the run if
|
| 42 |
+
receipts return estimated token counts. Use this before claiming real-token
|
| 43 |
+
savings.
|
| 44 |
- Full deployment receipt:
|
| 45 |
`python3 scripts/verify_compression_space.py --expected-sha <sha> --out reports/generated/compression_space/hf_space_verification.json`
|
| 46 |
validates HF runtime metadata, repo/runtime SHA agreement, API smoke, and
|
|
|
|
| 49 |
`reports/generated/compression_space/`; run the full verifier with the
|
| 50 |
current Space SHA to check runtime, API smoke, and remote/local file parity.
|
| 51 |
Current live receipt:
|
| 52 |
+
`reports/generated/compression_space/hf_space_verification_2026-06-11-managed-messages.json`.
|
| 53 |
- Latest live result: `/v1/compress` saved 27/102 estimated tokens;
|
| 54 |
managed `inputs[]` returned `input_count=2`, `succeeded=2`, `failed=0`,
|
| 55 |
managed `messages[]` returned `message_count=2` with system-role protection,
|
|
|
|
| 69 |
is mounted. `/v1/compress` is rules-first deletion-only compression with
|
| 70 |
safety receipts. The Space app supports both single `input` requests and
|
| 71 |
managed `inputs[]` batches with per-item receipts and partial-error rows.
|
| 72 |
+
`/v1/compress` now accepts `tokenizer_model`; when the tokenizer loads,
|
| 73 |
+
receipts report `token_count_exact=true`, `token_count_method=tokenizer`, and
|
| 74 |
+
the requested model. If it cannot load, receipts remain estimated and the
|
| 75 |
+
benchmark `--require-exact-tokens` gate fails.
|
| 76 |
- Mount `classifier_manifest.json`, tokenizer files, and optional `model.onnx`;
|
| 77 |
set `TOUCHDOWN_CLASSIFIER_ARTIFACT_DIR` to let the Space use artifact DROP
|
| 78 |
labels through ONNX Runtime or the manifest fallback. ONNX labels are
|
app.py
CHANGED
|
@@ -380,6 +380,13 @@ def _get_tokenizer():
|
|
| 380 |
return AutoTokenizer.from_pretrained(CLASSIFIER_MODEL)
|
| 381 |
|
| 382 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
@lru_cache(maxsize=1)
|
| 384 |
def _classifier_manifest() -> dict[str, Any] | None:
|
| 385 |
if not CLASSIFIER_ARTIFACT_DIR:
|
|
@@ -772,6 +779,29 @@ def _tool_schema_missing_groups(
|
|
| 772 |
return missing
|
| 773 |
|
| 774 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 775 |
def _protected_spans(
|
| 776 |
text: str,
|
| 777 |
protected_values: list[str],
|
|
@@ -834,6 +864,7 @@ def _compress_text(payload: dict[str, Any]) -> dict[str, Any]:
|
|
| 834 |
idempotency_key = payload.get("idempotency_key")
|
| 835 |
if idempotency_key is not None and not isinstance(idempotency_key, str):
|
| 836 |
raise HTTPException(status_code=400, detail="idempotency_key must be a string")
|
|
|
|
| 837 |
protected_values = payload.get("protected_spans") or []
|
| 838 |
if not isinstance(protected_values, list) or not all(
|
| 839 |
isinstance(value, str) for value in protected_values
|
|
@@ -907,8 +938,11 @@ def _compress_text(payload: dict[str, Any]) -> dict[str, Any]:
|
|
| 907 |
cursor = end
|
| 908 |
chunks.append(text[cursor:])
|
| 909 |
output = "".join(chunks)
|
| 910 |
-
before =
|
| 911 |
-
after =
|
|
|
|
|
|
|
|
|
|
| 912 |
saved = max(0, before - after)
|
| 913 |
missing = [value for value in protected_values if value and value not in output]
|
| 914 |
code_preserved = all(text[start:end] in output for start, end in code_spans)
|
|
@@ -983,6 +1017,9 @@ def _compress_text(payload: dict[str, Any]) -> dict[str, Any]:
|
|
| 983 |
"classifier_drop_chars": sum(end - start for start, end in classifier_ranges),
|
| 984 |
"dropped_segments_count": len(drops),
|
| 985 |
"dropped_segments": dropped_segments,
|
|
|
|
|
|
|
|
|
|
| 986 |
}
|
| 987 |
receipt["input_sha256"] = _sha256_text(text)
|
| 988 |
receipt["output_sha256"] = _sha256_text(output)
|
|
@@ -1027,6 +1064,7 @@ def _merge_batch_item_payload(
|
|
| 1027 |
"compression_settings": payload.get("compression_settings"),
|
| 1028 |
"protected_spans": payload.get("protected_spans"),
|
| 1029 |
"tool_schemas": payload.get("tool_schemas", payload.get("tools")),
|
|
|
|
| 1030 |
"request_id": payload.get("request_id"),
|
| 1031 |
"idempotency_key": payload.get("idempotency_key"),
|
| 1032 |
}
|
|
@@ -1058,6 +1096,7 @@ def _merge_batch_item_payload(
|
|
| 1058 |
"tool_schemas",
|
| 1059 |
item.get("tools", payload.get("tool_schemas", payload.get("tools"))),
|
| 1060 |
),
|
|
|
|
| 1061 |
"request_id": item.get("request_id", payload.get("request_id")),
|
| 1062 |
"idempotency_key": item.get(
|
| 1063 |
"idempotency_key",
|
|
@@ -1180,6 +1219,7 @@ def _handle_messages(payload: dict[str, Any]) -> dict[str, Any]:
|
|
| 1180 |
idempotency_key = payload.get("idempotency_key")
|
| 1181 |
if idempotency_key is not None and not isinstance(idempotency_key, str):
|
| 1182 |
raise HTTPException(status_code=400, detail="idempotency_key must be a string")
|
|
|
|
| 1183 |
|
| 1184 |
output_messages: list[dict[str, Any]] = []
|
| 1185 |
receipts: list[dict[str, Any]] = []
|
|
@@ -1220,6 +1260,7 @@ def _handle_messages(payload: dict[str, Any]) -> dict[str, Any]:
|
|
| 1220 |
"compression_settings": settings,
|
| 1221 |
"protected_spans": item_protected,
|
| 1222 |
"tool_schemas": payload.get("tool_schemas", payload.get("tools")),
|
|
|
|
| 1223 |
"request_id": request_id,
|
| 1224 |
"idempotency_key": idempotency_key,
|
| 1225 |
})
|
|
|
|
| 380 |
return AutoTokenizer.from_pretrained(CLASSIFIER_MODEL)
|
| 381 |
|
| 382 |
|
| 383 |
+
@lru_cache(maxsize=8)
|
| 384 |
+
def _get_count_tokenizer(model_name: str):
|
| 385 |
+
from transformers import AutoTokenizer
|
| 386 |
+
|
| 387 |
+
return AutoTokenizer.from_pretrained(model_name)
|
| 388 |
+
|
| 389 |
+
|
| 390 |
@lru_cache(maxsize=1)
|
| 391 |
def _classifier_manifest() -> dict[str, Any] | None:
|
| 392 |
if not CLASSIFIER_ARTIFACT_DIR:
|
|
|
|
| 779 |
return missing
|
| 780 |
|
| 781 |
|
| 782 |
+
def _optional_string(payload: dict[str, Any], key: str) -> str | None:
|
| 783 |
+
value = payload.get(key)
|
| 784 |
+
if value is None:
|
| 785 |
+
return None
|
| 786 |
+
if not isinstance(value, str):
|
| 787 |
+
raise HTTPException(status_code=400, detail=f"{key} must be a string")
|
| 788 |
+
return value
|
| 789 |
+
|
| 790 |
+
|
| 791 |
+
def _count_tokens(text: str, tokenizer_model: str | None) -> tuple[int, bool, str]:
|
| 792 |
+
if tokenizer_model:
|
| 793 |
+
try:
|
| 794 |
+
tokenizer = _get_count_tokenizer(tokenizer_model)
|
| 795 |
+
return (
|
| 796 |
+
len(tokenizer.encode(text, add_special_tokens=False)),
|
| 797 |
+
True,
|
| 798 |
+
"tokenizer",
|
| 799 |
+
)
|
| 800 |
+
except Exception:
|
| 801 |
+
pass
|
| 802 |
+
return max(1, round(len(text) / 4.0)), False, "chars_per_token_estimate"
|
| 803 |
+
|
| 804 |
+
|
| 805 |
def _protected_spans(
|
| 806 |
text: str,
|
| 807 |
protected_values: list[str],
|
|
|
|
| 864 |
idempotency_key = payload.get("idempotency_key")
|
| 865 |
if idempotency_key is not None and not isinstance(idempotency_key, str):
|
| 866 |
raise HTTPException(status_code=400, detail="idempotency_key must be a string")
|
| 867 |
+
tokenizer_model = _optional_string(payload, "tokenizer_model")
|
| 868 |
protected_values = payload.get("protected_spans") or []
|
| 869 |
if not isinstance(protected_values, list) or not all(
|
| 870 |
isinstance(value, str) for value in protected_values
|
|
|
|
| 938 |
cursor = end
|
| 939 |
chunks.append(text[cursor:])
|
| 940 |
output = "".join(chunks)
|
| 941 |
+
before, before_exact, token_method = _count_tokens(text, tokenizer_model)
|
| 942 |
+
after, after_exact, after_method = _count_tokens(output, tokenizer_model)
|
| 943 |
+
token_count_exact = before_exact and after_exact
|
| 944 |
+
if after_method != token_method:
|
| 945 |
+
token_method = "chars_per_token_estimate"
|
| 946 |
saved = max(0, before - after)
|
| 947 |
missing = [value for value in protected_values if value and value not in output]
|
| 948 |
code_preserved = all(text[start:end] in output for start, end in code_spans)
|
|
|
|
| 1017 |
"classifier_drop_chars": sum(end - start for start, end in classifier_ranges),
|
| 1018 |
"dropped_segments_count": len(drops),
|
| 1019 |
"dropped_segments": dropped_segments,
|
| 1020 |
+
"token_count_exact": token_count_exact,
|
| 1021 |
+
"token_count_method": token_method,
|
| 1022 |
+
"tokenizer_model": tokenizer_model,
|
| 1023 |
}
|
| 1024 |
receipt["input_sha256"] = _sha256_text(text)
|
| 1025 |
receipt["output_sha256"] = _sha256_text(output)
|
|
|
|
| 1064 |
"compression_settings": payload.get("compression_settings"),
|
| 1065 |
"protected_spans": payload.get("protected_spans"),
|
| 1066 |
"tool_schemas": payload.get("tool_schemas", payload.get("tools")),
|
| 1067 |
+
"tokenizer_model": payload.get("tokenizer_model"),
|
| 1068 |
"request_id": payload.get("request_id"),
|
| 1069 |
"idempotency_key": payload.get("idempotency_key"),
|
| 1070 |
}
|
|
|
|
| 1096 |
"tool_schemas",
|
| 1097 |
item.get("tools", payload.get("tool_schemas", payload.get("tools"))),
|
| 1098 |
),
|
| 1099 |
+
"tokenizer_model": item.get("tokenizer_model", payload.get("tokenizer_model")),
|
| 1100 |
"request_id": item.get("request_id", payload.get("request_id")),
|
| 1101 |
"idempotency_key": item.get(
|
| 1102 |
"idempotency_key",
|
|
|
|
| 1219 |
idempotency_key = payload.get("idempotency_key")
|
| 1220 |
if idempotency_key is not None and not isinstance(idempotency_key, str):
|
| 1221 |
raise HTTPException(status_code=400, detail="idempotency_key must be a string")
|
| 1222 |
+
tokenizer_model = _optional_string(payload, "tokenizer_model")
|
| 1223 |
|
| 1224 |
output_messages: list[dict[str, Any]] = []
|
| 1225 |
receipts: list[dict[str, Any]] = []
|
|
|
|
| 1260 |
"compression_settings": settings,
|
| 1261 |
"protected_spans": item_protected,
|
| 1262 |
"tool_schemas": payload.get("tool_schemas", payload.get("tools")),
|
| 1263 |
+
"tokenizer_model": tokenizer_model,
|
| 1264 |
"request_id": request_id,
|
| 1265 |
"idempotency_key": idempotency_key,
|
| 1266 |
})
|