wchen22 commited on
Commit
1212e7d
·
verified ·
1 Parent(s): 0dfe65a

Add exact tokenizer accounting to compression API

Browse files
Files changed (2) hide show
  1. README.md +11 -2
  2. app.py +43 -2
README.md CHANGED
@@ -27,7 +27,7 @@ Live Space:
27
  - `https://wchen22-touchdown-compression-classifier.hf.space`
28
  - Verified 2026-06-11 with HF CLI: runtime stage `RUNNING`, hardware
29
  `cpu-basic`, domain `READY`, repo/runtime SHA
30
- `b402ba63bf08ce65bd30da071256555382be4fe0`.
31
  - The deployed scaffold supports chunked ONNX artifact inference for long
32
  prompts. Use `hf spaces info wchen22/touchdown-compression-classifier --format
33
  json` for the current repo/runtime SHA.
@@ -36,6 +36,11 @@ Live Space:
36
  validates `/health`, `/v1/classify`, single `/v1/compress`, and managed
37
  `inputs[]` batch, managed `messages[]`, plus gzipped JSON request/response
38
  transport.
 
 
 
 
 
39
  - Full deployment receipt:
40
  `python3 scripts/verify_compression_space.py --expected-sha <sha> --out reports/generated/compression_space/hf_space_verification.json`
41
  validates HF runtime metadata, repo/runtime SHA agreement, API smoke, and
@@ -44,7 +49,7 @@ Live Space:
44
  `reports/generated/compression_space/`; run the full verifier with the
45
  current Space SHA to check runtime, API smoke, and remote/local file parity.
46
  Current live receipt:
47
- `reports/generated/compression_space/hf_space_verification_2026-06-11-idempotency-replay-health.json`.
48
  - Latest live result: `/v1/compress` saved 27/102 estimated tokens;
49
  managed `inputs[]` returned `input_count=2`, `succeeded=2`, `failed=0`,
50
  managed `messages[]` returned `message_count=2` with system-role protection,
@@ -64,6 +69,10 @@ Live Space:
64
  is mounted. `/v1/compress` is rules-first deletion-only compression with
65
  safety receipts. The Space app supports both single `input` requests and
66
  managed `inputs[]` batches with per-item receipts and partial-error rows.
 
 
 
 
67
  - Mount `classifier_manifest.json`, tokenizer files, and optional `model.onnx`;
68
  set `TOUCHDOWN_CLASSIFIER_ARTIFACT_DIR` to let the Space use artifact DROP
69
  labels through ONNX Runtime or the manifest fallback. ONNX labels are
 
27
  - `https://wchen22-touchdown-compression-classifier.hf.space`
28
  - Verified 2026-06-11 with HF CLI: runtime stage `RUNNING`, hardware
29
  `cpu-basic`, domain `READY`, repo/runtime SHA
30
+ `0dfe65a6c82c9e7fa37d2c4a32c8eda3ed4e96d7`.
31
  - The deployed scaffold supports chunked ONNX artifact inference for long
32
  prompts. Use `hf spaces info wchen22/touchdown-compression-classifier --format
33
  json` for the current repo/runtime SHA.
 
36
  validates `/health`, `/v1/classify`, single `/v1/compress`, and managed
37
  `inputs[]` batch, managed `messages[]`, plus gzipped JSON request/response
38
  transport.
39
+ - Real-corpus API benchmark:
40
+ `python3 scripts/benchmark_compression_api.py --base-url https://wchen22-touchdown-compression-classifier.hf.space --input-jsonl benchmarks/prompts/real/kv_stress_seed.jsonl --limit 4 --tokenizer-model Qwen/Qwen2.5-7B-Instruct --require-exact-tokens`.
41
+ This calls hosted `/v1/compress` over real prompt rows and fails the run if
42
+ receipts return estimated token counts. Use this before claiming real-token
43
+ savings.
44
  - Full deployment receipt:
45
  `python3 scripts/verify_compression_space.py --expected-sha <sha> --out reports/generated/compression_space/hf_space_verification.json`
46
  validates HF runtime metadata, repo/runtime SHA agreement, API smoke, and
 
49
  `reports/generated/compression_space/`; run the full verifier with the
50
  current Space SHA to check runtime, API smoke, and remote/local file parity.
51
  Current live receipt:
52
+ `reports/generated/compression_space/hf_space_verification_2026-06-11-managed-messages.json`.
53
  - Latest live result: `/v1/compress` saved 27/102 estimated tokens;
54
  managed `inputs[]` returned `input_count=2`, `succeeded=2`, `failed=0`,
55
  managed `messages[]` returned `message_count=2` with system-role protection,
 
69
  is mounted. `/v1/compress` is rules-first deletion-only compression with
70
  safety receipts. The Space app supports both single `input` requests and
71
  managed `inputs[]` batches with per-item receipts and partial-error rows.
72
+ `/v1/compress` now accepts `tokenizer_model`; when the tokenizer loads,
73
+ receipts report `token_count_exact=true`, `token_count_method=tokenizer`, and
74
+ the requested model. If it cannot load, receipts remain estimated and the
75
+ benchmark `--require-exact-tokens` gate fails.
76
  - Mount `classifier_manifest.json`, tokenizer files, and optional `model.onnx`;
77
  set `TOUCHDOWN_CLASSIFIER_ARTIFACT_DIR` to let the Space use artifact DROP
78
  labels through ONNX Runtime or the manifest fallback. ONNX labels are
app.py CHANGED
@@ -380,6 +380,13 @@ def _get_tokenizer():
380
  return AutoTokenizer.from_pretrained(CLASSIFIER_MODEL)
381
 
382
 
 
 
 
 
 
 
 
383
  @lru_cache(maxsize=1)
384
  def _classifier_manifest() -> dict[str, Any] | None:
385
  if not CLASSIFIER_ARTIFACT_DIR:
@@ -772,6 +779,29 @@ def _tool_schema_missing_groups(
772
  return missing
773
 
774
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
775
  def _protected_spans(
776
  text: str,
777
  protected_values: list[str],
@@ -834,6 +864,7 @@ def _compress_text(payload: dict[str, Any]) -> dict[str, Any]:
834
  idempotency_key = payload.get("idempotency_key")
835
  if idempotency_key is not None and not isinstance(idempotency_key, str):
836
  raise HTTPException(status_code=400, detail="idempotency_key must be a string")
 
837
  protected_values = payload.get("protected_spans") or []
838
  if not isinstance(protected_values, list) or not all(
839
  isinstance(value, str) for value in protected_values
@@ -907,8 +938,11 @@ def _compress_text(payload: dict[str, Any]) -> dict[str, Any]:
907
  cursor = end
908
  chunks.append(text[cursor:])
909
  output = "".join(chunks)
910
- before = max(1, round(len(text) / 4.0))
911
- after = max(1, round(len(output) / 4.0))
 
 
 
912
  saved = max(0, before - after)
913
  missing = [value for value in protected_values if value and value not in output]
914
  code_preserved = all(text[start:end] in output for start, end in code_spans)
@@ -983,6 +1017,9 @@ def _compress_text(payload: dict[str, Any]) -> dict[str, Any]:
983
  "classifier_drop_chars": sum(end - start for start, end in classifier_ranges),
984
  "dropped_segments_count": len(drops),
985
  "dropped_segments": dropped_segments,
 
 
 
986
  }
987
  receipt["input_sha256"] = _sha256_text(text)
988
  receipt["output_sha256"] = _sha256_text(output)
@@ -1027,6 +1064,7 @@ def _merge_batch_item_payload(
1027
  "compression_settings": payload.get("compression_settings"),
1028
  "protected_spans": payload.get("protected_spans"),
1029
  "tool_schemas": payload.get("tool_schemas", payload.get("tools")),
 
1030
  "request_id": payload.get("request_id"),
1031
  "idempotency_key": payload.get("idempotency_key"),
1032
  }
@@ -1058,6 +1096,7 @@ def _merge_batch_item_payload(
1058
  "tool_schemas",
1059
  item.get("tools", payload.get("tool_schemas", payload.get("tools"))),
1060
  ),
 
1061
  "request_id": item.get("request_id", payload.get("request_id")),
1062
  "idempotency_key": item.get(
1063
  "idempotency_key",
@@ -1180,6 +1219,7 @@ def _handle_messages(payload: dict[str, Any]) -> dict[str, Any]:
1180
  idempotency_key = payload.get("idempotency_key")
1181
  if idempotency_key is not None and not isinstance(idempotency_key, str):
1182
  raise HTTPException(status_code=400, detail="idempotency_key must be a string")
 
1183
 
1184
  output_messages: list[dict[str, Any]] = []
1185
  receipts: list[dict[str, Any]] = []
@@ -1220,6 +1260,7 @@ def _handle_messages(payload: dict[str, Any]) -> dict[str, Any]:
1220
  "compression_settings": settings,
1221
  "protected_spans": item_protected,
1222
  "tool_schemas": payload.get("tool_schemas", payload.get("tools")),
 
1223
  "request_id": request_id,
1224
  "idempotency_key": idempotency_key,
1225
  })
 
380
  return AutoTokenizer.from_pretrained(CLASSIFIER_MODEL)
381
 
382
 
383
+ @lru_cache(maxsize=8)
384
+ def _get_count_tokenizer(model_name: str):
385
+ from transformers import AutoTokenizer
386
+
387
+ return AutoTokenizer.from_pretrained(model_name)
388
+
389
+
390
  @lru_cache(maxsize=1)
391
  def _classifier_manifest() -> dict[str, Any] | None:
392
  if not CLASSIFIER_ARTIFACT_DIR:
 
779
  return missing
780
 
781
 
782
+ def _optional_string(payload: dict[str, Any], key: str) -> str | None:
783
+ value = payload.get(key)
784
+ if value is None:
785
+ return None
786
+ if not isinstance(value, str):
787
+ raise HTTPException(status_code=400, detail=f"{key} must be a string")
788
+ return value
789
+
790
+
791
+ def _count_tokens(text: str, tokenizer_model: str | None) -> tuple[int, bool, str]:
792
+ if tokenizer_model:
793
+ try:
794
+ tokenizer = _get_count_tokenizer(tokenizer_model)
795
+ return (
796
+ len(tokenizer.encode(text, add_special_tokens=False)),
797
+ True,
798
+ "tokenizer",
799
+ )
800
+ except Exception:
801
+ pass
802
+ return max(1, round(len(text) / 4.0)), False, "chars_per_token_estimate"
803
+
804
+
805
  def _protected_spans(
806
  text: str,
807
  protected_values: list[str],
 
864
  idempotency_key = payload.get("idempotency_key")
865
  if idempotency_key is not None and not isinstance(idempotency_key, str):
866
  raise HTTPException(status_code=400, detail="idempotency_key must be a string")
867
+ tokenizer_model = _optional_string(payload, "tokenizer_model")
868
  protected_values = payload.get("protected_spans") or []
869
  if not isinstance(protected_values, list) or not all(
870
  isinstance(value, str) for value in protected_values
 
938
  cursor = end
939
  chunks.append(text[cursor:])
940
  output = "".join(chunks)
941
+ before, before_exact, token_method = _count_tokens(text, tokenizer_model)
942
+ after, after_exact, after_method = _count_tokens(output, tokenizer_model)
943
+ token_count_exact = before_exact and after_exact
944
+ if after_method != token_method:
945
+ token_method = "chars_per_token_estimate"
946
  saved = max(0, before - after)
947
  missing = [value for value in protected_values if value and value not in output]
948
  code_preserved = all(text[start:end] in output for start, end in code_spans)
 
1017
  "classifier_drop_chars": sum(end - start for start, end in classifier_ranges),
1018
  "dropped_segments_count": len(drops),
1019
  "dropped_segments": dropped_segments,
1020
+ "token_count_exact": token_count_exact,
1021
+ "token_count_method": token_method,
1022
+ "tokenizer_model": tokenizer_model,
1023
  }
1024
  receipt["input_sha256"] = _sha256_text(text)
1025
  receipt["output_sha256"] = _sha256_text(output)
 
1064
  "compression_settings": payload.get("compression_settings"),
1065
  "protected_spans": payload.get("protected_spans"),
1066
  "tool_schemas": payload.get("tool_schemas", payload.get("tools")),
1067
+ "tokenizer_model": payload.get("tokenizer_model"),
1068
  "request_id": payload.get("request_id"),
1069
  "idempotency_key": payload.get("idempotency_key"),
1070
  }
 
1096
  "tool_schemas",
1097
  item.get("tools", payload.get("tool_schemas", payload.get("tools"))),
1098
  ),
1099
+ "tokenizer_model": item.get("tokenizer_model", payload.get("tokenizer_model")),
1100
  "request_id": item.get("request_id", payload.get("request_id")),
1101
  "idempotency_key": item.get(
1102
  "idempotency_key",
 
1219
  idempotency_key = payload.get("idempotency_key")
1220
  if idempotency_key is not None and not isinstance(idempotency_key, str):
1221
  raise HTTPException(status_code=400, detail="idempotency_key must be a string")
1222
+ tokenizer_model = _optional_string(payload, "tokenizer_model")
1223
 
1224
  output_messages: list[dict[str, Any]] = []
1225
  receipts: list[dict[str, Any]] = []
 
1260
  "compression_settings": settings,
1261
  "protected_spans": item_protected,
1262
  "tool_schemas": payload.get("tool_schemas", payload.get("tools")),
1263
+ "tokenizer_model": tokenizer_model,
1264
  "request_id": request_id,
1265
  "idempotency_key": idempotency_key,
1266
  })