Spaces:

wchen22
/

touchdown-compression-classifier

Sleeping

App Files Files Community

wchen22 commited on 24 days ago

Commit

472c58c

verified ·

1 Parent(s): 87c9bb7

feat: add compression classifier score calibration

Browse files

Files changed (2) hide show

README.md +3 -2
app.py +22 -6

README.md CHANGED Viewed

@@ -48,8 +48,9 @@ Live Space:
 - Mount `classifier_manifest.json`, tokenizer files, and optional `model.onnx`;
   set `TOUCHDOWN_CLASSIFIER_ARTIFACT_DIR` to let the Space use artifact DROP
   labels through ONNX Runtime or the manifest fallback. ONNX labels are
-  evaluated in chunked windows using manifest `max_length` and `stride`; labels
-  still pass through protected-span and deletion-only safety gates.
 Deploy:

 - Mount `classifier_manifest.json`, tokenizer files, and optional `model.onnx`;
   set `TOUCHDOWN_CLASSIFIER_ARTIFACT_DIR` to let the Space use artifact DROP
   labels through ONNX Runtime or the manifest fallback. ONNX labels are
+  evaluated in chunked windows using manifest `max_length` and `stride`; mounted
+  ONNX labels expose `keep_score`, `drop_score`, and `drop_score_threshold`.
+  DROP spans still pass through protected-span and deletion-only safety gates.
 Deploy:

app.py CHANGED Viewed

@@ -207,29 +207,37 @@ def _onnx_labels(
         str(key): value
         for key, value in (manifest.get("id2label") or {"0": "KEEP", "1": "DROP"}).items()
     }
     best_by_span: dict[tuple[int, int], dict[str, Any]] = {}
     for chunk_index, chunk_logits in enumerate(logits):
         tokens = tokenizer.convert_ids_to_tokens(input_ids[chunk_index])
         for token_index, token_logits in enumerate(chunk_logits):
             probs = _softmax(np.asarray(token_logits, dtype=float).tolist())
-            label_id = int(np.argmax(probs))
             start, end = offsets[chunk_index][token_index]
             start = int(start)
             end = int(end)
             if end <= start:
                 continue
-            score = round(float(probs[label_id]), 6)
             key = (start, end)
             item = {
                 "token": tokens[token_index],
                 "label": id2label.get(str(label_id), "KEEP"),
-                "score": score,
                 "start": start,
                 "end": end,
                 "source": "onnx_token_classifier",
                 "chunk_index": chunk_index,
             }
-            if score > float(best_by_span.get(key, {}).get("score", -1.0)):
                 best_by_span[key] = item
     return [best_by_span[key] for key in sorted(best_by_span)]
@@ -245,13 +253,21 @@ def _safe_classifier_drop_ranges(
     drop_labels = 0
     blocked = 0
     for item in labels:
-        if str(item.get("label") or item.get("entity") or "").upper() != "DROP":
             continue
         drop_labels += 1
         try:
             start = int(item["start"])
             end = int(item["end"])
-            score = float(item.get("score", 1.0))
         except Exception:
             blocked += 1
             continue

         str(key): value
         for key, value in (manifest.get("id2label") or {"0": "KEEP", "1": "DROP"}).items()
     }
+    label2id = {str(value).upper(): int(key) for key, value in id2label.items()}
+    keep_id = label2id.get("KEEP", 0)
+    drop_id = label2id.get("DROP", 1)
+    drop_score_threshold = float(manifest.get("drop_score_threshold", 0.5))
     best_by_span: dict[tuple[int, int], dict[str, Any]] = {}
     for chunk_index, chunk_logits in enumerate(logits):
         tokens = tokenizer.convert_ids_to_tokens(input_ids[chunk_index])
         for token_index, token_logits in enumerate(chunk_logits):
             probs = _softmax(np.asarray(token_logits, dtype=float).tolist())
+            keep_score = float(probs[keep_id]) if keep_id < len(probs) else 0.0
+            drop_score = float(probs[drop_id]) if drop_id < len(probs) else 0.0
+            label_id = drop_id if drop_score >= drop_score_threshold else keep_id
             start, end = offsets[chunk_index][token_index]
             start = int(start)
             end = int(end)
             if end <= start:
                 continue
             key = (start, end)
             item = {
                 "token": tokens[token_index],
                 "label": id2label.get(str(label_id), "KEEP"),
+                "score": round(drop_score if label_id == drop_id else keep_score, 6),
+                "keep_score": round(keep_score, 6),
+                "drop_score": round(drop_score, 6),
+                "drop_score_threshold": drop_score_threshold,
                 "start": start,
                 "end": end,
                 "source": "onnx_token_classifier",
                 "chunk_index": chunk_index,
             }
+            if drop_score > float(best_by_span.get(key, {}).get("drop_score", -1.0)):
                 best_by_span[key] = item
     return [best_by_span[key] for key in sorted(best_by_span)]
     drop_labels = 0
     blocked = 0
     for item in labels:
+        raw_drop_score = item.get("drop_score")
+        try:
+            drop_score = float(raw_drop_score) if raw_drop_score is not None else None
+        except (TypeError, ValueError):
+            drop_score = None
+        if (
+            str(item.get("label") or item.get("entity") or "").upper() != "DROP"
+            and (drop_score is None or drop_score < min_score)
+        ):
             continue
         drop_labels += 1
         try:
             start = int(item["start"])
             end = int(item["end"])
+            score = drop_score if drop_score is not None else float(item.get("score", 1.0))
         except Exception:
             blocked += 1
             continue