Spaces:

wchen22
/

touchdown-compression-classifier

Sleeping

App Files Files Community

wchen22 commited on 23 days ago

Commit

b5632b3

verified ·

1 Parent(s): e4bdbeb

feat: chunk classifier artifact windows

Browse files

Files changed (2) hide show

README.md +3 -2
app.py +31 -19

README.md CHANGED Viewed

@@ -44,8 +44,9 @@ Live Space:
   managed `inputs[]` batches with per-item receipts and partial-error rows.
 - Mount `classifier_manifest.json`, tokenizer files, and optional `model.onnx`;
   set `TOUCHDOWN_CLASSIFIER_ARTIFACT_DIR` to let the Space use artifact DROP
-  labels through ONNX Runtime or the manifest fallback. Those labels still pass
-  through protected-span and deletion-only safety gates.
 Deploy:

   managed `inputs[]` batches with per-item receipts and partial-error rows.
 - Mount `classifier_manifest.json`, tokenizer files, and optional `model.onnx`;
   set `TOUCHDOWN_CLASSIFIER_ARTIFACT_DIR` to let the Space use artifact DROP
+  labels through ONNX Runtime or the manifest fallback. ONNX labels are
+  evaluated in chunked windows using manifest `max_length` and `stride`; labels
+  still pass through protected-span and deletion-only safety gates.
 Deploy:

app.py CHANGED Viewed

@@ -193,33 +193,45 @@ def _onnx_labels(
         return_tensors="np",
         truncation=True,
         max_length=int(manifest.get("max_length", 512)),
     )
-    offsets = encoded.pop("offset_mapping")[0]
     session = _get_onnx_session(str(model_path))
     input_names = {item.name for item in session.get_inputs()}
     inputs = {key: value for key, value in encoded.items() if key in input_names}
-    logits = session.run(None, inputs)[0][0]
     id2label = {
         str(key): value
         for key, value in (manifest.get("id2label") or {"0": "KEEP", "1": "DROP"}).items()
     }
-    tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"][0])
-    labels = []
-    for index, token_logits in enumerate(logits):
-        probs = _softmax(np.asarray(token_logits, dtype=float).tolist())
-        label_id = int(np.argmax(probs))
-        start, end = offsets[index]
-        if int(end) <= int(start):
-            continue
-        labels.append({
-            "token": tokens[index],
-            "label": id2label.get(str(label_id), "KEEP"),
-            "score": round(float(probs[label_id]), 6),
-            "start": int(start),
-            "end": int(end),
-            "source": "onnx_token_classifier",
-        })
-    return labels
 def _safe_classifier_drop_ranges(

         return_tensors="np",
         truncation=True,
         max_length=int(manifest.get("max_length", 512)),
+        stride=int(manifest.get("stride", 0)),
+        return_overflowing_tokens=True,
+        padding=True,
     )
+    offsets = encoded.pop("offset_mapping")
+    input_ids = encoded["input_ids"]
     session = _get_onnx_session(str(model_path))
     input_names = {item.name for item in session.get_inputs()}
     inputs = {key: value for key, value in encoded.items() if key in input_names}
+    logits = session.run(None, inputs)[0]
     id2label = {
         str(key): value
         for key, value in (manifest.get("id2label") or {"0": "KEEP", "1": "DROP"}).items()
     }
+    best_by_span: dict[tuple[int, int], dict[str, Any]] = {}
+    for chunk_index, chunk_logits in enumerate(logits):
+        tokens = tokenizer.convert_ids_to_tokens(input_ids[chunk_index])
+        for token_index, token_logits in enumerate(chunk_logits):
+            probs = _softmax(np.asarray(token_logits, dtype=float).tolist())
+            label_id = int(np.argmax(probs))
+            start, end = offsets[chunk_index][token_index]
+            start = int(start)
+            end = int(end)
+            if end <= start:
+                continue
+            score = round(float(probs[label_id]), 6)
+            key = (start, end)
+            item = {
+                "token": tokens[token_index],
+                "label": id2label.get(str(label_id), "KEEP"),
+                "score": score,
+                "start": start,
+                "end": end,
+                "source": "onnx_token_classifier",
+                "chunk_index": chunk_index,
+            }
+            if score > float(best_by_span.get(key, {}).get("score", -1.0)):
+                best_by_span[key] = item
+    return [best_by_span[key] for key in sorted(best_by_span)]
 def _safe_classifier_drop_ranges(