chest2vec
/

chest2vec_labeler

@@ -282,6 +282,71 @@ class Chest2VecLabelerModel(PreTrainedModel):
         return res
 def report_f1(gt_reports: List[str], pred_reports: List[str], model=None, tokenizer=None,
               model_id: str = "chest2vec/chest2vec_labeler", **kw) -> Dict[str, Any]:
     """Convenience wrapper: load the labeler (if not supplied) and score GT vs predicted reports."""

         return res
+    # ---- per-label best F1 (threshold swept to maximize F1) vs ground-truth labels ----
+    def _to_positive_matrix(self, gt, names):
+        """Coerce ground-truth labels to a [N, len(names)] binary positive matrix.
+        Accepts a pandas DataFrame with the label columns (ternary 1/0/-1/NaN; positive == 1),
+        or a numpy/torch array (ternary -> ==1, or already-binary 0/1)."""
+        import numpy as np
+        try:
+            import pandas as pd
+            if isinstance(gt, pd.DataFrame):
+                out = np.zeros((len(gt), len(names)), dtype=int)
+                for j, c in enumerate(names):
+                    if c in gt.columns:
+                        out[:, j] = (pd.to_numeric(gt[c], errors="coerce").fillna(0).values == 1).astype(int)
+                return out
+        except ImportError:
+            pass
+        arr = gt.detach().cpu().numpy() if hasattr(gt, "detach") else np.asarray(gt)
+        return (arr == 1).astype(int)
+    @torch.no_grad()
+    def per_label_best_f1(self, reports: List[str], gt, tokenizer=None, level: str = "leaf",
+                          min_pos: int = 30, batch_size: int = 16, max_len: Optional[int] = None,
+                          device=None) -> Dict[str, Any]:
+        """
+        For each label, sweep the decision threshold and report the **F1-maximizing** operating
+        point (best F1 + the threshold that achieves it), evaluated against ground-truth labels.
+        `gt` is a ground-truth label matrix for `reports` (DataFrame with the 137 label columns,
+        or array). `level` is "leaf" / "upper" / "anatomy". Returns per-label best F1 / threshold /
+        n_pos, plus macro best-F1 over all labels and over labels with >= `min_pos` positives.
+        """
+        import numpy as np
+        from sklearn.metrics import precision_recall_curve
+        leaf_names = list(self.config.labels)
+        gt_leaf = self._to_positive_matrix(gt, leaf_names)
+        pr_leaf = self.predict_proba(reports, tokenizer=tokenizer, batch_size=batch_size,
+                                     max_len=max_len, device=device).numpy()
+        if level == "leaf":
+            prob, names, gtb = pr_leaf, leaf_names, gt_leaf
+        else:
+            pu, un, pa, an = self.aggregate_hierarchy(pr_leaf)
+            gu, _, ga, _ = self.aggregate_hierarchy(gt_leaf.astype(np.float32))
+            prob, names, gtb = (pu, un, (gu >= 0.5).astype(int)) if level == "upper" else (pa, an, (ga >= 0.5).astype(int))
+        per: Dict[str, Any] = {}
+        all_best, ge_best = [], []
+        for j, lab in enumerate(names):
+            t = gtb[:, j].astype(int); s = prob[:, j].astype(float); npos = int(t.sum())
+            if npos == 0 or len(np.unique(t)) < 2:
+                bf, bt = 0.0, None
+            else:
+                p, r, thr = precision_recall_curve(t, s)
+                f1 = (2 * p * r / (p + r + 1e-12))[:-1]
+                bi = int(np.nanargmax(f1)); bf = float(f1[bi]); bt = float(thr[bi])
+            per[lab] = {"best_f1": bf, "best_threshold": bt, "n_pos": npos}
+            all_best.append(bf)
+            if npos >= min_pos:
+                ge_best.append(bf)
+        return {"level": level, "min_pos": min_pos,
+                "macro_best_f1": float(np.mean(all_best)) if all_best else 0.0,
+                "macro_best_f1_min_pos": float(np.mean(ge_best)) if ge_best else 0.0,
+                "n_labels_min_pos": len(ge_best), "per_label": per}
 def report_f1(gt_reports: List[str], pred_reports: List[str], model=None, tokenizer=None,
               model_id: str = "chest2vec/chest2vec_labeler", **kw) -> Dict[str, Any]:
     """Convenience wrapper: load the labeler (if not supplied) and score GT vs predicted reports."""