asjc-classification
/

scibert_multilabel_asjc_classifier

@@ -87,24 +87,16 @@ For **26 parent subjects**, F1-score improves to **0.934** with full metadata.
 from transformers import TextClassificationPipeline, pipeline
 import torch
-# Define the Custom Pipeline
 class ASJCMultiLabelPipeline(TextClassificationPipeline):
     """
-    Custom pipeline for multi-label ASJC classification.
-    This pipeline:
-      - Applies sigmoid to the model logits.
-      - Filters labels by a threshold.
-      - Returns all labels with scores above the threshold.
-    Threshold can be specified during pipeline creation.
-    If not provided, it defaults to the `threshold` in the model's config.json, or 0.3.
     """
     def __init__(self, *args, **kwargs):
         self.threshold = kwargs.pop("threshold", None)
         super().__init__(*args, **kwargs)
-        # Use threshold from config if none is passed explicitly
         if self.threshold is None:
             self.threshold = getattr(self.model.config, "threshold", 0.3)
@@ -113,39 +105,36 @@ class ASJCMultiLabelPipeline(TextClassificationPipeline):
         scores = torch.sigmoid(torch.tensor(model_outputs["logits"])).tolist()
         results = []
-        # Collect labels above the threshold
         for i, score in enumerate(scores):
             if score >= self.threshold:
                 label = self.model.config.id2label[str(i)]
                 results.append({"label": label, "score": float(score)})
-        # Sort results by descending probability
         results = sorted(results, key=lambda x: x["score"], reverse=True)
         return results
-```
-```python
-# Create pipeline with the multi-label model
 pipe = pipeline(
-    "text-classification",
-    model="asjc-classification/scibert_multilabel_asjc_classifier"
 )
-# Example text input (title, container_title, abstract)
 text = (
     "title={Jodometrie}, "
     "container_title={Fresenius' Zeitschrift für analytische Chemie, Zeitschrift für analytische Chemie}, "
     "abstract={}"
 )
-# Get predictions
 result = pipe(text)
 print(result)
-# Expected labels (based on actual ASJC categories):
 # - Clinical Biochemistry
 # - Analytical Chemistry
 ```
 ---

 from transformers import TextClassificationPipeline, pipeline
 import torch
+# --- Custom multi-label pipeline ---
 class ASJCMultiLabelPipeline(TextClassificationPipeline):
     """
+    Multi-label classification pipeline for ASJC categories.
+    Uses a configurable threshold to return all labels with scores above the threshold.
     """
     def __init__(self, *args, **kwargs):
+        # Allow threshold override; default falls back to model config
         self.threshold = kwargs.pop("threshold", None)
         super().__init__(*args, **kwargs)
         if self.threshold is None:
             self.threshold = getattr(self.model.config, "threshold", 0.3)
         scores = torch.sigmoid(torch.tensor(model_outputs["logits"])).tolist()
         results = []
         for i, score in enumerate(scores):
             if score >= self.threshold:
                 label = self.model.config.id2label[str(i)]
                 results.append({"label": label, "score": float(score)})
+        # Sort by descending score
         results = sorted(results, key=lambda x: x["score"], reverse=True)
         return results
+# --- Create the pipeline explicitly using the custom class ---
 pipe = pipeline(
+    task="text-classification",
+    model="asjc-classification/scibert_multilabel_asjc_classifier",
+    pipeline_class=ASJCMultiLabelPipeline
 )
+# --- Example text input ---
 text = (
     "title={Jodometrie}, "
     "container_title={Fresenius' Zeitschrift für analytische Chemie, Zeitschrift für analytische Chemie}, "
     "abstract={}"
 )
+# --- Get multi-label predictions ---
 result = pipe(text)
 print(result)
+# Expected labels:
 # - Clinical Biochemistry
 # - Analytical Chemistry
 ```
 ---