OvermindLab
/

nerpa

@@ -31,7 +31,7 @@ model-index:
 pipeline_tag: token-classification
 ---
-# NERPA — Fine-Tuned GLiNER2 for PII Anonymisation
 A fine-tuned [GLiNER2 Large](https://huggingface.co/fastino/gliner2-large-v1) (340M params) model trained to detect Personally Identifiable Information (PII) in text. Built as a flexible, self-hosted replacement for AWS Comprehend at [Overmind](https://overmindlab.ai).
@@ -164,7 +164,7 @@ entities = detect_entities(model, text, entities={
 The inference pipeline in `anonymise.py`:
-1. **Chunking** — Long texts are split into 3000-character chunks with 100-char overlap to stay within the model's context window. Specific chunk size can be varied since DeBERTa-v2 (underlying encoder) uses relative position encoding. We found that this size works as well as smaller ones.
 2. **Batch prediction** — Chunks are fed through `GLiNER2.batch_extract_entities()` with `include_spans=True` to get character-level offsets.
 3. **Date disambiguation** — Both `DATE_TIME` and `DATE_OF_BIRTH` are always detected together so the model can choose the best label per span.
 4. **De-duplication** — Overlapping detections from chunk boundaries are merged, keeping the highest-confidence label for each position.
@@ -205,4 +205,4 @@ If you use NERPA, please cite both this model and the original GLiNER2 paper:
 Built by [Akhat Rakishev](https://github.com/akhatre) at [Overmind](https://overmindlab.ai).
-Overmind is infrastructure to make agents more reliable. Learn more at [overmindlab.ai](https://overmindlab.ai).

 pipeline_tag: token-classification
 ---
+# NERPA - Fine-Tuned GLiNER2 for PII Anonymisation
 A fine-tuned [GLiNER2 Large](https://huggingface.co/fastino/gliner2-large-v1) (340M params) model trained to detect Personally Identifiable Information (PII) in text. Built as a flexible, self-hosted replacement for AWS Comprehend at [Overmind](https://overmindlab.ai).
 The inference pipeline in `anonymise.py`:
+1. **Chunking** — Long texts are split into 3000-character chunks with 100-char overlap to stay within the model's context window. Specific chunk size can be varied since DeBERTa-v3 (underlying encoder) uses relative position encoding. We found that this size works as well as smaller ones.
 2. **Batch prediction** — Chunks are fed through `GLiNER2.batch_extract_entities()` with `include_spans=True` to get character-level offsets.
 3. **Date disambiguation** — Both `DATE_TIME` and `DATE_OF_BIRTH` are always detected together so the model can choose the best label per span.
 4. **De-duplication** — Overlapping detections from chunk boundaries are merged, keeping the highest-confidence label for each position.
 Built by [Akhat Rakishev](https://github.com/akhatre) at [Overmind](https://overmindlab.ai).
+Overmind is infrastructure for end-to-end agent optimisation. Learn more at [overmindlab.ai](https://overmindlab.ai).

anonymise.py CHANGED Viewed

@@ -8,16 +8,21 @@ Usage:
 """
 import argparse
 import sys
-from typing import Dict, List, Tuple
 import torch
 from gliner2 import GLiNER2
 # Entity types the model was fine-tuned to recognise, with descriptions
 # that guide the bi-encoder towards better detection.
-PII_ENTITIES = {
     "LOCATION": "Address, country, city, postcode, street, any other location",
     "AGE": "Age of a person",
     "DIGITAL_KEYS": "Digital keys, passwords, pins used to access anything like servers, banks, APIs, accounts etc",
@@ -39,6 +44,7 @@ PII_ENTITIES = {
 CONFIDENCE_THRESHOLD = 0.25
 CHUNK_SIZE = 3000
 CHUNK_OVERLAP = 100
 def load_model(model_path: str = ".") -> GLiNER2:
@@ -51,40 +57,47 @@ def load_model(model_path: str = ".") -> GLiNER2:
         device = torch.device("cpu")
     model = GLiNER2.from_pretrained(model_path)
-    model.to(device)
     return model
-def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> Tuple[List[str], List[int]]:
     """Split text into overlapping chunks, returning chunks and their start offsets."""
     if not text:
         return [], []
-    chunks, starts = [], []
     step = chunk_size - overlap
-    pos = 0
-    while pos < len(text):
         chunks.append(text[pos : pos + chunk_size])
         starts.append(pos)
-        if pos + chunk_size >= len(text):
-            break
-        pos += step
     return chunks, starts
 def detect_entities(
     model: GLiNER2,
     text: str,
-    entities: Dict[str, str] = None,
     threshold: float = CONFIDENCE_THRESHOLD,
-) -> List[dict]:
     """
     Detect PII entities in text, returning a list of
-    {"type": str, "start": int, "end": int, "score": float} dicts
     with character offsets into the original text.
     """
     entities = entities or PII_ENTITIES
-    # Always detect both date types so the model can disambiguate
     detect = dict(entities)
     if "DATE_TIME" in detect and "DATE_OF_BIRTH" not in detect:
         detect["DATE_OF_BIRTH"] = PII_ENTITIES["DATE_OF_BIRTH"]
@@ -93,9 +106,9 @@ def detect_entities(
     chunks, offsets = chunk_text(text)
-    all_chunk_results = []
-    for batch_start in range(0, len(chunks), 32):
-        batch = chunks[batch_start : batch_start + 32]
         results = model.batch_extract_entities(
             batch,
             detect,
@@ -105,63 +118,108 @@ def detect_entities(
         )
         all_chunk_results.extend(results)
-    # Merge results across chunks: de-duplicate overlapping detections
-    seen: Dict[Tuple[int, int], dict] = {}
     for chunk_result, chunk_offset in zip(all_chunk_results, offsets):
         for label, occurrences in chunk_result["entities"].items():
-            for occ in occurrences:
-                start = occ["start"] + chunk_offset
-                end = occ["end"] + chunk_offset
-                pos = (start, end)
-                if pos not in seen or seen[pos]["score"] < occ["confidence"]:
-                    seen[pos] = {"type": label, "score": occ["confidence"]}
-    # Merge overlapping spans, keeping highest confidence label
     items = sorted(
-        [(s, e, info) for (s, e), info in seen.items() if info["type"] in entities],
         key=lambda x: (x[0], x[1]),
     )
     if not items:
         return []
-    merged = []
-    cur_s, cur_e, cur_info = items[0]
-    for s, e, info in items[1:]:
-        if s < cur_e:  # overlapping
-            cur_e = max(cur_e, e)
-            if info["score"] > cur_info["score"]:
-                cur_info = info
         else:
-            merged.append({"type": cur_info["type"], "start": cur_s, "end": cur_e, "score": cur_info["score"]})
-            cur_s, cur_e, cur_info = s, e, info
-    merged.append({"type": cur_info["type"], "start": cur_s, "end": cur_e, "score": cur_info["score"]})
     return merged
-def anonymise(text: str, detected: List[dict]) -> str:
-    """Replace detected entities with placeholders like [PERSON_NAME]."""
-    # Process from end to start so offsets stay valid
-    result = text
-    for entity in sorted(detected, key=lambda e: e["start"], reverse=True):
-        placeholder = f'[{entity["type"]}]'
-        result = result[: entity["start"]] + placeholder + result[entity["end"] :]
-    return result
-def main():
-    parser = argparse.ArgumentParser(description="Anonymise PII in text using the NERPA model.")
-    parser.add_argument("text", nargs="?", help="Text to anonymise (or use --file)")
-    parser.add_argument("--file", "-f", help="Read text from a file instead")
-    parser.add_argument("--output", "-o", help="Write anonymised text to file (default: stdout)")
-    parser.add_argument("--model", "-m", default=".", help="Path to model directory (default: current dir)")
-    parser.add_argument("--threshold", "-t", type=float, default=CONFIDENCE_THRESHOLD, help="Confidence threshold (default: 0.25)")
-    parser.add_argument("--show-entities", action="store_true", help="Print detected entities before anonymised text")
     args = parser.parse_args()
     if args.file:
-        with open(args.file) as f:
-            text = f.read()
     elif args.text:
         text = args.text
     else:
@@ -171,15 +229,22 @@ def main():
     detected = detect_entities(model, text, threshold=args.threshold)
     if args.show_entities:
-        for e in detected:
-            print(f'  {e["type"]:25s} [{e["start"]:5d}:{e["end"]:5d}] (score={e["score"]:.2f})  "{text[e["start"]:e["end"]]}"', file=sys.stderr)
-        print(file=sys.stderr)
     result = anonymise(text, detected)
     if args.output:
-        with open(args.output, "w") as f:
-            f.write(result)
     else:
         print(result)

 """
 import argparse
+import logging
 import sys
+import warnings
+from typing import Optional
+warnings.filterwarnings("ignore", message=r".*incorrect regex pattern.*fix_mistral_regex.*")
 import torch
 from gliner2 import GLiNER2
+logger = logging.getLogger(__name__)
 # Entity types the model was fine-tuned to recognise, with descriptions
 # that guide the bi-encoder towards better detection.
+PII_ENTITIES: dict[str, str] = {
     "LOCATION": "Address, country, city, postcode, street, any other location",
     "AGE": "Age of a person",
     "DIGITAL_KEYS": "Digital keys, passwords, pins used to access anything like servers, banks, APIs, accounts etc",
 CONFIDENCE_THRESHOLD = 0.25
 CHUNK_SIZE = 3000
 CHUNK_OVERLAP = 100
+BATCH_SIZE = 32
 def load_model(model_path: str = ".") -> GLiNER2:
         device = torch.device("cpu")
     model = GLiNER2.from_pretrained(model_path)
+    try:
+        model.to(device)
+    except RuntimeError:
+        logger.warning(
+            "Failed to load model on %s, falling back to CPU.", device
+        )
+        model.to(torch.device("cpu"))
     return model
+def chunk_text(
+    text: str,
+    chunk_size: int = CHUNK_SIZE,
+    overlap: int = CHUNK_OVERLAP,
+) -> tuple[list[str], list[int]]:
     """Split text into overlapping chunks, returning chunks and their start offsets."""
     if not text:
         return [], []
+    chunks: list[str] = []
+    starts: list[int] = []
     step = chunk_size - overlap
+    for pos in range(0, len(text), step):
         chunks.append(text[pos : pos + chunk_size])
         starts.append(pos)
     return chunks, starts
 def detect_entities(
     model: GLiNER2,
     text: str,
+    entities: Optional[dict[str, str]] = None,
     threshold: float = CONFIDENCE_THRESHOLD,
+) -> list[dict]:
     """
     Detect PII entities in text, returning a list of
+    ``{"type": str, "start": int, "end": int, "score": float}`` dicts
     with character offsets into the original text.
     """
     entities = entities or PII_ENTITIES
+    # Always detect both date types so the model can disambiguate.
     detect = dict(entities)
     if "DATE_TIME" in detect and "DATE_OF_BIRTH" not in detect:
         detect["DATE_OF_BIRTH"] = PII_ENTITIES["DATE_OF_BIRTH"]
     chunks, offsets = chunk_text(text)
+    all_chunk_results: list[dict] = []
+    for batch_start in range(0, len(chunks), BATCH_SIZE):
+        batch = chunks[batch_start : batch_start + BATCH_SIZE]
         results = model.batch_extract_entities(
             batch,
             detect,
         )
         all_chunk_results.extend(results)
+    # Merge results across chunks: de-duplicate overlapping detections.
+    seen: dict[tuple[int, int], dict] = {}
     for chunk_result, chunk_offset in zip(all_chunk_results, offsets):
         for label, occurrences in chunk_result["entities"].items():
+            for occurrence in occurrences:
+                start = occurrence["start"] + chunk_offset
+                end = occurrence["end"] + chunk_offset
+                position = (start, end)
+                if (
+                    position not in seen
+                    or seen[position]["score"] < occurrence["confidence"]
+                ):
+                    seen[position] = {
+                        "type": label,
+                        "score": occurrence["confidence"],
+                    }
+    # Merge overlapping spans, keeping the highest-confidence label.
+    # NOTE: when two spans overlap they are fused into one span and
+    # assigned the label with the higher confidence score.
     items = sorted(
+        [
+            (start, end, info)
+            for (start, end), info in seen.items()
+            if info["type"] in entities
+        ],
         key=lambda x: (x[0], x[1]),
     )
     if not items:
         return []
+    merged: list[dict] = []
+    current_start, current_end, current_info = items[0]
+    for start, end, info in items[1:]:
+        if start < current_end:  # overlapping
+            current_end = max(current_end, end)
+            if info["score"] > current_info["score"]:
+                current_info = info
         else:
+            merged.append({
+                "type": current_info["type"],
+                "start": current_start,
+                "end": current_end,
+                "score": current_info["score"],
+            })
+            current_start, current_end, current_info = start, end, info
+    merged.append({
+        "type": current_info["type"],
+        "start": current_start,
+        "end": current_end,
+        "score": current_info["score"],
+    })
     return merged
+def anonymise(text: str, detected: list[dict]) -> str:
+    """Replace detected entities with placeholders like ``[PERSON_NAME]``."""
+    parts: list[str] = []
+    prev_end = 0
+    for entity in sorted(detected, key=lambda e: e["start"]):
+        parts.append(text[prev_end : entity["start"]])
+        parts.append(f'[{entity["type"]}]')
+        prev_end = entity["end"]
+    parts.append(text[prev_end:])
+    return "".join(parts)
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Anonymise PII in text using the NERPA model.",
+    )
+    parser.add_argument(
+        "text", nargs="?", help="Text to anonymise (or use --file)",
+    )
+    parser.add_argument(
+        "--file", "-f", help="Read text from a file instead",
+    )
+    parser.add_argument(
+        "--output", "-o",
+        help="Write anonymised text to file (default: stdout)",
+    )
+    parser.add_argument(
+        "--model", "-m", default=".",
+        help="Path to model directory (default: current dir)",
+    )
+    parser.add_argument(
+        "--threshold", "-t", type=float, default=CONFIDENCE_THRESHOLD,
+        help=f"Confidence threshold (default: {CONFIDENCE_THRESHOLD})",
+    )
+    parser.add_argument(
+        "--show-entities", action="store_true",
+        help="Print detected entities before anonymised text",
+    )
     args = parser.parse_args()
     if args.file:
+        try:
+            with open(args.file, encoding="utf-8") as f:
+                text = f.read()
+        except OSError as exc:
+            sys.exit(f"Error reading {args.file}: {exc}")
     elif args.text:
         text = args.text
     else:
     detected = detect_entities(model, text, threshold=args.threshold)
     if args.show_entities:
+        for entity in detected:
+            span = text[entity["start"] : entity["end"]]
+            logger.info(
+                "  %-25s [%5d:%5d] (score=%.2f)  %r",
+                entity["type"], entity["start"], entity["end"],
+                entity["score"], span,
+            )
     result = anonymise(text, detected)
     if args.output:
+        try:
+            with open(args.output, "w", encoding="utf-8") as f:
+                f.write(result)
+        except OSError as exc:
+            sys.exit(f"Error writing {args.output}: {exc}")
     else:
         print(result)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gliner2>=1.2.4
2	+ torch>=2.8.0