noteguard / src /pipeline.py
Chaeyoon
Refactor to Gold-RAP structure: src/ package, tests/, docs/, data/, output/
84981a4
Raw
History Blame Contribute Delete
1.69 kB
"""End-to-end single-note pipeline: detect -> de-identify -> audit.
This is the unit the demo UI and the CLI both call.
"""
from __future__ import annotations
from collections import Counter
from dataclasses import dataclass, field
from .detect import Detector, build_detector
from .recognisers import Span
from .transform import REDACTION, PseudonymVault, Replacement, apply_transform
@dataclass
class SanitiseResult:
original: str
sanitised: str
spans: list[Span]
replacements: list[Replacement]
method: str
audit: dict = field(default_factory=dict)
@property
def review_items(self) -> list[Span]:
"""Spans flagged for human review (low-confidence detections that were still redacted)."""
return [s for s in self.spans if s.needs_review]
class Pipeline:
def __init__(self, detector: Detector | None = None, vault: PseudonymVault | None = None):
self.detector = detector or build_detector()
self.vault = vault or PseudonymVault()
def sanitise(self, text: str, method: str = REDACTION, person_id: str = "") -> SanitiseResult:
spans = self.detector.detect(text)
sanitised, repls = apply_transform(text, spans, method, self.vault, person_id)
by_type = Counter(s.entity_type for s in spans)
needs_review = sum(1 for s in spans if s.needs_review)
audit = {
"detector": getattr(self.detector, "name", "?"),
"method": method,
"entities_removed": sum(by_type.values()),
"by_type": dict(by_type),
"needs_review": needs_review,
}
return SanitiseResult(text, sanitised, spans, repls, method, audit)