Upload 3 files

Browse files

Files changed (3) hide show

config_dcpg.json +22 -0
dcpg_encoder.py +112 -257
inference_dcpg.py +40 -0

config_dcpg.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "model_type": "dcpg_encoder",
+  "architecture": "GAT",
+  "node_feat_dim": 19,
+  "hidden_dim": 32,
+  "embed_dim": 16,
+  "num_layers": 2,
+  "pooling": "attention",
+  "attention": "single_head",
+  "edge_weight_formula": "0.30*f_temporal + 0.30*f_semantic + 0.25*f_modality + 0.15*f_trust",
+  "input_sources": [
+    "DCPGAdapter.graph_summary",
+    "CRDTGraph.summary"
+  ],
+  "output": {
+    "patient_embedding": 16,
+    "node_embeddings": "per_node",
+    "risk_score": "scalar_sigmoid"
+  },
+  "dependencies": [],
+  "framework": "pure_python"
+}

dcpg_encoder.py CHANGED Viewed

@@ -6,40 +6,34 @@ from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Tuple
-# ---------------------------------------------------------------------------
-# Node feature extraction
-# ---------------------------------------------------------------------------
 MODALITY_INDEX = {
-    "text": 0,
-    "asr": 1,
-    "image_proxy": 2,
-    "waveform_proxy": 3,
-    "audio_proxy": 4,
-    "image_link": 5,
-    "audio_link": 6,
 }
-MODALITY_DIM = len(MODALITY_INDEX) + 1  # +1 for unknown
 PHI_TYPE_INDEX = {
-    "NAME_DATE_MRN_FACILITY": 0,
-    "NAME_DATE_MRN": 1,
-    "FACE_IMAGE": 2,
-    "WAVEFORM_HEADER": 3,
-    "VOICE": 4,
-    "FACE_LINK": 5,
-    "VOICE_LINK": 6,
 }
 PHI_TYPE_DIM = len(PHI_TYPE_INDEX) + 1
-NODE_SCALAR_DIM = 3   # risk_entropy, context_confidence, pseudonym_version_norm
-NODE_FEAT_DIM = MODALITY_DIM + PHI_TYPE_DIM + NODE_SCALAR_DIM  # 18
 def _one_hot(idx_map: Dict[str, int], key: str, dim: int) -> List[float]:
     vec = [0.0] * dim
-    i = idx_map.get(key, dim - 1)
-    vec[i] = 1.0
     return vec
@@ -51,30 +45,15 @@ def node_features(
     pseudonym_version: int,
     max_pv: int = 10,
 ) -> List[float]:
-    mod_oh = _one_hot(MODALITY_INDEX, modality, MODALITY_DIM)
-    phi_oh = _one_hot(PHI_TYPE_INDEX, phi_type, PHI_TYPE_DIM)
-    scalars = [
-        float(max(0.0, min(1.0, risk_entropy))),
-        float(max(0.0, min(1.0, context_confidence))),
-        float(min(pseudonym_version, max_pv)) / float(max_pv),
-    ]
-    return mod_oh + phi_oh + scalars
-# ---------------------------------------------------------------------------
-# Linear layer (no deps)
-# ---------------------------------------------------------------------------
-def _matmul(A: List[List[float]], B: List[List[float]]) -> List[List[float]]:
-    rows, mid, cols = len(A), len(B), len(B[0])
-    out = [[0.0] * cols for _ in range(rows)]
-    for i in range(rows):
-        for k in range(mid):
-            if A[i][k] == 0.0:
-                continue
-            for j in range(cols):
-                out[i][j] += A[i][k] * B[k][j]
-    return out
 def _matvec(W: List[List[float]], x: List[float]) -> List[float]:
@@ -92,12 +71,8 @@ def _softmax(x: List[float]) -> List[float]:
     return [v / s for v in e]
-def _norm(x: List[float]) -> float:
-    return math.sqrt(sum(v * v for v in x)) or 1.0
 def _normalize(x: List[float]) -> List[float]:
-    n = _norm(x)
     return [v / n for v in x]
@@ -105,13 +80,24 @@ def _add(a: List[float], b: List[float]) -> List[float]:
     return [a[i] + b[i] for i in range(len(a))]
-def _scale(a: List[float], s: float) -> List[float]:
-    return [v * s for v in a]
-# ---------------------------------------------------------------------------
-# GAT message passing (single attention head, numpy-free)
-# ---------------------------------------------------------------------------
 @dataclass
 class GATLayer:
@@ -138,7 +124,6 @@ class GATLayer:
         n = len(node_feats)
         h = [_relu(_matvec(self.W, x)) for x in node_feats]
-        # attention coefficients
         e: Dict[Tuple[int, int], float] = {}
         for (src, dst), w in zip(edge_index, edge_weights):
             score = (
@@ -147,167 +132,23 @@ class GATLayer:
             )
             e[(src, dst)] = math.exp(score) * float(w)
-        # per-node normalization
         norm_sum: List[float] = [0.0] * n
         for (src, dst), v in e.items():
             norm_sum[dst] += v
         for (src, dst) in e:
-            denom = norm_sum[dst] or 1.0
-            e[(src, dst)] /= denom
-        # aggregate
         out = [[0.0] * self.out_dim for _ in range(n)]
         for (src, dst), alpha in e.items():
             for k in range(self.out_dim):
                 out[dst][k] += alpha * h[src][k]
-        # residual add (project if needed)
         for i in range(n):
             out[i] = _add(out[i], h[i])
         return out
-def _xavier_init(rows: int, cols: int) -> List[List[float]]:
-    limit = math.sqrt(6.0 / (rows + cols))
-    import random
-    rng = random.Random(42)
-    return [
-        [rng.uniform(-limit, limit) for _ in range(cols)]
-        for _ in range(rows)
-    ]
-# ---------------------------------------------------------------------------
-# Pooling
-# ---------------------------------------------------------------------------
-def mean_pool(node_embeds: List[List[float]]) -> List[float]:
-    if not node_embeds:
-        return []
-    dim = len(node_embeds[0])
-    out = [0.0] * dim
-    for h in node_embeds:
-        for k in range(dim):
-            out[k] += h[k]
-    return [v / len(node_embeds) for v in out]
-def max_pool(node_embeds: List[List[float]]) -> List[float]:
-    if not node_embeds:
-        return []
-    dim = len(node_embeds[0])
-    out = [-1e9] * dim
-    for h in node_embeds:
-        for k in range(dim):
-            if h[k] > out[k]:
-                out[k] = h[k]
-    return out
-def attention_pool(
-    node_embeds: List[List[float]],
-    risk_entropies: List[float],
-) -> List[float]:
-    if not node_embeds:
-        return []
-    weights = _softmax(risk_entropies)
-    dim = len(node_embeds[0])
-    out = [0.0] * dim
-    for h, w in zip(node_embeds, weights):
-        for k in range(dim):
-            out[k] += w * h[k]
-    return out
-# ---------------------------------------------------------------------------
-# Encoder
-# ---------------------------------------------------------------------------
-HIDDEN_DIM = 32
-EMBED_DIM = 16
-@dataclass
-class DCPGEncoder:
-    """
-    Two-layer GAT encoder over a DCPG graph.
-    Input:  graph_summary dict from DCPGAdapter.graph_summary()
-            or CRDTGraph.summary() enriched with node features
-    Output: patient_embedding (EMBED_DIM floats) + risk_score (float)
-    """
-    layer1: GATLayer = field(default_factory=lambda: GATLayer(NODE_FEAT_DIM, HIDDEN_DIM))
-    layer2: GATLayer = field(default_factory=lambda: GATLayer(HIDDEN_DIM, EMBED_DIM))
-    risk_head: List[List[float]] = field(default_factory=lambda: _xavier_init(1, EMBED_DIM))
-    def encode(self, graph: "DCPGGraph") -> "EncoderOutput":
-        if not graph.nodes:
-            zero = [0.0] * EMBED_DIM
-            return EncoderOutput(
-                patient_embedding=zero,
-                node_embeddings=[],
-                risk_score=0.0,
-                node_ids=[],
-            )
-        feats = [n.feature_vec() for n in graph.nodes]
-        ei = graph.edge_index()
-        ew = graph.edge_weights()
-        h1 = self.layer1.forward(feats, ei, ew)
-        h2 = self.layer2.forward(h1, ei, ew)
-        risk_entropies = [n.risk_entropy for n in graph.nodes]
-        patient_emb = attention_pool(h2, risk_entropies)
-        patient_emb = _normalize(patient_emb)
-        risk_score = math.sigmoid_approx(
-            sum(self.risk_head[0][k] * patient_emb[k] for k in range(EMBED_DIM))
-        )
-        return EncoderOutput(
-            patient_embedding=patient_emb,
-            node_embeddings=[_normalize(h) for h in h2],
-            risk_score=round(risk_score, 4),
-            node_ids=[n.node_id for n in graph.nodes],
-        )
-def _sigmoid(x: float) -> float:
-    if x >= 0:
-        return 1.0 / (1.0 + math.exp(-x))
-    e = math.exp(x)
-    return e / (1.0 + e)
-# patch into math namespace for use above
-math.sigmoid_approx = _sigmoid  # type: ignore[attr-defined]
-@dataclass
-class EncoderOutput:
-    patient_embedding: List[float]
-    node_embeddings: List[List[float]]
-    risk_score: float
-    node_ids: List[str]
-    def to_dict(self) -> Dict[str, Any]:
-        return {
-            "patient_embedding": [round(v, 5) for v in self.patient_embedding],
-            "node_embeddings": {
-                nid: [round(v, 5) for v in emb]
-                for nid, emb in zip(self.node_ids, self.node_embeddings)
-            },
-            "risk_score": self.risk_score,
-            "embed_dim": len(self.patient_embedding),
-        }
-# ---------------------------------------------------------------------------
-# DCPGGraph — thin wrapper to consume DCPGAdapter.graph_summary() output
-# ---------------------------------------------------------------------------
 @dataclass
 class DCPGGraphNode:
     node_id: str
@@ -319,11 +160,8 @@ class DCPGGraphNode:
     def feature_vec(self) -> List[float]:
         return node_features(
-            self.modality,
-            self.phi_type,
-            self.risk_entropy,
-            self.context_confidence,
-            self.pseudonym_version,
         )
@@ -339,68 +177,99 @@ class DCPGGraph:
         idx = self._node_index()
         ei: List[Tuple[int, int]] = []
         for e in self.edges:
-            s = idx.get(e["source"])
-            t = idx.get(e["target"])
             if s is not None and t is not None:
-                ei.append((s, t))
-                ei.append((t, s))  # undirected
         return ei
     def edge_weights(self) -> List[float]:
         idx = self._node_index()
         ew: List[float] = []
         for e in self.edges:
-            s = idx.get(e["source"])
-            t = idx.get(e["target"])
             if s is not None and t is not None:
                 w = float(e.get("weight", 1.0))
-                ew.extend([w, w])
         return ew
     @classmethod
     def from_summary(cls, summary: Dict[str, Any]) -> "DCPGGraph":
         nodes = [
             DCPGGraphNode(
-                node_id=n["node_id"],
-                modality=n["modality"],
-                phi_type=n["phi_type"],
                 risk_entropy=float(n.get("risk_entropy", 0.0)),
                 context_confidence=float(n.get("context_confidence", 1.0)),
                 pseudonym_version=int(n.get("pseudonym_version", 0)),
             )
             for n in summary.get("nodes", [])
         ]
-        edges = summary.get("edges", [])
-        return cls(nodes=nodes, edges=edges)
     @classmethod
-    def from_crdt_summary(
-        cls,
-        summary: Dict[str, Any],
-        provisional_risk: float = 0.0,
-    ) -> "DCPGGraph":
         nodes = []
         for n in summary.get("nodes", []):
             parts = str(n["node_id"]).split("::")
             modality = parts[1] if len(parts) > 1 else "text"
-            nodes.append(
-                DCPGGraphNode(
-                    node_id=n["node_id"],
-                    modality=modality,
-                    phi_type=modality.upper(),
-                    risk_entropy=float(n.get("risk_entropy", provisional_risk)),
-                    context_confidence=min(
-                        1.0, float(n.get("total_phi_units", 1)) / 10.0
-                    ),
-                    pseudonym_version=int(n.get("pseudonym_version", 0)),
-                )
-            )
         return cls(nodes=nodes, edges=[])
-# ---------------------------------------------------------------------------
-# Inference helper
-# ---------------------------------------------------------------------------
 def encode_patient(
     graph_summary: Dict[str, Any],
@@ -415,18 +284,11 @@ def encode_patient(
         )
     else:
         g = DCPGGraph.from_summary(graph_summary)
-    out = enc.encode(g)
-    return out.to_dict()
-# ---------------------------------------------------------------------------
-# Smoke test
-# ---------------------------------------------------------------------------
 if __name__ == "__main__":
     summary = {
-        "node_count": 3,
-        "edge_count": 2,
         "nodes": [
             {"node_id": "p1::text::NAME_DATE_MRN_FACILITY", "modality": "text",
              "phi_type": "NAME_DATE_MRN_FACILITY", "risk_entropy": 0.72,
@@ -440,17 +302,10 @@ if __name__ == "__main__":
         ],
         "edges": [
             {"source": "p1::text::NAME_DATE_MRN_FACILITY",
-             "target": "p1::asr::NAME_DATE_MRN",
-             "type": "co_occurrence", "weight": 0.71},
             {"source": "p1::text::NAME_DATE_MRN_FACILITY",
-             "target": "p1::image_proxy::FACE_IMAGE",
-             "type": "cross_modal", "weight": 0.58},
         ],
-        "provisional_risk": 0.664,
     }
     result = encode_patient(summary)
     print(json.dumps(result, indent=2))
-    print(f"\nrisk_score: {result['risk_score']}")
-    print(f"embed_dim:  {result['embed_dim']}")
-    print(f"nodes encoded: {len(result['node_embeddings'])}")

 from typing import Any, Dict, List, Optional, Tuple
 MODALITY_INDEX = {
+    "text": 0, "asr": 1, "image_proxy": 2, "waveform_proxy": 3,
+    "audio_proxy": 4, "image_link": 5, "audio_link": 6,
 }
+MODALITY_DIM = len(MODALITY_INDEX) + 1
 PHI_TYPE_INDEX = {
+    "NAME_DATE_MRN_FACILITY": 0, "NAME_DATE_MRN": 1, "FACE_IMAGE": 2,
+    "WAVEFORM_HEADER": 3, "VOICE": 4, "FACE_LINK": 5, "VOICE_LINK": 6,
 }
 PHI_TYPE_DIM = len(PHI_TYPE_INDEX) + 1
+NODE_SCALAR_DIM = 3
+NODE_FEAT_DIM = MODALITY_DIM + PHI_TYPE_DIM + NODE_SCALAR_DIM  # 19
+HIDDEN_DIM = 32
+EMBED_DIM = 16
+def _sigmoid(x: float) -> float:
+    if x >= 0:
+        return 1.0 / (1.0 + math.exp(-x))
+    e = math.exp(x)
+    return e / (1.0 + e)
 def _one_hot(idx_map: Dict[str, int], key: str, dim: int) -> List[float]:
     vec = [0.0] * dim
+    vec[idx_map.get(key, dim - 1)] = 1.0
     return vec
     pseudonym_version: int,
     max_pv: int = 10,
 ) -> List[float]:
+    return (
+        _one_hot(MODALITY_INDEX, modality, MODALITY_DIM)
+        + _one_hot(PHI_TYPE_INDEX, phi_type, PHI_TYPE_DIM)
+        + [
+            float(max(0.0, min(1.0, risk_entropy))),
+            float(max(0.0, min(1.0, context_confidence))),
+            float(min(pseudonym_version, max_pv)) / float(max_pv),
+        ]
+    )
 def _matvec(W: List[List[float]], x: List[float]) -> List[float]:
     return [v / s for v in e]
 def _normalize(x: List[float]) -> List[float]:
+    n = math.sqrt(sum(v * v for v in x)) or 1.0
     return [v / n for v in x]
     return [a[i] + b[i] for i in range(len(a))]
+def _xavier_init(rows: int, cols: int) -> List[List[float]]:
+    import random
+    limit = math.sqrt(6.0 / (rows + cols))
+    rng = random.Random(42)
+    return [[rng.uniform(-limit, limit) for _ in range(cols)] for _ in range(rows)]
+def attention_pool(node_embeds: List[List[float]], risk_entropies: List[float]) -> List[float]:
+    if not node_embeds:
+        return []
+    weights = _softmax(risk_entropies)
+    dim = len(node_embeds[0])
+    out = [0.0] * dim
+    for h, w in zip(node_embeds, weights):
+        for k in range(dim):
+            out[k] += w * h[k]
+    return out
 @dataclass
 class GATLayer:
         n = len(node_feats)
         h = [_relu(_matvec(self.W, x)) for x in node_feats]
         e: Dict[Tuple[int, int], float] = {}
         for (src, dst), w in zip(edge_index, edge_weights):
             score = (
             )
             e[(src, dst)] = math.exp(score) * float(w)
         norm_sum: List[float] = [0.0] * n
         for (src, dst), v in e.items():
             norm_sum[dst] += v
         for (src, dst) in e:
+            e[(src, dst)] /= norm_sum[dst] or 1.0
         out = [[0.0] * self.out_dim for _ in range(n)]
         for (src, dst), alpha in e.items():
             for k in range(self.out_dim):
                 out[dst][k] += alpha * h[src][k]
         for i in range(n):
             out[i] = _add(out[i], h[i])
         return out
 @dataclass
 class DCPGGraphNode:
     node_id: str
     def feature_vec(self) -> List[float]:
         return node_features(
+            self.modality, self.phi_type,
+            self.risk_entropy, self.context_confidence, self.pseudonym_version,
         )
         idx = self._node_index()
         ei: List[Tuple[int, int]] = []
         for e in self.edges:
+            s, t = idx.get(e["source"]), idx.get(e["target"])
             if s is not None and t is not None:
+                ei += [(s, t), (t, s)]
         return ei
     def edge_weights(self) -> List[float]:
         idx = self._node_index()
         ew: List[float] = []
         for e in self.edges:
+            s, t = idx.get(e["source"]), idx.get(e["target"])
             if s is not None and t is not None:
                 w = float(e.get("weight", 1.0))
+                ew += [w, w]
         return ew
     @classmethod
     def from_summary(cls, summary: Dict[str, Any]) -> "DCPGGraph":
         nodes = [
             DCPGGraphNode(
+                node_id=n["node_id"], modality=n["modality"], phi_type=n["phi_type"],
                 risk_entropy=float(n.get("risk_entropy", 0.0)),
                 context_confidence=float(n.get("context_confidence", 1.0)),
                 pseudonym_version=int(n.get("pseudonym_version", 0)),
             )
             for n in summary.get("nodes", [])
         ]
+        return cls(nodes=nodes, edges=summary.get("edges", []))
     @classmethod
+    def from_crdt_summary(cls, summary: Dict[str, Any], provisional_risk: float = 0.0) -> "DCPGGraph":
         nodes = []
         for n in summary.get("nodes", []):
             parts = str(n["node_id"]).split("::")
             modality = parts[1] if len(parts) > 1 else "text"
+            nodes.append(DCPGGraphNode(
+                node_id=n["node_id"], modality=modality,
+                phi_type=modality.upper(),
+                risk_entropy=float(n.get("risk_entropy", provisional_risk)),
+                context_confidence=min(1.0, float(n.get("total_phi_units", 1)) / 10.0),
+                pseudonym_version=int(n.get("pseudonym_version", 0)),
+            ))
         return cls(nodes=nodes, edges=[])
+@dataclass
+class EncoderOutput:
+    patient_embedding: List[float]
+    node_embeddings: List[List[float]]
+    risk_score: float
+    node_ids: List[str]
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "patient_embedding": [round(v, 5) for v in self.patient_embedding],
+            "node_embeddings": {
+                nid: [round(v, 5) for v in emb]
+                for nid, emb in zip(self.node_ids, self.node_embeddings)
+            },
+            "risk_score": self.risk_score,
+            "embed_dim": len(self.patient_embedding),
+        }
+@dataclass
+class DCPGEncoder:
+    layer1: GATLayer = field(default_factory=lambda: GATLayer(NODE_FEAT_DIM, HIDDEN_DIM))
+    layer2: GATLayer = field(default_factory=lambda: GATLayer(HIDDEN_DIM, EMBED_DIM))
+    risk_head: List[List[float]] = field(default_factory=lambda: _xavier_init(1, EMBED_DIM))
+    def encode(self, graph: DCPGGraph) -> EncoderOutput:
+        if not graph.nodes:
+            return EncoderOutput(
+                patient_embedding=[0.0] * EMBED_DIM,
+                node_embeddings=[], risk_score=0.0, node_ids=[],
+            )
+        feats = [n.feature_vec() for n in graph.nodes]
+        ei = graph.edge_index()
+        ew = graph.edge_weights()
+        h1 = self.layer1.forward(feats, ei, ew)
+        h2 = self.layer2.forward(h1, ei, ew)
+        patient_emb = _normalize(attention_pool(h2, [n.risk_entropy for n in graph.nodes]))
+        risk_score = _sigmoid(sum(self.risk_head[0][k] * patient_emb[k] for k in range(EMBED_DIM)))
+        return EncoderOutput(
+            patient_embedding=patient_emb,
+            node_embeddings=[_normalize(h) for h in h2],
+            risk_score=round(risk_score, 4),
+            node_ids=[n.node_id for n in graph.nodes],
+        )
 def encode_patient(
     graph_summary: Dict[str, Any],
         )
     else:
         g = DCPGGraph.from_summary(graph_summary)
+    return enc.encode(g).to_dict()
 if __name__ == "__main__":
     summary = {
         "nodes": [
             {"node_id": "p1::text::NAME_DATE_MRN_FACILITY", "modality": "text",
              "phi_type": "NAME_DATE_MRN_FACILITY", "risk_entropy": 0.72,
         ],
         "edges": [
             {"source": "p1::text::NAME_DATE_MRN_FACILITY",
+             "target": "p1::asr::NAME_DATE_MRN", "type": "co_occurrence", "weight": 0.71},
             {"source": "p1::text::NAME_DATE_MRN_FACILITY",
+             "target": "p1::image_proxy::FACE_IMAGE", "type": "cross_modal", "weight": 0.58},
         ],
     }
     result = encode_patient(summary)
     print(json.dumps(result, indent=2))

inference_dcpg.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from __future__ import annotations
+import json
+import sys
+from dcpg_encoder import DCPGEncoder, encode_patient
+_encoder = DCPGEncoder()
+def predict(graph_summary: dict, source: str = "dcpg") -> dict:
+    return encode_patient(graph_summary, encoder=_encoder, source=source)
+def predict_batch(summaries: list, source: str = "dcpg") -> list:
+    return [predict(s, source=source) for s in summaries]
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        with open(sys.argv[1]) as f:
+            data = json.load(f)
+        result = predict(data)
+    else:
+        result = predict({
+            "nodes": [
+                {"node_id": "p1::text::NAME_DATE_MRN_FACILITY", "modality": "text",
+                 "phi_type": "NAME_DATE_MRN_FACILITY", "risk_entropy": 0.8,
+                 "context_confidence": 0.9, "pseudonym_version": 2},
+                {"node_id": "p1::audio_proxy::VOICE", "modality": "audio_proxy",
+                 "phi_type": "VOICE", "risk_entropy": 0.55,
+                 "context_confidence": 0.6, "pseudonym_version": 1},
+            ],
+            "edges": [
+                {"source": "p1::text::NAME_DATE_MRN_FACILITY",
+                 "target": "p1::audio_proxy::VOICE",
+                 "type": "cross_modal", "weight": 0.63},
+            ],
+        })
+    print(json.dumps(result, indent=2))