File size: 4,334 Bytes
406250b
 
 
 
 
 
 
 
 
 
 
 
797f2cf
406250b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""Knowledge-graph lookup and reference selection for lab markers."""

from __future__ import annotations

import json
import re
from functools import lru_cache
from pathlib import Path
from typing import Any


ROOT = Path(__file__).resolve().parents[1]
DEFAULT_KNOWLEDGE_GRAPH_PATH = ROOT / "kb" / "cbc_knowledge_graph.json"  # lab-wide marker graph (107 tests)


class LabKnowledgeGraph:
    """Small deterministic lookup layer over the JSON knowledge graph."""

    def __init__(self, payload: dict[str, Any]) -> None:
        self.payload = payload
        self.tests: list[dict[str, Any]] = list(payload.get("tests", []))
        self._by_id = {str(test.get("id", "")).casefold(): test for test in self.tests}
        self._alias_index = self._build_alias_index()

    @classmethod
    def load(cls, path: str | Path = DEFAULT_KNOWLEDGE_GRAPH_PATH) -> "LabKnowledgeGraph":
        graph_path = Path(path)
        return cls(json.loads(graph_path.read_text(encoding="utf-8")))

    def resolve(self, marker_name: str | None) -> dict[str, Any] | None:
        """Return the graph node matching a raw marker name or alias."""
        for key in _candidate_keys(marker_name):
            match = self._alias_index.get(key)
            if match is not None:
                return match
        return None

    def get(self, marker_id: str | None) -> dict[str, Any] | None:
        if not marker_id:
            return None
        return self._by_id.get(str(marker_id).casefold())

    def select_statistics(
        self,
        node: dict[str, Any],
        age_group: str,
        sex: str,
    ) -> dict[str, Any] | None:
        """Select the best statistics block for age/sex context.

        Sex-specific ranges are preferred when available. The JSON keeps an
        `unknown` sex bucket for high-impact markers, and age-only statistics
        remain the compatibility fallback for every marker.
        """
        normalized_sex = sex if sex in {"male", "female"} else "unknown"
        sex_stats = node.get("sex_specific_statistics_per_group_age")
        if isinstance(sex_stats, dict):
            group_stats = sex_stats.get(age_group)
            if isinstance(group_stats, dict):
                values = group_stats.get(normalized_sex) or group_stats.get("unknown")
                if isinstance(values, dict):
                    return {
                        "basis": "sex_specific_statistics_per_group_age",
                        "age_group": age_group,
                        "sex": normalized_sex,
                        "values": values,
                    }

        age_stats = node.get("statistics_per_group_age", {})
        values = age_stats.get(age_group)
        if isinstance(values, dict):
            return {
                "basis": "statistics_per_group_age",
                "age_group": age_group,
                "sex": "not_applied",
                "values": values,
            }
        return None

    def _build_alias_index(self) -> dict[str, dict[str, Any]]:
        index: dict[str, dict[str, Any]] = {}
        for test in self.tests:
            names = [
                test.get("id"),
                test.get("display_name"),
                *(test.get("aliases") or []),
            ]
            for name in names:
                for key in _candidate_keys(name):
                    index.setdefault(key, test)
        return index


@lru_cache(maxsize=1)
def default_knowledge_graph() -> LabKnowledgeGraph:
    return LabKnowledgeGraph.load()


def _candidate_keys(value: str | None) -> list[str]:
    if value is None:
        return []

    text = str(value).strip()
    if not text:
        return []

    pieces = {text}
    pieces.add(re.sub(r"\([^)]*\)", "", text).strip())

    for inner in re.findall(r"\(([^)]*)\)", text):
        pieces.add(inner)
        pieces.update(part.strip() for part in re.split(r"[/,;]", inner))

    keys: list[str] = []
    for piece in pieces:
        key = _marker_key(piece)
        if key and key not in keys:
            keys.append(key)
    return keys


def _marker_key(value: str) -> str:
    normalized = value.casefold()
    normalized = normalized.replace("µ", "u").replace("μ", "u")
    normalized = normalized.replace("percent", "%").replace("number", "#")
    return re.sub(r"[^a-z0-9%#]+", "", normalized)