"""Knowledge graph schema: node and edge data classes.""" from __future__ import annotations from dataclasses import dataclass, field from enum import Enum from typing import Optional class DomainTag(str, Enum): """High-level domain classification for concepts.""" NEUROANATOMY = "neuroanatomy" DISEASE = "disease" GENE = "gene" NEUROTRANSMITTER = "neurotransmitter" DRUG = "drug" COGNITIVE_FUNCTION = "cognitive_function" CELL_TYPE = "cell_type" BIOMARKER = "biomarker" PARADIGM = "paradigm" # experimental paradigm (BrainMap) CONNECTIVITY = "connectivity" # functional/structural connections IMAGING_FEATURE = "imaging_feature" # cortical thickness, volume, FA, FC, SUVR, etc. DATASET_VARIABLE = "dataset_variable" # genetics, environment, medication, etc. # Phase 1.5 Experiment infrastructure (atlas/modality/dataset/ml_model) # + reserved RECIPE tag (former Phase 4.3, removed 2026-05-13 but kept # in UMLS-skip set for forward compat). RECIPE = "recipe" # reserved ATLAS = "atlas" # brain parcellation (ATLAS:*) MODALITY = "modality" # imaging/data modality (MODALITY:*) DATASET = "dataset" # research dataset (DATASET:*) ML_MODEL = "ml_model" # ML architecture (MODEL:*) # Brain decoding stimuli & psychological-state targets VISUAL_STIMULUS = "visual_stimulus" # image/video stimulus (NSD/BOLD5000/SEED-DV) EMOTION = "emotion" # affective state label (SEED family) VIGILANCE = "vigilance" # alertness/drowsiness label (SEED-VIG) class SemanticType(str, Enum): """UMLS semantic types relevant to neuroscience.""" DISEASE_OR_SYNDROME = "T047" MENTAL_DYSFUNCTION = "T048" NEOPLASTIC_PROCESS = "T191" BODY_PART_ORGAN = "T023" BODY_LOCATION = "T029" CELL = "T025" NEUROTRANSMITTER = "T116" AMINO_ACID_PEPTIDE = "T116" # overlaps with neurotransmitter in UMLS PHARMACOLOGIC_SUBSTANCE = "T121" GENE_OR_GENOME = "T028" INTELLECTUAL_PRODUCT = "T170" @dataclass class ConceptNode: """A concept node in the knowledge graph.""" id: str # unique identifier (CUI, or custom like "NN:1234") preferred_name: str # standard display name semantic_types: list[str] = field(default_factory=list) # TUI codes domain_tags: list[str] = field(default_factory=list) # DomainTag values source_vocab: str = "" # originating vocabulary (MeSH, NeuroNames, etc.) definition: str = "" # text definition aliases: list[str] = field(default_factory=list) # synonyms / alternate names external_ids: dict[str, str] = field(default_factory=dict) # cross-references atlas_mapping: Optional[dict] = None # MNI coords, atlas region ID, etc. metadata: dict = field(default_factory=dict) # catch-all for extra info def to_dict(self) -> dict: return { "id": self.id, "preferred_name": self.preferred_name, "semantic_types": self.semantic_types, "domain_tags": self.domain_tags, "source_vocab": self.source_vocab, "definition": self.definition, "aliases": self.aliases, "external_ids": self.external_ids, "atlas_mapping": self.atlas_mapping, "metadata": self.metadata, } @classmethod def from_dict(cls, d: dict) -> ConceptNode: return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__}) RELATION_TYPES = { # taxonomic / structural "is_a", # A is a subtype of B "part_of", # A is anatomical part of B "has_part", # inverse of part_of # causal / functional "causes", # A causes B "associated_with", # A is associated with B (loose) "predisposes", # A increases risk of B # therapeutic "treats", # A treats B "contraindicated_for", # A is contraindicated for B # molecular / genetic "gene_associated_with_disease", "protein_encoded_by", "modulates", # A modulates activity of B "binds_to", # A binds to receptor B # neuroanatomy "projects_to", # A projects neural connections to B "connects_to", # structural connectivity A-B "activates", # A functionally activates B "coactivates", # A and B co-activate (BrainMap) # evidence "supported_by", "contradicts", "about", # claim predicates (from paper extraction) "reduces", "increases", "correlates_with", "is_biomarker_of", "is_risk_factor_for", "is_associated_with", "predicts", "mediates", "inhibits", "distinguishes", # Deprecated Phase 4.3 Input Recipe edges — reserved, unused after # 2026-05-13 removal of input_recipe/recipe_kg_ingest modules. "tests_hypothesis", # (deprecated) Recipe → Hypothesis "predicts_outcome", # (deprecated) Recipe → target ConceptNode "uses_biomarker", # (deprecated) Recipe → Biomarker atom "uses_atlas", # (deprecated) Recipe → Atlas "uses_modality", # (deprecated) Recipe → Modality "uses_model", # (deprecated) Recipe → Model "evaluated_on", # (deprecated) Recipe → Dataset "measured_in", # (deprecated) Biomarker → Neuroanatomy ROI "measured_by", # (deprecated) Biomarker → Modality # Phase 1.5 Experiment infrastructure edges "supports_modality", # Model → Modality (compat declaration) "provides_modality", # Dataset → Modality (what the dataset contains) # Brain decoding edges (NSD/BOLD5000/SEED-DV/SEED family) "evokes", # visual_stimulus → neuroanatomy (encoding direction) "decoded_from", # visual_stimulus ← neuroanatomy (decoding direction) "elicits", # stimulus → emotion/vigilance (behavioral label) } # Claim-specific predicates (extracted from papers) CLAIM_PREDICATES = { "reduces", # A reduces B "increases", # A increases B "correlates_with", # A correlates with B "causes", # A causes B "is_biomarker_of", # A is a biomarker for B "is_risk_factor_for", # A is a risk factor for B "treats", # A treats B "modulates", # A modulates B "activates", # A activates B "inhibits", # A inhibits B "predicts", # A predicts B "mediates", # A mediates the relationship between B and C "is_associated_with", # A is associated with B "distinguishes", # A distinguishes B from C } @dataclass class Edge: """A directed edge in the knowledge graph.""" source_id: str # source ConceptNode.id target_id: str # target ConceptNode.id relation_type: str # one of RELATION_TYPES source: str = "" # provenance: 'NeuroNames', 'MeSH', 'DisGeNET', etc. confidence: float = 1.0 # 0.0-1.0 evidence_ref: str = "" # citation or reference metadata: dict = field(default_factory=dict) def to_dict(self) -> dict: return { "source_id": self.source_id, "target_id": self.target_id, "relation_type": self.relation_type, "source": self.source, "confidence": self.confidence, "evidence_ref": self.evidence_ref, "metadata": self.metadata, } @classmethod def from_dict(cls, d: dict) -> Edge: return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__}) @dataclass class Evidence: """Experimental evidence supporting a scientific claim.""" study_type: str = "" # "fMRI", "lesion", "meta-analysis", "GWAS", "animal_model" methodology: str = "" # "resting-state FC", "voxel-based morphometry", "DTI", ... p_value: Optional[float] = None effect_size: Optional[float] = None # Cohen's d, r, OR, beta effect_metric: str = "" # "Cohen's d", "r", "OR", "beta", "AUC" sample_size: Optional[int] = None replicability: str = "single_study" # "replicated", "single_study", "controversial" direction: str = "" # "positive", "negative" def to_dict(self) -> dict: return { "study_type": self.study_type, "methodology": self.methodology, "p_value": self.p_value, "effect_size": self.effect_size, "effect_metric": self.effect_metric, "sample_size": self.sample_size, "replicability": self.replicability, "direction": self.direction, } @classmethod def from_dict(cls, d: dict) -> Evidence: return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__}) @dataclass class PaperRef: """Reference to a source paper.""" pmid: str = "" # PubMed ID doi: str = "" title: str = "" authors: str = "" year: Optional[int] = None journal: str = "" def to_dict(self) -> dict: return { "pmid": self.pmid, "doi": self.doi, "title": self.title, "authors": self.authors, "year": self.year, "journal": self.journal, } @classmethod def from_dict(cls, d: dict) -> PaperRef: return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__}) @dataclass class Claim: """A structured scientific claim extracted from a paper. A claim is both stored as a node (for detailed querying) and generates simplified edges (for multi-hop traversal). """ id: str # CLM:uuid subject_id: str # ConceptNode.id in the graph subject_name: str # human-readable subject name predicate: str # one of CLAIM_PREDICATES object_id: str # ConceptNode.id in the graph object_name: str # human-readable object name negated: bool = False # "X does NOT affect Y" confidence: float = 0.5 # overall confidence 0-1 evidence: Evidence = field(default_factory=Evidence) source_paper: PaperRef = field(default_factory=PaperRef) raw_text: str = "" # original sentence from paper metadata: dict = field(default_factory=dict) def to_dict(self) -> dict: return { "id": self.id, "subject_id": self.subject_id, "subject_name": self.subject_name, "predicate": self.predicate, "object_id": self.object_id, "object_name": self.object_name, "negated": self.negated, "confidence": self.confidence, "evidence": self.evidence.to_dict(), "source_paper": self.source_paper.to_dict(), "raw_text": self.raw_text, "metadata": self.metadata, } @classmethod def from_dict(cls, d: dict) -> Claim: d = d.copy() if "evidence" in d and isinstance(d["evidence"], dict): d["evidence"] = Evidence.from_dict(d["evidence"]) if "source_paper" in d and isinstance(d["source_paper"], dict): d["source_paper"] = PaperRef.from_dict(d["source_paper"]) return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__}) def to_edge(self) -> Edge: """Convert claim to a simplified graph edge for traversal.""" return Edge( source_id=self.subject_id, target_id=self.object_id, relation_type=self.predicate, source=f"claim:{self.source_paper.pmid or self.id}", confidence=self.confidence, evidence_ref=self.source_paper.title, metadata={"claim_id": self.id, "negated": self.negated}, )