zxcvb6958
Optimize search with trigram index + precomputed top lists
6e7a2fd
"""Knowledge graph schema: node and edge data classes."""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
class DomainTag(str, Enum):
"""High-level domain classification for concepts."""
NEUROANATOMY = "neuroanatomy"
DISEASE = "disease"
GENE = "gene"
NEUROTRANSMITTER = "neurotransmitter"
DRUG = "drug"
COGNITIVE_FUNCTION = "cognitive_function"
CELL_TYPE = "cell_type"
BIOMARKER = "biomarker"
PARADIGM = "paradigm" # experimental paradigm (BrainMap)
CONNECTIVITY = "connectivity" # functional/structural connections
IMAGING_FEATURE = "imaging_feature" # cortical thickness, volume, FA, FC, SUVR, etc.
DATASET_VARIABLE = "dataset_variable" # genetics, environment, medication, etc.
# Phase 1.5 Experiment infrastructure (atlas/modality/dataset/ml_model)
# + reserved RECIPE tag (former Phase 4.3, removed 2026-05-13 but kept
# in UMLS-skip set for forward compat).
RECIPE = "recipe" # reserved
ATLAS = "atlas" # brain parcellation (ATLAS:*)
MODALITY = "modality" # imaging/data modality (MODALITY:*)
DATASET = "dataset" # research dataset (DATASET:*)
ML_MODEL = "ml_model" # ML architecture (MODEL:*)
# Brain decoding stimuli & psychological-state targets
VISUAL_STIMULUS = "visual_stimulus" # image/video stimulus (NSD/BOLD5000/SEED-DV)
EMOTION = "emotion" # affective state label (SEED family)
VIGILANCE = "vigilance" # alertness/drowsiness label (SEED-VIG)
class SemanticType(str, Enum):
"""UMLS semantic types relevant to neuroscience."""
DISEASE_OR_SYNDROME = "T047"
MENTAL_DYSFUNCTION = "T048"
NEOPLASTIC_PROCESS = "T191"
BODY_PART_ORGAN = "T023"
BODY_LOCATION = "T029"
CELL = "T025"
NEUROTRANSMITTER = "T116"
AMINO_ACID_PEPTIDE = "T116" # overlaps with neurotransmitter in UMLS
PHARMACOLOGIC_SUBSTANCE = "T121"
GENE_OR_GENOME = "T028"
INTELLECTUAL_PRODUCT = "T170"
@dataclass
class ConceptNode:
"""A concept node in the knowledge graph."""
id: str # unique identifier (CUI, or custom like "NN:1234")
preferred_name: str # standard display name
semantic_types: list[str] = field(default_factory=list) # TUI codes
domain_tags: list[str] = field(default_factory=list) # DomainTag values
source_vocab: str = "" # originating vocabulary (MeSH, NeuroNames, etc.)
definition: str = "" # text definition
aliases: list[str] = field(default_factory=list) # synonyms / alternate names
external_ids: dict[str, str] = field(default_factory=dict) # cross-references
atlas_mapping: Optional[dict] = None # MNI coords, atlas region ID, etc.
metadata: dict = field(default_factory=dict) # catch-all for extra info
def to_dict(self) -> dict:
return {
"id": self.id,
"preferred_name": self.preferred_name,
"semantic_types": self.semantic_types,
"domain_tags": self.domain_tags,
"source_vocab": self.source_vocab,
"definition": self.definition,
"aliases": self.aliases,
"external_ids": self.external_ids,
"atlas_mapping": self.atlas_mapping,
"metadata": self.metadata,
}
@classmethod
def from_dict(cls, d: dict) -> ConceptNode:
return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
RELATION_TYPES = {
# taxonomic / structural
"is_a", # A is a subtype of B
"part_of", # A is anatomical part of B
"has_part", # inverse of part_of
# causal / functional
"causes", # A causes B
"associated_with", # A is associated with B (loose)
"predisposes", # A increases risk of B
# therapeutic
"treats", # A treats B
"contraindicated_for", # A is contraindicated for B
# molecular / genetic
"gene_associated_with_disease",
"protein_encoded_by",
"modulates", # A modulates activity of B
"binds_to", # A binds to receptor B
# neuroanatomy
"projects_to", # A projects neural connections to B
"connects_to", # structural connectivity A-B
"activates", # A functionally activates B
"coactivates", # A and B co-activate (BrainMap)
# evidence
"supported_by",
"contradicts",
"about",
# claim predicates (from paper extraction)
"reduces",
"increases",
"correlates_with",
"is_biomarker_of",
"is_risk_factor_for",
"is_associated_with",
"predicts",
"mediates",
"inhibits",
"distinguishes",
# Deprecated Phase 4.3 Input Recipe edges — reserved, unused after
# 2026-05-13 removal of input_recipe/recipe_kg_ingest modules.
"tests_hypothesis", # (deprecated) Recipe → Hypothesis
"predicts_outcome", # (deprecated) Recipe → target ConceptNode
"uses_biomarker", # (deprecated) Recipe → Biomarker atom
"uses_atlas", # (deprecated) Recipe → Atlas
"uses_modality", # (deprecated) Recipe → Modality
"uses_model", # (deprecated) Recipe → Model
"evaluated_on", # (deprecated) Recipe → Dataset
"measured_in", # (deprecated) Biomarker → Neuroanatomy ROI
"measured_by", # (deprecated) Biomarker → Modality
# Phase 1.5 Experiment infrastructure edges
"supports_modality", # Model → Modality (compat declaration)
"provides_modality", # Dataset → Modality (what the dataset contains)
# Brain decoding edges (NSD/BOLD5000/SEED-DV/SEED family)
"evokes", # visual_stimulus → neuroanatomy (encoding direction)
"decoded_from", # visual_stimulus ← neuroanatomy (decoding direction)
"elicits", # stimulus → emotion/vigilance (behavioral label)
}
# Claim-specific predicates (extracted from papers)
CLAIM_PREDICATES = {
"reduces", # A reduces B
"increases", # A increases B
"correlates_with", # A correlates with B
"causes", # A causes B
"is_biomarker_of", # A is a biomarker for B
"is_risk_factor_for", # A is a risk factor for B
"treats", # A treats B
"modulates", # A modulates B
"activates", # A activates B
"inhibits", # A inhibits B
"predicts", # A predicts B
"mediates", # A mediates the relationship between B and C
"is_associated_with", # A is associated with B
"distinguishes", # A distinguishes B from C
}
@dataclass
class Edge:
"""A directed edge in the knowledge graph."""
source_id: str # source ConceptNode.id
target_id: str # target ConceptNode.id
relation_type: str # one of RELATION_TYPES
source: str = "" # provenance: 'NeuroNames', 'MeSH', 'DisGeNET', etc.
confidence: float = 1.0 # 0.0-1.0
evidence_ref: str = "" # citation or reference
metadata: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return {
"source_id": self.source_id,
"target_id": self.target_id,
"relation_type": self.relation_type,
"source": self.source,
"confidence": self.confidence,
"evidence_ref": self.evidence_ref,
"metadata": self.metadata,
}
@classmethod
def from_dict(cls, d: dict) -> Edge:
return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
@dataclass
class Evidence:
"""Experimental evidence supporting a scientific claim."""
study_type: str = "" # "fMRI", "lesion", "meta-analysis", "GWAS", "animal_model"
methodology: str = "" # "resting-state FC", "voxel-based morphometry", "DTI", ...
p_value: Optional[float] = None
effect_size: Optional[float] = None # Cohen's d, r, OR, beta
effect_metric: str = "" # "Cohen's d", "r", "OR", "beta", "AUC"
sample_size: Optional[int] = None
replicability: str = "single_study" # "replicated", "single_study", "controversial"
direction: str = "" # "positive", "negative"
def to_dict(self) -> dict:
return {
"study_type": self.study_type,
"methodology": self.methodology,
"p_value": self.p_value,
"effect_size": self.effect_size,
"effect_metric": self.effect_metric,
"sample_size": self.sample_size,
"replicability": self.replicability,
"direction": self.direction,
}
@classmethod
def from_dict(cls, d: dict) -> Evidence:
return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
@dataclass
class PaperRef:
"""Reference to a source paper."""
pmid: str = "" # PubMed ID
doi: str = ""
title: str = ""
authors: str = ""
year: Optional[int] = None
journal: str = ""
def to_dict(self) -> dict:
return {
"pmid": self.pmid,
"doi": self.doi,
"title": self.title,
"authors": self.authors,
"year": self.year,
"journal": self.journal,
}
@classmethod
def from_dict(cls, d: dict) -> PaperRef:
return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
@dataclass
class Claim:
"""A structured scientific claim extracted from a paper.
A claim is both stored as a node (for detailed querying) and
generates simplified edges (for multi-hop traversal).
"""
id: str # CLM:uuid
subject_id: str # ConceptNode.id in the graph
subject_name: str # human-readable subject name
predicate: str # one of CLAIM_PREDICATES
object_id: str # ConceptNode.id in the graph
object_name: str # human-readable object name
negated: bool = False # "X does NOT affect Y"
confidence: float = 0.5 # overall confidence 0-1
evidence: Evidence = field(default_factory=Evidence)
source_paper: PaperRef = field(default_factory=PaperRef)
raw_text: str = "" # original sentence from paper
metadata: dict = field(default_factory=dict)
def to_dict(self) -> dict:
return {
"id": self.id,
"subject_id": self.subject_id,
"subject_name": self.subject_name,
"predicate": self.predicate,
"object_id": self.object_id,
"object_name": self.object_name,
"negated": self.negated,
"confidence": self.confidence,
"evidence": self.evidence.to_dict(),
"source_paper": self.source_paper.to_dict(),
"raw_text": self.raw_text,
"metadata": self.metadata,
}
@classmethod
def from_dict(cls, d: dict) -> Claim:
d = d.copy()
if "evidence" in d and isinstance(d["evidence"], dict):
d["evidence"] = Evidence.from_dict(d["evidence"])
if "source_paper" in d and isinstance(d["source_paper"], dict):
d["source_paper"] = PaperRef.from_dict(d["source_paper"])
return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
def to_edge(self) -> Edge:
"""Convert claim to a simplified graph edge for traversal."""
return Edge(
source_id=self.subject_id,
target_id=self.object_id,
relation_type=self.predicate,
source=f"claim:{self.source_paper.pmid or self.id}",
confidence=self.confidence,
evidence_ref=self.source_paper.title,
metadata={"claim_id": self.id, "negated": self.negated},
)