Spaces:
Sleeping
Sleeping
| """Hypothesis engine: batch-generate, persist, and rank testable hypotheses. | |
| Phase 3 of the NeuroClaw discovery loop: | |
| 1. batch_generate() — traverse the graph to produce hypotheses at scale | |
| 2. save / load — persist hypotheses to JSON | |
| 3. rank_hypotheses() — sort by novelty, evidence, testability, confidence | |
| 4. (Phase 5-6) hypotheses become executable NeuroClaw analysis tasks | |
| Usage: | |
| from core.knowledge_graph import load_graph, HypothesisEngine | |
| kg = load_graph() | |
| engine = HypothesisEngine(kg) | |
| # batch generate across all domain pairs | |
| hypotheses = engine.batch_generate() | |
| engine.save_hypotheses(hypotheses, "data/hypotheses.json") | |
| # or load and re-rank | |
| hypotheses = engine.load_hypotheses("data/hypotheses.json") | |
| ranked = engine.rank_hypotheses(hypotheses) | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import math | |
| import re | |
| from dataclasses import asdict, dataclass, field | |
| from pathlib import Path | |
| from typing import Optional | |
| import networkx as nx | |
| from .graph_manager import KnowledgeGraph | |
| from .schema import ConceptNode | |
| logger = logging.getLogger(__name__) | |
| # ── data structures ──────────────────────────────────────────────────── | |
| class HypothesisLink: | |
| """A single step in a hypothesis chain.""" | |
| from_id: str | |
| from_name: str | |
| to_id: str | |
| to_name: str | |
| relation_type: str | |
| confidence: float | |
| claim_id: str = "" | |
| raw_text: str = "" | |
| evidence: dict = field(default_factory=dict) | |
| source_paper: dict = field(default_factory=dict) | |
| class Hypothesis: | |
| """A generated hypothesis with full evidence chain.""" | |
| id: str = "" | |
| hypothesis_type: str = "" # "path", "bridge", "gap", "contradiction" | |
| source_id: str = "" | |
| source_name: str = "" | |
| target_id: str = "" | |
| target_name: str = "" | |
| path: list[HypothesisLink] = field(default_factory=list) | |
| confidence_score: float = 0.0 | |
| novelty_score: float = 0.0 | |
| evidence_score: float = 0.0 | |
| testability_score: float = 0.0 | |
| composite_score: float = 0.0 | |
| supporting_claims: list[str] = field(default_factory=list) | |
| explanation: str = "" | |
| testability_reason: str = "" | |
| metadata: dict = field(default_factory=dict) | |
| critic_score: float = 0.0 | |
| critic_feedback: list[dict] = field(default_factory=list) | |
| critic_rounds: int = 0 | |
| evolve_score: float = 0.0 | |
| def to_dict(self) -> dict: | |
| d = asdict(self) | |
| return d | |
| def from_dict(cls, d: dict) -> Hypothesis: | |
| d = d.copy() | |
| if "path" in d and isinstance(d["path"], list): | |
| d["path"] = [HypothesisLink(**p) if isinstance(p, dict) else p for p in d["path"]] | |
| return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__}) | |
| class Contradiction: | |
| """A pair of conflicting claims.""" | |
| concept_a_id: str = "" | |
| concept_a_name: str = "" | |
| concept_b_id: str = "" | |
| concept_b_name: str = "" | |
| claim_for_id: str = "" | |
| claim_for_predicate: str = "" | |
| claim_for_text: str = "" | |
| claim_against_id: str = "" | |
| claim_against_predicate: str = "" | |
| claim_against_text: str = "" | |
| severity: float = 0.0 | |
| class Gap: | |
| """An unexplored relationship between two concepts.""" | |
| concept_a_id: str = "" | |
| concept_a_name: str = "" | |
| concept_b_id: str = "" | |
| concept_b_name: str = "" | |
| distance: int = 0 | |
| connecting_concepts: list[str] = field(default_factory=list) | |
| domain_a: str = "" | |
| domain_b: str = "" | |
| potential_relation: str = "" | |
| # ── constants ────────────────────────────────────────────────────────── | |
| OPPOSING_PREDICATES = { | |
| ("increases", "reduces"), | |
| ("reduces", "increases"), | |
| ("causes", "inhibits"), | |
| ("inhibits", "causes"), | |
| ("treats", "contraindicated_for"), | |
| ("contraindicated_for", "treats"), | |
| ("activates", "inhibits"), | |
| ("inhibits", "activates"), | |
| } | |
| # Review-only study types (no independent empirical evidence). | |
| # Used by compute_frequency_boost and compute_temporal_decay. Edge-level | |
| # weighting by study_type lives in phase4_optimize.apply_evidence_weighting. | |
| _REVIEW_TYPES = {"review", "narrative_review", "systematic_review"} | |
| COMMON_RELATIONS = {"is_a", "part_of", "associated_with", "about", "is_associated_with"} | |
| # Noisy entity name patterns — hypotheses involving these are low quality. | |
| # Two categories: | |
| # (a) process-word ≠ entity: nominalized verbs/states ("loss", "progression") | |
| # that pop up as bridge nodes but carry no biological content. | |
| # (b) generic containers: vague collective terms ("tissue volumes", "Family") | |
| # that don't refer to a specific measurable thing. | |
| _NOISE_WORDS = frozenset({ | |
| # original set | |
| "unseen", "risk", "effect", "level", "status", "change", "type", | |
| "group", "factor", "model", "method", "unknown", "other", "none", | |
| "miscellaneous", "various", "difference", "increase", "decrease", | |
| # nominalized processes/states (category a) | |
| "loss", "progression", "reduction", "elevation", "alteration", | |
| "disruption", "dysfunction", "impairment", "deterioration", | |
| "improvement", "recovery", "response", "onset", "activation", | |
| "inhibition", "regulation", "modulation", "stimulation", | |
| "expression", "function", "functions", | |
| # generic containers (category b) | |
| "family", "members", "phenomenon", "phenomena", "processes", | |
| "mechanisms", "pathways", "symptoms", "manifestations", | |
| "volumes", "volume", | |
| # life events / demographics that are not biological entities | |
| "stress", "life", "events", "exposure", "outcome", "outcomes", | |
| "quality", | |
| }) | |
| NOISE_PATTERNS = [ | |
| re.compile(r"^[A-Z][a-z]?$"), # 1-2 letter: "Id", "Ca", "Mg" | |
| re.compile(r"^[A-Z][a-z]{2,4}$"), # Short mixed-case: "Tics", "Risk" | |
| re.compile(r"^\d+$"), # Pure numbers | |
| ] | |
| # (C-1) Generic-phrase patterns for INTERMEDIATE nodes. The token-based | |
| # `_NOISE_WORDS` filter misses phrases like "functional connectivity" or | |
| # "neural activity" because no individual word is in the noise list, but | |
| # the WHOLE phrase carries no measurable content. We only block these when | |
| # they appear as INTERMEDIATE nodes (paths can legitimately end in | |
| # "functional connectivity" as an outcome metric). | |
| _GENERIC_INTERMEDIATE_PATTERNS = [ | |
| re.compile(r"^(abnormal|altered|impaired|reduced|increased|disrupted|aberrant)?\s*" | |
| r"(brain|neural|neuronal|cortical|cerebral)\s+" | |
| r"(activity|activation|function|functioning|connectivity|" | |
| r"network|networks|signaling|metabolism|response|responses)$", | |
| re.I), | |
| re.compile(r"^(functional|structural|anatomical|effective)\s+" | |
| r"(connectivity|network|networks|integrity|abnormalit(y|ies))$", re.I), | |
| re.compile(r"^(disease|symptom|clinical|treatment|therapeutic)\s+" | |
| r"(progression|outcome|outcomes|response|severity|burden|stage|staging)$", re.I), | |
| re.compile(r"^(common|typical|specific|various|different)\s+" | |
| r"(features|patterns|mechanisms|processes)$", re.I), | |
| re.compile(r"^(neuro)?(degeneration|inflammation|protection|plasticity|genesis|imaging)$", | |
| re.I), | |
| re.compile(r"^(grey|gray|white)\s+matter$", re.I), | |
| re.compile(r"^(cognitive|behavioral|emotional|motor|sensory)\s+" | |
| r"(deficit|deficits|dysfunction|impairment|abnormalit(y|ies))$", re.I), | |
| ] | |
| # (C-3) Target-name patterns that LOOK like outcomes (so they pass | |
| # _is_dataset_outcome's keyword fallback) but are actually too broad to | |
| # drive a DL experiment. We block these even if their domain says | |
| # disease/cognitive_function. | |
| _TARGET_TOO_BROAD_PATTERNS = [ | |
| # bare umbrella nouns (single token) | |
| re.compile(r"^(skill|skills|ability|abilities|outcome|outcomes|" | |
| r"symptom|symptoms|manifestation|manifestations|" | |
| r"phenomenon|phenomena|finding|findings|" | |
| r"deficit|deficits|impairment|impairments|" | |
| r"function|functions|functioning|behavior|behaviors|" | |
| r"capability|capabilities|condition|conditions|" | |
| r"disease|diseases|disorder|disorders|syndrome|syndromes|" | |
| r"focus|integration|balance|knowledge|autonomy|" | |
| r"performance|adaptation|resilience|vulnerability|" | |
| r"recovery|progression|mechanism|process)$", re.I), | |
| # broad-category disease umbrellas (when these are the literal target, | |
| # they're too generic — but specific subtypes like "Alzheimer Disease" | |
| # don't match these patterns) | |
| re.compile(r"^(neurological|psychiatric|mental|cognitive|behavioral|" | |
| r"neurodegenerative|cardiovascular)\s+" | |
| r"(disease|diseases|disorder|disorders|condition|conditions)$", re.I), | |
| re.compile(r"^(human\s+)?(disease|diseases|disorder|disorders)$", re.I), | |
| re.compile(r"^(brain|mental|psychiatric|psychological)\s+health$", re.I), | |
| re.compile(r"^clinical\s+(features|outcome|outcomes|presentation|status)$", re.I), | |
| # "X deficits/impairments" patterns (too vague as targets) | |
| re.compile(r"^(motor|cognitive|neurocognitive|functional|social|" | |
| r"verbal|visual|sensory|emotional|behavioral)\s+" | |
| r"(deficit|deficits|impairment|impairments|dysfunction|" | |
| r"disability|decline|deterioration)$", re.I), | |
| ] | |
| # Vague relation types that add little signal | |
| VAGUE_RELATIONS = {"is_associated_with", "associated_with", "about"} | |
| # CognitiveAtlas / MeSH concept ids that are top-degree generic hubs | |
| # in the KG. The audit found these at degrees 700-9000+, with names that | |
| # are real English words (not caught by _NOISE_WORDS) but referring to | |
| # extremely abstract umbrella concepts: | |
| # | |
| # COGAT trm_4a3fd79d0a891 "memory" degree 2248 | |
| # COGAT trm_4a3fd79d0a80f "logic" degree 2052 | |
| # COGAT trm_5159c80c1dd24 "loss" degree 1034 | |
| # COGAT trm_4a3fd79d09741 "activation" degree 840 | |
| # COGAT trm_4a3fd79d0afcf "risk" degree 722 | |
| # COGAT trm_4a3fd79d0b2a8 "stress" degree 139 | |
| # MSH:D001921 "Brain" degree 9157 | |
| # MSH:D009474 "Neurons" degree 1354 | |
| # | |
| # Hypotheses with these as intermediate nodes or endpoints are too vague | |
| # to drive a downstream DL experiment ("FPN -> memory" is not testable | |
| # because we don't know which memory subsystem). Filtered in post_process. | |
| PATH_IGNORE_NODE_IDS = frozenset({ | |
| "COGAT_CONCEPT:trm_4a3fd79d0a891", # memory | |
| "COGAT_CONCEPT:trm_4a3fd79d0a80f", # logic | |
| "COGAT_CONCEPT:trm_5159c80c1dd24", # loss | |
| "COGAT_CONCEPT:trm_4a3fd79d09741", # activation | |
| "COGAT_CONCEPT:trm_4a3fd79d0afcf", # risk | |
| "COGAT_CONCEPT:trm_4a3fd79d0b2a8", # stress | |
| "MSH:D001921", # Brain (umbrella) | |
| "MSH:D009474", # Neurons (umbrella) | |
| }) | |
| # Disease/category mega-hubs that are valid as hypothesis endpoints | |
| # ("predict Alzheimer" is fine) but NOT as intermediate transit nodes | |
| # ("A → Alzheimer → B" is just "A relates to AD, AD relates to B" — no | |
| # discovery value). Audit found 37.8% of hypotheses transit through these. | |
| INTERMEDIATE_ONLY_IGNORE_IDS = frozenset({ | |
| "COGAT_DISORDER:dso_5419", # schizophrenia (degree 1005) | |
| "MSH:D009103", # Multiple Sclerosis (816) | |
| "COGAT_DISORDER:dso_3312", # bipolar disorder (703) | |
| "MSH:D000544", # Alzheimer Disease (746) | |
| "MSH:D004827", # Epilepsy (750) | |
| "MSH:D010300", # Parkinson Disease (709) | |
| "COGAT_DISORDER:dso_0060041", # autism spectrum disorder (613) | |
| "MSH:D001289", # ADHD (601) | |
| "MSH:D003863", # Depression (577) | |
| "MSH:D001523", # Mental Disorders (489) | |
| }) | |
| DIRECTIONAL_RELATIONS = { | |
| "causes", "treats", "increases", "reduces", "modulates", | |
| "activates", "inhibits", "is_biomarker_of", "is_risk_factor_for", | |
| "predicts", "distinguishes", "mediates", | |
| # Brain decoding directional predicates | |
| "evokes", "decoded_from", "elicits", | |
| } | |
| # domain pairs worth exploring — aligned with NeuroClaw imaging experiments | |
| # target datasets: UKB (T1w/dMRI/rfMRI/SWI), ADNI (T1w/PET/fMRI/DTI), HCP-YA (T1w/T2w/fMRI/dMRI/MEG) | |
| # experiment models: BrainGNN, NeuroStorm, SVM, XGBoost on raw images + handcrafted features | |
| # | |
| # Design principle: target should be a dataset OUTCOME (what we want to predict), | |
| # source should be a MEASURABLE feature (what the dataset provides as input). | |
| # - UKB outcomes: fluid intelligence, neuroticism, dementia diagnosis, motor tests | |
| # - ADNI outcomes: MCI→AD conversion, CDR-SB, cognitive composite | |
| # - HCP outcomes: fluid/crystallized IQ, emotion recognition, personality traits | |
| # | |
| # Allowed sources (what we can measure): neuroanatomy (MRI regions), connectivity | |
| # networks, gene, biomarker (CSF/PET), drug (for intervention studies). | |
| # Allowed targets (what we predict): disease (diagnostic labels), cognitive_function | |
| # (the OUTCOMES — includes behavior, personality, affect). | |
| DEFAULT_DOMAIN_PAIRS = [ | |
| # core: measurable features → clinical/behavioral OUTCOMES | |
| ("neuroanatomy", "disease"), # MRI → diagnosis | |
| ("neuroanatomy", "cognitive_function"), # MRI → cognition/behavior | |
| ("connectivity", "disease"), # dMRI/fMRI connectivity → diagnosis | |
| ("connectivity", "cognitive_function"), # connectivity → cognition | |
| # genetics → outcomes (UKB 500k WGS) | |
| ("gene", "disease"), | |
| ("gene", "cognitive_function"), # GWAS → behavior/IQ | |
| # fluid biomarkers → outcomes (ADNI CSF, blood) | |
| ("biomarker", "disease"), | |
| ("biomarker", "cognitive_function"), | |
| # drug → outcomes (ADNI pharmaceutical arms) | |
| ("drug", "disease"), | |
| ("drug", "cognitive_function"), | |
| # cross-outcome (comorbidity, transdiagnostic) | |
| ("disease", "disease"), | |
| ("cognitive_function", "disease"), # e.g. anxiety → MS diagnosis risk | |
| ("disease", "cognitive_function"), # e.g. AD → processing speed decline | |
| ] | |
| # Domains that are NOT directly measurable from brain imaging | |
| # These hypotheses will be filtered out in post_process | |
| NON_MEASURABLE_BIOMARKER_TYPES = { | |
| "neurotransmitter", # needs specialized PET tracers (e.g., 11C-raclopride for DA) | |
| "protein", # needs tissue biopsy or CSF | |
| "enzyme", # needs molecular assays | |
| "receptor", # needs specialized PET (e.g., 11C-PIB for Aβ, but that's biomarker domain) | |
| # fluid biomarkers — not available in UKB/HCP-YA, only ADNI CSF subset | |
| "csf_biomarker", | |
| "blood_biomarker", | |
| "saliva_biomarker", | |
| "tear_biomarker", | |
| } | |
| # Specific entity name patterns that are NOT directly measurable from imaging | |
| _NON_MEASURABLE_PATTERNS = [ | |
| re.compile(r"(neurotransmitter|dopamine|serotonin|norepinephrine|gaba|glutamate|acetylcholine)\s+(level|concentration|release|synthesis)", re.I), | |
| re.compile(r"(alpha|beta|gamma|delta|kappa)\s*synuclein\s*(pathology|aggregation|expression)", re.I), | |
| re.compile(r"(amyloid|tau|phosphorylated)\s*(beta|protein|peptide)\s*(aggregation|production|clearance)", re.I), | |
| re.compile(r"(enzyme|kinase|phosphatase)\s*(activity|expression)", re.I), | |
| re.compile(r"(receptor|transporter)\s*(density|binding|expression)", re.I), | |
| re.compile(r"(TNF|interleukin|IL-\d|cytokine|chemokine)\s*(alpha|beta|level|concentration|production)", re.I), | |
| re.compile(r"CSF\s+(Aβ|amyloid|tau|p-tau|NFL|neurofilament)", re.I), | |
| re.compile(r"(blood|plasma|serum)\s+(biomarker|marker|level|concentration)", re.I), | |
| re.compile(r"(CSF|cerebrospinal fluid)\s+", re.I), | |
| re.compile(r"(saliva|tear|urine)\s+(biomarker|marker|level)", re.I), | |
| re.compile(r"(biopsy|tissue sample)", re.I), | |
| ] | |
| # Non-neurological target domains — brain regions should not directly predict these | |
| _NON_NEUROLOGICAL_TARGETS = re.compile( | |
| r"(urinary|incontinence|frequency|enuresis|bladder|renal|kidney|liver|" | |
| r"gastrointestinal|cardiac|pulmonary|dermatol|orthopedic|musculoskeletal|" | |
| r"fracture|sprain|tumor|cancer|carcinoma|leukemia|lymphoma)", re.I | |
| ) | |
| # DATASET-OUTCOME whitelist — covers actual predicted variables in UKB/ADNI/HCP-YA | |
| # papers (see README "Dataset Outcomes" for references to typical prediction tasks). | |
| # Target must match one of these patterns to pass the post_process filter. | |
| # We also auto-accept any concept in the `disease` domain (clinical diagnosis | |
| # IS the most common outcome) and any MSH/CogAtlas concept in the | |
| # `cognitive_function` domain (behavior/cognition). | |
| # | |
| # Categories cover: | |
| # - Clinical diagnostic labels (Alzheimer, schizophrenia, MCI, etc.) — all 3 datasets | |
| # - AD staging / conversion (CN→MCI→AD, ATN) — ADNI | |
| # - Clinical scales (CDR, MMSE, ADAS-Cog, PHQ-9, MoCA, NPI) — ADNI + UKB | |
| # - Cognitive abilities (IQ, memory, attention, processing speed) — all 3 | |
| # - Specific cognitive tests (PMAT, flanker, N-back, delay discounting) — HCP | |
| # - Personality (Big Five) — HCP + UKB | |
| # - Behavior/affect (anxiety, depression, aggression, risk-taking) — all 3 | |
| # - Motor/sensory (grip strength, gait, reaction time, dexterity) — UKB + HCP | |
| # - Brain age / neurodegeneration markers — UKB + ADNI | |
| # - NeuroSTORM-evaluated phenotypes: MND, early psychosis (HCP-EP), ADHD200, | |
| # COBRE, UCLA L5c, TCP psychiatric scales, fMRI task state classification | |
| # - Subject fingerprinting / re-identification | |
| _OUTCOME_KEYWORDS = re.compile( | |
| r"(" | |
| # cognitive abilities — general | |
| r"intelligence|cognition|cognitive\s+(function|ability|performance|deterioration|impairment|dysfunction|decline|test|assessment|composite|score)|" | |
| r"memory|attention|executive|processing\s+speed|reasoning|language|" | |
| r"fluency|perception|reaction\s+time|fluid\s+intelligence|" | |
| r"crystallized\s+intelligence|working\s+memory|episodic\s+memory|" | |
| r"semantic\s+memory|verbal\s+(memory|fluency|learning)|visuospatial|" | |
| # specific HCP NIH Toolbox / cognitive tasks | |
| r"pmat|flanker|card\s+sort|n-?back|list\s+sort|picture\s+sequence|" | |
| r"pattern\s+comparison|picture\s+vocabulary|oral\s+reading|" | |
| r"delay\s+discounting|risk[- ]taking|go[- ]no[- ]go|" | |
| # HCP Penn CNB cognitive battery | |
| r"penn\s+(word|matrix|line\s+orientation|continuous\s+performance|progressive\s+matrices|fear|emotion|cnb)|" | |
| r"matrix\s+pattern|numeric\s+memory|prospective\s+memory|pairs\s+matching|" | |
| r"trail\s+making|symbol\s+digit|boston\s+naming|animal\s+fluency|" | |
| r"category\s+fluency|logical\s+memory|clock\s+drawing|ravlt|" | |
| # HCP 7 task states (NeuroSTORM state classification) | |
| r"emotion\s+task|gambling\s+task|language\s+task|motor\s+task|" | |
| r"relational\s+task|social\s+task|working\s+memory\s+task|" | |
| # clinical scales (ADNI/UKB/TCP/HCP) | |
| r"\b(cdr|cdr-sb|mmse|moca|adas|adas-cog|npi|faq|gds|phq-?9|gad-?7|bai|hdrs|hrsd|hamd|ham-d|" | |
| r"bdi|ymrs|panss|sans|saps|audit|asrs|pro|adi|srs|tci|neo-?ffi|asr|abcl|" | |
| r"cidi|cidi-sf|eysenck|swemwbs|psqi|ftnd|ssaga|masq|promis|upsit)\b|" | |
| r"adult\s+self\s+report|adult\s+behavior\s+checklist|" | |
| # personality / affect | |
| r"neuroticism|extraversion|agreeableness|conscientiousness|openness|" | |
| r"personality|temperament|affect|mood|emotion|anxiety|depression|" | |
| r"well-?being|satisfaction|life\s+satisfaction|psychological|stress\s+response|" | |
| r"anxiety\s+sensitivity|cautiousness|" | |
| r"affect\s+(positive|negative)|emotion\s+recognition|emotional\s+regulation|" | |
| r"perceived\s+(stress|rejection|hostility)|anger|fear|sadness|" | |
| # social functioning (HCP + UKB) | |
| r"loneliness|social\s+(isolation|support|relationship|cognition)|" | |
| r"meaning\s+and\s+purpose|instrumental\s+support|emotional\s+support|" | |
| r"friendship|" | |
| # behavior | |
| r"behavior|aggression|impulsivity|addiction|substance|alcohol|smoking|" | |
| r"tobacco|cannabis|cocaine|opiate|opioid|hallucinogen|" | |
| r"drug\s+use|substance\s+use|sleep\s+quality|insomnia|" | |
| # diagnoses / clinical outcomes — added NeuroSTORM-evaluated cohorts and ADNI stages | |
| r"alzheimer|parkinson|schizophrenia|autism|adhd|bipolar|epilepsy|" | |
| r"mci|mild\s+cognitive|dementia|psychosis|early\s+psychosis|stroke|post[- ]stroke|" | |
| r"multiple\s+sclerosis|huntington|frontotemporal|lewy\s+body|" | |
| r"motor\s+neuron\s+disease|mnd|als|" | |
| r"transdiagnostic|psychiatric\s+disorder|mental\s+health\s+disorder|" | |
| r"ocd|ptsd|phobia|panic|agoraphobia|somatoform|eating\s+disorder|" | |
| # ADNI-specific diagnostic stages | |
| r"\b(cn|smc|emci|lmci|ad\b|preclinical|at\b|atn|alzheimer\s+continuum)\b|" | |
| r"significant\s+memory\s+concern|subjective\s+(memory|cognitive)\s+(concern|complaint|decline)|" | |
| r"cognitively\s+(normal|unimpaired)|" | |
| r"disorder|syndrome|diagnosis|onset|conversion|progression|severity|" | |
| r"symptom|manifestation|prognosis|outcome|treatment\s+response|" | |
| r"disease\s+(stage|staging|duration|burden)|" | |
| # cardiovascular / metabolic diseases (UKB ICD-10) | |
| r"myocardial\s+infarction|heart\s+failure|hypertension|atrial\s+fibrillation|" | |
| r"coronary|cardiovascular\s+disease|diabetes|type\s*[12]\s+diabetes|" | |
| r"chronic\s+kidney|fatty\s+liver|nafld|metabolic\s+syndrome|obesity|" | |
| # AD-specific biomarker status | |
| r"amyloid\s+(status|positivity|positive|negative|load|burden|suvr)|" | |
| r"tau\s+(status|positivity|positive|tangle|pathology|burden|suvr)|" | |
| r"atn\s+(profile|stage|classification)|" | |
| r"neurodegeneration\s+(stage|status)|" | |
| # brain age / aging | |
| r"brain\s+age|brain-?age(-?gap)?|aging|age[- ]related|age\s+acceleration|" | |
| # motor / sensory | |
| r"grip\s+strength|gait|motor\s+coordination|motor\s+function|" | |
| r"balance|tremor|dexterity|walking\s+speed|two[- ]minute\s+walk|endurance|" | |
| r"visual\s+(acuity|field)|audition|hearing|olfaction|taste|pain|" | |
| r"chronic\s+pain|musculoskeletal\s+pain|" | |
| # mortality / longevity | |
| r"mortality|all-?cause\s+death|survival|life\s+expectancy" | |
| r")", re.I | |
| ) | |
| # Target domains considered as valid dataset outcomes | |
| _OUTCOME_DOMAINS = {"disease", "cognitive_function"} | |
| # NeuroClaw testable modalities and their keywords | |
| # Aligned with UKB/ADNI/HCP-YA available data + deep learning models | |
| TESTABLE_MODALITIES = { | |
| "sMRI": ["cortical thickness", "volume", "atrophy", "gray matter", "white matter", | |
| "brain structure", "morphometry", "VBM", "FreeSurfer", "recon-all", | |
| "brain region", "hippocampus", "amygdala", "thalamus", "caudate", | |
| "putamen", "cerebellum", "insula", "cortex", "ventricle"], | |
| "fMRI": ["functional connectivity", "BOLD", "activation", "resting-state", | |
| "task-based", "network", "default mode", "fMRI", "brain response", | |
| "neural activity", "brain activation"], | |
| "dMRI": ["DTI", "diffusion", "fractional anisotropy", "tractography", | |
| "white matter integrity", "structural connectivity", "FA", "MD", | |
| "connectivity matrix", "fiber bundle", "white matter tract"], | |
| "PET": ["PET", "tracer", "amyloid", "tau", "FDG", "SUVr", "binding potential", | |
| "glucose metabolism", "florbetapir", "flortaucipir"], | |
| "EEG": ["EEG", "ERP", "oscillation", "power spectrum", "alpha", "beta", "theta", | |
| "delta", "gamma", "microstate", "coherence", "event-related"], | |
| "organ_volume": ["organ volume", "liver volume", "kidney volume", "spleen volume", | |
| "MedSAM", "segmentation", "organ size"], | |
| } | |
| # Deep learning model keywords for testability scoring | |
| DL_MODEL_KEYWORDS = [ | |
| "BrainGNN", "NeuroStorm", "GNN", "graph neural", "region of interest", "ROI", | |
| "connectivity matrix", "adjacency", "node feature", "graph convolution", | |
| "deep learning", "CNN", "ResNet", "attention", "transformer", | |
| "voxel", "patch", "whole-brain", | |
| ] | |
| # ── Dataset-Available Variables ────────────────────────────────────── | |
| # Defines what can be measured in each dataset. Hypotheses must start | |
| # from these features and end at dataset-available outcomes. | |
| DATASET_FEATURES = { | |
| "UKB": { | |
| # sMRI (T1w): FreeSurfer-derived ROI measures | |
| "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "smri_cortical_area": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "smri_cortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "smri_voxel": {"modality": "sMRI", "tool": "voxel", "level": "voxel"}, | |
| # dMRI: diffusion metrics per tract | |
| "dmri_fa": {"modality": "dMRI", "tool": "TBSS", "level": "tract"}, | |
| "dmri_md": {"modality": "dMRI", "tool": "TBSS", "level": "tract"}, | |
| "dmri_sc": {"modality": "dMRI", "tool": "tractography", "level": "connectivity"}, | |
| # rfMRI: functional connectivity | |
| "rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"}, | |
| # lesion segmentation | |
| "lesion_volume": {"modality": "sMRI", "tool": "MedSAM", "level": "ROI"}, | |
| # non-imaging | |
| "genetics": {"modality": "genetics", "tool": "WGS/GSA", "level": "SNP"}, | |
| "environment": {"modality": "environment", "tool": "questionnaire","level": "variable"}, | |
| "physical": {"modality": "physical", "tool": "measurement", "level": "variable"}, | |
| "hospitalization":{"modality": "clinical", "tool": "ICD10", "level": "outcome"}, | |
| }, | |
| "ADNI": { | |
| "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "smri_voxel": {"modality": "sMRI", "tool": "voxel", "level": "voxel"}, | |
| "pet_amyloid": {"modality": "PET", "tool": "florbetapir", "level": "ROI"}, | |
| "pet_tau": {"modality": "PET", "tool": "flortaucipir", "level": "ROI"}, | |
| "pet_fdg": {"modality": "PET", "tool": "FDG", "level": "ROI"}, | |
| "fmri_fc": {"modality": "fMRI", "tool": "task/resting", "level": "connectivity"}, | |
| "dti_fa": {"modality": "dMRI", "tool": "DTI", "level": "tract"}, | |
| "lesion_volume": {"modality": "sMRI", "tool": "MedSAM", "level": "ROI"}, | |
| "genetics": {"modality": "genetics", "tool": "APOE/GWAS", "level": "SNP"}, | |
| "medication": {"modality": "clinical", "tool": "medication_log", "level": "variable"}, | |
| }, | |
| "HCP_YA": { | |
| "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "smri_myelin": {"modality": "sMRI", "tool": "T1w/T2w", "level": "ROI"}, | |
| "smri_voxel": {"modality": "sMRI", "tool": "voxel", "level": "voxel"}, | |
| "rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"}, | |
| "tfmri_task":{"modality": "fMRI", "tool": "task fMRI","level": "activation"}, | |
| "dmri_sc": {"modality": "dMRI", "tool": "HARDI", "level": "connectivity"}, | |
| "meg": {"modality": "MEG", "tool": "MEG", "level": "connectivity"}, | |
| }, | |
| # NAS-available patient cohorts with preprocessed ROI time series. | |
| # Phenotype CSVs live under Z:\Dataset\fMRI\phenotype and the dataset- | |
| # specific rest csvs. All supply rfMRI volumes or ROI series; structural | |
| # T1 is available for HCP-EP and HCP-Aging (the other four are rfMRI-only | |
| # public releases). | |
| "ABIDE": { | |
| "rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"}, | |
| "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"}, | |
| }, | |
| "ADHD200": { | |
| "rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"}, | |
| "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"}, | |
| }, | |
| "COBRE": { | |
| "rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"}, | |
| "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"}, | |
| }, | |
| "UCLA": { | |
| # UCLA CNP — rest + 6 task contrasts, cross-diagnosis cohort. | |
| "rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"}, | |
| "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"}, | |
| "tfmri_task": {"modality": "fMRI", "tool": "task fMRI", "level": "activation"}, | |
| }, | |
| "HCP_EP": { | |
| # HCP Early Psychosis — patient cohort, T1w + rfMRI cleaned. | |
| "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"}, | |
| "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"}, | |
| }, | |
| "HCP_AGING": { | |
| # HCP-Aging — T1w + rfMRI REST1/REST2 + 3 task contrasts. | |
| "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "smri_myelin": {"modality": "sMRI", "tool": "T1w/T2w", "level": "ROI"}, | |
| "rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"}, | |
| "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"}, | |
| "tfmri_task": {"modality": "fMRI", "tool": "task fMRI", "level": "activation"}, | |
| }, | |
| # ── Visual decoding (fMRI) ────────────────────────────────────────── | |
| # NSD & BOLD5000: image-stimulus visual task fMRI, no rest. | |
| "NSD": { | |
| "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "tfmri_visual_voxel": {"modality": "fMRI", "tool": "task fMRI", | |
| "level": "voxel", "stimulus": "natural_image"}, | |
| "tfmri_visual_roi": {"modality": "fMRI", "tool": "task fMRI", | |
| "level": "ROI", "stimulus": "natural_image"}, | |
| }, | |
| "BOLD5000": { | |
| "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"}, | |
| "tfmri_visual_voxel": {"modality": "fMRI", "tool": "task fMRI", | |
| "level": "voxel", "stimulus": "ImageNet_COCO_Scene"}, | |
| "tfmri_visual_roi": {"modality": "fMRI", "tool": "task fMRI", | |
| "level": "ROI", "stimulus": "ImageNet_COCO_Scene"}, | |
| }, | |
| # ── Visual decoding (EEG) ─────────────────────────────────────────── | |
| "SEED_DV": { | |
| "eeg_psd": {"modality": "EEG", "tool": "PSD", "level": "channel"}, | |
| "eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"}, | |
| }, | |
| # ── Emotion decoding (EEG + eye tracking) ─────────────────────────── | |
| "SEED": { | |
| "eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"}, | |
| "eeg_psd": {"modality": "EEG", "tool": "PSD", "level": "channel"}, | |
| }, | |
| "SEED_IV": { | |
| "eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"}, | |
| "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation", | |
| "level": "variable"}, | |
| }, | |
| "SEED_V": { | |
| "eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"}, | |
| "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation", | |
| "level": "variable"}, | |
| }, | |
| "SEED_VII": { | |
| "eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"}, | |
| "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation", | |
| "level": "variable"}, | |
| }, | |
| "SEED_GER": { | |
| "eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"}, | |
| "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation", | |
| "level": "variable"}, | |
| }, | |
| "SEED_FRA": { | |
| "eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"}, | |
| "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation", | |
| "level": "variable"}, | |
| }, | |
| # ── Vigilance decoding (EEG) ──────────────────────────────────────── | |
| "SEED_VIG": { | |
| "eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"}, | |
| "eog": {"modality": "EOG", "tool": "EOG", "level": "channel"}, | |
| "eye_movement": {"modality": "eye_tracking", "tool": "gaze/blink", | |
| "level": "variable"}, | |
| }, | |
| } | |
| DATASET_OUTCOMES = { | |
| "UKB": [ | |
| "disease_diagnosis", # ICD10 codes | |
| "mortality", # death registry | |
| "cognitive_score", # touchscreen cognitive tests | |
| "imaging_phenotype", # derived imaging phenotypes | |
| ], | |
| "ADNI": [ | |
| "diagnosis", # CN / MCI / AD | |
| "conversion", # MCI → AD conversion | |
| "cognitive_decline", # ADAS-Cog, MMSE decline | |
| "biomarker_status", # amyloid+/tau+ status | |
| ], | |
| "HCP_YA": [ | |
| "behavioral_score", # NIH Toolbox | |
| "cognitive_task", # task fMRI performance | |
| "personality", # NEO-FFI | |
| ], | |
| # ABIDE — ASD vs controls, rest only. | |
| "ABIDE": [ | |
| "diagnosis", # ASD vs TD | |
| "symptom_severity", # ADOS, ADI-R, SRS | |
| "cognitive_score", # FIQ/VIQ/PIQ | |
| ], | |
| # ADHD200 — ADHD subtype vs TDC. | |
| "ADHD200": [ | |
| "diagnosis", # ADHD (combined/inattentive/hyperactive) vs TDC | |
| "symptom_severity", # ADHD-RS, Conners | |
| "cognitive_score", # WASI/WISC | |
| ], | |
| # COBRE — schizophrenia vs controls. | |
| "COBRE": [ | |
| "diagnosis", # schizophrenia vs HC | |
| "symptom_severity", # PANSS positive/negative/general | |
| "cognitive_score", # WAIS | |
| ], | |
| # UCLA CNP — schizophrenia/bipolar/ADHD vs controls. | |
| "UCLA": [ | |
| "diagnosis", # SCZ / BP / ADHD / HC | |
| "symptom_severity", # HAM-D, YMRS, ADHD-RS | |
| "cognitive_task", # 6 task contrasts | |
| ], | |
| # HCP-EP — early psychosis (FES + AR) vs HC. | |
| "HCP_EP": [ | |
| "diagnosis", # affective/non-affective psychosis vs HC | |
| "symptom_severity", # PANSS, SANS, YMRS | |
| "cognitive_score", # MATRICS Consensus Cognitive Battery | |
| ], | |
| # HCP-Aging — lifespan 36-100 yrs, healthy aging. | |
| "HCP_AGING": [ | |
| "cognitive_decline", # NIH Toolbox across age | |
| "behavioral_score", # same battery as HCP-YA | |
| "cognitive_task", # CARIT/FACENAME/VISMOTOR | |
| ], | |
| # ── Visual decoding outcomes ──────────────────────────────────────── | |
| "NSD": [ | |
| "image_category", # COCO 80-class | |
| "image_semantic", # CLIP / language-model embedding | |
| "stimulus_reconstruction",# pixel / latent reconstruction | |
| ], | |
| "BOLD5000": [ | |
| "image_category", # ImageNet 1000-class / COCO / Scene | |
| "scene_type", # Scene 365-class | |
| "image_semantic", | |
| ], | |
| "SEED_DV": [ | |
| "video_class", # discrete video categories | |
| "video_semantic", | |
| "video_reconstruction", | |
| ], | |
| # ── Emotion decoding outcomes ─────────────────────────────────────── | |
| "SEED": ["emotion_3class"], # positive/neutral/negative | |
| "SEED_IV": ["emotion_4class"], # happy/sad/fear/neutral | |
| "SEED_V": ["emotion_5class"], # +disgust | |
| "SEED_VII": ["emotion_7class", "emotion_continuous"], | |
| "SEED_GER": ["emotion_3class"], | |
| "SEED_FRA": ["emotion_3class"], | |
| # ── Vigilance decoding outcomes ───────────────────────────────────── | |
| "SEED_VIG": ["vigilance_continuous", "perclos"], | |
| } | |
| # Imaging feature templates — dynamically combined with AAL atlas regions | |
| # {region} is replaced with actual neuroanatomy node names at generation time | |
| IMAGING_FEATURE_TEMPLATES = { | |
| # sMRI FreeSurfer ROI features | |
| "cortical thickness of {region}": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI", | |
| "datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]}, | |
| "gray matter volume of {region}": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI", | |
| "datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]}, | |
| "subcortical volume of {region}": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI", | |
| "datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]}, | |
| "cortical area of {region}": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI", | |
| "datasets": ["UKB", "HCP_YA", "HCP_AGING"]}, | |
| # dMRI tract features | |
| "fractional anisotropy of {region}": {"modality": "dMRI", "tool": "TBSS", "level": "tract", | |
| "datasets": ["UKB", "HCP_YA"]}, | |
| "mean diffusivity of {region}": {"modality": "dMRI", "tool": "TBSS", "level": "tract", | |
| "datasets": ["UKB", "HCP_YA"]}, | |
| # PET ROI features (ADNI) | |
| "amyloid SUVR of {region}": {"modality": "PET", "tool": "florbetapir", "level": "ROI", | |
| "datasets": ["ADNI"]}, | |
| "tau SUVR of {region}": {"modality": "PET", "tool": "flortaucipir", "level": "ROI", | |
| "datasets": ["ADNI"]}, | |
| "FDG uptake of {region}": {"modality": "PET", "tool": "FDG", "level": "ROI", | |
| "datasets": ["ADNI"]}, | |
| # lesion segmentation | |
| "lesion volume of {region}": {"modality": "sMRI", "tool": "MedSAM", "level": "ROI", | |
| "datasets": ["UKB", "ADNI"]}, | |
| } | |
| # Connectivity feature templates — {a} and {b} are AAL regions | |
| CONNECTIVITY_FEATURE_TEMPLATES = { | |
| "functional connectivity between {a} and {b}": {"modality": "fMRI", "tool": "rfMRI", | |
| "level": "connectivity", | |
| "datasets": ["UKB", "ADNI", "HCP_YA", | |
| "ABIDE", "ADHD200", "COBRE", | |
| "UCLA", "HCP_EP", "HCP_AGING"]}, | |
| "effective connectivity from {a} to {b}": {"modality": "fMRI", "tool": "DCM/GC", | |
| "level": "connectivity", | |
| "datasets": ["ADNI", "HCP_YA", | |
| "UCLA", "HCP_EP", "HCP_AGING"]}, | |
| "structural connectivity between {a} and {b}": {"modality": "dMRI", "tool": "tractography", | |
| "level": "connectivity", | |
| "datasets": ["UKB", "HCP_YA"]}, | |
| } | |
| # Domain pairs for imaging-driven hypothesis generation | |
| # source domain → target domain, aligned with dataset modalities | |
| IMAGING_DOMAIN_PAIRS = [ | |
| # sMRI features → disease | |
| ("neuroanatomy", "disease"), | |
| # connectivity → disease | |
| ("connectivity", "disease"), | |
| # sMRI features → cognitive function | |
| ("neuroanatomy", "cognitive_function"), | |
| # gene → brain structure (UKB genetics + imaging) | |
| ("gene", "neuroanatomy"), | |
| # disease → drug (ADNI) | |
| ("disease", "drug"), | |
| ] | |
| # Brain decoding domain pairs (NSD / BOLD5000 / SEED family). | |
| # These are SEPARATE from IMAGING_DOMAIN_PAIRS because decoding hypotheses | |
| # reverse the usual direction: instead of "brain feature → clinical outcome", | |
| # they go "stimulus ↔ brain" or "brain → psychological-state label". | |
| DECODING_DOMAIN_PAIRS = [ | |
| # Encoding: stimulus drives brain response | |
| ("visual_stimulus", "neuroanatomy"), | |
| ("visual_stimulus", "imaging_feature"), | |
| ("visual_stimulus", "connectivity"), | |
| # Decoding: brain predicts stimulus identity | |
| ("neuroanatomy", "visual_stimulus"), | |
| ("imaging_feature", "visual_stimulus"), | |
| # EEG → emotion (SEED/SEED-IV/SEED-V/SEED-VII/SEED-GER/SEED-FRA) | |
| ("imaging_feature", "emotion"), | |
| ("neuroanatomy", "emotion"), | |
| # EEG → vigilance (SEED-VIG) | |
| ("imaging_feature", "vigilance"), | |
| ("neuroanatomy", "vigilance"), | |
| ] | |
| # AAL atlas regions used for imaging feature generation | |
| # Subset of neuroanatomy nodes from NN_AAL source | |
| _AAL_REGION_KEYWORDS = [ | |
| "Precentral", "Frontal_Sup", "Frontal_Mid", "Frontal_Inf", "Rolandic_Oper", | |
| "Supp_Motor", "Olfactory", "Frontal_Sup_Med", "Frontal_Med_Orb", | |
| "Rectus", "Insula", "Cingulate", "Hippocampus", "Parahippocampal", | |
| "Amygdala", "Calcarine", "Cuneus", "Lingual", "Occipital", | |
| "Fusiform", "Postcentral", "Parietal", "SupraMarginal", "Angular", | |
| "Precuneus", "Paracentral", "Caudate", "Putamen", "Pallidum", | |
| "Thalamus", "Heschl", "Temporal", "Temporal_Pole", | |
| ] | |
| # ── engine ───────────────────────────────────────────────────────────── | |
| class HypothesisEngine: | |
| """Batch-generate, persist, and rank testable hypotheses from a knowledge graph.""" | |
| def __init__(self, kg: KnowledgeGraph): | |
| self.kg = kg | |
| self.G = kg.G | |
| self._index = kg._index | |
| # Build claims index for frequency_boost: (subj, pred, obj) → [claim_meta, ...] | |
| self._claims_by_triple: dict[tuple[str, str, str], list[dict]] = {} | |
| for nid, node in self._index.items(): | |
| if "claim" not in node.domain_tags: | |
| continue | |
| meta = node.metadata | |
| key = (meta.get("subject_id", ""), meta.get("predicate", ""), meta.get("object_id", "")) | |
| if key[0] and key[2]: | |
| self._claims_by_triple.setdefault(key, []).append(meta) | |
| # ── batch generation ─────────────────────────────────────────────── | |
| def batch_generate( | |
| self, | |
| domain_pairs: Optional[list[tuple[str, str]]] = None, | |
| max_hops: int = 3, | |
| max_paths_per_pair: int = 5, | |
| max_seeds_per_domain: int = 50, | |
| ) -> list[Hypothesis]: | |
| """Batch-generate hypotheses across the entire graph. | |
| Strategy: for each domain pair, sample seed concepts from domain_a, | |
| find paths to concepts in domain_b within max_hops hops. | |
| """ | |
| if domain_pairs is None: | |
| domain_pairs = DEFAULT_DOMAIN_PAIRS | |
| all_hypotheses: list[Hypothesis] = [] | |
| seen_pairs: set[tuple[str, str]] = set() | |
| _hyp_counter = 0 | |
| for dom_a, dom_b in domain_pairs: | |
| logger.info(f"generating hypotheses: {dom_a} -> {dom_b}") | |
| seeds_a = self._sample_domain_nodes(dom_a, max_seeds_per_domain) | |
| targets_b = { | |
| nid for nid, data in self.G.nodes(data=True) | |
| if dom_b in data.get("domain_tags", []) | |
| and "claim" not in data.get("domain_tags", []) | |
| and nid not in PATH_IGNORE_NODE_IDS | |
| } | |
| for seed_id in seeds_a: | |
| if seed_id not in self.G: | |
| continue | |
| # BFS from seed | |
| try: | |
| reachable = nx.single_source_shortest_path( | |
| self.G, seed_id, cutoff=max_hops | |
| ) | |
| except nx.NetworkXError: | |
| continue | |
| # find targets in domain_b | |
| candidates = [ | |
| nid for nid in reachable | |
| if nid in targets_b and nid != seed_id | |
| ] | |
| pair_count = 0 | |
| for target_id in candidates: | |
| pair_key = tuple(sorted([seed_id, target_id])) | |
| if pair_key in seen_pairs: | |
| continue | |
| seen_pairs.add(pair_key) | |
| raw_path = reachable[target_id] | |
| links = self._enrich_path(raw_path) | |
| if not links: | |
| continue | |
| conf = self._compute_confidence_score(links) | |
| nov = self._compute_novelty_score(links) | |
| evi = self._compute_evidence_score(links) | |
| test, test_reason = self._compute_testability_score(links) | |
| claim_ids = [l.claim_id for l in links if l.claim_id] | |
| _hyp_counter += 1 | |
| h = Hypothesis( | |
| id=f"HYP:{_hyp_counter:06d}", | |
| hypothesis_type="bridge", | |
| source_id=seed_id, | |
| source_name=self._index[seed_id].preferred_name, | |
| target_id=target_id, | |
| target_name=self._index[target_id].preferred_name, | |
| path=links, | |
| confidence_score=conf, | |
| novelty_score=nov, | |
| evidence_score=evi, | |
| testability_score=test, | |
| composite_score=0.0, # set below | |
| supporting_claims=claim_ids, | |
| testability_reason=test_reason, | |
| metadata={"domain_a": dom_a, "domain_b": dom_b}, | |
| ) | |
| h.explanation = self._generate_explanation(h) | |
| h.composite_score = self._composite_score(h) | |
| all_hypotheses.append(h) | |
| pair_count += 1 | |
| if pair_count >= max_paths_per_pair: | |
| break | |
| logger.info(f"batch generation complete: {len(all_hypotheses)} hypotheses from {len(domain_pairs)} domain pairs") | |
| all_hypotheses = self.post_process(all_hypotheses) | |
| return all_hypotheses | |
| def post_process( | |
| self, | |
| hypotheses: list[Hypothesis], | |
| min_hops: int = 2, | |
| filter_vague_relations: bool = True, | |
| filter_non_measurable: bool = True, | |
| max_hops_filter: int = 5, | |
| ) -> list[Hypothesis]: | |
| """Filter low-quality hypotheses after generation. | |
| Filters: | |
| 1. Noisy entities — source/target name matches NOISE_PATTERNS | |
| 2. 1-hop hypotheses — too simple, just restates existing edges | |
| 3. Vague relations — all links are is_associated_with / associated_with / about | |
| 4. Non-measurable biomarkers — entities not directly measurable from brain imaging | |
| 5. Pure association chains — no directional predicates (causes/treats/increases/etc.) | |
| 6. Overly long paths — exceeds max_hops_filter (default 5) to reduce noise accumulation | |
| """ | |
| before = len(hypotheses) | |
| filtered = [] | |
| for h in hypotheses: | |
| # filter noisy entities (source, target, and all intermediate nodes) | |
| all_names = {h.source_name, h.target_name} | |
| for link in h.path: | |
| all_names.add(link.from_name) | |
| all_names.add(link.to_name) | |
| if any(self._is_noisy_entity(name) for name in all_names): | |
| continue | |
| # filter 1-hop (single direct edge = no discovery value) | |
| if len(h.path) < min_hops: | |
| continue | |
| # filter all-vague-relations | |
| if filter_vague_relations: | |
| relation_types = {l.relation_type for l in h.path} | |
| if relation_types and relation_types <= VAGUE_RELATIONS: | |
| continue | |
| # filter single-PMID bridges (all hops cite the same paper = not a real bridge) | |
| if len(h.path) >= 2: | |
| pmids = set() | |
| for link in h.path: | |
| pmid = link.source_paper.get("pmid", "") if isinstance(link.source_paper, dict) else "" | |
| if pmid: | |
| pmids.add(pmid) | |
| if len(pmids) == 1: | |
| continue | |
| # filter non-measurable biomarkers (not testable from imaging) | |
| if filter_non_measurable: | |
| if self._has_non_measurable_entity(h): | |
| continue | |
| # filter biologically implausible paths (brain region → non-neurological target) | |
| if self._has_implausible_path(h): | |
| continue | |
| # filter paths with weak evidence (target not mentioned in raw_text) | |
| if self._has_weak_evidence(h): | |
| continue | |
| # filter paths where both ends of any edge are broad hubs | |
| # ("Brain Diseases --causes--> Cognitive Dysfunction" is uninformative) | |
| if self._has_hub_to_hub_edge(h): | |
| continue | |
| # filter paths touching any vague COGAT/MeSH umbrella hub | |
| # (memory/logic/loss/activation/risk/stress/Brain/Neurons). | |
| # These nodes are too abstract to drive a DL experiment whether | |
| # they appear as source, target, or intermediate. | |
| if self._touches_path_ignore_node(h): | |
| continue | |
| # filter paths that transit through disease mega-hubs as | |
| # intermediate nodes (A → Disease → B is uninformative). | |
| # These nodes are still valid as source/target endpoints. | |
| if self._transits_intermediate_only_hub(h): | |
| continue | |
| # (C-1) filter paths whose INTERMEDIATE node is a generic | |
| # phrase ("neural activity", "disease progression", "grey | |
| # matter", ...). Endpoints are not checked here. | |
| if self._has_intermediate_generic_phrase(h): | |
| continue | |
| # (C-2) filter paths whose directional density is too thin | |
| # (3+ hops with < 50% directional relations = too vague to | |
| # be a mechanism hypothesis). | |
| if self._has_thin_directional_density(h): | |
| continue | |
| # filter: target must be a dataset outcome (diagnosis/cognition/behavior/ | |
| # personality/motor). Predicting "White Matter" or "Neurons" is not a | |
| # hypothesis UKB/ADNI/HCP can directly test — those are imaging features | |
| # used as INPUTS, not outcomes. | |
| if not self._is_dataset_outcome(h): | |
| continue | |
| # (C-3) filter: target name is an umbrella concept ("skill", | |
| # "disease", "neurological disorder", "clinical features") | |
| # even though it passes the outcome keyword check. These | |
| # can't anchor a concrete DL label. | |
| if self._is_too_broad_target(h.target_name): | |
| continue | |
| # filter paths with no directional predicates (pure association chains) | |
| if len(h.path) >= 2: | |
| relation_types = {l.relation_type for l in h.path} | |
| if not (relation_types & DIRECTIONAL_RELATIONS): | |
| continue | |
| # filter paths that exceed max hop length (noise accumulation) | |
| if len(h.path) > max_hops_filter: | |
| continue | |
| filtered.append(h) | |
| # Deduplicate: for each (source, target) pair, keep top 2 by composite score | |
| from collections import defaultdict | |
| pair_groups = defaultdict(list) | |
| for h in filtered: | |
| key = (h.source_id, h.target_id) | |
| pair_groups[key].append(h) | |
| deduplicated = [] | |
| for key, group in pair_groups.items(): | |
| # Sort by composite score descending | |
| group.sort(key=lambda x: x.composite_score, reverse=True) | |
| # Keep top 2 (or 1 if only one exists) | |
| deduplicated.extend(group[:2]) | |
| logger.info(f"post_process: {before} -> {len(filtered)} filtered -> {len(deduplicated)} deduplicated " | |
| f"(removed {before - len(deduplicated)} total)") | |
| return deduplicated | |
| def _has_non_measurable_entity(self, h: Hypothesis) -> bool: | |
| """Check if hypothesis involves entities not measurable from brain imaging. | |
| Filters out hypotheses where source or target is: | |
| - A non-measurable domain (neurotransmitter levels, protein expression, etc.) | |
| - Matches non-measurable entity name patterns (CSF markers, blood markers, etc.) | |
| """ | |
| for node_name, node_id in [(h.source_name, h.source_id), (h.target_name, h.target_id)]: | |
| # check domain tags | |
| node = self._index.get(node_id) | |
| if node: | |
| domains = set(node.domain_tags) - {"claim"} | |
| # allow neurotransmitter/protein as intermediate hops only if source or target is neuroanatomy | |
| if domains & NON_MEASURABLE_BIOMARKER_TYPES: | |
| # check if the OTHER end is a brain region (then it's a valid "X affects brain" hypothesis) | |
| other_name = h.target_name if node_name == h.source_name else h.source_name | |
| other_id = h.target_id if node_name == h.source_name else h.source_id | |
| other_node = self._index.get(other_id) | |
| if other_node and "neuroanatomy" not in other_node.domain_tags: | |
| return True | |
| # check name patterns | |
| for pattern in _NON_MEASURABLE_PATTERNS: | |
| if pattern.search(node_name): | |
| return True | |
| return False | |
| def _is_noisy_entity(name: str) -> bool: | |
| """Check if an entity name matches known noise patterns.""" | |
| if not name or len(name.strip()) == 0: | |
| return True | |
| name_clean = name.strip() | |
| for pattern in NOISE_PATTERNS: | |
| if pattern.match(name_clean): | |
| return True | |
| # check if name contains any noise word | |
| words = set(re.split(r"[\s\-_,/]+", name_clean.lower())) | |
| if words & _NOISE_WORDS: | |
| return True | |
| return False | |
| def _is_generic_intermediate(name: str) -> bool: | |
| """(C-1) Phrase-level filter for intermediate node names that pass | |
| token-level `_NOISE_WORDS` but are still too vague. | |
| Examples that get blocked: | |
| - "neural activity" (no individual noise token) | |
| - "functional connectivity" (legit metric but not a mechanism) | |
| - "disease progression" | |
| - "grey matter" (umbrella) | |
| - "cognitive deficit" | |
| Only call on intermediate nodes — these phrases can be valid as | |
| endpoints (e.g. "functional connectivity" as a target metric). | |
| """ | |
| if not name: | |
| return True | |
| s = name.strip() | |
| for pattern in _GENERIC_INTERMEDIATE_PATTERNS: | |
| if pattern.match(s): | |
| return True | |
| return False | |
| def _is_too_broad_target(name: str) -> bool: | |
| """(C-3) Block target names that pass the outcome keyword regex but | |
| are umbrella concepts ("disease", "skill", "neurological disorder", | |
| "clinical features"). A DL experiment can't be designed against | |
| these — you don't know which subtype to label. | |
| """ | |
| if not name: | |
| return True | |
| s = name.strip() | |
| for pattern in _TARGET_TOO_BROAD_PATTERNS: | |
| if pattern.match(s): | |
| return True | |
| return False | |
| def _has_intermediate_generic_phrase(self, h: Hypothesis) -> bool: | |
| """(C-1) Reject paths whose intermediate node is a generic phrase | |
| like "neural activity" or "disease progression". Endpoints are | |
| excluded from this check because some metrics (e.g. "functional | |
| connectivity") legitimately appear as outcomes. | |
| """ | |
| if len(h.path) < 2: | |
| return False | |
| intermediate_names: list[str] = [] | |
| for i, link in enumerate(h.path): | |
| # link.from_name is intermediate when i >= 1 | |
| # link.to_name is intermediate when i < len(path) - 1 | |
| if i >= 1: | |
| intermediate_names.append(link.from_name or "") | |
| if i < len(h.path) - 1: | |
| intermediate_names.append(link.to_name or "") | |
| for name in intermediate_names: | |
| if self._is_generic_intermediate(name): | |
| return True | |
| return False | |
| def _has_thin_directional_density(self, h: Hypothesis) -> bool: | |
| """(C-2) Reject paths where directional relations are too sparse. | |
| Current rule (older): >= 1 directional anywhere = pass. | |
| Problem: a 4-hop path with 1 directional + 3 vague edges still | |
| looks like a real chain to scoring but is essentially a vague | |
| association narrative. | |
| New rule: | |
| - 1-2 hop path: at least 1 directional (unchanged) | |
| - 3+ hop path: at least half of the edges must be directional | |
| """ | |
| n = len(h.path) | |
| if n < 3: | |
| return False | |
| directional = sum(1 for l in h.path if l.relation_type in DIRECTIONAL_RELATIONS) | |
| return directional * 2 < n # < 50% directional | |
| def _has_implausible_path(self, h: Hypothesis) -> bool: | |
| """Check if hypothesis path has biologically implausible connections. | |
| Filters paths where a brain region directly predicts a non-neurological | |
| condition (e.g., amygdala → urinary incontinence) without a plausible | |
| intermediate neurological mechanism. | |
| """ | |
| # Check if source is a brain region and target is non-neurological | |
| source_node = self._index.get(h.source_id) | |
| target_node = self._index.get(h.target_id) | |
| if not source_node or not target_node: | |
| return False | |
| source_is_brain = "neuroanatomy" in source_node.domain_tags | |
| target_is_neuro = any(d in target_node.domain_tags for d in | |
| ["neuroanatomy", "disease", "cognitive_function", | |
| "biomarker", "gene", "drug", "neurotransmitter"]) | |
| # If source is brain region and target is non-neurological, check target name | |
| if source_is_brain and not target_is_neuro: | |
| if _NON_NEUROLOGICAL_TARGETS.search(h.target_name): | |
| return True | |
| # Also check intermediate nodes in the path | |
| for link in h.path: | |
| if _NON_NEUROLOGICAL_TARGETS.search(link.to_name): | |
| # Check if the previous node is a brain region | |
| prev_node = self._index.get(link.from_id) | |
| if prev_node and "neuroanatomy" in prev_node.domain_tags: | |
| # Only filter if there's no disease intermediate | |
| has_disease_intermediate = any( | |
| "disease" in self._index.get(l.from_id, ConceptNode(id="", preferred_name="")).domain_tags | |
| for l in h.path[:h.path.index(link)] | |
| ) | |
| if not has_disease_intermediate: | |
| return True | |
| return False | |
| def _has_hub_to_hub_edge(self, h: Hypothesis) -> bool: | |
| """Reject paths containing any edge whose endpoints are both broad hubs. | |
| Example: "Brain Diseases --causes--> Cognitive Dysfunction" — both ends | |
| are top-level categories; the edge is too generic to be a mechanistic | |
| step in a hypothesis. | |
| Hub set is the top-N nodes by non-'about' degree, computed once and | |
| cached. Uses a low bar (N=50) because hubs are self-evidently generic. | |
| """ | |
| if not hasattr(self, "_hub_id_set"): | |
| # Build once per engine instance | |
| from collections import Counter | |
| degree = Counter() | |
| for u, v, data in self.G.edges(data=True): | |
| if data.get("relation_type") != "about": | |
| degree[u] += 1 | |
| degree[v] += 1 | |
| top = degree.most_common(50) | |
| self._hub_id_set = {cid for cid, _ in top} | |
| for link in h.path: | |
| if link.from_id in self._hub_id_set and link.to_id in self._hub_id_set: | |
| return True | |
| return False | |
| def _touches_path_ignore_node(self, h: Hypothesis) -> bool: | |
| """Reject paths whose source, target, or any intermediate node is in | |
| PATH_IGNORE_NODE_IDS (vague COGAT/MeSH umbrella hubs). | |
| Catches concepts the token-based _is_noisy_entity misses because | |
| the names ("memory", "logic", "Brain", "Neurons") are legitimate | |
| English words but the KG concept id refers to an over-general | |
| umbrella that's not testable. | |
| """ | |
| if h.source_id in PATH_IGNORE_NODE_IDS: | |
| return True | |
| if h.target_id in PATH_IGNORE_NODE_IDS: | |
| return True | |
| for link in h.path: | |
| if link.from_id in PATH_IGNORE_NODE_IDS: | |
| return True | |
| if link.to_id in PATH_IGNORE_NODE_IDS: | |
| return True | |
| return False | |
| def _transits_intermediate_only_hub(h: Hypothesis) -> bool: | |
| """Reject paths that use disease mega-hubs as intermediate transit. | |
| INTERMEDIATE_ONLY_IGNORE_IDS nodes are valid as source/target | |
| (predicting Alzheimer is a real hypothesis) but not as middle | |
| hops (A → Alzheimer → B is just "both relate to AD"). | |
| """ | |
| if len(h.path) < 2: | |
| return False | |
| for i, link in enumerate(h.path): | |
| if i >= 1 and link.from_id in INTERMEDIATE_ONLY_IGNORE_IDS: | |
| return True | |
| if i < len(h.path) - 1 and link.to_id in INTERMEDIATE_ONLY_IGNORE_IDS: | |
| return True | |
| return False | |
| def _is_dataset_outcome(self, h: Hypothesis) -> bool: | |
| """Check if target is a UKB/ADNI/HCP-testable outcome. | |
| The goal of our hypotheses is to predict SOMETHING from brain imaging. | |
| Valid targets: | |
| - Clinical diagnoses (disease domain) — Alzheimer, MCI, schizophrenia, etc. | |
| - Cognitive/behavioral/personality measures (cognitive_function domain) | |
| - Brain decoding targets: | |
| * neuroanatomy (for encoding: stimulus → brain activation) | |
| * visual_stimulus (for decoding: brain → stimulus category) | |
| * emotion (SEED family: EEG → affect label) | |
| * vigilance (SEED-VIG: EEG → alertness) | |
| Invalid targets: | |
| - Molecular entities (gene, biomarker, drug, neurotransmitter) — these | |
| may be predictors, not predicted quantities | |
| - Overly generic disease categories (Brain Diseases, Mental Disorders) — | |
| already filtered by hub-to-hub, but double-check by keyword. | |
| Accepts target if EITHER: | |
| a) target's domain is in _OUTCOME_DOMAINS ∪ decoding domains, OR | |
| b) target name matches _OUTCOME_KEYWORDS regex (as fallback for | |
| claim_extraction concepts whose domain may be uncertain) | |
| """ | |
| target = self._index.get(h.target_id) | |
| if target is None: | |
| return False | |
| domains = set(target.domain_tags) | |
| # Accept: disease, cognitive_function, or decoding-target domains | |
| outcome_domains = _OUTCOME_DOMAINS | {"visual_stimulus", "emotion", "vigilance"} | |
| if domains & outcome_domains: | |
| return True | |
| # Accept: neuroanatomy targets when the hypothesis is a brain-decoding | |
| # encoding path (stimulus → brain region). Excludes the clinical- | |
| # prediction case where a target of 'White Matter' would be an input. | |
| if "neuroanatomy" in domains: | |
| source = self._index.get(h.source_id) | |
| if source: | |
| source_domains = set(source.domain_tags) | |
| if source_domains & {"visual_stimulus", "emotion", "vigilance"}: | |
| return True | |
| # Fallback: outcome keyword match (catches claim_extraction concepts | |
| # that describe outcomes but have wrong domain tags) | |
| if _OUTCOME_KEYWORDS.search(h.target_name): | |
| return True | |
| return False | |
| def _has_weak_evidence(self, h: Hypothesis) -> bool: | |
| """Check if hypothesis path has weak evidence (target not mentioned in raw_text). | |
| For hypotheses where the target is a specific brain region, check if any hop's | |
| raw_text actually mentions that region. If not, the path is likely spurious | |
| (e.g., IL-1β → Internal Capsula where the evidence text talks about "grey matter" | |
| but never mentions internal capsule). | |
| Exception: paths anchored by curated functional facts (e.g. `evokes` from | |
| visual_stimulus to a functional ROI) carry programmatic confidence, not | |
| paper evidence — skip the raw_text requirement for them. | |
| """ | |
| target_node = self._index.get(h.target_id) | |
| if not target_node or "neuroanatomy" not in target_node.domain_tags: | |
| return False | |
| # Skip paths whose source is a visual_stimulus / emotion / vigilance node, or | |
| # which contain at least one curated functional edge (evokes / decoded_from / | |
| # elicits). These are seeded from neuroscience textbooks, not paper claims. | |
| source_node = self._index.get(h.source_id) | |
| if source_node: | |
| decoding_domains = {"visual_stimulus", "emotion", "vigilance"} | |
| if any(t in decoding_domains for t in source_node.domain_tags): | |
| return False | |
| if any(l.relation_type in {"evokes", "decoded_from", "elicits"} for l in h.path): | |
| return False | |
| # Extract key terms from target name (e.g., "Internal Capsula" → ["internal", "capsula"]) | |
| target_terms = set(re.findall(r'\b\w{4,}\b', h.target_name.lower())) | |
| if not target_terms: | |
| return False | |
| # Check if any hop mentions the target region | |
| for link in h.path: | |
| raw = link.raw_text or link.evidence.get("raw_text", "") if isinstance(link.evidence, dict) else "" | |
| if raw: | |
| raw_lower = raw.lower() | |
| # If any target term appears in raw_text, evidence is OK | |
| if any(term in raw_lower for term in target_terms): | |
| return False | |
| # No hop mentions the target region → weak evidence | |
| logger.debug(f"weak evidence: {h.id} target '{h.target_name}' not mentioned in any raw_text") | |
| return True | |
| # ── imaging-driven batch generation ────────────────────────────── | |
| def batch_generate_imaging( | |
| self, | |
| dataset: str = "UKB", | |
| max_paths_per_pair: int = 5, | |
| max_seeds: int = 50, | |
| max_hops: int = 3, | |
| include_connectivity: bool = True, | |
| ) -> list[Hypothesis]: | |
| """Generate hypotheses driven by imaging features available in a dataset. | |
| Strategy: | |
| 1. Find AAL atlas neuroanatomy nodes in the graph as ROI seeds | |
| 2. For each ROI × imaging feature template, construct a feature name | |
| (e.g., "cortical thickness of Hippocampus_L") | |
| 3. Find graph paths from each ROI to disease/cognitive_function nodes | |
| 4. Filter using expanded exclusion rules | |
| 5. Annotate each hypothesis with dataset metadata | |
| """ | |
| dataset_key = dataset.upper().replace("-", "_") | |
| if dataset_key not in DATASET_FEATURES: | |
| raise ValueError(f"Unknown dataset: {dataset}. Available: {list(DATASET_FEATURES.keys())}") | |
| ds_features = DATASET_FEATURES[dataset_key] | |
| ds_outcomes = DATASET_OUTCOMES.get(dataset_key, []) | |
| # 1. Find AAL atlas ROI nodes | |
| aal_nodes = self._find_aal_regions(max_seeds) | |
| if not aal_nodes: | |
| logger.warning("No AAL atlas regions found in graph") | |
| return [] | |
| logger.info(f"Found {len(aal_nodes)} AAL regions for imaging hypothesis generation") | |
| # 2. Collect outcome nodes (disease, cognitive_function) | |
| outcome_nodes = self._collect_outcome_nodes() | |
| if not outcome_nodes: | |
| logger.warning("No outcome nodes (disease/cognitive_function) found") | |
| return [] | |
| # 3. Determine which imaging templates apply to this dataset | |
| applicable_templates = { | |
| name: meta for name, meta in IMAGING_FEATURE_TEMPLATES.items() | |
| if dataset_key in meta["datasets"] | |
| } | |
| all_hypotheses: list[Hypothesis] = [] | |
| _hyp_counter = 0 | |
| seen_pairs: set[tuple[str, str]] = set() | |
| # 4. Generate ROI-level imaging hypotheses | |
| for region_id, region_name in aal_nodes.items(): | |
| for feat_template, feat_meta in applicable_templates.items(): | |
| feature_name = feat_template.replace("{region}", region_name) | |
| # Find paths from this ROI to outcomes | |
| try: | |
| reachable = nx.single_source_shortest_path( | |
| self.G, region_id, cutoff=max_hops | |
| ) | |
| except nx.NetworkXError: | |
| continue | |
| candidates = [ | |
| nid for nid in reachable | |
| if nid in outcome_nodes and nid != region_id | |
| ] | |
| pair_count = 0 | |
| for target_id in candidates: | |
| pair_key = (region_id, target_id, feat_template) | |
| if pair_key in seen_pairs: | |
| continue | |
| seen_pairs.add(pair_key) | |
| raw_path = reachable[target_id] | |
| links = self._enrich_path(raw_path) | |
| if not links: | |
| continue | |
| # Skip if path contains non-measurable entities | |
| if self._path_has_non_measurable(links): | |
| continue | |
| conf = self._compute_confidence_score(links) | |
| nov = self._compute_novelty_score(links) | |
| evi = self._compute_evidence_score(links) | |
| test, test_reason = self._compute_testability_score(links) | |
| # Boost testability for imaging-driven hypotheses | |
| test = min(test + 0.15, 1.0) | |
| claim_ids = [l.claim_id for l in links if l.claim_id] | |
| _hyp_counter += 1 | |
| target_node = self._index.get(target_id) | |
| h = Hypothesis( | |
| id=f"HYP:IMG:{_hyp_counter:06d}", | |
| hypothesis_type="imaging", | |
| source_id=region_id, | |
| source_name=feature_name, | |
| target_id=target_id, | |
| target_name=target_node.preferred_name if target_node else target_id, | |
| path=links, | |
| confidence_score=conf, | |
| novelty_score=nov, | |
| evidence_score=evi, | |
| testability_score=test, | |
| composite_score=0.0, | |
| supporting_claims=claim_ids, | |
| testability_reason=test_reason, | |
| metadata={ | |
| "dataset": dataset_key, | |
| "input_modality": feat_meta["modality"], | |
| "input_feature": feature_name, | |
| "input_level": feat_meta["level"], | |
| "input_tool": feat_meta["tool"], | |
| "input_region": region_name, | |
| "outcome_type": self._classify_outcome(target_node), | |
| }, | |
| ) | |
| h.explanation = self._generate_explanation(h) | |
| h.composite_score = self._composite_score(h) | |
| all_hypotheses.append(h) | |
| pair_count += 1 | |
| if pair_count >= max_paths_per_pair: | |
| break | |
| # 5. Generate connectivity-level hypotheses | |
| if include_connectivity: | |
| conn_templates = { | |
| name: meta for name, meta in CONNECTIVITY_FEATURE_TEMPLATES.items() | |
| if dataset_key in meta["datasets"] | |
| } | |
| if conn_templates: | |
| hyps = self._generate_connectivity_hypotheses( | |
| aal_nodes, outcome_nodes, conn_templates, | |
| dataset_key, max_paths_per_pair, max_hops, _hyp_counter, seen_pairs, | |
| ) | |
| _hyp_counter += len(hyps) | |
| all_hypotheses.extend(hyps) | |
| logger.info( | |
| f"imaging batch generation ({dataset_key}): " | |
| f"{len(all_hypotheses)} hypotheses from {len(aal_nodes)} regions" | |
| ) | |
| all_hypotheses = self.post_process(all_hypotheses) | |
| return all_hypotheses | |
| def _find_aal_regions(self, max_n: int) -> dict[str, str]: | |
| """Find AAL atlas neuroanatomy nodes. Returns {node_id: region_name}.""" | |
| candidates = {} | |
| for nid, data in self.G.nodes(data=True): | |
| if "neuroanatomy" not in data.get("domain_tags", []): | |
| continue | |
| name = data.get("preferred_name", "") | |
| # Match against AAL region keywords | |
| name_lower = name.lower() | |
| for kw in _AAL_REGION_KEYWORDS: | |
| if kw.lower() in name_lower: | |
| candidates[nid] = name | |
| break | |
| # Sort by degree (more connected = richer paths) | |
| sorted_items = sorted( | |
| candidates.items(), | |
| key=lambda item: self.G.degree(item[0]), | |
| reverse=True, | |
| ) | |
| return dict(sorted_items[:max_n]) | |
| def _collect_outcome_nodes(self) -> set[str]: | |
| """Collect all disease + cognitive_function nodes as potential outcomes.""" | |
| outcome_ids = set() | |
| for nid, data in self.G.nodes(data=True): | |
| domains = set(data.get("domain_tags", [])) | |
| if "claim" in domains: | |
| continue | |
| if nid in PATH_IGNORE_NODE_IDS: | |
| continue | |
| if domains & {"disease", "cognitive_function"}: | |
| outcome_ids.add(nid) | |
| return outcome_ids | |
| def _classify_outcome(self, node: Optional[ConceptNode]) -> str: | |
| """Classify outcome node type for metadata.""" | |
| if not node: | |
| return "unknown" | |
| domains = set(node.domain_tags) | |
| if "disease" in domains: | |
| return "disease" | |
| if "cognitive_function" in domains: | |
| return "cognitive_function" | |
| if "biomarker" in domains: | |
| return "biomarker" | |
| return "other" | |
| def _path_has_non_measurable(self, links: list[HypothesisLink]) -> bool: | |
| """Check if any intermediate node in the path is non-measurable.""" | |
| for link in links: | |
| for name, nid in [(link.from_name, link.from_id), (link.to_name, link.to_id)]: | |
| node = self._index.get(nid) | |
| if node: | |
| domains = set(node.domain_tags) - {"claim"} | |
| if domains & NON_MEASURABLE_BIOMARKER_TYPES: | |
| return True | |
| for pattern in _NON_MEASURABLE_PATTERNS: | |
| if pattern.search(name): | |
| return True | |
| return False | |
| def _generate_connectivity_hypotheses( | |
| self, | |
| aal_nodes: dict[str, str], | |
| outcome_nodes: set[str], | |
| conn_templates: dict, | |
| dataset_key: str, | |
| max_paths_per_pair: int, | |
| max_hops: int, | |
| hyp_counter_start: int, | |
| seen_pairs: set, | |
| ) -> list[Hypothesis]: | |
| """Generate hypotheses for connectivity features (FC/EC/SC between region pairs).""" | |
| hypotheses = [] | |
| counter = hyp_counter_start | |
| region_ids = list(aal_nodes.keys()) | |
| # Sample region pairs (limit to avoid O(n^2) explosion) | |
| max_pairs = min(len(region_ids) * 3, 200) | |
| import random | |
| if len(region_ids) > 20: | |
| sampled_pairs = [] | |
| for _ in range(max_pairs): | |
| a, b = random.sample(region_ids, 2) | |
| sampled_pairs.append((a, b)) | |
| else: | |
| sampled_pairs = [(a, b) for i, a in enumerate(region_ids) for b in region_ids[i+1:]] | |
| sampled_pairs = sampled_pairs[:max_pairs] | |
| for region_a_id, region_b_id in sampled_pairs: | |
| name_a = aal_nodes[region_a_id] | |
| name_b = aal_nodes[region_b_id] | |
| for feat_template, feat_meta in conn_templates.items(): | |
| feature_name = feat_template.replace("{a}", name_a).replace("{b}", name_b) | |
| # Find paths from region_a to outcomes (potentially through region_b) | |
| try: | |
| reachable = nx.single_source_shortest_path( | |
| self.G, region_a_id, cutoff=max_hops | |
| ) | |
| except nx.NetworkXError: | |
| continue | |
| candidates = [ | |
| nid for nid in reachable | |
| if nid in outcome_nodes and nid != region_a_id | |
| ] | |
| pair_count = 0 | |
| for target_id in candidates: | |
| pair_key = (region_a_id, target_id, feat_template) | |
| if pair_key in seen_pairs: | |
| continue | |
| seen_pairs.add(pair_key) | |
| raw_path = reachable[target_id] | |
| links = self._enrich_path(raw_path) | |
| if not links: | |
| continue | |
| if self._path_has_non_measurable(links): | |
| continue | |
| conf = self._compute_confidence_score(links) | |
| nov = self._compute_novelty_score(links) | |
| evi = self._compute_evidence_score(links) | |
| test, test_reason = self._compute_testability_score(links) | |
| test = min(test + 0.15, 1.0) | |
| claim_ids = [l.claim_id for l in links if l.claim_id] | |
| counter += 1 | |
| target_node = self._index.get(target_id) | |
| h = Hypothesis( | |
| id=f"HYP:IMG:{counter:06d}", | |
| hypothesis_type="imaging_connectivity", | |
| source_id=region_a_id, | |
| source_name=feature_name, | |
| target_id=target_id, | |
| target_name=target_node.preferred_name if target_node else target_id, | |
| path=links, | |
| confidence_score=conf, | |
| novelty_score=nov, | |
| evidence_score=evi, | |
| testability_score=test, | |
| composite_score=0.0, | |
| supporting_claims=claim_ids, | |
| testability_reason=test_reason, | |
| metadata={ | |
| "dataset": dataset_key, | |
| "input_modality": feat_meta["modality"], | |
| "input_feature": feature_name, | |
| "input_level": feat_meta["level"], | |
| "input_tool": feat_meta["tool"], | |
| "input_region_a": name_a, | |
| "input_region_b": name_b, | |
| "input_region": f"{name_a} - {name_b}", | |
| "outcome_type": self._classify_outcome(target_node), | |
| }, | |
| ) | |
| h.explanation = self._generate_explanation(h) | |
| h.composite_score = self._composite_score(h) | |
| hypotheses.append(h) | |
| pair_count += 1 | |
| if pair_count >= max_paths_per_pair: | |
| break | |
| return hypotheses | |
| # ── persistence ──────────────────────────────────────────────────── | |
| def save_hypotheses(self, hypotheses: list[Hypothesis], path: str | Path) -> None: | |
| """Save hypotheses to JSON.""" | |
| path = Path(path) | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| data = { | |
| "n_hypotheses": len(hypotheses), | |
| "hypotheses": [h.to_dict() for h in hypotheses], | |
| } | |
| path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8") | |
| logger.info(f"saved {len(hypotheses)} hypotheses to {path}") | |
| def load_hypotheses(self, path: str | Path) -> list[Hypothesis]: | |
| """Load hypotheses from JSON.""" | |
| path = Path(path) | |
| data = json.loads(path.read_text(encoding="utf-8")) | |
| hypotheses = [Hypothesis.from_dict(h) for h in data["hypotheses"]] | |
| logger.info(f"loaded {len(hypotheses)} hypotheses from {path}") | |
| return hypotheses | |
| # ── ranking ──────────────────────────────────────────────────────── | |
| def rank_hypotheses( | |
| self, | |
| hypotheses: list[Hypothesis], | |
| weights: Optional[dict[str, float]] = None, | |
| top_n: int = 100, | |
| skip_post_process: bool = False, | |
| ) -> list[Hypothesis]: | |
| """Rank hypotheses by composite score (novelty, evidence, testability, confidence). | |
| Args: | |
| hypotheses: list of hypotheses to rank | |
| weights: custom weights dict, keys: confidence, evidence, novelty, testability | |
| top_n: return top N results | |
| skip_post_process: if True, skip the post-processing filter | |
| """ | |
| if not skip_post_process: | |
| hypotheses = self.post_process(hypotheses) | |
| if weights is None: | |
| # testability weighted highest — must be verifiable with imaging experiments | |
| weights = { | |
| "confidence": 0.20, | |
| "evidence": 0.20, | |
| "novelty": 0.25, | |
| "testability": 0.35, | |
| } | |
| for h in hypotheses: | |
| h.composite_score = ( | |
| (h.confidence_score ** weights["confidence"]) | |
| * (h.evidence_score ** weights["evidence"]) | |
| * (h.novelty_score ** weights["novelty"]) | |
| * (max(h.testability_score, 0.01) ** weights["testability"]) | |
| ) | |
| hypotheses.sort(key=lambda h: h.composite_score, reverse=True) | |
| return hypotheses[:top_n] | |
| # ── query-based (kept for interactive use) ───────────────────────── | |
| def find_paths( | |
| self, | |
| source_id: str, | |
| target_id: str, | |
| max_hops: int = 3, | |
| max_paths: int = 20, | |
| ) -> list[Hypothesis]: | |
| """Find hypothesis paths between two concepts with evidence enrichment.""" | |
| if source_id not in self.G or target_id not in self.G: | |
| return [] | |
| claim_nodes = {nid for nid, n in self._index.items() if "claim" in n.domain_tags} | |
| intermediate_exclude = claim_nodes - {source_id, target_id} | |
| # Also strip vague umbrella hubs from the search subgraph so paths | |
| # never include them as intermediates. Endpoints are excluded from | |
| # the strip so a caller can still query them directly. | |
| intermediate_exclude |= (PATH_IGNORE_NODE_IDS - {source_id, target_id}) | |
| subgraph = self.G.copy() | |
| subgraph.remove_nodes_from(intermediate_exclude) | |
| if source_id not in subgraph or target_id not in subgraph: | |
| return [] | |
| try: | |
| raw_paths = list(nx.all_simple_paths( | |
| subgraph, source_id, target_id, cutoff=max_hops | |
| )) | |
| except nx.NetworkXError: | |
| return [] | |
| raw_paths = raw_paths[:max_paths] | |
| return self._build_hypotheses_from_paths(raw_paths, "path") | |
| def bridge_discovery( | |
| self, | |
| concept_id: str, | |
| target_domain: str, | |
| max_hops: int = 3, | |
| max_results: int = 20, | |
| ) -> list[Hypothesis]: | |
| """Find cross-domain connections through intermediate claims.""" | |
| if concept_id not in self.G: | |
| return [] | |
| target_nodes = { | |
| nid for nid, data in self.G.nodes(data=True) | |
| if target_domain in data.get("domain_tags", []) | |
| } | |
| if not target_nodes: | |
| return [] | |
| try: | |
| reachable = nx.single_source_shortest_path( | |
| self.G, concept_id, cutoff=max_hops | |
| ) | |
| except nx.NetworkXError: | |
| return [] | |
| candidates = { | |
| nid for nid in reachable | |
| if nid in target_nodes and nid != concept_id | |
| and "claim" not in self._index.get(nid, ConceptNode(id="", preferred_name="")).domain_tags | |
| } | |
| hypotheses = [] | |
| for target_id in candidates: | |
| raw_path = reachable[target_id] | |
| links = self._enrich_path(raw_path) | |
| if not links: | |
| continue | |
| conf = self._compute_confidence_score(links) | |
| nov = self._compute_novelty_score(links) | |
| evi = self._compute_evidence_score(links) | |
| test, test_reason = self._compute_testability_score(links) | |
| claim_ids = [l.claim_id for l in links if l.claim_id] | |
| h = Hypothesis( | |
| hypothesis_type="bridge", | |
| source_id=concept_id, | |
| source_name=self._index[concept_id].preferred_name, | |
| target_id=target_id, | |
| target_name=self._index[target_id].preferred_name, | |
| path=links, | |
| confidence_score=conf, | |
| novelty_score=nov, | |
| evidence_score=evi, | |
| testability_score=test, | |
| supporting_claims=claim_ids, | |
| testability_reason=test_reason, | |
| ) | |
| h.explanation = self._generate_explanation(h) | |
| h.composite_score = self._composite_score(h) | |
| hypotheses.append(h) | |
| hypotheses.sort(key=lambda h: h.composite_score, reverse=True) | |
| return hypotheses[:max_results] | |
| def discover_hypotheses( | |
| self, | |
| concept_id: str, | |
| max_hops: int = 3, | |
| max_results: int = 30, | |
| exclude_domains: Optional[set[str]] = None, | |
| ) -> list[Hypothesis]: | |
| """Find hypotheses radiating from a single concept to all reachable domains.""" | |
| if concept_id not in self.G: | |
| return [] | |
| exclude = exclude_domains or {"claim"} | |
| source_node = self._index.get(concept_id) | |
| source_domains = set(source_node.domain_tags) - exclude if source_node else set() | |
| try: | |
| reachable = nx.single_source_shortest_path(self.G, concept_id, cutoff=max_hops) | |
| except nx.NetworkXError: | |
| return [] | |
| candidates = [] | |
| for target_id, raw_path in reachable.items(): | |
| if target_id == concept_id: | |
| continue | |
| target_node = self._index.get(target_id) | |
| if not target_node: | |
| continue | |
| target_domains = set(target_node.domain_tags) - exclude | |
| if not target_domains or target_domains <= source_domains: | |
| continue | |
| candidates.append((target_id, raw_path)) | |
| hypotheses = [] | |
| for target_id, raw_path in candidates: | |
| links = self._enrich_path(raw_path) | |
| if not links: | |
| continue | |
| conf = self._compute_confidence_score(links) | |
| nov = self._compute_novelty_score(links) | |
| evi = self._compute_evidence_score(links) | |
| test, test_reason = self._compute_testability_score(links) | |
| claim_ids = [l.claim_id for l in links if l.claim_id] | |
| h = Hypothesis( | |
| hypothesis_type="discover", | |
| source_id=concept_id, | |
| source_name=self._index[concept_id].preferred_name, | |
| target_id=target_id, | |
| target_name=self._index[target_id].preferred_name, | |
| path=links, | |
| confidence_score=conf, | |
| novelty_score=nov, | |
| evidence_score=evi, | |
| testability_score=test, | |
| supporting_claims=claim_ids, | |
| testability_reason=test_reason, | |
| ) | |
| h.explanation = self._generate_explanation(h) | |
| h.composite_score = self._composite_score(h) | |
| hypotheses.append(h) | |
| hypotheses = self.post_process(hypotheses) | |
| hypotheses.sort(key=lambda h: h.composite_score, reverse=True) | |
| return hypotheses[:max_results] | |
| def find_trending( | |
| self, | |
| since_year: int = 2020, | |
| min_claims: int = 3, | |
| direction: str = "strengthening", | |
| max_results: int = 30, | |
| ) -> list[dict]: | |
| """Find concept pairs with strengthening/weakening evidence over time. | |
| Returns list of dicts with: concept_a, concept_b, years, slope, direction, claims. | |
| """ | |
| from collections import Counter | |
| # Group claims by (subject, object) | |
| claim_groups: dict[tuple[str, str], list[dict]] = {} | |
| for nid, node in self._index.items(): | |
| if "claim" not in node.domain_tags: | |
| continue | |
| meta = node.metadata | |
| sid = meta.get("subject_id", "") | |
| oid = meta.get("object_id", "") | |
| if not sid or not oid: | |
| continue | |
| key = (sid, oid) | |
| claim_groups.setdefault(key, []).append(meta) | |
| results = [] | |
| for (sid, oid), claims in claim_groups.items(): | |
| years = [] | |
| for c in claims: | |
| sp = c.get("source_paper", {}) | |
| y = sp.get("year") | |
| if y and y >= since_year: | |
| years.append(y) | |
| if len(years) < min_claims: | |
| continue | |
| year_counts = Counter(years) | |
| ys = sorted(year_counts.keys()) | |
| cs = [year_counts[y] for y in ys] | |
| slope = _simple_slope(ys, cs) | |
| if direction == "strengthening" and slope <= 0.3: | |
| continue | |
| if direction == "weakening" and slope >= -0.3: | |
| continue | |
| if direction == "emerging" and max(ys) < 2025: | |
| continue | |
| src_node = self._index.get(sid) | |
| tgt_node = self._index.get(oid) | |
| results.append({ | |
| "concept_a": src_node.preferred_name if src_node else sid, | |
| "concept_b": tgt_node.preferred_name if tgt_node else oid, | |
| "concept_a_id": sid, | |
| "concept_b_id": oid, | |
| "year_counts": {str(y): year_counts[y] for y in ys}, | |
| "slope": round(slope, 3), | |
| "direction": direction, | |
| "n_claims": len(claims), | |
| }) | |
| results.sort(key=lambda r: abs(r["slope"]), reverse=True) | |
| return results[:max_results] | |
| def contradiction_detection( | |
| self, | |
| domain_filter: Optional[str] = None, | |
| max_results: int = 50, | |
| ) -> list[Contradiction]: | |
| """Find pairs of claims that assert opposite things about the same concept pair.""" | |
| claim_lookup: dict[tuple[str, str], list[ConceptNode]] = {} | |
| for nid, node in self._index.items(): | |
| if "claim" not in node.domain_tags: | |
| continue | |
| meta = node.metadata | |
| sid = meta.get("subject_id", "") | |
| oid = meta.get("object_id", "") | |
| if not sid or not oid: | |
| continue | |
| if domain_filter: | |
| src_node = self._index.get(sid) | |
| tgt_node = self._index.get(oid) | |
| domains = set() | |
| if src_node: | |
| domains.update(src_node.domain_tags) | |
| if tgt_node: | |
| domains.update(tgt_node.domain_tags) | |
| if domain_filter not in domains: | |
| continue | |
| key = (sid, oid) | |
| claim_lookup.setdefault(key, []).append(node) | |
| contradictions = [] | |
| for (sid, oid), claims in claim_lookup.items(): | |
| if len(claims) < 2: | |
| continue | |
| for i in range(len(claims)): | |
| for j in range(i + 1, len(claims)): | |
| c1, c2 = claims[i], claims[j] | |
| m1, m2 = c1.metadata, c2.metadata | |
| severity = self._check_contradiction(m1, m2) | |
| if severity > 0: | |
| contradictions.append(Contradiction( | |
| concept_a_id=sid, | |
| concept_a_name=m1.get("subject_name", sid), | |
| concept_b_id=oid, | |
| concept_b_name=m1.get("object_name", oid), | |
| claim_for_id=c1.id, | |
| claim_for_predicate=m1.get("predicate", ""), | |
| claim_for_text=m1.get("raw_text", ""), | |
| claim_against_id=c2.id, | |
| claim_against_predicate=m2.get("predicate", ""), | |
| claim_against_text=m2.get("raw_text", ""), | |
| severity=severity, | |
| )) | |
| contradictions.sort(key=lambda c: c.severity, reverse=True) | |
| return contradictions[:max_results] | |
| def gap_detection( | |
| self, | |
| domain_a: str, | |
| domain_b: Optional[str] = None, | |
| max_results: int = 50, | |
| ) -> list[Gap]: | |
| """Find concept pairs 2 hops apart with no direct edge.""" | |
| if domain_b is None: | |
| domain_b = domain_a | |
| nodes_a = { | |
| nid for nid, data in self.G.nodes(data=True) | |
| if domain_a in data.get("domain_tags", []) | |
| and "claim" not in data.get("domain_tags", []) | |
| } | |
| nodes_b = { | |
| nid for nid, data in self.G.nodes(data=True) | |
| if domain_b in data.get("domain_tags", []) | |
| and "claim" not in data.get("domain_tags", []) | |
| } | |
| gaps = [] | |
| seen = set() | |
| for a_id in nodes_a: | |
| if a_id not in self.G: | |
| continue | |
| hop1 = set(self.G.successors(a_id)) | set(self.G.predecessors(a_id)) | |
| hop2 = set() | |
| for n1 in hop1: | |
| if "claim" in self._index.get(n1, ConceptNode(id="", preferred_name="")).domain_tags: | |
| continue | |
| hop2.update(self.G.successors(n1)) | |
| hop2.update(self.G.predecessors(n1)) | |
| hop2 -= {a_id} | |
| hop2 -= hop1 | |
| for b_id in hop2 & nodes_b: | |
| pair = tuple(sorted([a_id, b_id])) | |
| if pair in seen: | |
| continue | |
| seen.add(pair) | |
| if self.G.has_edge(a_id, b_id) or self.G.has_edge(b_id, a_id): | |
| continue | |
| try: | |
| path = nx.shortest_path(self.G, a_id, b_id) | |
| except (nx.NetworkXNoPath, nx.NetworkXError): | |
| continue | |
| if len(path) > 3: | |
| continue | |
| connecting = [n for n in path[1:-1] | |
| if "claim" not in self._index.get(n, ConceptNode(id="", preferred_name="")).domain_tags] | |
| a_node = self._index.get(a_id) | |
| b_node = self._index.get(b_id) | |
| gaps.append(Gap( | |
| concept_a_id=a_id, | |
| concept_a_name=a_node.preferred_name if a_node else a_id, | |
| concept_b_id=b_id, | |
| concept_b_name=b_node.preferred_name if b_node else b_id, | |
| distance=len(path) - 1, | |
| connecting_concepts=connecting, | |
| domain_a=domain_a, | |
| domain_b=domain_b, | |
| potential_relation=self._infer_relation(path), | |
| )) | |
| gaps.sort(key=lambda g: (0 if g.domain_a != g.domain_b else 1, g.distance)) | |
| return gaps[:max_results] | |
| # ── name resolution ──────────────────────────────────────────────── | |
| def resolve_name(self, query: str) -> Optional[str]: | |
| """Resolve a name to a concept ID. Returns None if not found.""" | |
| if not query: | |
| return None | |
| for node in self._index.values(): | |
| if node.preferred_name == query: | |
| return node.id | |
| query_lower = query.lower() | |
| for node in self._index.values(): | |
| if node.preferred_name.lower() == query_lower: | |
| return node.id | |
| for node in self._index.values(): | |
| for alias in node.aliases: | |
| if alias.lower() == query_lower: | |
| return node.id | |
| candidates = [] | |
| for node in self._index.values(): | |
| name_lower = node.preferred_name.lower() | |
| if query_lower in name_lower or name_lower in query_lower: | |
| candidates.append(node) | |
| continue | |
| for alias in node.aliases: | |
| if query_lower in alias.lower() or alias.lower() in query_lower: | |
| candidates.append(node) | |
| break | |
| if len(candidates) == 1: | |
| return candidates[0].id | |
| elif len(candidates) > 1: | |
| candidates.sort(key=lambda n: len(n.preferred_name)) | |
| return candidates[0].id | |
| return None | |
| # ── internal helpers ─────────────────────────────────────────────── | |
| def _sample_domain_nodes(self, domain: str, max_n: int) -> list[str]: | |
| """Sample up to max_n non-claim nodes from a domain, preferring nodes with edges.""" | |
| nodes = [ | |
| nid for nid, data in self.G.nodes(data=True) | |
| if domain in data.get("domain_tags", []) | |
| and "claim" not in data.get("domain_tags", []) | |
| and nid not in PATH_IGNORE_NODE_IDS | |
| ] | |
| # sort by degree (more connected = more useful as seed) | |
| nodes.sort(key=lambda n: self.G.degree(n), reverse=True) | |
| return nodes[:max_n] | |
| def _build_hypotheses_from_paths( | |
| self, raw_paths: list[list[str]], hyp_type: str | |
| ) -> list[Hypothesis]: | |
| """Build Hypothesis objects from raw node-ID paths.""" | |
| hypotheses = [] | |
| for raw_path in raw_paths: | |
| links = self._enrich_path(raw_path) | |
| if not links: | |
| continue | |
| conf = self._compute_confidence_score(links) | |
| nov = self._compute_novelty_score(links) | |
| evi = self._compute_evidence_score(links) | |
| test, test_reason = self._compute_testability_score(links) | |
| claim_ids = [l.claim_id for l in links if l.claim_id] | |
| h = Hypothesis( | |
| hypothesis_type=hyp_type, | |
| source_id=raw_path[0], | |
| source_name=self._index[raw_path[0]].preferred_name, | |
| target_id=raw_path[-1], | |
| target_name=self._index[raw_path[-1]].preferred_name, | |
| path=links, | |
| confidence_score=conf, | |
| novelty_score=nov, | |
| evidence_score=evi, | |
| testability_score=test, | |
| supporting_claims=claim_ids, | |
| testability_reason=test_reason, | |
| ) | |
| h.explanation = self._generate_explanation(h) | |
| h.composite_score = self._composite_score(h) | |
| hypotheses.append(h) | |
| hypotheses.sort(key=lambda h: h.composite_score, reverse=True) | |
| return hypotheses | |
| def _enrich_path(self, raw_path: list[str]) -> list[HypothesisLink]: | |
| """Convert a raw node-ID path into rich HypothesisLink objects.""" | |
| links = [] | |
| for i in range(len(raw_path) - 1): | |
| src_id, tgt_id = raw_path[i], raw_path[i + 1] | |
| if not self.G.has_edge(src_id, tgt_id): | |
| continue | |
| edge_data = self.G.edges[src_id, tgt_id] | |
| src_node = self._index.get(src_id) | |
| tgt_node = self._index.get(tgt_id) | |
| claim_id = edge_data.get("metadata", {}).get("claim_id", "") | |
| claim_node = self._index.get(claim_id) if claim_id else None | |
| evidence = {} | |
| paper = {} | |
| raw_text = "" | |
| if claim_node and claim_node.metadata: | |
| meta = claim_node.metadata | |
| evidence = meta.get("evidence", {}) | |
| paper = meta.get("source_paper", {}) | |
| raw_text = meta.get("raw_text", "") | |
| links.append(HypothesisLink( | |
| from_id=src_id, | |
| from_name=src_node.preferred_name if src_node else src_id, | |
| to_id=tgt_id, | |
| to_name=tgt_node.preferred_name if tgt_node else tgt_id, | |
| relation_type=edge_data.get("relation_type", "unknown"), | |
| confidence=edge_data.get("confidence", 0.5), | |
| claim_id=claim_id, | |
| raw_text=raw_text, | |
| evidence=evidence, | |
| source_paper=paper, | |
| )) | |
| return links | |
| # ── scoring ──────────────────────────────────────────────────────── | |
| def compute_frequency_boost(self, claim_meta: dict) -> float: | |
| """Frequency boost based on independent PRIMARY study replication. | |
| Prefers the merged `primary_supporting_papers` list set by | |
| `phase4_optimize.merge_duplicate_claims` (already filtered for | |
| non-review study types). Falls back to rebuilding from the | |
| pre-merge index, matching the same filter logic. | |
| """ | |
| # Fast path: canonical claim carries primary-PMID list | |
| primary = claim_meta.get("primary_supporting_papers") | |
| if primary is not None and isinstance(primary, list): | |
| n = len(primary) | |
| if n >= 3: | |
| return 1.2 | |
| elif n >= 1: | |
| return 1.0 | |
| else: | |
| return 0.5 | |
| # Fallback: scan all claims with the same SPO, filter reviews | |
| key = ( | |
| claim_meta.get("subject_id", ""), | |
| claim_meta.get("predicate", ""), | |
| claim_meta.get("object_id", ""), | |
| ) | |
| all_claims = self._claims_by_triple.get(key, []) | |
| primary_pmids = set() | |
| for c in all_claims: | |
| st = c.get("evidence", {}).get("study_type", "") | |
| if st not in _REVIEW_TYPES: | |
| pmid = c.get("source_paper", {}).get("pmid", "") | |
| if pmid: | |
| primary_pmids.add(pmid) | |
| if len(primary_pmids) >= 3: | |
| return 1.2 | |
| elif len(primary_pmids) >= 1: | |
| return 1.0 | |
| else: | |
| return 0.5 | |
| def compute_temporal_decay(claim_meta: dict, reference_year: int = 2026) -> float: | |
| """Temporal decay: newer primary studies get higher weight. | |
| Reviews get no time bonus (1.0). Primary studies decay 3% per year, floor 0.7. | |
| """ | |
| st = claim_meta.get("evidence", {}).get("study_type", "") | |
| if st in _REVIEW_TYPES: | |
| return 1.0 | |
| year = claim_meta.get("source_paper", {}).get("year", 0) | |
| if not year: | |
| return 0.85 # unknown year, neutral | |
| age = reference_year - year | |
| return max(0.7, 1.0 - 0.03 * age) | |
| def _compute_confidence_score(self, path: list[HypothesisLink]) -> float: | |
| """Confidence = geometric mean of per-link scores, with weak-link penalty. | |
| Per-link score = edge.confidence × freq_boost × temporal_decay | |
| (edge.confidence already includes study_type weighting from | |
| phase4_optimize.apply_evidence_weighting and the claim-level | |
| statistical quality signals from claim_extractor._estimate_confidence) | |
| Aggregate: geometric mean (one weak link crushes the path) | |
| + weakest-link penalty (×0.7 when min_edge < 0.1) | |
| Single source of truth for each multiplier: | |
| - study_type → phase4_optimize.WEIGHT_MAP (canonical, idempotent) | |
| - p_value/sample_size/replicability → claim_extractor._estimate_confidence | |
| - freq across primary PMIDs → compute_frequency_boost | |
| - publication recency → compute_temporal_decay | |
| """ | |
| if not path: | |
| return 0.0 | |
| import math | |
| scores = [] | |
| min_conf = float("inf") | |
| for link in path: | |
| raw = max(link.confidence, 1e-3) # tiny floor for log() | |
| min_conf = min(min_conf, raw) | |
| full_meta = { | |
| "evidence": link.evidence, | |
| "source_paper": link.source_paper, | |
| "subject_id": link.from_id, | |
| "predicate": link.relation_type, | |
| "object_id": link.to_id, | |
| } | |
| freq_boost = self.compute_frequency_boost(full_meta) | |
| temp_decay = self.compute_temporal_decay(full_meta) | |
| s = raw * freq_boost * temp_decay | |
| scores.append(min(s, 1.0)) | |
| log_sum = sum(math.log(max(s, 1e-6)) for s in scores) | |
| gm = math.exp(log_sum / len(scores)) | |
| if min_conf < 0.1: | |
| gm *= 0.7 | |
| return max(min(gm, 1.0), 0.0) | |
| def _compute_novelty_score(self, path: list[HypothesisLink]) -> float: | |
| """Score how novel/surprising a hypothesis is. | |
| Lower = more expected (direct known relationship), Higher = more surprising. | |
| """ | |
| score = 0.3 # base | |
| # hop bonus: longer paths = more novel connections | |
| score += 0.1 * min(len(path) - 1, 3) | |
| # cross-domain bonus: connecting different domains is more novel | |
| domains_seen = set() | |
| for link in path: | |
| src = self._index.get(link.from_id) | |
| tgt = self._index.get(link.to_id) | |
| if src: | |
| domains_seen.update(src.domain_tags) | |
| if tgt: | |
| domains_seen.update(tgt.domain_tags) | |
| domains_seen.discard("claim") | |
| n_domains = len(domains_seen) | |
| if n_domains >= 3: | |
| score += 0.15 | |
| elif n_domains >= 2: | |
| score += 0.10 | |
| # rare relation bonus: non-generic relations are more novel | |
| rare_count = sum(1 for l in path if l.relation_type not in COMMON_RELATIONS) | |
| score += 0.05 * min(rare_count, 3) | |
| # evidence diversity: more papers = better supported, less novel | |
| # fewer papers = more speculative, more novel | |
| pmids = {l.source_paper.get("pmid", "") for l in path if l.source_paper.get("pmid")} | |
| if len(pmids) == 0: | |
| score += 0.10 # no paper support = speculative but novel | |
| elif len(pmids) == 1: | |
| score += 0.05 # single source = weak replication | |
| return min(score, 1.0) | |
| def _compute_evidence_score(self, path: list[HypothesisLink]) -> float: | |
| """Score evidence quality: traceability and text availability. | |
| DOES NOT use p_value/sample_size/effect_size — those signals already | |
| flow into edge.confidence via claim_extractor._estimate_confidence | |
| and are aggregated by _compute_confidence_score. Counting them again | |
| here was double-dipping. | |
| This score asks a different question: "How well-anchored is the | |
| evidence in source documents?" — which complements confidence's | |
| "How statistically strong is the evidence?". Path-level: most | |
| well-extracted edges score 0.6-0.8; we reserve >0.9 for paths whose | |
| every step has rich provenance. | |
| """ | |
| _REVIEW_TYPES = {"narrative_review", "review"} | |
| scores = [] | |
| for link in path: | |
| study_type = (link.evidence.get("study_type") or "").lower() | |
| s = 0.2 if study_type in _REVIEW_TYPES else 0.3 | |
| if link.raw_text and len(link.raw_text) > 20: | |
| s += 0.20 | |
| if link.claim_id: | |
| s += 0.15 | |
| if link.source_paper.get("pmid"): | |
| s += 0.15 | |
| if link.evidence.get("study_type"): | |
| s += 0.10 | |
| scores.append(min(s, 1.0)) | |
| return self._geometric_mean(scores) | |
| def _compute_testability_score(self, path: list[HypothesisLink]) -> tuple[float, str]: | |
| """Score how testable a hypothesis is with NeuroClaw imaging experiments. | |
| Boosts for: | |
| - Brain region features directly measurable from sMRI (volume, thickness) | |
| - Connectivity features (functional/structural) for GNN models | |
| - Modalities available in UKB/ADNI/HCP-YA | |
| - Deep learning model compatibility (BrainGNN, NeuroStorm) | |
| - Target diseases present in datasets (AD, PD, depression, etc.) | |
| Returns (score, reason_string). | |
| """ | |
| all_text = " ".join( | |
| l.raw_text + " " + l.from_name + " " + l.to_name + " " + l.relation_type | |
| for l in path | |
| ).lower() | |
| # check which modalities are mentioned | |
| matched_modalities = [] | |
| for modality, keywords in TESTABLE_MODALITIES.items(): | |
| for kw in keywords: | |
| if kw.lower() in all_text: | |
| matched_modalities.append(modality) | |
| break | |
| if not matched_modalities: | |
| return 0.15, "no imaging modality detected" | |
| score = 0.25 # base for having a modality | |
| # modality bonus (more = more testable angles) | |
| score += 0.10 * min(len(matched_modalities), 3) | |
| # heavy bonus for sMRI features (volume/thickness — directly measurable in all 3 datasets) | |
| if "sMRI" in matched_modalities: | |
| score += 0.15 | |
| # heavy bonus for connectivity features (input to BrainGNN/GNN models) | |
| if "dMRI" in matched_modalities or "fMRI" in matched_modalities: | |
| score += 0.15 | |
| # bonus for PET (available in ADNI, key for AD research) | |
| if "PET" in matched_modalities: | |
| score += 0.10 | |
| # bonus for brain region specificity (testable with atlas parcellation) | |
| brain_region_keywords = ["cortex", "hippocampus", "amygdala", "thalamus", | |
| "cerebellum", "striatum", "insula", "gyrus", | |
| "caudate", "putamen", "pallidum", "accumbens", | |
| "precuneus", "cuneus", "lingual", "fusiform", | |
| "parahippocampal", "entorhinal", "parietal", | |
| "frontal", "temporal", "occipital"] | |
| regions_found = [kw for kw in brain_region_keywords if kw in all_text] | |
| if regions_found: | |
| score += 0.10 # atlas-based ROI analysis | |
| if len(regions_found) >= 2: | |
| score += 0.05 # pair of regions = connectivity hypothesis | |
| # bonus for diseases present in target datasets | |
| dataset_diseases = [ | |
| "alzheimer", "parkinson", "depression", "schizophrenia", "adhd", | |
| "autism", "epilepsy", "multiple sclerosis", "anxiety", "bipolar", | |
| "dementia", "mci", "mild cognitive", | |
| ] | |
| if any(d in all_text for d in dataset_diseases): | |
| score += 0.05 | |
| # bonus for DL-model-compatible features (graph structure, ROI, connectivity matrix) | |
| if any(kw.lower() in all_text for kw in DL_MODEL_KEYWORDS): | |
| score += 0.05 | |
| # build reason string | |
| modalities_str = ", ".join(matched_modalities) | |
| reason = f"modalities: {modalities_str}" | |
| if regions_found: | |
| reason += f" | brain regions: {', '.join(regions_found[:4])}" | |
| if any(d in all_text for d in dataset_diseases): | |
| matched_diseases = [d for d in dataset_diseases if d in all_text] | |
| reason += f" | diseases: {', '.join(matched_diseases[:3])}" | |
| return min(score, 1.0), reason | |
| def _composite_score(self, h: Hypothesis) -> float: | |
| """Weighted geometric mean of the 4 score components. | |
| Geometric: a hypothesis is only as good as its weakest dimension. | |
| A path with great evidence but 0 testability is worthless to us. | |
| Matches the linear fitness in evolution_engine._score_fitness | |
| (same weights, different aggregation — fitness adds convergence / | |
| diversity / length modifiers not relevant here). | |
| """ | |
| c = max(h.confidence_score, 0.01) | |
| e = max(h.evidence_score, 0.01) | |
| n = max(h.novelty_score, 0.01) | |
| t = max(h.testability_score, 0.01) | |
| score = (c ** 0.20) * (e ** 0.20) * (n ** 0.25) * (t ** 0.35) | |
| if self._has_only_review_evidence(h): | |
| score *= 0.7 | |
| return score | |
| def _has_only_review_evidence(h: Hypothesis) -> bool: | |
| """True if every link in the path comes from a review/narrative_review.""" | |
| _REVIEW_TYPES = {"narrative_review", "review"} | |
| if not h.path: | |
| return False | |
| for link in h.path: | |
| study_type = (link.evidence.get("study_type") or "").lower() | |
| if study_type and study_type not in _REVIEW_TYPES: | |
| return False | |
| return True | |
| def _check_contradiction(self, m1: dict, m2: dict) -> float: | |
| """Check if two claims contradict each other. Returns severity 0-1.""" | |
| p1 = m1.get("predicate", "") | |
| p2 = m2.get("predicate", "") | |
| n1 = m1.get("negated", False) | |
| n2 = m2.get("negated", False) | |
| if p1 == p2 and n1 != n2: | |
| return 1.0 | |
| if (p1, p2) in OPPOSING_PREDICATES: | |
| return 0.8 | |
| if p1 == p2 and not n1 and not n2: | |
| d1 = m1.get("evidence", {}).get("direction", "") | |
| d2 = m2.get("evidence", {}).get("direction", "") | |
| if d1 and d2 and d1 != d2: | |
| return 0.6 | |
| return 0.0 | |
| def _infer_relation(self, path: list[str]) -> str: | |
| """Infer a potential relation from a path's edge types.""" | |
| relations = [] | |
| for i in range(len(path) - 1): | |
| if self.G.has_edge(path[i], path[i + 1]): | |
| rt = self.G.edges[path[i], path[i + 1]].get("relation_type", "") | |
| if rt and rt not in ("about", "is_a", "part_of"): | |
| relations.append(rt) | |
| if relations: | |
| for r in relations: | |
| if r not in COMMON_RELATIONS: | |
| return r | |
| return relations[0] | |
| return "associated_with" | |
| def _generate_explanation(self, h: Hypothesis) -> str: | |
| """Generate a human-readable explanation for a hypothesis.""" | |
| path_str = " --> ".join( | |
| f"{l.from_name} --[{l.relation_type}]--> {l.to_name}" for l in h.path | |
| ) | |
| if not path_str: | |
| return "" | |
| pmids = {l.source_paper.get("pmid", "") for l in h.path if l.source_paper.get("pmid")} | |
| key_finding = "" | |
| for l in h.path: | |
| if l.raw_text: | |
| key_finding = l.raw_text[:150] | |
| if len(l.raw_text) > 150: | |
| key_finding += "..." | |
| break | |
| lines = [ | |
| f"Hypothesis: {h.source_name} may relate to {h.target_name} via {len(h.path)}-hop path.", | |
| f"Path: {path_str}", | |
| f"Evidence: {len(h.supporting_claims)} claims from {len(pmids)} papers", | |
| ] | |
| if key_finding: | |
| lines.append(f"Key finding: '{key_finding}'") | |
| if h.testability_reason: | |
| lines.append(f"Testability: {h.testability_reason}") | |
| lines.append( | |
| f"Confidence: {h.confidence_score:.2f} | " | |
| f"Novelty: {h.novelty_score:.2f} | " | |
| f"Evidence: {h.evidence_score:.2f} | " | |
| f"Testability: {h.testability_score:.2f}" | |
| ) | |
| return "\n".join(lines) | |
| def _geometric_mean(values: list[float]) -> float: | |
| if not values: | |
| return 0.0 | |
| product = math.prod(values) | |
| return product ** (1.0 / len(values)) | |
| def _simple_slope(xs: list[int], ys: list[int]) -> float: | |
| """Simple linear regression slope without numpy.""" | |
| n = len(xs) | |
| if n < 2: | |
| return 0.0 | |
| mean_x = sum(xs) / n | |
| mean_y = sum(ys) / n | |
| num = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys)) | |
| den = sum((x - mean_x) ** 2 for x in xs) | |
| if den == 0: | |
| return 0.0 | |
| return num / den | |