diff --git "a/core/knowledge_graph/src/hypothesis_engine.py" "b/core/knowledge_graph/src/hypothesis_engine.py"
new file mode 100644--- /dev/null
+++ "b/core/knowledge_graph/src/hypothesis_engine.py"
@@ -0,0 +1,2644 @@
+"""Hypothesis engine: batch-generate, persist, and rank testable hypotheses.
+
+Phase 3 of the NeuroClaw discovery loop:
+  1. batch_generate() — traverse the graph to produce hypotheses at scale
+  2. save / load — persist hypotheses to JSON
+  3. rank_hypotheses() — sort by novelty, evidence, testability, confidence
+  4. (Phase 5-6) hypotheses become executable NeuroClaw analysis tasks
+
+Usage:
+    from core.knowledge_graph import load_graph, HypothesisEngine
+
+    kg = load_graph()
+    engine = HypothesisEngine(kg)
+
+    # batch generate across all domain pairs
+    hypotheses = engine.batch_generate()
+    engine.save_hypotheses(hypotheses, "data/hypotheses.json")
+
+    # or load and re-rank
+    hypotheses = engine.load_hypotheses("data/hypotheses.json")
+    ranked = engine.rank_hypotheses(hypotheses)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import math
+import re
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Optional
+
+import networkx as nx
+
+from .graph_manager import KnowledgeGraph
+from .schema import ConceptNode
+
+logger = logging.getLogger(__name__)
+
+# ── data structures ────────────────────────────────────────────────────
+
+@dataclass
+class HypothesisLink:
+    """A single step in a hypothesis chain."""
+    from_id: str
+    from_name: str
+    to_id: str
+    to_name: str
+    relation_type: str
+    confidence: float
+    claim_id: str = ""
+    raw_text: str = ""
+    evidence: dict = field(default_factory=dict)
+    source_paper: dict = field(default_factory=dict)
+
+
+@dataclass
+class Hypothesis:
+    """A generated hypothesis with full evidence chain."""
+    id: str = ""
+    hypothesis_type: str = ""  # "path", "bridge", "gap", "contradiction"
+    source_id: str = ""
+    source_name: str = ""
+    target_id: str = ""
+    target_name: str = ""
+    path: list[HypothesisLink] = field(default_factory=list)
+    confidence_score: float = 0.0
+    novelty_score: float = 0.0
+    evidence_score: float = 0.0
+    testability_score: float = 0.0
+    composite_score: float = 0.0
+    supporting_claims: list[str] = field(default_factory=list)
+    explanation: str = ""
+    testability_reason: str = ""
+    metadata: dict = field(default_factory=dict)
+    critic_score: float = 0.0
+    critic_feedback: list[dict] = field(default_factory=list)
+    critic_rounds: int = 0
+    evolve_score: float = 0.0
+
+    def to_dict(self) -> dict:
+        d = asdict(self)
+        return d
+
+    @classmethod
+    def from_dict(cls, d: dict) -> Hypothesis:
+        d = d.copy()
+        if "path" in d and isinstance(d["path"], list):
+            d["path"] = [HypothesisLink(**p) if isinstance(p, dict) else p for p in d["path"]]
+        return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
+
+
+@dataclass
+class Contradiction:
+    """A pair of conflicting claims."""
+    concept_a_id: str = ""
+    concept_a_name: str = ""
+    concept_b_id: str = ""
+    concept_b_name: str = ""
+    claim_for_id: str = ""
+    claim_for_predicate: str = ""
+    claim_for_text: str = ""
+    claim_against_id: str = ""
+    claim_against_predicate: str = ""
+    claim_against_text: str = ""
+    severity: float = 0.0
+
+
+@dataclass
+class Gap:
+    """An unexplored relationship between two concepts."""
+    concept_a_id: str = ""
+    concept_a_name: str = ""
+    concept_b_id: str = ""
+    concept_b_name: str = ""
+    distance: int = 0
+    connecting_concepts: list[str] = field(default_factory=list)
+    domain_a: str = ""
+    domain_b: str = ""
+    potential_relation: str = ""
+
+
+# ── constants ──────────────────────────────────────────────────────────
+
+OPPOSING_PREDICATES = {
+    ("increases", "reduces"),
+    ("reduces", "increases"),
+    ("causes", "inhibits"),
+    ("inhibits", "causes"),
+    ("treats", "contraindicated_for"),
+    ("contraindicated_for", "treats"),
+    ("activates", "inhibits"),
+    ("inhibits", "activates"),
+}
+
+# Review-only study types (no independent empirical evidence).
+# Used by compute_frequency_boost and compute_temporal_decay. Edge-level
+# weighting by study_type lives in phase4_optimize.apply_evidence_weighting.
+_REVIEW_TYPES = {"review", "narrative_review", "systematic_review"}
+
+COMMON_RELATIONS = {"is_a", "part_of", "associated_with", "about", "is_associated_with"}
+
+# Noisy entity name patterns — hypotheses involving these are low quality.
+# Two categories:
+#   (a) process-word ≠ entity: nominalized verbs/states ("loss", "progression")
+#       that pop up as bridge nodes but carry no biological content.
+#   (b) generic containers: vague collective terms ("tissue volumes", "Family")
+#       that don't refer to a specific measurable thing.
+_NOISE_WORDS = frozenset({
+    # original set
+    "unseen", "risk", "effect", "level", "status", "change", "type",
+    "group", "factor", "model", "method", "unknown", "other", "none",
+    "miscellaneous", "various", "difference", "increase", "decrease",
+    # nominalized processes/states (category a)
+    "loss", "progression", "reduction", "elevation", "alteration",
+    "disruption", "dysfunction", "impairment", "deterioration",
+    "improvement", "recovery", "response", "onset", "activation",
+    "inhibition", "regulation", "modulation", "stimulation",
+    "expression", "function", "functions",
+    # generic containers (category b)
+    "family", "members", "phenomenon", "phenomena", "processes",
+    "mechanisms", "pathways", "symptoms", "manifestations",
+    "volumes", "volume",
+    # life events / demographics that are not biological entities
+    "stress", "life", "events", "exposure", "outcome", "outcomes",
+    "quality",
+})
+
+NOISE_PATTERNS = [
+    re.compile(r"^[A-Z][a-z]?$"),                                  # 1-2 letter: "Id", "Ca", "Mg"
+    re.compile(r"^[A-Z][a-z]{2,4}$"),                              # Short mixed-case: "Tics", "Risk"
+    re.compile(r"^\d+$"),                                           # Pure numbers
+]
+
+# (C-1) Generic-phrase patterns for INTERMEDIATE nodes. The token-based
+# `_NOISE_WORDS` filter misses phrases like "functional connectivity" or
+# "neural activity" because no individual word is in the noise list, but
+# the WHOLE phrase carries no measurable content. We only block these when
+# they appear as INTERMEDIATE nodes (paths can legitimately end in
+# "functional connectivity" as an outcome metric).
+_GENERIC_INTERMEDIATE_PATTERNS = [
+    re.compile(r"^(abnormal|altered|impaired|reduced|increased|disrupted|aberrant)?\s*"
+               r"(brain|neural|neuronal|cortical|cerebral)\s+"
+               r"(activity|activation|function|functioning|connectivity|"
+               r"network|networks|signaling|metabolism|response|responses)$",
+               re.I),
+    re.compile(r"^(functional|structural|anatomical|effective)\s+"
+               r"(connectivity|network|networks|integrity|abnormalit(y|ies))$", re.I),
+    re.compile(r"^(disease|symptom|clinical|treatment|therapeutic)\s+"
+               r"(progression|outcome|outcomes|response|severity|burden|stage|staging)$", re.I),
+    re.compile(r"^(common|typical|specific|various|different)\s+"
+               r"(features|patterns|mechanisms|processes)$", re.I),
+    re.compile(r"^(neuro)?(degeneration|inflammation|protection|plasticity|genesis|imaging)$",
+               re.I),
+    re.compile(r"^(grey|gray|white)\s+matter$", re.I),
+    re.compile(r"^(cognitive|behavioral|emotional|motor|sensory)\s+"
+               r"(deficit|deficits|dysfunction|impairment|abnormalit(y|ies))$", re.I),
+]
+
+# (C-3) Target-name patterns that LOOK like outcomes (so they pass
+# _is_dataset_outcome's keyword fallback) but are actually too broad to
+# drive a DL experiment. We block these even if their domain says
+# disease/cognitive_function.
+_TARGET_TOO_BROAD_PATTERNS = [
+    # bare umbrella nouns (single token)
+    re.compile(r"^(skill|skills|ability|abilities|outcome|outcomes|"
+               r"symptom|symptoms|manifestation|manifestations|"
+               r"phenomenon|phenomena|finding|findings|"
+               r"deficit|deficits|impairment|impairments|"
+               r"function|functions|functioning|behavior|behaviors|"
+               r"capability|capabilities|condition|conditions|"
+               r"disease|diseases|disorder|disorders|syndrome|syndromes|"
+               r"focus|integration|balance|knowledge|autonomy|"
+               r"performance|adaptation|resilience|vulnerability|"
+               r"recovery|progression|mechanism|process)$", re.I),
+    # broad-category disease umbrellas (when these are the literal target,
+    # they're too generic — but specific subtypes like "Alzheimer Disease"
+    # don't match these patterns)
+    re.compile(r"^(neurological|psychiatric|mental|cognitive|behavioral|"
+               r"neurodegenerative|cardiovascular)\s+"
+               r"(disease|diseases|disorder|disorders|condition|conditions)$", re.I),
+    re.compile(r"^(human\s+)?(disease|diseases|disorder|disorders)$", re.I),
+    re.compile(r"^(brain|mental|psychiatric|psychological)\s+health$", re.I),
+    re.compile(r"^clinical\s+(features|outcome|outcomes|presentation|status)$", re.I),
+    # "X deficits/impairments" patterns (too vague as targets)
+    re.compile(r"^(motor|cognitive|neurocognitive|functional|social|"
+               r"verbal|visual|sensory|emotional|behavioral)\s+"
+               r"(deficit|deficits|impairment|impairments|dysfunction|"
+               r"disability|decline|deterioration)$", re.I),
+]
+
+# Vague relation types that add little signal
+VAGUE_RELATIONS = {"is_associated_with", "associated_with", "about"}
+
+# CognitiveAtlas / MeSH concept ids that are top-degree generic hubs
+# in the KG. The audit found these at degrees 700-9000+, with names that
+# are real English words (not caught by _NOISE_WORDS) but referring to
+# extremely abstract umbrella concepts:
+#
+#   COGAT trm_4a3fd79d0a891  "memory"      degree 2248
+#   COGAT trm_4a3fd79d0a80f  "logic"       degree 2052
+#   COGAT trm_5159c80c1dd24  "loss"        degree 1034
+#   COGAT trm_4a3fd79d09741  "activation"  degree  840
+#   COGAT trm_4a3fd79d0afcf  "risk"        degree  722
+#   COGAT trm_4a3fd79d0b2a8  "stress"      degree  139
+#   MSH:D001921              "Brain"       degree 9157
+#   MSH:D009474              "Neurons"     degree 1354
+#
+# Hypotheses with these as intermediate nodes or endpoints are too vague
+# to drive a downstream DL experiment ("FPN -> memory" is not testable
+# because we don't know which memory subsystem). Filtered in post_process.
+PATH_IGNORE_NODE_IDS = frozenset({
+    "COGAT_CONCEPT:trm_4a3fd79d0a891",   # memory
+    "COGAT_CONCEPT:trm_4a3fd79d0a80f",   # logic
+    "COGAT_CONCEPT:trm_5159c80c1dd24",   # loss
+    "COGAT_CONCEPT:trm_4a3fd79d09741",   # activation
+    "COGAT_CONCEPT:trm_4a3fd79d0afcf",   # risk
+    "COGAT_CONCEPT:trm_4a3fd79d0b2a8",   # stress
+    "MSH:D001921",                        # Brain (umbrella)
+    "MSH:D009474",                        # Neurons (umbrella)
+})
+
+# Disease/category mega-hubs that are valid as hypothesis endpoints
+# ("predict Alzheimer" is fine) but NOT as intermediate transit nodes
+# ("A → Alzheimer → B" is just "A relates to AD, AD relates to B" — no
+# discovery value). Audit found 37.8% of hypotheses transit through these.
+INTERMEDIATE_ONLY_IGNORE_IDS = frozenset({
+    "COGAT_DISORDER:dso_5419",            # schizophrenia (degree 1005)
+    "MSH:D009103",                         # Multiple Sclerosis (816)
+    "COGAT_DISORDER:dso_3312",            # bipolar disorder (703)
+    "MSH:D000544",                         # Alzheimer Disease (746)
+    "MSH:D004827",                         # Epilepsy (750)
+    "MSH:D010300",                         # Parkinson Disease (709)
+    "COGAT_DISORDER:dso_0060041",         # autism spectrum disorder (613)
+    "MSH:D001289",                         # ADHD (601)
+    "MSH:D003863",                         # Depression (577)
+    "MSH:D001523",                         # Mental Disorders (489)
+})
+
+DIRECTIONAL_RELATIONS = {
+    "causes", "treats", "increases", "reduces", "modulates",
+    "activates", "inhibits", "is_biomarker_of", "is_risk_factor_for",
+    "predicts", "distinguishes", "mediates",
+    # Brain decoding directional predicates
+    "evokes", "decoded_from", "elicits",
+}
+
+# domain pairs worth exploring — aligned with NeuroClaw imaging experiments
+# target datasets: UKB (T1w/dMRI/rfMRI/SWI), ADNI (T1w/PET/fMRI/DTI), HCP-YA (T1w/T2w/fMRI/dMRI/MEG)
+# experiment models: BrainGNN, NeuroStorm, SVM, XGBoost on raw images + handcrafted features
+#
+# Design principle: target should be a dataset OUTCOME (what we want to predict),
+# source should be a MEASURABLE feature (what the dataset provides as input).
+# - UKB outcomes: fluid intelligence, neuroticism, dementia diagnosis, motor tests
+# - ADNI outcomes: MCI→AD conversion, CDR-SB, cognitive composite
+# - HCP outcomes: fluid/crystallized IQ, emotion recognition, personality traits
+#
+# Allowed sources (what we can measure): neuroanatomy (MRI regions), connectivity
+# networks, gene, biomarker (CSF/PET), drug (for intervention studies).
+# Allowed targets (what we predict): disease (diagnostic labels), cognitive_function
+# (the OUTCOMES — includes behavior, personality, affect).
+DEFAULT_DOMAIN_PAIRS = [
+    # core: measurable features → clinical/behavioral OUTCOMES
+    ("neuroanatomy", "disease"),             # MRI → diagnosis
+    ("neuroanatomy", "cognitive_function"),  # MRI → cognition/behavior
+    ("connectivity", "disease"),             # dMRI/fMRI connectivity → diagnosis
+    ("connectivity", "cognitive_function"),  # connectivity → cognition
+    # genetics → outcomes (UKB 500k WGS)
+    ("gene", "disease"),
+    ("gene", "cognitive_function"),          # GWAS → behavior/IQ
+    # fluid biomarkers → outcomes (ADNI CSF, blood)
+    ("biomarker", "disease"),
+    ("biomarker", "cognitive_function"),
+    # drug → outcomes (ADNI pharmaceutical arms)
+    ("drug", "disease"),
+    ("drug", "cognitive_function"),
+    # cross-outcome (comorbidity, transdiagnostic)
+    ("disease", "disease"),
+    ("cognitive_function", "disease"),       # e.g. anxiety → MS diagnosis risk
+    ("disease", "cognitive_function"),       # e.g. AD → processing speed decline
+]
+
+# Domains that are NOT directly measurable from brain imaging
+# These hypotheses will be filtered out in post_process
+NON_MEASURABLE_BIOMARKER_TYPES = {
+    "neurotransmitter",   # needs specialized PET tracers (e.g., 11C-raclopride for DA)
+    "protein",            # needs tissue biopsy or CSF
+    "enzyme",             # needs molecular assays
+    "receptor",           # needs specialized PET (e.g., 11C-PIB for Aβ, but that's biomarker domain)
+    # fluid biomarkers — not available in UKB/HCP-YA, only ADNI CSF subset
+    "csf_biomarker",
+    "blood_biomarker",
+    "saliva_biomarker",
+    "tear_biomarker",
+}
+
+# Specific entity name patterns that are NOT directly measurable from imaging
+_NON_MEASURABLE_PATTERNS = [
+    re.compile(r"(neurotransmitter|dopamine|serotonin|norepinephrine|gaba|glutamate|acetylcholine)\s+(level|concentration|release|synthesis)", re.I),
+    re.compile(r"(alpha|beta|gamma|delta|kappa)\s*synuclein\s*(pathology|aggregation|expression)", re.I),
+    re.compile(r"(amyloid|tau|phosphorylated)\s*(beta|protein|peptide)\s*(aggregation|production|clearance)", re.I),
+    re.compile(r"(enzyme|kinase|phosphatase)\s*(activity|expression)", re.I),
+    re.compile(r"(receptor|transporter)\s*(density|binding|expression)", re.I),
+    re.compile(r"(TNF|interleukin|IL-\d|cytokine|chemokine)\s*(alpha|beta|level|concentration|production)", re.I),
+    re.compile(r"CSF\s+(Aβ|amyloid|tau|p-tau|NFL|neurofilament)", re.I),
+    re.compile(r"(blood|plasma|serum)\s+(biomarker|marker|level|concentration)", re.I),
+    re.compile(r"(CSF|cerebrospinal fluid)\s+", re.I),
+    re.compile(r"(saliva|tear|urine)\s+(biomarker|marker|level)", re.I),
+    re.compile(r"(biopsy|tissue sample)", re.I),
+]
+
+# Non-neurological target domains — brain regions should not directly predict these
+_NON_NEUROLOGICAL_TARGETS = re.compile(
+    r"(urinary|incontinence|frequency|enuresis|bladder|renal|kidney|liver|"
+    r"gastrointestinal|cardiac|pulmonary|dermatol|orthopedic|musculoskeletal|"
+    r"fracture|sprain|tumor|cancer|carcinoma|leukemia|lymphoma)", re.I
+)
+
+# DATASET-OUTCOME whitelist — covers actual predicted variables in UKB/ADNI/HCP-YA
+# papers (see README "Dataset Outcomes" for references to typical prediction tasks).
+# Target must match one of these patterns to pass the post_process filter.
+# We also auto-accept any concept in the `disease` domain (clinical diagnosis
+# IS the most common outcome) and any MSH/CogAtlas concept in the
+# `cognitive_function` domain (behavior/cognition).
+#
+# Categories cover:
+# - Clinical diagnostic labels (Alzheimer, schizophrenia, MCI, etc.) — all 3 datasets
+# - AD staging / conversion (CN→MCI→AD, ATN) — ADNI
+# - Clinical scales (CDR, MMSE, ADAS-Cog, PHQ-9, MoCA, NPI) — ADNI + UKB
+# - Cognitive abilities (IQ, memory, attention, processing speed) — all 3
+# - Specific cognitive tests (PMAT, flanker, N-back, delay discounting) — HCP
+# - Personality (Big Five) — HCP + UKB
+# - Behavior/affect (anxiety, depression, aggression, risk-taking) — all 3
+# - Motor/sensory (grip strength, gait, reaction time, dexterity) — UKB + HCP
+# - Brain age / neurodegeneration markers — UKB + ADNI
+# - NeuroSTORM-evaluated phenotypes: MND, early psychosis (HCP-EP), ADHD200,
+#   COBRE, UCLA L5c, TCP psychiatric scales, fMRI task state classification
+# - Subject fingerprinting / re-identification
+_OUTCOME_KEYWORDS = re.compile(
+    r"("
+    # cognitive abilities — general
+    r"intelligence|cognition|cognitive\s+(function|ability|performance|deterioration|impairment|dysfunction|decline|test|assessment|composite|score)|"
+    r"memory|attention|executive|processing\s+speed|reasoning|language|"
+    r"fluency|perception|reaction\s+time|fluid\s+intelligence|"
+    r"crystallized\s+intelligence|working\s+memory|episodic\s+memory|"
+    r"semantic\s+memory|verbal\s+(memory|fluency|learning)|visuospatial|"
+    # specific HCP NIH Toolbox / cognitive tasks
+    r"pmat|flanker|card\s+sort|n-?back|list\s+sort|picture\s+sequence|"
+    r"pattern\s+comparison|picture\s+vocabulary|oral\s+reading|"
+    r"delay\s+discounting|risk[- ]taking|go[- ]no[- ]go|"
+    # HCP Penn CNB cognitive battery
+    r"penn\s+(word|matrix|line\s+orientation|continuous\s+performance|progressive\s+matrices|fear|emotion|cnb)|"
+    r"matrix\s+pattern|numeric\s+memory|prospective\s+memory|pairs\s+matching|"
+    r"trail\s+making|symbol\s+digit|boston\s+naming|animal\s+fluency|"
+    r"category\s+fluency|logical\s+memory|clock\s+drawing|ravlt|"
+    # HCP 7 task states (NeuroSTORM state classification)
+    r"emotion\s+task|gambling\s+task|language\s+task|motor\s+task|"
+    r"relational\s+task|social\s+task|working\s+memory\s+task|"
+    # clinical scales (ADNI/UKB/TCP/HCP)
+    r"\b(cdr|cdr-sb|mmse|moca|adas|adas-cog|npi|faq|gds|phq-?9|gad-?7|bai|hdrs|hrsd|hamd|ham-d|"
+    r"bdi|ymrs|panss|sans|saps|audit|asrs|pro|adi|srs|tci|neo-?ffi|asr|abcl|"
+    r"cidi|cidi-sf|eysenck|swemwbs|psqi|ftnd|ssaga|masq|promis|upsit)\b|"
+    r"adult\s+self\s+report|adult\s+behavior\s+checklist|"
+    # personality / affect
+    r"neuroticism|extraversion|agreeableness|conscientiousness|openness|"
+    r"personality|temperament|affect|mood|emotion|anxiety|depression|"
+    r"well-?being|satisfaction|life\s+satisfaction|psychological|stress\s+response|"
+    r"anxiety\s+sensitivity|cautiousness|"
+    r"affect\s+(positive|negative)|emotion\s+recognition|emotional\s+regulation|"
+    r"perceived\s+(stress|rejection|hostility)|anger|fear|sadness|"
+    # social functioning (HCP + UKB)
+    r"loneliness|social\s+(isolation|support|relationship|cognition)|"
+    r"meaning\s+and\s+purpose|instrumental\s+support|emotional\s+support|"
+    r"friendship|"
+    # behavior
+    r"behavior|aggression|impulsivity|addiction|substance|alcohol|smoking|"
+    r"tobacco|cannabis|cocaine|opiate|opioid|hallucinogen|"
+    r"drug\s+use|substance\s+use|sleep\s+quality|insomnia|"
+    # diagnoses / clinical outcomes — added NeuroSTORM-evaluated cohorts and ADNI stages
+    r"alzheimer|parkinson|schizophrenia|autism|adhd|bipolar|epilepsy|"
+    r"mci|mild\s+cognitive|dementia|psychosis|early\s+psychosis|stroke|post[- ]stroke|"
+    r"multiple\s+sclerosis|huntington|frontotemporal|lewy\s+body|"
+    r"motor\s+neuron\s+disease|mnd|als|"
+    r"transdiagnostic|psychiatric\s+disorder|mental\s+health\s+disorder|"
+    r"ocd|ptsd|phobia|panic|agoraphobia|somatoform|eating\s+disorder|"
+    # ADNI-specific diagnostic stages
+    r"\b(cn|smc|emci|lmci|ad\b|preclinical|at\b|atn|alzheimer\s+continuum)\b|"
+    r"significant\s+memory\s+concern|subjective\s+(memory|cognitive)\s+(concern|complaint|decline)|"
+    r"cognitively\s+(normal|unimpaired)|"
+    r"disorder|syndrome|diagnosis|onset|conversion|progression|severity|"
+    r"symptom|manifestation|prognosis|outcome|treatment\s+response|"
+    r"disease\s+(stage|staging|duration|burden)|"
+    # cardiovascular / metabolic diseases (UKB ICD-10)
+    r"myocardial\s+infarction|heart\s+failure|hypertension|atrial\s+fibrillation|"
+    r"coronary|cardiovascular\s+disease|diabetes|type\s*[12]\s+diabetes|"
+    r"chronic\s+kidney|fatty\s+liver|nafld|metabolic\s+syndrome|obesity|"
+    # AD-specific biomarker status
+    r"amyloid\s+(status|positivity|positive|negative|load|burden|suvr)|"
+    r"tau\s+(status|positivity|positive|tangle|pathology|burden|suvr)|"
+    r"atn\s+(profile|stage|classification)|"
+    r"neurodegeneration\s+(stage|status)|"
+    # brain age / aging
+    r"brain\s+age|brain-?age(-?gap)?|aging|age[- ]related|age\s+acceleration|"
+    # motor / sensory
+    r"grip\s+strength|gait|motor\s+coordination|motor\s+function|"
+    r"balance|tremor|dexterity|walking\s+speed|two[- ]minute\s+walk|endurance|"
+    r"visual\s+(acuity|field)|audition|hearing|olfaction|taste|pain|"
+    r"chronic\s+pain|musculoskeletal\s+pain|"
+    # mortality / longevity
+    r"mortality|all-?cause\s+death|survival|life\s+expectancy"
+    r")", re.I
+)
+
+# Target domains considered as valid dataset outcomes
+_OUTCOME_DOMAINS = {"disease", "cognitive_function"}
+
+# NeuroClaw testable modalities and their keywords
+# Aligned with UKB/ADNI/HCP-YA available data + deep learning models
+TESTABLE_MODALITIES = {
+    "sMRI": ["cortical thickness", "volume", "atrophy", "gray matter", "white matter",
+             "brain structure", "morphometry", "VBM", "FreeSurfer", "recon-all",
+             "brain region", "hippocampus", "amygdala", "thalamus", "caudate",
+             "putamen", "cerebellum", "insula", "cortex", "ventricle"],
+    "fMRI": ["functional connectivity", "BOLD", "activation", "resting-state",
+             "task-based", "network", "default mode", "fMRI", "brain response",
+             "neural activity", "brain activation"],
+    "dMRI": ["DTI", "diffusion", "fractional anisotropy", "tractography",
+             "white matter integrity", "structural connectivity", "FA", "MD",
+             "connectivity matrix", "fiber bundle", "white matter tract"],
+    "PET": ["PET", "tracer", "amyloid", "tau", "FDG", "SUVr", "binding potential",
+            "glucose metabolism", "florbetapir", "flortaucipir"],
+    "EEG": ["EEG", "ERP", "oscillation", "power spectrum", "alpha", "beta", "theta",
+            "delta", "gamma", "microstate", "coherence", "event-related"],
+    "organ_volume": ["organ volume", "liver volume", "kidney volume", "spleen volume",
+                     "MedSAM", "segmentation", "organ size"],
+}
+
+# Deep learning model keywords for testability scoring
+DL_MODEL_KEYWORDS = [
+    "BrainGNN", "NeuroStorm", "GNN", "graph neural", "region of interest", "ROI",
+    "connectivity matrix", "adjacency", "node feature", "graph convolution",
+    "deep learning", "CNN", "ResNet", "attention", "transformer",
+    "voxel", "patch", "whole-brain",
+]
+
+# ── Dataset-Available Variables ──────────────────────────────────────
+# Defines what can be measured in each dataset. Hypotheses must start
+# from these features and end at dataset-available outcomes.
+
+DATASET_FEATURES = {
+    "UKB": {
+        # sMRI (T1w): FreeSurfer-derived ROI measures
+        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "smri_cortical_area":     {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "smri_cortical_volume":   {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "smri_voxel":             {"modality": "sMRI", "tool": "voxel",       "level": "voxel"},
+        # dMRI: diffusion metrics per tract
+        "dmri_fa":  {"modality": "dMRI", "tool": "TBSS", "level": "tract"},
+        "dmri_md":  {"modality": "dMRI", "tool": "TBSS", "level": "tract"},
+        "dmri_sc":  {"modality": "dMRI", "tool": "tractography", "level": "connectivity"},
+        # rfMRI: functional connectivity
+        "rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"},
+        # lesion segmentation
+        "lesion_volume": {"modality": "sMRI", "tool": "MedSAM", "level": "ROI"},
+        # non-imaging
+        "genetics":       {"modality": "genetics",    "tool": "WGS/GSA",     "level": "SNP"},
+        "environment":    {"modality": "environment",  "tool": "questionnaire","level": "variable"},
+        "physical":       {"modality": "physical",     "tool": "measurement",  "level": "variable"},
+        "hospitalization":{"modality": "clinical",     "tool": "ICD10",        "level": "outcome"},
+    },
+    "ADNI": {
+        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "smri_voxel":             {"modality": "sMRI", "tool": "voxel",       "level": "voxel"},
+        "pet_amyloid": {"modality": "PET", "tool": "florbetapir",  "level": "ROI"},
+        "pet_tau":     {"modality": "PET", "tool": "flortaucipir", "level": "ROI"},
+        "pet_fdg":     {"modality": "PET", "tool": "FDG",          "level": "ROI"},
+        "fmri_fc":     {"modality": "fMRI", "tool": "task/resting", "level": "connectivity"},
+        "dti_fa":      {"modality": "dMRI", "tool": "DTI",          "level": "tract"},
+        "lesion_volume": {"modality": "sMRI", "tool": "MedSAM", "level": "ROI"},
+        "genetics":    {"modality": "genetics", "tool": "APOE/GWAS", "level": "SNP"},
+        "medication":  {"modality": "clinical", "tool": "medication_log", "level": "variable"},
+    },
+    "HCP_YA": {
+        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "smri_myelin":            {"modality": "sMRI", "tool": "T1w/T2w",    "level": "ROI"},
+        "smri_voxel":             {"modality": "sMRI", "tool": "voxel",       "level": "voxel"},
+        "rfmri_fc":  {"modality": "fMRI", "tool": "rfMRI",    "level": "connectivity"},
+        "tfmri_task":{"modality": "fMRI", "tool": "task fMRI","level": "activation"},
+        "dmri_sc":   {"modality": "dMRI", "tool": "HARDI",    "level": "connectivity"},
+        "meg":       {"modality": "MEG",  "tool": "MEG",      "level": "connectivity"},
+    },
+    # NAS-available patient cohorts with preprocessed ROI time series.
+    # Phenotype CSVs live under Z:\Dataset\fMRI\phenotype and the dataset-
+    # specific rest csvs. All supply rfMRI volumes or ROI series; structural
+    # T1 is available for HCP-EP and HCP-Aging (the other four are rfMRI-only
+    # public releases).
+    "ABIDE": {
+        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
+        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
+    },
+    "ADHD200": {
+        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
+        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
+    },
+    "COBRE": {
+        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
+        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
+    },
+    "UCLA": {
+        # UCLA CNP — rest + 6 task contrasts, cross-diagnosis cohort.
+        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
+        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
+        "tfmri_task":   {"modality": "fMRI", "tool": "task fMRI",   "level": "activation"},
+    },
+    "HCP_EP": {
+        # HCP Early Psychosis — patient cohort, T1w + rfMRI cleaned.
+        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
+        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
+    },
+    "HCP_AGING": {
+        # HCP-Aging — T1w + rfMRI REST1/REST2 + 3 task contrasts.
+        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "smri_myelin":             {"modality": "sMRI", "tool": "T1w/T2w",    "level": "ROI"},
+        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
+        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
+        "tfmri_task":   {"modality": "fMRI", "tool": "task fMRI",   "level": "activation"},
+    },
+    # ── Visual decoding (fMRI) ───────────────────────────────────────���──
+    # NSD & BOLD5000: image-stimulus visual task fMRI, no rest.
+    "NSD": {
+        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "tfmri_visual_voxel":      {"modality": "fMRI", "tool": "task fMRI",
+                                     "level": "voxel", "stimulus": "natural_image"},
+        "tfmri_visual_roi":        {"modality": "fMRI", "tool": "task fMRI",
+                                     "level": "ROI",   "stimulus": "natural_image"},
+    },
+    "BOLD5000": {
+        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
+        "tfmri_visual_voxel":      {"modality": "fMRI", "tool": "task fMRI",
+                                     "level": "voxel", "stimulus": "ImageNet_COCO_Scene"},
+        "tfmri_visual_roi":        {"modality": "fMRI", "tool": "task fMRI",
+                                     "level": "ROI",   "stimulus": "ImageNet_COCO_Scene"},
+    },
+    # ── Visual decoding (EEG) ───────────────────────────────────────────
+    "SEED_DV": {
+        "eeg_psd": {"modality": "EEG", "tool": "PSD", "level": "channel"},
+        "eeg_de":  {"modality": "EEG", "tool": "DE",  "level": "channel"},
+    },
+    # ── Emotion decoding (EEG + eye tracking) ───────────────────────────
+    "SEED": {
+        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
+        "eeg_psd":      {"modality": "EEG", "tool": "PSD", "level": "channel"},
+    },
+    "SEED_IV": {
+        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
+        "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
+                         "level": "variable"},
+    },
+    "SEED_V": {
+        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
+        "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
+                         "level": "variable"},
+    },
+    "SEED_VII": {
+        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
+        "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
+                         "level": "variable"},
+    },
+    "SEED_GER": {
+        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
+        "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
+                         "level": "variable"},
+    },
+    "SEED_FRA": {
+        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
+        "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
+                         "level": "variable"},
+    },
+    # ── Vigilance decoding (EEG) ────────────────────────────────────────
+    "SEED_VIG": {
+        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
+        "eog":          {"modality": "EOG", "tool": "EOG", "level": "channel"},
+        "eye_movement": {"modality": "eye_tracking", "tool": "gaze/blink",
+                         "level": "variable"},
+    },
+}
+
+DATASET_OUTCOMES = {
+    "UKB": [
+        "disease_diagnosis",   # ICD10 codes
+        "mortality",           # death registry
+        "cognitive_score",     # touchscreen cognitive tests
+        "imaging_phenotype",   # derived imaging phenotypes
+    ],
+    "ADNI": [
+        "diagnosis",           # CN / MCI / AD
+        "conversion",          # MCI → AD conversion
+        "cognitive_decline",   # ADAS-Cog, MMSE decline
+        "biomarker_status",    # amyloid+/tau+ status
+    ],
+    "HCP_YA": [
+        "behavioral_score",    # NIH Toolbox
+        "cognitive_task",      # task fMRI performance
+        "personality",         # NEO-FFI
+    ],
+    # ABIDE — ASD vs controls, rest only.
+    "ABIDE": [
+        "diagnosis",           # ASD vs TD
+        "symptom_severity",    # ADOS, ADI-R, SRS
+        "cognitive_score",     # FIQ/VIQ/PIQ
+    ],
+    # ADHD200 — ADHD subtype vs TDC.
+    "ADHD200": [
+        "diagnosis",           # ADHD (combined/inattentive/hyperactive) vs TDC
+        "symptom_severity",    # ADHD-RS, Conners
+        "cognitive_score",     # WASI/WISC
+    ],
+    # COBRE — schizophrenia vs controls.
+    "COBRE": [
+        "diagnosis",           # schizophrenia vs HC
+        "symptom_severity",    # PANSS positive/negative/general
+        "cognitive_score",     # WAIS
+    ],
+    # UCLA CNP — schizophrenia/bipolar/ADHD vs controls.
+    "UCLA": [
+        "diagnosis",           # SCZ / BP / ADHD / HC
+        "symptom_severity",    # HAM-D, YMRS, ADHD-RS
+        "cognitive_task",      # 6 task contrasts
+    ],
+    # HCP-EP — early psychosis (FES + AR) vs HC.
+    "HCP_EP": [
+        "diagnosis",           # affective/non-affective psychosis vs HC
+        "symptom_severity",    # PANSS, SANS, YMRS
+        "cognitive_score",     # MATRICS Consensus Cognitive Battery
+    ],
+    # HCP-Aging — lifespan 36-100 yrs, healthy aging.
+    "HCP_AGING": [
+        "cognitive_decline",   # NIH Toolbox across age
+        "behavioral_score",    # same battery as HCP-YA
+        "cognitive_task",      # CARIT/FACENAME/VISMOTOR
+    ],
+    # ── Visual decoding outcomes ────────────────────────────────────────
+    "NSD": [
+        "image_category",         # COCO 80-class
+        "image_semantic",         # CLIP / language-model embedding
+        "stimulus_reconstruction",# pixel / latent reconstruction
+    ],
+    "BOLD5000": [
+        "image_category",         # ImageNet 1000-class / COCO / Scene
+        "scene_type",             # Scene 365-class
+        "image_semantic",
+    ],
+    "SEED_DV": [
+        "video_class",            # discrete video categories
+        "video_semantic",
+        "video_reconstruction",
+    ],
+    # ── Emotion decoding outcomes ───────────────────────────────────────
+    "SEED":     ["emotion_3class"],            # positive/neutral/negative
+    "SEED_IV":  ["emotion_4class"],            # happy/sad/fear/neutral
+    "SEED_V":   ["emotion_5class"],            # +disgust
+    "SEED_VII": ["emotion_7class", "emotion_continuous"],
+    "SEED_GER": ["emotion_3class"],
+    "SEED_FRA": ["emotion_3class"],
+    # ── Vigilance decoding outcomes ─────────────────────────────────────
+    "SEED_VIG": ["vigilance_continuous", "perclos"],
+}
+
+# Imaging feature templates — dynamically combined with AAL atlas regions
+# {region} is replaced with actual neuroanatomy node names at generation time
+IMAGING_FEATURE_TEMPLATES = {
+    # sMRI FreeSurfer ROI features
+    "cortical thickness of {region}":   {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
+                                          "datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]},
+    "gray matter volume of {region}":   {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
+                                          "datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]},
+    "subcortical volume of {region}":   {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
+                                          "datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]},
+    "cortical area of {region}":        {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
+                                          "datasets": ["UKB", "HCP_YA", "HCP_AGING"]},
+    # dMRI tract features
+    "fractional anisotropy of {region}": {"modality": "dMRI", "tool": "TBSS", "level": "tract",
+                                           "datasets": ["UKB", "HCP_YA"]},
+    "mean diffusivity of {region}":      {"modality": "dMRI", "tool": "TBSS", "level": "tract",
+                                           "datasets": ["UKB", "HCP_YA"]},
+    # PET ROI features (ADNI)
+    "amyloid SUVR of {region}":          {"modality": "PET", "tool": "florbetapir", "level": "ROI",
+                                           "datasets": ["ADNI"]},
+    "tau SUVR of {region}":              {"modality": "PET", "tool": "flortaucipir", "level": "ROI",
+                                           "datasets": ["ADNI"]},
+    "FDG uptake of {region}":            {"modality": "PET", "tool": "FDG", "level": "ROI",
+                                           "datasets": ["ADNI"]},
+    # lesion segmentation
+    "lesion volume of {region}":          {"modality": "sMRI", "tool": "MedSAM", "level": "ROI",
+                                           "datasets": ["UKB", "ADNI"]},
+}
+
+# Connectivity feature templates — {a} and {b} are AAL regions
+CONNECTIVITY_FEATURE_TEMPLATES = {
+    "functional connectivity between {a} and {b}":    {"modality": "fMRI", "tool": "rfMRI",
+                                                        "level": "connectivity",
+                                                        "datasets": ["UKB", "ADNI", "HCP_YA",
+                                                                     "ABIDE", "ADHD200", "COBRE",
+                                                                     "UCLA", "HCP_EP", "HCP_AGING"]},
+    "effective connectivity from {a} to {b}":         {"modality": "fMRI", "tool": "DCM/GC",
+                                                        "level": "connectivity",
+                                                        "datasets": ["ADNI", "HCP_YA",
+                                                                     "UCLA", "HCP_EP", "HCP_AGING"]},
+    "structural connectivity between {a} and {b}":    {"modality": "dMRI", "tool": "tractography",
+                                                        "level": "connectivity",
+                                                        "datasets": ["UKB", "HCP_YA"]},
+}
+
+# Domain pairs for imaging-driven hypothesis generation
+# source domain → target domain, aligned with dataset modalities
+IMAGING_DOMAIN_PAIRS = [
+    # sMRI features → disease
+    ("neuroanatomy", "disease"),
+    # connectivity → disease
+    ("connectivity", "disease"),
+    # sMRI features → cognitive function
+    ("neuroanatomy", "cognitive_function"),
+    # gene → brain structure (UKB genetics + imaging)
+    ("gene", "neuroanatomy"),
+    # disease → drug (ADNI)
+    ("disease", "drug"),
+]
+
+# Brain decoding domain pairs (NSD / BOLD5000 / SEED family).
+# These are SEPARATE from IMAGING_DOMAIN_PAIRS because decoding hypotheses
+# reverse the usual direction: instead of "brain feature → clinical outcome",
+# they go "stimulus ↔ brain" or "brain → psychological-state label".
+DECODING_DOMAIN_PAIRS = [
+    # Encoding: stimulus drives brain response
+    ("visual_stimulus", "neuroanatomy"),
+    ("visual_stimulus", "imaging_feature"),
+    ("visual_stimulus", "connectivity"),
+    # Decoding: brain predicts stimulus identity
+    ("neuroanatomy",    "visual_stimulus"),
+    ("imaging_feature", "visual_stimulus"),
+    # EEG → emotion (SEED/SEED-IV/SEED-V/SEED-VII/SEED-GER/SEED-FRA)
+    ("imaging_feature", "emotion"),
+    ("neuroanatomy",    "emotion"),
+    # EEG → vigilance (SEED-VIG)
+    ("imaging_feature", "vigilance"),
+    ("neuroanatomy",    "vigilance"),
+]
+
+# AAL atlas regions used for imaging feature generation
+# Subset of neuroanatomy nodes from NN_AAL source
+_AAL_REGION_KEYWORDS = [
+    "Precentral", "Frontal_Sup", "Frontal_Mid", "Frontal_Inf", "Rolandic_Oper",
+    "Supp_Motor", "Olfactory", "Frontal_Sup_Med", "Frontal_Med_Orb",
+    "Rectus", "Insula", "Cingulate", "Hippocampus", "Parahippocampal",
+    "Amygdala", "Calcarine", "Cuneus", "Lingual", "Occipital",
+    "Fusiform", "Postcentral", "Parietal", "SupraMarginal", "Angular",
+    "Precuneus", "Paracentral", "Caudate", "Putamen", "Pallidum",
+    "Thalamus", "Heschl", "Temporal", "Temporal_Pole",
+]
+
+# ── engine ─────────────────────────────────────────────────────────────
+
+class HypothesisEngine:
+    """Batch-generate, persist, and rank testable hypotheses from a knowledge graph."""
+
+    def __init__(self, kg: KnowledgeGraph):
+        self.kg = kg
+        self.G = kg.G
+        self._index = kg._index
+        # Build claims index for frequency_boost: (subj, pred, obj) → [claim_meta, ...]
+        self._claims_by_triple: dict[tuple[str, str, str], list[dict]] = {}
+        for nid, node in self._index.items():
+            if "claim" not in node.domain_tags:
+                continue
+            meta = node.metadata
+            key = (meta.get("subject_id", ""), meta.get("predicate", ""), meta.get("object_id", ""))
+            if key[0] and key[2]:
+                self._claims_by_triple.setdefault(key, []).append(meta)
+
+    # ── batch generation ───────────────────────────────────────────────
+
+    def batch_generate(
+        self,
+        domain_pairs: Optional[list[tuple[str, str]]] = None,
+        max_hops: int = 3,
+        max_paths_per_pair: int = 5,
+        max_seeds_per_domain: int = 50,
+    ) -> list[Hypothesis]:
+        """Batch-generate hypotheses across the entire graph.
+
+        Strategy: for each domain pair, sample seed concepts from domain_a,
+        find paths to concepts in domain_b within max_hops hops.
+        """
+        if domain_pairs is None:
+            domain_pairs = DEFAULT_DOMAIN_PAIRS
+
+        all_hypotheses: list[Hypothesis] = []
+        seen_pairs: set[tuple[str, str]] = set()
+        _hyp_counter = 0
+
+        for dom_a, dom_b in domain_pairs:
+            logger.info(f"generating hypotheses: {dom_a} -> {dom_b}")
+
+            seeds_a = self._sample_domain_nodes(dom_a, max_seeds_per_domain)
+            targets_b = {
+                nid for nid, data in self.G.nodes(data=True)
+                if dom_b in data.get("domain_tags", [])
+                and "claim" not in data.get("domain_tags", [])
+                and nid not in PATH_IGNORE_NODE_IDS
+            }
+
+            for seed_id in seeds_a:
+                if seed_id not in self.G:
+                    continue
+
+                # BFS from seed
+                try:
+                    reachable = nx.single_source_shortest_path(
+                        self.G, seed_id, cutoff=max_hops
+                    )
+                except nx.NetworkXError:
+                    continue
+
+                # find targets in domain_b
+                candidates = [
+                    nid for nid in reachable
+                    if nid in targets_b and nid != seed_id
+                ]
+
+                pair_count = 0
+                for target_id in candidates:
+                    pair_key = tuple(sorted([seed_id, target_id]))
+                    if pair_key in seen_pairs:
+                        continue
+                    seen_pairs.add(pair_key)
+
+                    raw_path = reachable[target_id]
+                    links = self._enrich_path(raw_path)
+                    if not links:
+                        continue
+
+                    conf = self._compute_confidence_score(links)
+                    nov = self._compute_novelty_score(links)
+                    evi = self._compute_evidence_score(links)
+                    test, test_reason = self._compute_testability_score(links)
+                    claim_ids = [l.claim_id for l in links if l.claim_id]
+
+                    _hyp_counter += 1
+                    h = Hypothesis(
+                        id=f"HYP:{_hyp_counter:06d}",
+                        hypothesis_type="bridge",
+                        source_id=seed_id,
+                        source_name=self._index[seed_id].preferred_name,
+                        target_id=target_id,
+                        target_name=self._index[target_id].preferred_name,
+                        path=links,
+                        confidence_score=conf,
+                        novelty_score=nov,
+                        evidence_score=evi,
+                        testability_score=test,
+                        composite_score=0.0,  # set below
+                        supporting_claims=claim_ids,
+                        testability_reason=test_reason,
+                        metadata={"domain_a": dom_a, "domain_b": dom_b},
+                    )
+                    h.explanation = self._generate_explanation(h)
+                    h.composite_score = self._composite_score(h)
+                    all_hypotheses.append(h)
+
+                    pair_count += 1
+                    if pair_count >= max_paths_per_pair:
+                        break
+
+        logger.info(f"batch generation complete: {len(all_hypotheses)} hypotheses from {len(domain_pairs)} domain pairs")
+
+        all_hypotheses = self.post_process(all_hypotheses)
+        return all_hypotheses
+
+    def post_process(
+        self,
+        hypotheses: list[Hypothesis],
+        min_hops: int = 2,
+        filter_vague_relations: bool = True,
+        filter_non_measurable: bool = True,
+        max_hops_filter: int = 5,
+    ) -> list[Hypothesis]:
+        """Filter low-quality hypotheses after generation.
+
+        Filters:
+        1. Noisy entities — source/target name matches NOISE_PATTERNS
+        2. 1-hop hypotheses — too simple, just restates existing edges
+        3. Vague relations — all links are is_associated_with / associated_with / about
+        4. Non-measurable biomarkers — entities not directly measurable from brain imaging
+        5. Pure association chains — no directional predicates (causes/treats/increases/etc.)
+        6. Overly long paths — exceeds max_hops_filter (default 5) to reduce noise accumulation
+        """
+        before = len(hypotheses)
+        filtered = []
+
+        for h in hypotheses:
+            # filter noisy entities (source, target, and all intermediate nodes)
+            all_names = {h.source_name, h.target_name}
+            for link in h.path:
+                all_names.add(link.from_name)
+                all_names.add(link.to_name)
+            if any(self._is_noisy_entity(name) for name in all_names):
+                continue
+
+            # filter 1-hop (single direct edge = no discovery value)
+            if len(h.path) < min_hops:
+                continue
+
+            # filter all-vague-relations
+            if filter_vague_relations:
+                relation_types = {l.relation_type for l in h.path}
+                if relation_types and relation_types <= VAGUE_RELATIONS:
+                    continue
+
+            # filter single-PMID bridges (all hops cite the same paper = not a real bridge)
+            if len(h.path) >= 2:
+                pmids = set()
+                for link in h.path:
+                    pmid = link.source_paper.get("pmid", "") if isinstance(link.source_paper, dict) else ""
+                    if pmid:
+                        pmids.add(pmid)
+                if len(pmids) == 1:
+                    continue
+
+            # filter non-measurable biomarkers (not testable from imaging)
+            if filter_non_measurable:
+                if self._has_non_measurable_entity(h):
+                    continue
+
+            # filter biologically implausible paths (brain region → non-neurological target)
+            if self._has_implausible_path(h):
+                continue
+
+            # filter paths with weak evidence (target not mentioned in raw_text)
+            if self._has_weak_evidence(h):
+                continue
+
+            # filter paths where both ends of any edge are broad hubs
+            # ("Brain Diseases --causes--> Cognitive Dysfunction" is uninformative)
+            if self._has_hub_to_hub_edge(h):
+                continue
+
+            # filter paths touching any vague COGAT/MeSH umbrella hub
+            # (memory/logic/loss/activation/risk/stress/Brain/Neurons).
+            # These nodes are too abstract to drive a DL experiment whether
+            # they appear as source, target, or intermediate.
+            if self._touches_path_ignore_node(h):
+                continue
+
+            # filter paths that transit through disease mega-hubs as
+            # intermediate nodes (A → Disease → B is uninformative).
+            # These nodes are still valid as source/target endpoints.
+            if self._transits_intermediate_only_hub(h):
+                continue
+
+            # (C-1) filter paths whose INTERMEDIATE node is a generic
+            # phrase ("neural activity", "disease progression", "grey
+            # matter", ...). Endpoints are not checked here.
+            if self._has_intermediate_generic_phrase(h):
+                continue
+
+            # (C-2) filter paths whose directional density is too thin
+            # (3+ hops with < 50% directional relations = too vague to
+            # be a mechanism hypothesis).
+            if self._has_thin_directional_density(h):
+                continue
+
+            # filter: target must be a dataset outcome (diagnosis/cognition/behavior/
+            # personality/motor). Predicting "White Matter" or "Neurons" is not a
+            # hypothesis UKB/ADNI/HCP can directly test — those are imaging features
+            # used as INPUTS, not outcomes.
+            if not self._is_dataset_outcome(h):
+                continue
+
+            # (C-3) filter: target name is an umbrella concept ("skill",
+            # "disease", "neurological disorder", "clinical features")
+            # even though it passes the outcome keyword check. These
+            # can't anchor a concrete DL label.
+            if self._is_too_broad_target(h.target_name):
+                continue
+
+            # filter paths with no directional predicates (pure association chains)
+            if len(h.path) >= 2:
+                relation_types = {l.relation_type for l in h.path}
+                if not (relation_types & DIRECTIONAL_RELATIONS):
+                    continue
+
+            # filter paths that exceed max hop length (noise accumulation)
+            if len(h.path) > max_hops_filter:
+                continue
+
+            filtered.append(h)
+
+        # Deduplicate: for each (source, target) pair, keep top 2 by composite score
+        from collections import defaultdict
+        pair_groups = defaultdict(list)
+        for h in filtered:
+            key = (h.source_id, h.target_id)
+            pair_groups[key].append(h)
+
+        deduplicated = []
+        for key, group in pair_groups.items():
+            # Sort by composite score descending
+            group.sort(key=lambda x: x.composite_score, reverse=True)
+            # Keep top 2 (or 1 if only one exists)
+            deduplicated.extend(group[:2])
+
+        logger.info(f"post_process: {before} -> {len(filtered)} filtered -> {len(deduplicated)} deduplicated "
+                     f"(removed {before - len(deduplicated)} total)")
+        return deduplicated
+
+    def _has_non_measurable_entity(self, h: Hypothesis) -> bool:
+        """Check if hypothesis involves entities not measurable from brain imaging.
+
+        Filters out hypotheses where source or target is:
+        - A non-measurable domain (neurotransmitter levels, protein expression, etc.)
+        - Matches non-measurable entity name patterns (CSF markers, blood markers, etc.)
+        """
+        for node_name, node_id in [(h.source_name, h.source_id), (h.target_name, h.target_id)]:
+            # check domain tags
+            node = self._index.get(node_id)
+            if node:
+                domains = set(node.domain_tags) - {"claim"}
+                # allow neurotransmitter/protein as intermediate hops only if source or target is neuroanatomy
+                if domains & NON_MEASURABLE_BIOMARKER_TYPES:
+                    # check if the OTHER end is a brain region (then it's a valid "X affects brain" hypothesis)
+                    other_name = h.target_name if node_name == h.source_name else h.source_name
+                    other_id = h.target_id if node_name == h.source_name else h.source_id
+                    other_node = self._index.get(other_id)
+                    if other_node and "neuroanatomy" not in other_node.domain_tags:
+                        return True
+
+            # check name patterns
+            for pattern in _NON_MEASURABLE_PATTERNS:
+                if pattern.search(node_name):
+                    return True
+
+        return False
+
+    @staticmethod
+    def _is_noisy_entity(name: str) -> bool:
+        """Check if an entity name matches known noise patterns."""
+        if not name or len(name.strip()) == 0:
+            return True
+        name_clean = name.strip()
+        for pattern in NOISE_PATTERNS:
+            if pattern.match(name_clean):
+                return True
+        # check if name contains any noise word
+        words = set(re.split(r"[\s\-_,/]+", name_clean.lower()))
+        if words & _NOISE_WORDS:
+            return True
+        return False
+
+    @staticmethod
+    def _is_generic_intermediate(name: str) -> bool:
+        """(C-1) Phrase-level filter for intermediate node names that pass
+        token-level `_NOISE_WORDS` but are still too vague.
+
+        Examples that get blocked:
+          - "neural activity"  (no individual noise token)
+          - "functional connectivity" (legit metric but not a mechanism)
+          - "disease progression"
+          - "grey matter"  (umbrella)
+          - "cognitive deficit"
+
+        Only call on intermediate nodes — these phrases can be valid as
+        endpoints (e.g. "functional connectivity" as a target metric).
+        """
+        if not name:
+            return True
+        s = name.strip()
+        for pattern in _GENERIC_INTERMEDIATE_PATTERNS:
+            if pattern.match(s):
+                return True
+        return False
+
+    @staticmethod
+    def _is_too_broad_target(name: str) -> bool:
+        """(C-3) Block target names that pass the outcome keyword regex but
+        are umbrella concepts ("disease", "skill", "neurological disorder",
+        "clinical features"). A DL experiment can't be designed against
+        these — you don't know which subtype to label.
+        """
+        if not name:
+            return True
+        s = name.strip()
+        for pattern in _TARGET_TOO_BROAD_PATTERNS:
+            if pattern.match(s):
+                return True
+        return False
+
+    def _has_intermediate_generic_phrase(self, h: Hypothesis) -> bool:
+        """(C-1) Reject paths whose intermediate node is a generic phrase
+        like "neural activity" or "disease progression". Endpoints are
+        excluded from this check because some metrics (e.g. "functional
+        connectivity") legitimately appear as outcomes.
+        """
+        if len(h.path) < 2:
+            return False
+        intermediate_names: list[str] = []
+        for i, link in enumerate(h.path):
+            # link.from_name is intermediate when i >= 1
+            # link.to_name   is intermediate when i <  len(path) - 1
+            if i >= 1:
+                intermediate_names.append(link.from_name or "")
+            if i < len(h.path) - 1:
+                intermediate_names.append(link.to_name or "")
+        for name in intermediate_names:
+            if self._is_generic_intermediate(name):
+                return True
+        return False
+
+    def _has_thin_directional_density(self, h: Hypothesis) -> bool:
+        """(C-2) Reject paths where directional relations are too sparse.
+
+        Current rule (older): >= 1 directional anywhere = pass.
+        Problem: a 4-hop path with 1 directional + 3 vague edges still
+        looks like a real chain to scoring but is essentially a vague
+        association narrative.
+
+        New rule:
+          - 1-2 hop path: at least 1 directional (unchanged)
+          - 3+ hop path: at least half of the edges must be directional
+        """
+        n = len(h.path)
+        if n < 3:
+            return False
+        directional = sum(1 for l in h.path if l.relation_type in DIRECTIONAL_RELATIONS)
+        return directional * 2 < n   # < 50% directional
+
+    def _has_implausible_path(self, h: Hypothesis) -> bool:
+        """Check if hypothesis path has biologically implausible connections.
+
+        Filters paths where a brain region directly predicts a non-neurological
+        condition (e.g., amygdala → urinary incontinence) without a plausible
+        intermediate neurological mechanism.
+        """
+        # Check if source is a brain region and target is non-neurological
+        source_node = self._index.get(h.source_id)
+        target_node = self._index.get(h.target_id)
+
+        if not source_node or not target_node:
+            return False
+
+        source_is_brain = "neuroanatomy" in source_node.domain_tags
+        target_is_neuro = any(d in target_node.domain_tags for d in
+                              ["neuroanatomy", "disease", "cognitive_function",
+                               "biomarker", "gene", "drug", "neurotransmitter"])
+
+        # If source is brain region and target is non-neurological, check target name
+        if source_is_brain and not target_is_neuro:
+            if _NON_NEUROLOGICAL_TARGETS.search(h.target_name):
+                return True
+
+        # Also check intermediate nodes in the path
+        for link in h.path:
+            if _NON_NEUROLOGICAL_TARGETS.search(link.to_name):
+                # Check if the previous node is a brain region
+                prev_node = self._index.get(link.from_id)
+                if prev_node and "neuroanatomy" in prev_node.domain_tags:
+                    # Only filter if there's no disease intermediate
+                    has_disease_intermediate = any(
+                        "disease" in self._index.get(l.from_id, ConceptNode(id="", preferred_name="")).domain_tags
+                        for l in h.path[:h.path.index(link)]
+                    )
+                    if not has_disease_intermediate:
+                        return True
+
+        return False
+
+    def _has_hub_to_hub_edge(self, h: Hypothesis) -> bool:
+        """Reject paths containing any edge whose endpoints are both broad hubs.
+
+        Example: "Brain Diseases --causes--> Cognitive Dysfunction" — both ends
+        are top-level categories; the edge is too generic to be a mechanistic
+        step in a hypothesis.
+
+        Hub set is the top-N nodes by non-'about' degree, computed once and
+        cached. Uses a low bar (N=50) because hubs are self-evidently generic.
+        """
+        if not hasattr(self, "_hub_id_set"):
+            # Build once per engine instance
+            from collections import Counter
+            degree = Counter()
+            for u, v, data in self.G.edges(data=True):
+                if data.get("relation_type") != "about":
+                    degree[u] += 1
+                    degree[v] += 1
+            top = degree.most_common(50)
+            self._hub_id_set = {cid for cid, _ in top}
+
+        for link in h.path:
+            if link.from_id in self._hub_id_set and link.to_id in self._hub_id_set:
+                return True
+        return False
+
+    def _touches_path_ignore_node(self, h: Hypothesis) -> bool:
+        """Reject paths whose source, target, or any intermediate node is in
+        PATH_IGNORE_NODE_IDS (vague COGAT/MeSH umbrella hubs).
+
+        Catches concepts the token-based _is_noisy_entity misses because
+        the names ("memory", "logic", "Brain", "Neurons") are legitimate
+        English words but the KG concept id refers to an over-general
+        umbrella that's not testable.
+        """
+        if h.source_id in PATH_IGNORE_NODE_IDS:
+            return True
+        if h.target_id in PATH_IGNORE_NODE_IDS:
+            return True
+        for link in h.path:
+            if link.from_id in PATH_IGNORE_NODE_IDS:
+                return True
+            if link.to_id in PATH_IGNORE_NODE_IDS:
+                return True
+        return False
+
+    @staticmethod
+    def _transits_intermediate_only_hub(h: Hypothesis) -> bool:
+        """Reject paths that use disease mega-hubs as intermediate transit.
+
+        INTERMEDIATE_ONLY_IGNORE_IDS nodes are valid as source/target
+        (predicting Alzheimer is a real hypothesis) but not as middle
+        hops (A → Alzheimer → B is just "both relate to AD").
+        """
+        if len(h.path) < 2:
+            return False
+        for i, link in enumerate(h.path):
+            if i >= 1 and link.from_id in INTERMEDIATE_ONLY_IGNORE_IDS:
+                return True
+            if i < len(h.path) - 1 and link.to_id in INTERMEDIATE_ONLY_IGNORE_IDS:
+                return True
+        return False
+
+    def _is_dataset_outcome(self, h: Hypothesis) -> bool:
+        """Check if target is a UKB/ADNI/HCP-testable outcome.
+
+        The goal of our hypotheses is to predict SOMETHING from brain imaging.
+        Valid targets:
+        - Clinical diagnoses (disease domain) — Alzheimer, MCI, schizophrenia, etc.
+        - Cognitive/behavioral/personality measures (cognitive_function domain)
+        - Brain decoding targets:
+            * neuroanatomy (for encoding: stimulus → brain activation)
+            * visual_stimulus (for decoding: brain → stimulus category)
+            * emotion (SEED family: EEG → affect label)
+            * vigilance (SEED-VIG: EEG → alertness)
+
+        Invalid targets:
+        - Molecular entities (gene, biomarker, drug, neurotransmitter) — these
+          may be predictors, not predicted quantities
+        - Overly generic disease categories (Brain Diseases, Mental Disorders) —
+          already filtered by hub-to-hub, but double-check by keyword.
+
+        Accepts target if EITHER:
+          a) target's domain is in _OUTCOME_DOMAINS ∪ decoding domains, OR
+          b) target name matches _OUTCOME_KEYWORDS regex (as fallback for
+             claim_extraction concepts whose domain may be uncertain)
+        """
+        target = self._index.get(h.target_id)
+        if target is None:
+            return False
+
+        domains = set(target.domain_tags)
+        # Accept: disease, cognitive_function, or decoding-target domains
+        outcome_domains = _OUTCOME_DOMAINS | {"visual_stimulus", "emotion", "vigilance"}
+        if domains & outcome_domains:
+            return True
+
+        # Accept: neuroanatomy targets when the hypothesis is a brain-decoding
+        # encoding path (stimulus → brain region). Excludes the clinical-
+        # prediction case where a target of 'White Matter' would be an input.
+        if "neuroanatomy" in domains:
+            source = self._index.get(h.source_id)
+            if source:
+                source_domains = set(source.domain_tags)
+                if source_domains & {"visual_stimulus", "emotion", "vigilance"}:
+                    return True
+
+        # Fallback: outcome keyword match (catches claim_extraction concepts
+        # that describe outcomes but have wrong domain tags)
+        if _OUTCOME_KEYWORDS.search(h.target_name):
+            return True
+
+        return False
+
+    def _has_weak_evidence(self, h: Hypothesis) -> bool:
+        """Check if hypothesis path has weak evidence (target not mentioned in raw_text).
+
+        For hypotheses where the target is a specific brain region, check if any hop's
+        raw_text actually mentions that region. If not, the path is likely spurious
+        (e.g., IL-1β → Internal Capsula where the evidence text talks about "grey matter"
+        but never mentions internal capsule).
+
+        Exception: paths anchored by curated functional facts (e.g. `evokes` from
+        visual_stimulus to a functional ROI) carry programmatic confidence, not
+        paper evidence — skip the raw_text requirement for them.
+        """
+        target_node = self._index.get(h.target_id)
+        if not target_node or "neuroanatomy" not in target_node.domain_tags:
+            return False
+
+        # Skip paths whose source is a visual_stimulus / emotion / vigilance node, or
+        # which contain at least one curated functional edge (evokes / decoded_from /
+        # elicits). These are seeded from neuroscience textbooks, not paper claims.
+        source_node = self._index.get(h.source_id)
+        if source_node:
+            decoding_domains = {"visual_stimulus", "emotion", "vigilance"}
+            if any(t in decoding_domains for t in source_node.domain_tags):
+                return False
+        if any(l.relation_type in {"evokes", "decoded_from", "elicits"} for l in h.path):
+            return False
+
+        # Extract key terms from target name (e.g., "Internal Capsula" → ["internal", "capsula"])
+        target_terms = set(re.findall(r'\b\w{4,}\b', h.target_name.lower()))
+        if not target_terms:
+            return False
+
+        # Check if any hop mentions the target region
+        for link in h.path:
+            raw = link.raw_text or link.evidence.get("raw_text", "") if isinstance(link.evidence, dict) else ""
+            if raw:
+                raw_lower = raw.lower()
+                # If any target term appears in raw_text, evidence is OK
+                if any(term in raw_lower for term in target_terms):
+                    return False
+
+        # No hop mentions the target region → weak evidence
+        logger.debug(f"weak evidence: {h.id} target '{h.target_name}' not mentioned in any raw_text")
+        return True
+
+    # ── imaging-driven batch generation ──────────────────────────────
+
+    def batch_generate_imaging(
+        self,
+        dataset: str = "UKB",
+        max_paths_per_pair: int = 5,
+        max_seeds: int = 50,
+        max_hops: int = 3,
+        include_connectivity: bool = True,
+    ) -> list[Hypothesis]:
+        """Generate hypotheses driven by imaging features available in a dataset.
+
+        Strategy:
+        1. Find AAL atlas neuroanatomy nodes in the graph as ROI seeds
+        2. For each ROI × imaging feature template, construct a feature name
+           (e.g., "cortical thickness of Hippocampus_L")
+        3. Find graph paths from each ROI to disease/cognitive_function nodes
+        4. Filter using expanded exclusion rules
+        5. Annotate each hypothesis with dataset metadata
+        """
+        dataset_key = dataset.upper().replace("-", "_")
+        if dataset_key not in DATASET_FEATURES:
+            raise ValueError(f"Unknown dataset: {dataset}. Available: {list(DATASET_FEATURES.keys())}")
+
+        ds_features = DATASET_FEATURES[dataset_key]
+        ds_outcomes = DATASET_OUTCOMES.get(dataset_key, [])
+
+        # 1. Find AAL atlas ROI nodes
+        aal_nodes = self._find_aal_regions(max_seeds)
+        if not aal_nodes:
+            logger.warning("No AAL atlas regions found in graph")
+            return []
+
+        logger.info(f"Found {len(aal_nodes)} AAL regions for imaging hypothesis generation")
+
+        # 2. Collect outcome nodes (disease, cognitive_function)
+        outcome_nodes = self._collect_outcome_nodes()
+        if not outcome_nodes:
+            logger.warning("No outcome nodes (disease/cognitive_function) found")
+            return []
+
+        # 3. Determine which imaging templates apply to this dataset
+        applicable_templates = {
+            name: meta for name, meta in IMAGING_FEATURE_TEMPLATES.items()
+            if dataset_key in meta["datasets"]
+        }
+
+        all_hypotheses: list[Hypothesis] = []
+        _hyp_counter = 0
+        seen_pairs: set[tuple[str, str]] = set()
+
+        # 4. Generate ROI-level imaging hypotheses
+        for region_id, region_name in aal_nodes.items():
+            for feat_template, feat_meta in applicable_templates.items():
+                feature_name = feat_template.replace("{region}", region_name)
+
+                # Find paths from this ROI to outcomes
+                try:
+                    reachable = nx.single_source_shortest_path(
+                        self.G, region_id, cutoff=max_hops
+                    )
+                except nx.NetworkXError:
+                    continue
+
+                candidates = [
+                    nid for nid in reachable
+                    if nid in outcome_nodes and nid != region_id
+                ]
+
+                pair_count = 0
+                for target_id in candidates:
+                    pair_key = (region_id, target_id, feat_template)
+                    if pair_key in seen_pairs:
+                        continue
+                    seen_pairs.add(pair_key)
+
+                    raw_path = reachable[target_id]
+                    links = self._enrich_path(raw_path)
+                    if not links:
+                        continue
+
+                    # Skip if path contains non-measurable entities
+                    if self._path_has_non_measurable(links):
+                        continue
+
+                    conf = self._compute_confidence_score(links)
+                    nov = self._compute_novelty_score(links)
+                    evi = self._compute_evidence_score(links)
+                    test, test_reason = self._compute_testability_score(links)
+                    # Boost testability for imaging-driven hypotheses
+                    test = min(test + 0.15, 1.0)
+                    claim_ids = [l.claim_id for l in links if l.claim_id]
+
+                    _hyp_counter += 1
+                    target_node = self._index.get(target_id)
+                    h = Hypothesis(
+                        id=f"HYP:IMG:{_hyp_counter:06d}",
+                        hypothesis_type="imaging",
+                        source_id=region_id,
+                        source_name=feature_name,
+                        target_id=target_id,
+                        target_name=target_node.preferred_name if target_node else target_id,
+                        path=links,
+                        confidence_score=conf,
+                        novelty_score=nov,
+                        evidence_score=evi,
+                        testability_score=test,
+                        composite_score=0.0,
+                        supporting_claims=claim_ids,
+                        testability_reason=test_reason,
+                        metadata={
+                            "dataset": dataset_key,
+                            "input_modality": feat_meta["modality"],
+                            "input_feature": feature_name,
+                            "input_level": feat_meta["level"],
+                            "input_tool": feat_meta["tool"],
+                            "input_region": region_name,
+                            "outcome_type": self._classify_outcome(target_node),
+                        },
+                    )
+                    h.explanation = self._generate_explanation(h)
+                    h.composite_score = self._composite_score(h)
+                    all_hypotheses.append(h)
+
+                    pair_count += 1
+                    if pair_count >= max_paths_per_pair:
+                        break
+
+        # 5. Generate connectivity-level hypotheses
+        if include_connectivity:
+            conn_templates = {
+                name: meta for name, meta in CONNECTIVITY_FEATURE_TEMPLATES.items()
+                if dataset_key in meta["datasets"]
+            }
+            if conn_templates:
+                hyps = self._generate_connectivity_hypotheses(
+                    aal_nodes, outcome_nodes, conn_templates,
+                    dataset_key, max_paths_per_pair, max_hops, _hyp_counter, seen_pairs,
+                )
+                _hyp_counter += len(hyps)
+                all_hypotheses.extend(hyps)
+
+        logger.info(
+            f"imaging batch generation ({dataset_key}): "
+            f"{len(all_hypotheses)} hypotheses from {len(aal_nodes)} regions"
+        )
+
+        all_hypotheses = self.post_process(all_hypotheses)
+        return all_hypotheses
+
+    def _find_aal_regions(self, max_n: int) -> dict[str, str]:
+        """Find AAL atlas neuroanatomy nodes. Returns {node_id: region_name}."""
+        candidates = {}
+        for nid, data in self.G.nodes(data=True):
+            if "neuroanatomy" not in data.get("domain_tags", []):
+                continue
+            name = data.get("preferred_name", "")
+            # Match against AAL region keywords
+            name_lower = name.lower()
+            for kw in _AAL_REGION_KEYWORDS:
+                if kw.lower() in name_lower:
+                    candidates[nid] = name
+                    break
+        # Sort by degree (more connected = richer paths)
+        sorted_items = sorted(
+            candidates.items(),
+            key=lambda item: self.G.degree(item[0]),
+            reverse=True,
+        )
+        return dict(sorted_items[:max_n])
+
+    def _collect_outcome_nodes(self) -> set[str]:
+        """Collect all disease + cognitive_function nodes as potential outcomes."""
+        outcome_ids = set()
+        for nid, data in self.G.nodes(data=True):
+            domains = set(data.get("domain_tags", []))
+            if "claim" in domains:
+                continue
+            if nid in PATH_IGNORE_NODE_IDS:
+                continue
+            if domains & {"disease", "cognitive_function"}:
+                outcome_ids.add(nid)
+        return outcome_ids
+
+    def _classify_outcome(self, node: Optional[ConceptNode]) -> str:
+        """Classify outcome node type for metadata."""
+        if not node:
+            return "unknown"
+        domains = set(node.domain_tags)
+        if "disease" in domains:
+            return "disease"
+        if "cognitive_function" in domains:
+            return "cognitive_function"
+        if "biomarker" in domains:
+            return "biomarker"
+        return "other"
+
+    def _path_has_non_measurable(self, links: list[HypothesisLink]) -> bool:
+        """Check if any intermediate node in the path is non-measurable."""
+        for link in links:
+            for name, nid in [(link.from_name, link.from_id), (link.to_name, link.to_id)]:
+                node = self._index.get(nid)
+                if node:
+                    domains = set(node.domain_tags) - {"claim"}
+                    if domains & NON_MEASURABLE_BIOMARKER_TYPES:
+                        return True
+                for pattern in _NON_MEASURABLE_PATTERNS:
+                    if pattern.search(name):
+                        return True
+        return False
+
+    def _generate_connectivity_hypotheses(
+        self,
+        aal_nodes: dict[str, str],
+        outcome_nodes: set[str],
+        conn_templates: dict,
+        dataset_key: str,
+        max_paths_per_pair: int,
+        max_hops: int,
+        hyp_counter_start: int,
+        seen_pairs: set,
+    ) -> list[Hypothesis]:
+        """Generate hypotheses for connectivity features (FC/EC/SC between region pairs)."""
+        hypotheses = []
+        counter = hyp_counter_start
+        region_ids = list(aal_nodes.keys())
+
+        # Sample region pairs (limit to avoid O(n^2) explosion)
+        max_pairs = min(len(region_ids) * 3, 200)
+        import random
+        if len(region_ids) > 20:
+            sampled_pairs = []
+            for _ in range(max_pairs):
+                a, b = random.sample(region_ids, 2)
+                sampled_pairs.append((a, b))
+        else:
+            sampled_pairs = [(a, b) for i, a in enumerate(region_ids) for b in region_ids[i+1:]]
+            sampled_pairs = sampled_pairs[:max_pairs]
+
+        for region_a_id, region_b_id in sampled_pairs:
+            name_a = aal_nodes[region_a_id]
+            name_b = aal_nodes[region_b_id]
+
+            for feat_template, feat_meta in conn_templates.items():
+                feature_name = feat_template.replace("{a}", name_a).replace("{b}", name_b)
+
+                # Find paths from region_a to outcomes (potentially through region_b)
+                try:
+                    reachable = nx.single_source_shortest_path(
+                        self.G, region_a_id, cutoff=max_hops
+                    )
+                except nx.NetworkXError:
+                    continue
+
+                candidates = [
+                    nid for nid in reachable
+                    if nid in outcome_nodes and nid != region_a_id
+                ]
+
+                pair_count = 0
+                for target_id in candidates:
+                    pair_key = (region_a_id, target_id, feat_template)
+                    if pair_key in seen_pairs:
+                        continue
+                    seen_pairs.add(pair_key)
+
+                    raw_path = reachable[target_id]
+                    links = self._enrich_path(raw_path)
+                    if not links:
+                        continue
+
+                    if self._path_has_non_measurable(links):
+                        continue
+
+                    conf = self._compute_confidence_score(links)
+                    nov = self._compute_novelty_score(links)
+                    evi = self._compute_evidence_score(links)
+                    test, test_reason = self._compute_testability_score(links)
+                    test = min(test + 0.15, 1.0)
+                    claim_ids = [l.claim_id for l in links if l.claim_id]
+
+                    counter += 1
+                    target_node = self._index.get(target_id)
+                    h = Hypothesis(
+                        id=f"HYP:IMG:{counter:06d}",
+                        hypothesis_type="imaging_connectivity",
+                        source_id=region_a_id,
+                        source_name=feature_name,
+                        target_id=target_id,
+                        target_name=target_node.preferred_name if target_node else target_id,
+                        path=links,
+                        confidence_score=conf,
+                        novelty_score=nov,
+                        evidence_score=evi,
+                        testability_score=test,
+                        composite_score=0.0,
+                        supporting_claims=claim_ids,
+                        testability_reason=test_reason,
+                        metadata={
+                            "dataset": dataset_key,
+                            "input_modality": feat_meta["modality"],
+                            "input_feature": feature_name,
+                            "input_level": feat_meta["level"],
+                            "input_tool": feat_meta["tool"],
+                            "input_region_a": name_a,
+                            "input_region_b": name_b,
+                            "input_region": f"{name_a} - {name_b}",
+                            "outcome_type": self._classify_outcome(target_node),
+                        },
+                    )
+                    h.explanation = self._generate_explanation(h)
+                    h.composite_score = self._composite_score(h)
+                    hypotheses.append(h)
+
+                    pair_count += 1
+                    if pair_count >= max_paths_per_pair:
+                        break
+
+        return hypotheses
+
+    # ── persistence ────────────────────────────────────────────────────
+
+    def save_hypotheses(self, hypotheses: list[Hypothesis], path: str | Path) -> None:
+        """Save hypotheses to JSON."""
+        path = Path(path)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        data = {
+            "n_hypotheses": len(hypotheses),
+            "hypotheses": [h.to_dict() for h in hypotheses],
+        }
+        path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
+        logger.info(f"saved {len(hypotheses)} hypotheses to {path}")
+
+    def load_hypotheses(self, path: str | Path) -> list[Hypothesis]:
+        """Load hypotheses from JSON."""
+        path = Path(path)
+        data = json.loads(path.read_text(encoding="utf-8"))
+        hypotheses = [Hypothesis.from_dict(h) for h in data["hypotheses"]]
+        logger.info(f"loaded {len(hypotheses)} hypotheses from {path}")
+        return hypotheses
+
+    # ── ranking ────────────────────────────────────────────────────────
+
+    def rank_hypotheses(
+        self,
+        hypotheses: list[Hypothesis],
+        weights: Optional[dict[str, float]] = None,
+        top_n: int = 100,
+        skip_post_process: bool = False,
+    ) -> list[Hypothesis]:
+        """Rank hypotheses by composite score (novelty, evidence, testability, confidence).
+
+        Args:
+            hypotheses: list of hypotheses to rank
+            weights: custom weights dict, keys: confidence, evidence, novelty, testability
+            top_n: return top N results
+            skip_post_process: if True, skip the post-processing filter
+        """
+        if not skip_post_process:
+            hypotheses = self.post_process(hypotheses)
+
+        if weights is None:
+            # testability weighted highest — must be verifiable with imaging experiments
+            weights = {
+                "confidence": 0.20,
+                "evidence": 0.20,
+                "novelty": 0.25,
+                "testability": 0.35,
+            }
+
+        for h in hypotheses:
+            h.composite_score = (
+                (h.confidence_score ** weights["confidence"])
+                * (h.evidence_score ** weights["evidence"])
+                * (h.novelty_score ** weights["novelty"])
+                * (max(h.testability_score, 0.01) ** weights["testability"])
+            )
+
+        hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
+        return hypotheses[:top_n]
+
+    # ── query-based (kept for interactive use) ─────────────────────────
+
+    def find_paths(
+        self,
+        source_id: str,
+        target_id: str,
+        max_hops: int = 3,
+        max_paths: int = 20,
+    ) -> list[Hypothesis]:
+        """Find hypothesis paths between two concepts with evidence enrichment."""
+        if source_id not in self.G or target_id not in self.G:
+            return []
+
+        claim_nodes = {nid for nid, n in self._index.items() if "claim" in n.domain_tags}
+        intermediate_exclude = claim_nodes - {source_id, target_id}
+        # Also strip vague umbrella hubs from the search subgraph so paths
+        # never include them as intermediates. Endpoints are excluded from
+        # the strip so a caller can still query them directly.
+        intermediate_exclude |= (PATH_IGNORE_NODE_IDS - {source_id, target_id})
+
+        subgraph = self.G.copy()
+        subgraph.remove_nodes_from(intermediate_exclude)
+
+        if source_id not in subgraph or target_id not in subgraph:
+            return []
+
+        try:
+            raw_paths = list(nx.all_simple_paths(
+                subgraph, source_id, target_id, cutoff=max_hops
+            ))
+        except nx.NetworkXError:
+            return []
+
+        raw_paths = raw_paths[:max_paths]
+        return self._build_hypotheses_from_paths(raw_paths, "path")
+
+    def bridge_discovery(
+        self,
+        concept_id: str,
+        target_domain: str,
+        max_hops: int = 3,
+        max_results: int = 20,
+    ) -> list[Hypothesis]:
+        """Find cross-domain connections through intermediate claims."""
+        if concept_id not in self.G:
+            return []
+
+        target_nodes = {
+            nid for nid, data in self.G.nodes(data=True)
+            if target_domain in data.get("domain_tags", [])
+        }
+        if not target_nodes:
+            return []
+
+        try:
+            reachable = nx.single_source_shortest_path(
+                self.G, concept_id, cutoff=max_hops
+            )
+        except nx.NetworkXError:
+            return []
+
+        candidates = {
+            nid for nid in reachable
+            if nid in target_nodes and nid != concept_id
+            and "claim" not in self._index.get(nid, ConceptNode(id="", preferred_name="")).domain_tags
+        }
+
+        hypotheses = []
+        for target_id in candidates:
+            raw_path = reachable[target_id]
+            links = self._enrich_path(raw_path)
+            if not links:
+                continue
+
+            conf = self._compute_confidence_score(links)
+            nov = self._compute_novelty_score(links)
+            evi = self._compute_evidence_score(links)
+            test, test_reason = self._compute_testability_score(links)
+            claim_ids = [l.claim_id for l in links if l.claim_id]
+
+            h = Hypothesis(
+                hypothesis_type="bridge",
+                source_id=concept_id,
+                source_name=self._index[concept_id].preferred_name,
+                target_id=target_id,
+                target_name=self._index[target_id].preferred_name,
+                path=links,
+                confidence_score=conf,
+                novelty_score=nov,
+                evidence_score=evi,
+                testability_score=test,
+                supporting_claims=claim_ids,
+                testability_reason=test_reason,
+            )
+            h.explanation = self._generate_explanation(h)
+            h.composite_score = self._composite_score(h)
+            hypotheses.append(h)
+
+        hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
+        return hypotheses[:max_results]
+
+    def discover_hypotheses(
+        self,
+        concept_id: str,
+        max_hops: int = 3,
+        max_results: int = 30,
+        exclude_domains: Optional[set[str]] = None,
+    ) -> list[Hypothesis]:
+        """Find hypotheses radiating from a single concept to all reachable domains."""
+        if concept_id not in self.G:
+            return []
+
+        exclude = exclude_domains or {"claim"}
+        source_node = self._index.get(concept_id)
+        source_domains = set(source_node.domain_tags) - exclude if source_node else set()
+
+        try:
+            reachable = nx.single_source_shortest_path(self.G, concept_id, cutoff=max_hops)
+        except nx.NetworkXError:
+            return []
+
+        candidates = []
+        for target_id, raw_path in reachable.items():
+            if target_id == concept_id:
+                continue
+            target_node = self._index.get(target_id)
+            if not target_node:
+                continue
+            target_domains = set(target_node.domain_tags) - exclude
+            if not target_domains or target_domains <= source_domains:
+                continue
+            candidates.append((target_id, raw_path))
+
+        hypotheses = []
+        for target_id, raw_path in candidates:
+            links = self._enrich_path(raw_path)
+            if not links:
+                continue
+            conf = self._compute_confidence_score(links)
+            nov = self._compute_novelty_score(links)
+            evi = self._compute_evidence_score(links)
+            test, test_reason = self._compute_testability_score(links)
+            claim_ids = [l.claim_id for l in links if l.claim_id]
+
+            h = Hypothesis(
+                hypothesis_type="discover",
+                source_id=concept_id,
+                source_name=self._index[concept_id].preferred_name,
+                target_id=target_id,
+                target_name=self._index[target_id].preferred_name,
+                path=links,
+                confidence_score=conf,
+                novelty_score=nov,
+                evidence_score=evi,
+                testability_score=test,
+                supporting_claims=claim_ids,
+                testability_reason=test_reason,
+            )
+            h.explanation = self._generate_explanation(h)
+            h.composite_score = self._composite_score(h)
+            hypotheses.append(h)
+
+        hypotheses = self.post_process(hypotheses)
+        hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
+        return hypotheses[:max_results]
+
+    def find_trending(
+        self,
+        since_year: int = 2020,
+        min_claims: int = 3,
+        direction: str = "strengthening",
+        max_results: int = 30,
+    ) -> list[dict]:
+        """Find concept pairs with strengthening/weakening evidence over time.
+
+        Returns list of dicts with: concept_a, concept_b, years, slope, direction, claims.
+        """
+        from collections import Counter
+
+        # Group claims by (subject, object)
+        claim_groups: dict[tuple[str, str], list[dict]] = {}
+        for nid, node in self._index.items():
+            if "claim" not in node.domain_tags:
+                continue
+            meta = node.metadata
+            sid = meta.get("subject_id", "")
+            oid = meta.get("object_id", "")
+            if not sid or not oid:
+                continue
+            key = (sid, oid)
+            claim_groups.setdefault(key, []).append(meta)
+
+        results = []
+        for (sid, oid), claims in claim_groups.items():
+            years = []
+            for c in claims:
+                sp = c.get("source_paper", {})
+                y = sp.get("year")
+                if y and y >= since_year:
+                    years.append(y)
+
+            if len(years) < min_claims:
+                continue
+
+            year_counts = Counter(years)
+            ys = sorted(year_counts.keys())
+            cs = [year_counts[y] for y in ys]
+            slope = _simple_slope(ys, cs)
+
+            if direction == "strengthening" and slope <= 0.3:
+                continue
+            if direction == "weakening" and slope >= -0.3:
+                continue
+            if direction == "emerging" and max(ys) < 2025:
+                continue
+
+            src_node = self._index.get(sid)
+            tgt_node = self._index.get(oid)
+
+            results.append({
+                "concept_a": src_node.preferred_name if src_node else sid,
+                "concept_b": tgt_node.preferred_name if tgt_node else oid,
+                "concept_a_id": sid,
+                "concept_b_id": oid,
+                "year_counts": {str(y): year_counts[y] for y in ys},
+                "slope": round(slope, 3),
+                "direction": direction,
+                "n_claims": len(claims),
+            })
+
+        results.sort(key=lambda r: abs(r["slope"]), reverse=True)
+        return results[:max_results]
+
+    def contradiction_detection(
+        self,
+        domain_filter: Optional[str] = None,
+        max_results: int = 50,
+    ) -> list[Contradiction]:
+        """Find pairs of claims that assert opposite things about the same concept pair."""
+        claim_lookup: dict[tuple[str, str], list[ConceptNode]] = {}
+        for nid, node in self._index.items():
+            if "claim" not in node.domain_tags:
+                continue
+            meta = node.metadata
+            sid = meta.get("subject_id", "")
+            oid = meta.get("object_id", "")
+            if not sid or not oid:
+                continue
+
+            if domain_filter:
+                src_node = self._index.get(sid)
+                tgt_node = self._index.get(oid)
+                domains = set()
+                if src_node:
+                    domains.update(src_node.domain_tags)
+                if tgt_node:
+                    domains.update(tgt_node.domain_tags)
+                if domain_filter not in domains:
+                    continue
+
+            key = (sid, oid)
+            claim_lookup.setdefault(key, []).append(node)
+
+        contradictions = []
+        for (sid, oid), claims in claim_lookup.items():
+            if len(claims) < 2:
+                continue
+            for i in range(len(claims)):
+                for j in range(i + 1, len(claims)):
+                    c1, c2 = claims[i], claims[j]
+                    m1, m2 = c1.metadata, c2.metadata
+                    severity = self._check_contradiction(m1, m2)
+                    if severity > 0:
+                        contradictions.append(Contradiction(
+                            concept_a_id=sid,
+                            concept_a_name=m1.get("subject_name", sid),
+                            concept_b_id=oid,
+                            concept_b_name=m1.get("object_name", oid),
+                            claim_for_id=c1.id,
+                            claim_for_predicate=m1.get("predicate", ""),
+                            claim_for_text=m1.get("raw_text", ""),
+                            claim_against_id=c2.id,
+                            claim_against_predicate=m2.get("predicate", ""),
+                            claim_against_text=m2.get("raw_text", ""),
+                            severity=severity,
+                        ))
+
+        contradictions.sort(key=lambda c: c.severity, reverse=True)
+        return contradictions[:max_results]
+
+    def gap_detection(
+        self,
+        domain_a: str,
+        domain_b: Optional[str] = None,
+        max_results: int = 50,
+    ) -> list[Gap]:
+        """Find concept pairs 2 hops apart with no direct edge."""
+        if domain_b is None:
+            domain_b = domain_a
+
+        nodes_a = {
+            nid for nid, data in self.G.nodes(data=True)
+            if domain_a in data.get("domain_tags", [])
+            and "claim" not in data.get("domain_tags", [])
+        }
+        nodes_b = {
+            nid for nid, data in self.G.nodes(data=True)
+            if domain_b in data.get("domain_tags", [])
+            and "claim" not in data.get("domain_tags", [])
+        }
+
+        gaps = []
+        seen = set()
+
+        for a_id in nodes_a:
+            if a_id not in self.G:
+                continue
+            hop1 = set(self.G.successors(a_id)) | set(self.G.predecessors(a_id))
+            hop2 = set()
+            for n1 in hop1:
+                if "claim" in self._index.get(n1, ConceptNode(id="", preferred_name="")).domain_tags:
+                    continue
+                hop2.update(self.G.successors(n1))
+                hop2.update(self.G.predecessors(n1))
+
+            hop2 -= {a_id}
+            hop2 -= hop1
+
+            for b_id in hop2 & nodes_b:
+                pair = tuple(sorted([a_id, b_id]))
+                if pair in seen:
+                    continue
+                seen.add(pair)
+
+                if self.G.has_edge(a_id, b_id) or self.G.has_edge(b_id, a_id):
+                    continue
+
+                try:
+                    path = nx.shortest_path(self.G, a_id, b_id)
+                except (nx.NetworkXNoPath, nx.NetworkXError):
+                    continue
+
+                if len(path) > 3:
+                    continue
+
+                connecting = [n for n in path[1:-1]
+                              if "claim" not in self._index.get(n, ConceptNode(id="", preferred_name="")).domain_tags]
+
+                a_node = self._index.get(a_id)
+                b_node = self._index.get(b_id)
+
+                gaps.append(Gap(
+                    concept_a_id=a_id,
+                    concept_a_name=a_node.preferred_name if a_node else a_id,
+                    concept_b_id=b_id,
+                    concept_b_name=b_node.preferred_name if b_node else b_id,
+                    distance=len(path) - 1,
+                    connecting_concepts=connecting,
+                    domain_a=domain_a,
+                    domain_b=domain_b,
+                    potential_relation=self._infer_relation(path),
+                ))
+
+        gaps.sort(key=lambda g: (0 if g.domain_a != g.domain_b else 1, g.distance))
+        return gaps[:max_results]
+
+    # ── name resolution ────────────────────────────────────────────────
+
+    def resolve_name(self, query: str) -> Optional[str]:
+        """Resolve a name to a concept ID. Returns None if not found."""
+        if not query:
+            return None
+
+        for node in self._index.values():
+            if node.preferred_name == query:
+                return node.id
+
+        query_lower = query.lower()
+        for node in self._index.values():
+            if node.preferred_name.lower() == query_lower:
+                return node.id
+
+        for node in self._index.values():
+            for alias in node.aliases:
+                if alias.lower() == query_lower:
+                    return node.id
+
+        candidates = []
+        for node in self._index.values():
+            name_lower = node.preferred_name.lower()
+            if query_lower in name_lower or name_lower in query_lower:
+                candidates.append(node)
+                continue
+            for alias in node.aliases:
+                if query_lower in alias.lower() or alias.lower() in query_lower:
+                    candidates.append(node)
+                    break
+
+        if len(candidates) == 1:
+            return candidates[0].id
+        elif len(candidates) > 1:
+            candidates.sort(key=lambda n: len(n.preferred_name))
+            return candidates[0].id
+
+        return None
+
+    # ── internal helpers ───────────────────────────────────────────────
+
+    def _sample_domain_nodes(self, domain: str, max_n: int) -> list[str]:
+        """Sample up to max_n non-claim nodes from a domain, preferring nodes with edges."""
+        nodes = [
+            nid for nid, data in self.G.nodes(data=True)
+            if domain in data.get("domain_tags", [])
+            and "claim" not in data.get("domain_tags", [])
+            and nid not in PATH_IGNORE_NODE_IDS
+        ]
+        # sort by degree (more connected = more useful as seed)
+        nodes.sort(key=lambda n: self.G.degree(n), reverse=True)
+        return nodes[:max_n]
+
+    def _build_hypotheses_from_paths(
+        self, raw_paths: list[list[str]], hyp_type: str
+    ) -> list[Hypothesis]:
+        """Build Hypothesis objects from raw node-ID paths."""
+        hypotheses = []
+        for raw_path in raw_paths:
+            links = self._enrich_path(raw_path)
+            if not links:
+                continue
+
+            conf = self._compute_confidence_score(links)
+            nov = self._compute_novelty_score(links)
+            evi = self._compute_evidence_score(links)
+            test, test_reason = self._compute_testability_score(links)
+            claim_ids = [l.claim_id for l in links if l.claim_id]
+
+            h = Hypothesis(
+                hypothesis_type=hyp_type,
+                source_id=raw_path[0],
+                source_name=self._index[raw_path[0]].preferred_name,
+                target_id=raw_path[-1],
+                target_name=self._index[raw_path[-1]].preferred_name,
+                path=links,
+                confidence_score=conf,
+                novelty_score=nov,
+                evidence_score=evi,
+                testability_score=test,
+                supporting_claims=claim_ids,
+                testability_reason=test_reason,
+            )
+            h.explanation = self._generate_explanation(h)
+            h.composite_score = self._composite_score(h)
+            hypotheses.append(h)
+
+        hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
+        return hypotheses
+
+    def _enrich_path(self, raw_path: list[str]) -> list[HypothesisLink]:
+        """Convert a raw node-ID path into rich HypothesisLink objects."""
+        links = []
+        for i in range(len(raw_path) - 1):
+            src_id, tgt_id = raw_path[i], raw_path[i + 1]
+            if not self.G.has_edge(src_id, tgt_id):
+                continue
+
+            edge_data = self.G.edges[src_id, tgt_id]
+            src_node = self._index.get(src_id)
+            tgt_node = self._index.get(tgt_id)
+
+            claim_id = edge_data.get("metadata", {}).get("claim_id", "")
+            claim_node = self._index.get(claim_id) if claim_id else None
+
+            evidence = {}
+            paper = {}
+            raw_text = ""
+
+            if claim_node and claim_node.metadata:
+                meta = claim_node.metadata
+                evidence = meta.get("evidence", {})
+                paper = meta.get("source_paper", {})
+                raw_text = meta.get("raw_text", "")
+
+            links.append(HypothesisLink(
+                from_id=src_id,
+                from_name=src_node.preferred_name if src_node else src_id,
+                to_id=tgt_id,
+                to_name=tgt_node.preferred_name if tgt_node else tgt_id,
+                relation_type=edge_data.get("relation_type", "unknown"),
+                confidence=edge_data.get("confidence", 0.5),
+                claim_id=claim_id,
+                raw_text=raw_text,
+                evidence=evidence,
+                source_paper=paper,
+            ))
+
+        return links
+
+    # ── scoring ────────────────────────────────────────────────────────
+
+    def compute_frequency_boost(self, claim_meta: dict) -> float:
+        """Frequency boost based on independent PRIMARY study replication.
+
+        Prefers the merged `primary_supporting_papers` list set by
+        `phase4_optimize.merge_duplicate_claims` (already filtered for
+        non-review study types). Falls back to rebuilding from the
+        pre-merge index, matching the same filter logic.
+        """
+        # Fast path: canonical claim carries primary-PMID list
+        primary = claim_meta.get("primary_supporting_papers")
+        if primary is not None and isinstance(primary, list):
+            n = len(primary)
+            if n >= 3:
+                return 1.2
+            elif n >= 1:
+                return 1.0
+            else:
+                return 0.5
+
+        # Fallback: scan all claims with the same SPO, filter reviews
+        key = (
+            claim_meta.get("subject_id", ""),
+            claim_meta.get("predicate", ""),
+            claim_meta.get("object_id", ""),
+        )
+        all_claims = self._claims_by_triple.get(key, [])
+        primary_pmids = set()
+        for c in all_claims:
+            st = c.get("evidence", {}).get("study_type", "")
+            if st not in _REVIEW_TYPES:
+                pmid = c.get("source_paper", {}).get("pmid", "")
+                if pmid:
+                    primary_pmids.add(pmid)
+
+        if len(primary_pmids) >= 3:
+            return 1.2
+        elif len(primary_pmids) >= 1:
+            return 1.0
+        else:
+            return 0.5
+
+    @staticmethod
+    def compute_temporal_decay(claim_meta: dict, reference_year: int = 2026) -> float:
+        """Temporal decay: newer primary studies get higher weight.
+
+        Reviews get no time bonus (1.0). Primary studies decay 3% per year, floor 0.7.
+        """
+        st = claim_meta.get("evidence", {}).get("study_type", "")
+        if st in _REVIEW_TYPES:
+            return 1.0
+        year = claim_meta.get("source_paper", {}).get("year", 0)
+        if not year:
+            return 0.85  # unknown year, neutral
+        age = reference_year - year
+        return max(0.7, 1.0 - 0.03 * age)
+
+    def _compute_confidence_score(self, path: list[HypothesisLink]) -> float:
+        """Confidence = geometric mean of per-link scores, with weak-link penalty.
+
+        Per-link score = edge.confidence × freq_boost × temporal_decay
+          (edge.confidence already includes study_type weighting from
+          phase4_optimize.apply_evidence_weighting and the claim-level
+          statistical quality signals from claim_extractor._estimate_confidence)
+
+        Aggregate: geometric mean (one weak link crushes the path)
+          + weakest-link penalty (×0.7 when min_edge < 0.1)
+
+        Single source of truth for each multiplier:
+        - study_type → phase4_optimize.WEIGHT_MAP (canonical, idempotent)
+        - p_value/sample_size/replicability → claim_extractor._estimate_confidence
+        - freq across primary PMIDs → compute_frequency_boost
+        - publication recency → compute_temporal_decay
+        """
+        if not path:
+            return 0.0
+
+        import math
+
+        scores = []
+        min_conf = float("inf")
+        for link in path:
+            raw = max(link.confidence, 1e-3)  # tiny floor for log()
+            min_conf = min(min_conf, raw)
+
+            full_meta = {
+                "evidence": link.evidence,
+                "source_paper": link.source_paper,
+                "subject_id": link.from_id,
+                "predicate": link.relation_type,
+                "object_id": link.to_id,
+            }
+            freq_boost = self.compute_frequency_boost(full_meta)
+            temp_decay = self.compute_temporal_decay(full_meta)
+
+            s = raw * freq_boost * temp_decay
+            scores.append(min(s, 1.0))
+
+        log_sum = sum(math.log(max(s, 1e-6)) for s in scores)
+        gm = math.exp(log_sum / len(scores))
+
+        if min_conf < 0.1:
+            gm *= 0.7
+
+        return max(min(gm, 1.0), 0.0)
+
+    def _compute_novelty_score(self, path: list[HypothesisLink]) -> float:
+        """Score how novel/surprising a hypothesis is.
+
+        Lower = more expected (direct known relationship), Higher = more surprising.
+        """
+        score = 0.3  # base
+
+        # hop bonus: longer paths = more novel connections
+        score += 0.1 * min(len(path) - 1, 3)
+
+        # cross-domain bonus: connecting different domains is more novel
+        domains_seen = set()
+        for link in path:
+            src = self._index.get(link.from_id)
+            tgt = self._index.get(link.to_id)
+            if src:
+                domains_seen.update(src.domain_tags)
+            if tgt:
+                domains_seen.update(tgt.domain_tags)
+        domains_seen.discard("claim")
+        n_domains = len(domains_seen)
+        if n_domains >= 3:
+            score += 0.15
+        elif n_domains >= 2:
+            score += 0.10
+
+        # rare relation bonus: non-generic relations are more novel
+        rare_count = sum(1 for l in path if l.relation_type not in COMMON_RELATIONS)
+        score += 0.05 * min(rare_count, 3)
+
+        # evidence diversity: more papers = better supported, less novel
+        # fewer papers = more speculative, more novel
+        pmids = {l.source_paper.get("pmid", "") for l in path if l.source_paper.get("pmid")}
+        if len(pmids) == 0:
+            score += 0.10  # no paper support = speculative but novel
+        elif len(pmids) == 1:
+            score += 0.05  # single source = weak replication
+
+        return min(score, 1.0)
+
+    def _compute_evidence_score(self, path: list[HypothesisLink]) -> float:
+        """Score evidence quality: traceability and text availability.
+
+        DOES NOT use p_value/sample_size/effect_size — those signals already
+        flow into edge.confidence via claim_extractor._estimate_confidence
+        and are aggregated by _compute_confidence_score. Counting them again
+        here was double-dipping.
+
+        This score asks a different question: "How well-anchored is the
+        evidence in source documents?" — which complements confidence's
+        "How statistically strong is the evidence?". Path-level: most
+        well-extracted edges score 0.6-0.8; we reserve >0.9 for paths whose
+        every step has rich provenance.
+        """
+        _REVIEW_TYPES = {"narrative_review", "review"}
+        scores = []
+        for link in path:
+            study_type = (link.evidence.get("study_type") or "").lower()
+            s = 0.2 if study_type in _REVIEW_TYPES else 0.3
+
+            if link.raw_text and len(link.raw_text) > 20:
+                s += 0.20
+            if link.claim_id:
+                s += 0.15
+            if link.source_paper.get("pmid"):
+                s += 0.15
+            if link.evidence.get("study_type"):
+                s += 0.10
+
+            scores.append(min(s, 1.0))
+
+        return self._geometric_mean(scores)
+
+    def _compute_testability_score(self, path: list[HypothesisLink]) -> tuple[float, str]:
+        """Score how testable a hypothesis is with NeuroClaw imaging experiments.
+
+        Boosts for:
+        - Brain region features directly measurable from sMRI (volume, thickness)
+        - Connectivity features (functional/structural) for GNN models
+        - Modalities available in UKB/ADNI/HCP-YA
+        - Deep learning model compatibility (BrainGNN, NeuroStorm)
+        - Target diseases present in datasets (AD, PD, depression, etc.)
+
+        Returns (score, reason_string).
+        """
+        all_text = " ".join(
+            l.raw_text + " " + l.from_name + " " + l.to_name + " " + l.relation_type
+            for l in path
+        ).lower()
+
+        # check which modalities are mentioned
+        matched_modalities = []
+        for modality, keywords in TESTABLE_MODALITIES.items():
+            for kw in keywords:
+                if kw.lower() in all_text:
+                    matched_modalities.append(modality)
+                    break
+
+        if not matched_modalities:
+            return 0.15, "no imaging modality detected"
+
+        score = 0.25  # base for having a modality
+
+        # modality bonus (more = more testable angles)
+        score += 0.10 * min(len(matched_modalities), 3)
+
+        # heavy bonus for sMRI features (volume/thickness — directly measurable in all 3 datasets)
+        if "sMRI" in matched_modalities:
+            score += 0.15
+
+        # heavy bonus for connectivity features (input to BrainGNN/GNN models)
+        if "dMRI" in matched_modalities or "fMRI" in matched_modalities:
+            score += 0.15
+
+        # bonus for PET (available in ADNI, key for AD research)
+        if "PET" in matched_modalities:
+            score += 0.10
+
+        # bonus for brain region specificity (testable with atlas parcellation)
+        brain_region_keywords = ["cortex", "hippocampus", "amygdala", "thalamus",
+                                 "cerebellum", "striatum", "insula", "gyrus",
+                                 "caudate", "putamen", "pallidum", "accumbens",
+                                 "precuneus", "cuneus", "lingual", "fusiform",
+                                 "parahippocampal", "entorhinal", "parietal",
+                                 "frontal", "temporal", "occipital"]
+        regions_found = [kw for kw in brain_region_keywords if kw in all_text]
+        if regions_found:
+            score += 0.10  # atlas-based ROI analysis
+            if len(regions_found) >= 2:
+                score += 0.05  # pair of regions = connectivity hypothesis
+
+        # bonus for diseases present in target datasets
+        dataset_diseases = [
+            "alzheimer", "parkinson", "depression", "schizophrenia", "adhd",
+            "autism", "epilepsy", "multiple sclerosis", "anxiety", "bipolar",
+            "dementia", "mci", "mild cognitive",
+        ]
+        if any(d in all_text for d in dataset_diseases):
+            score += 0.05
+
+        # bonus for DL-model-compatible features (graph structure, ROI, connectivity matrix)
+        if any(kw.lower() in all_text for kw in DL_MODEL_KEYWORDS):
+            score += 0.05
+
+        # build reason string
+        modalities_str = ", ".join(matched_modalities)
+        reason = f"modalities: {modalities_str}"
+        if regions_found:
+            reason += f" | brain regions: {', '.join(regions_found[:4])}"
+        if any(d in all_text for d in dataset_diseases):
+            matched_diseases = [d for d in dataset_diseases if d in all_text]
+            reason += f" | diseases: {', '.join(matched_diseases[:3])}"
+
+        return min(score, 1.0), reason
+
+    def _composite_score(self, h: Hypothesis) -> float:
+        """Weighted geometric mean of the 4 score components.
+
+        Geometric: a hypothesis is only as good as its weakest dimension.
+        A path with great evidence but 0 testability is worthless to us.
+
+        Matches the linear fitness in evolution_engine._score_fitness
+        (same weights, different aggregation — fitness adds convergence /
+        diversity / length modifiers not relevant here).
+        """
+        c = max(h.confidence_score, 0.01)
+        e = max(h.evidence_score, 0.01)
+        n = max(h.novelty_score, 0.01)
+        t = max(h.testability_score, 0.01)
+        score = (c ** 0.20) * (e ** 0.20) * (n ** 0.25) * (t ** 0.35)
+
+        if self._has_only_review_evidence(h):
+            score *= 0.7
+
+        return score
+
+    @staticmethod
+    def _has_only_review_evidence(h: Hypothesis) -> bool:
+        """True if every link in the path comes from a review/narrative_review."""
+        _REVIEW_TYPES = {"narrative_review", "review"}
+        if not h.path:
+            return False
+        for link in h.path:
+            study_type = (link.evidence.get("study_type") or "").lower()
+            if study_type and study_type not in _REVIEW_TYPES:
+                return False
+        return True
+
+    def _check_contradiction(self, m1: dict, m2: dict) -> float:
+        """Check if two claims contradict each other. Returns severity 0-1."""
+        p1 = m1.get("predicate", "")
+        p2 = m2.get("predicate", "")
+        n1 = m1.get("negated", False)
+        n2 = m2.get("negated", False)
+
+        if p1 == p2 and n1 != n2:
+            return 1.0
+
+        if (p1, p2) in OPPOSING_PREDICATES:
+            return 0.8
+
+        if p1 == p2 and not n1 and not n2:
+            d1 = m1.get("evidence", {}).get("direction", "")
+            d2 = m2.get("evidence", {}).get("direction", "")
+            if d1 and d2 and d1 != d2:
+                return 0.6
+
+        return 0.0
+
+    def _infer_relation(self, path: list[str]) -> str:
+        """Infer a potential relation from a path's edge types."""
+        relations = []
+        for i in range(len(path) - 1):
+            if self.G.has_edge(path[i], path[i + 1]):
+                rt = self.G.edges[path[i], path[i + 1]].get("relation_type", "")
+                if rt and rt not in ("about", "is_a", "part_of"):
+                    relations.append(rt)
+
+        if relations:
+            for r in relations:
+                if r not in COMMON_RELATIONS:
+                    return r
+            return relations[0]
+        return "associated_with"
+
+    def _generate_explanation(self, h: Hypothesis) -> str:
+        """Generate a human-readable explanation for a hypothesis."""
+        path_str = " --> ".join(
+            f"{l.from_name} --[{l.relation_type}]--> {l.to_name}" for l in h.path
+        )
+        if not path_str:
+            return ""
+
+        pmids = {l.source_paper.get("pmid", "") for l in h.path if l.source_paper.get("pmid")}
+        key_finding = ""
+        for l in h.path:
+            if l.raw_text:
+                key_finding = l.raw_text[:150]
+                if len(l.raw_text) > 150:
+                    key_finding += "..."
+                break
+
+        lines = [
+            f"Hypothesis: {h.source_name} may relate to {h.target_name} via {len(h.path)}-hop path.",
+            f"Path: {path_str}",
+            f"Evidence: {len(h.supporting_claims)} claims from {len(pmids)} papers",
+        ]
+        if key_finding:
+            lines.append(f"Key finding: '{key_finding}'")
+        if h.testability_reason:
+            lines.append(f"Testability: {h.testability_reason}")
+        lines.append(
+            f"Confidence: {h.confidence_score:.2f} | "
+            f"Novelty: {h.novelty_score:.2f} | "
+            f"Evidence: {h.evidence_score:.2f} | "
+            f"Testability: {h.testability_score:.2f}"
+        )
+        return "\n".join(lines)
+
+    @staticmethod
+    def _geometric_mean(values: list[float]) -> float:
+        if not values:
+            return 0.0
+        product = math.prod(values)
+        return product ** (1.0 / len(values))
+
+
+def _simple_slope(xs: list[int], ys: list[int]) -> float:
+    """Simple linear regression slope without numpy."""
+    n = len(xs)
+    if n < 2:
+        return 0.0
+    mean_x = sum(xs) / n
+    mean_y = sum(ys) / n
+    num = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys))
+    den = sum((x - mean_x) ** 2 for x in xs)
+    if den == 0:
+        return 0.0
+    return num / den