"""Hypothesis engine: batch-generate, persist, and rank testable hypotheses.

Phase 3 of the NeuroClaw discovery loop:
  1. batch_generate() — traverse the graph to produce hypotheses at scale
  2. save / load — persist hypotheses to JSON
  3. rank_hypotheses() — sort by novelty, evidence, testability, confidence
  4. (Phase 5-6) hypotheses become executable NeuroClaw analysis tasks

Usage:
    from core.knowledge_graph import load_graph, HypothesisEngine

    kg = load_graph()
    engine = HypothesisEngine(kg)

    # batch generate across all domain pairs
    hypotheses = engine.batch_generate()
    engine.save_hypotheses(hypotheses, "data/hypotheses.json")

    # or load and re-rank
    hypotheses = engine.load_hypotheses("data/hypotheses.json")
    ranked = engine.rank_hypotheses(hypotheses)
"""

from __future__ import annotations

import json
import logging
import math
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Optional

import networkx as nx

from .graph_manager import KnowledgeGraph
from .schema import ConceptNode

logger = logging.getLogger(__name__)

# ── data structures ────────────────────────────────────────────────────

@dataclass
class HypothesisLink:
    """A single step in a hypothesis chain."""
    from_id: str
    from_name: str
    to_id: str
    to_name: str
    relation_type: str
    confidence: float
    claim_id: str = ""
    raw_text: str = ""
    evidence: dict = field(default_factory=dict)
    source_paper: dict = field(default_factory=dict)


@dataclass
class Hypothesis:
    """A generated hypothesis with full evidence chain."""
    id: str = ""
    hypothesis_type: str = ""  # "path", "bridge", "gap", "contradiction"
    source_id: str = ""
    source_name: str = ""
    target_id: str = ""
    target_name: str = ""
    path: list[HypothesisLink] = field(default_factory=list)
    confidence_score: float = 0.0
    novelty_score: float = 0.0
    evidence_score: float = 0.0
    testability_score: float = 0.0
    composite_score: float = 0.0
    supporting_claims: list[str] = field(default_factory=list)
    explanation: str = ""
    testability_reason: str = ""
    metadata: dict = field(default_factory=dict)
    critic_score: float = 0.0
    critic_feedback: list[dict] = field(default_factory=list)
    critic_rounds: int = 0
    evolve_score: float = 0.0

    def to_dict(self) -> dict:
        d = asdict(self)
        return d

    @classmethod
    def from_dict(cls, d: dict) -> Hypothesis:
        d = d.copy()
        if "path" in d and isinstance(d["path"], list):
            d["path"] = [HypothesisLink(**p) if isinstance(p, dict) else p for p in d["path"]]
        return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})


@dataclass
class Contradiction:
    """A pair of conflicting claims."""
    concept_a_id: str = ""
    concept_a_name: str = ""
    concept_b_id: str = ""
    concept_b_name: str = ""
    claim_for_id: str = ""
    claim_for_predicate: str = ""
    claim_for_text: str = ""
    claim_against_id: str = ""
    claim_against_predicate: str = ""
    claim_against_text: str = ""
    severity: float = 0.0


@dataclass
class Gap:
    """An unexplored relationship between two concepts."""
    concept_a_id: str = ""
    concept_a_name: str = ""
    concept_b_id: str = ""
    concept_b_name: str = ""
    distance: int = 0
    connecting_concepts: list[str] = field(default_factory=list)
    domain_a: str = ""
    domain_b: str = ""
    potential_relation: str = ""


# ── constants ──────────────────────────────────────────────────────────

OPPOSING_PREDICATES = {
    ("increases", "reduces"),
    ("reduces", "increases"),
    ("causes", "inhibits"),
    ("inhibits", "causes"),
    ("treats", "contraindicated_for"),
    ("contraindicated_for", "treats"),
    ("activates", "inhibits"),
    ("inhibits", "activates"),
}

# Review-only study types (no independent empirical evidence).
# Used by compute_frequency_boost and compute_temporal_decay. Edge-level
# weighting by study_type lives in phase4_optimize.apply_evidence_weighting.
_REVIEW_TYPES = {"review", "narrative_review", "systematic_review"}

COMMON_RELATIONS = {"is_a", "part_of", "associated_with", "about", "is_associated_with"}

# Noisy entity name patterns — hypotheses involving these are low quality.
# Two categories:
#   (a) process-word ≠ entity: nominalized verbs/states ("loss", "progression")
#       that pop up as bridge nodes but carry no biological content.
#   (b) generic containers: vague collective terms ("tissue volumes", "Family")
#       that don't refer to a specific measurable thing.
_NOISE_WORDS = frozenset({
    # original set
    "unseen", "risk", "effect", "level", "status", "change", "type",
    "group", "factor", "model", "method", "unknown", "other", "none",
    "miscellaneous", "various", "difference", "increase", "decrease",
    # nominalized processes/states (category a)
    "loss", "progression", "reduction", "elevation", "alteration",
    "disruption", "dysfunction", "impairment", "deterioration",
    "improvement", "recovery", "response", "onset", "activation",
    "inhibition", "regulation", "modulation", "stimulation",
    "expression", "function", "functions",
    # generic containers (category b)
    "family", "members", "phenomenon", "phenomena", "processes",
    "mechanisms", "pathways", "symptoms", "manifestations",
    "volumes", "volume",
    # life events / demographics that are not biological entities
    "stress", "life", "events", "exposure", "outcome", "outcomes",
    "quality",
})

NOISE_PATTERNS = [
    re.compile(r"^[A-Z][a-z]?$"),                                  # 1-2 letter: "Id", "Ca", "Mg"
    re.compile(r"^[A-Z][a-z]{2,4}$"),                              # Short mixed-case: "Tics", "Risk"
    re.compile(r"^\d+$"),                                           # Pure numbers
]

# (C-1) Generic-phrase patterns for INTERMEDIATE nodes. The token-based
# `_NOISE_WORDS` filter misses phrases like "functional connectivity" or
# "neural activity" because no individual word is in the noise list, but
# the WHOLE phrase carries no measurable content. We only block these when
# they appear as INTERMEDIATE nodes (paths can legitimately end in
# "functional connectivity" as an outcome metric).
_GENERIC_INTERMEDIATE_PATTERNS = [
    re.compile(r"^(abnormal|altered|impaired|reduced|increased|disrupted|aberrant)?\s*"
               r"(brain|neural|neuronal|cortical|cerebral)\s+"
               r"(activity|activation|function|functioning|connectivity|"
               r"network|networks|signaling|metabolism|response|responses)$",
               re.I),
    re.compile(r"^(functional|structural|anatomical|effective)\s+"
               r"(connectivity|network|networks|integrity|abnormalit(y|ies))$", re.I),
    re.compile(r"^(disease|symptom|clinical|treatment|therapeutic)\s+"
               r"(progression|outcome|outcomes|response|severity|burden|stage|staging)$", re.I),
    re.compile(r"^(common|typical|specific|various|different)\s+"
               r"(features|patterns|mechanisms|processes)$", re.I),
    re.compile(r"^(neuro)?(degeneration|inflammation|protection|plasticity|genesis|imaging)$",
               re.I),
    re.compile(r"^(grey|gray|white)\s+matter$", re.I),
    re.compile(r"^(cognitive|behavioral|emotional|motor|sensory)\s+"
               r"(deficit|deficits|dysfunction|impairment|abnormalit(y|ies))$", re.I),
]

# (C-3) Target-name patterns that LOOK like outcomes (so they pass
# _is_dataset_outcome's keyword fallback) but are actually too broad to
# drive a DL experiment. We block these even if their domain says
# disease/cognitive_function.
_TARGET_TOO_BROAD_PATTERNS = [
    # bare umbrella nouns (single token)
    re.compile(r"^(skill|skills|ability|abilities|outcome|outcomes|"
               r"symptom|symptoms|manifestation|manifestations|"
               r"phenomenon|phenomena|finding|findings|"
               r"deficit|deficits|impairment|impairments|"
               r"function|functions|functioning|behavior|behaviors|"
               r"capability|capabilities|condition|conditions|"
               r"disease|diseases|disorder|disorders|syndrome|syndromes|"
               r"focus|integration|balance|knowledge|autonomy|"
               r"performance|adaptation|resilience|vulnerability|"
               r"recovery|progression|mechanism|process)$", re.I),
    # broad-category disease umbrellas (when these are the literal target,
    # they're too generic — but specific subtypes like "Alzheimer Disease"
    # don't match these patterns)
    re.compile(r"^(neurological|psychiatric|mental|cognitive|behavioral|"
               r"neurodegenerative|cardiovascular)\s+"
               r"(disease|diseases|disorder|disorders|condition|conditions)$", re.I),
    re.compile(r"^(human\s+)?(disease|diseases|disorder|disorders)$", re.I),
    re.compile(r"^(brain|mental|psychiatric|psychological)\s+health$", re.I),
    re.compile(r"^clinical\s+(features|outcome|outcomes|presentation|status)$", re.I),
    # "X deficits/impairments" patterns (too vague as targets)
    re.compile(r"^(motor|cognitive|neurocognitive|functional|social|"
               r"verbal|visual|sensory|emotional|behavioral)\s+"
               r"(deficit|deficits|impairment|impairments|dysfunction|"
               r"disability|decline|deterioration)$", re.I),
]

# Vague relation types that add little signal
VAGUE_RELATIONS = {"is_associated_with", "associated_with", "about"}

# CognitiveAtlas / MeSH concept ids that are top-degree generic hubs
# in the KG. The audit found these at degrees 700-9000+, with names that
# are real English words (not caught by _NOISE_WORDS) but referring to
# extremely abstract umbrella concepts:
#
#   COGAT trm_4a3fd79d0a891  "memory"      degree 2248
#   COGAT trm_4a3fd79d0a80f  "logic"       degree 2052
#   COGAT trm_5159c80c1dd24  "loss"        degree 1034
#   COGAT trm_4a3fd79d09741  "activation"  degree  840
#   COGAT trm_4a3fd79d0afcf  "risk"        degree  722
#   COGAT trm_4a3fd79d0b2a8  "stress"      degree  139
#   MSH:D001921              "Brain"       degree 9157
#   MSH:D009474              "Neurons"     degree 1354
#
# Hypotheses with these as intermediate nodes or endpoints are too vague
# to drive a downstream DL experiment ("FPN -> memory" is not testable
# because we don't know which memory subsystem). Filtered in post_process.
PATH_IGNORE_NODE_IDS = frozenset({
    "COGAT_CONCEPT:trm_4a3fd79d0a891",   # memory
    "COGAT_CONCEPT:trm_4a3fd79d0a80f",   # logic
    "COGAT_CONCEPT:trm_5159c80c1dd24",   # loss
    "COGAT_CONCEPT:trm_4a3fd79d09741",   # activation
    "COGAT_CONCEPT:trm_4a3fd79d0afcf",   # risk
    "COGAT_CONCEPT:trm_4a3fd79d0b2a8",   # stress
    "MSH:D001921",                        # Brain (umbrella)
    "MSH:D009474",                        # Neurons (umbrella)
})

# Disease/category mega-hubs that are valid as hypothesis endpoints
# ("predict Alzheimer" is fine) but NOT as intermediate transit nodes
# ("A → Alzheimer → B" is just "A relates to AD, AD relates to B" — no
# discovery value). Audit found 37.8% of hypotheses transit through these.
INTERMEDIATE_ONLY_IGNORE_IDS = frozenset({
    "COGAT_DISORDER:dso_5419",            # schizophrenia (degree 1005)
    "MSH:D009103",                         # Multiple Sclerosis (816)
    "COGAT_DISORDER:dso_3312",            # bipolar disorder (703)
    "MSH:D000544",                         # Alzheimer Disease (746)
    "MSH:D004827",                         # Epilepsy (750)
    "MSH:D010300",                         # Parkinson Disease (709)
    "COGAT_DISORDER:dso_0060041",         # autism spectrum disorder (613)
    "MSH:D001289",                         # ADHD (601)
    "MSH:D003863",                         # Depression (577)
    "MSH:D001523",                         # Mental Disorders (489)
})

DIRECTIONAL_RELATIONS = {
    "causes", "treats", "increases", "reduces", "modulates",
    "activates", "inhibits", "is_biomarker_of", "is_risk_factor_for",
    "predicts", "distinguishes", "mediates",
    # Brain decoding directional predicates
    "evokes", "decoded_from", "elicits",
}

# domain pairs worth exploring — aligned with NeuroClaw imaging experiments
# target datasets: UKB (T1w/dMRI/rfMRI/SWI), ADNI (T1w/PET/fMRI/DTI), HCP-YA (T1w/T2w/fMRI/dMRI/MEG)
# experiment models: BrainGNN, NeuroStorm, SVM, XGBoost on raw images + handcrafted features
#
# Design principle: target should be a dataset OUTCOME (what we want to predict),
# source should be a MEASURABLE feature (what the dataset provides as input).
# - UKB outcomes: fluid intelligence, neuroticism, dementia diagnosis, motor tests
# - ADNI outcomes: MCI→AD conversion, CDR-SB, cognitive composite
# - HCP outcomes: fluid/crystallized IQ, emotion recognition, personality traits
#
# Allowed sources (what we can measure): neuroanatomy (MRI regions), connectivity
# networks, gene, biomarker (CSF/PET), drug (for intervention studies).
# Allowed targets (what we predict): disease (diagnostic labels), cognitive_function
# (the OUTCOMES — includes behavior, personality, affect).
DEFAULT_DOMAIN_PAIRS = [
    # core: measurable features → clinical/behavioral OUTCOMES
    ("neuroanatomy", "disease"),             # MRI → diagnosis
    ("neuroanatomy", "cognitive_function"),  # MRI → cognition/behavior
    ("connectivity", "disease"),             # dMRI/fMRI connectivity → diagnosis
    ("connectivity", "cognitive_function"),  # connectivity → cognition
    # genetics → outcomes (UKB 500k WGS)
    ("gene", "disease"),
    ("gene", "cognitive_function"),          # GWAS → behavior/IQ
    # fluid biomarkers → outcomes (ADNI CSF, blood)
    ("biomarker", "disease"),
    ("biomarker", "cognitive_function"),
    # drug → outcomes (ADNI pharmaceutical arms)
    ("drug", "disease"),
    ("drug", "cognitive_function"),
    # cross-outcome (comorbidity, transdiagnostic)
    ("disease", "disease"),
    ("cognitive_function", "disease"),       # e.g. anxiety → MS diagnosis risk
    ("disease", "cognitive_function"),       # e.g. AD → processing speed decline
]

# Domains that are NOT directly measurable from brain imaging
# These hypotheses will be filtered out in post_process
NON_MEASURABLE_BIOMARKER_TYPES = {
    "neurotransmitter",   # needs specialized PET tracers (e.g., 11C-raclopride for DA)
    "protein",            # needs tissue biopsy or CSF
    "enzyme",             # needs molecular assays
    "receptor",           # needs specialized PET (e.g., 11C-PIB for Aβ, but that's biomarker domain)
    # fluid biomarkers — not available in UKB/HCP-YA, only ADNI CSF subset
    "csf_biomarker",
    "blood_biomarker",
    "saliva_biomarker",
    "tear_biomarker",
}

# Specific entity name patterns that are NOT directly measurable from imaging
_NON_MEASURABLE_PATTERNS = [
    re.compile(r"(neurotransmitter|dopamine|serotonin|norepinephrine|gaba|glutamate|acetylcholine)\s+(level|concentration|release|synthesis)", re.I),
    re.compile(r"(alpha|beta|gamma|delta|kappa)\s*synuclein\s*(pathology|aggregation|expression)", re.I),
    re.compile(r"(amyloid|tau|phosphorylated)\s*(beta|protein|peptide)\s*(aggregation|production|clearance)", re.I),
    re.compile(r"(enzyme|kinase|phosphatase)\s*(activity|expression)", re.I),
    re.compile(r"(receptor|transporter)\s*(density|binding|expression)", re.I),
    re.compile(r"(TNF|interleukin|IL-\d|cytokine|chemokine)\s*(alpha|beta|level|concentration|production)", re.I),
    re.compile(r"CSF\s+(Aβ|amyloid|tau|p-tau|NFL|neurofilament)", re.I),
    re.compile(r"(blood|plasma|serum)\s+(biomarker|marker|level|concentration)", re.I),
    re.compile(r"(CSF|cerebrospinal fluid)\s+", re.I),
    re.compile(r"(saliva|tear|urine)\s+(biomarker|marker|level)", re.I),
    re.compile(r"(biopsy|tissue sample)", re.I),
]

# Non-neurological target domains — brain regions should not directly predict these
_NON_NEUROLOGICAL_TARGETS = re.compile(
    r"(urinary|incontinence|frequency|enuresis|bladder|renal|kidney|liver|"
    r"gastrointestinal|cardiac|pulmonary|dermatol|orthopedic|musculoskeletal|"
    r"fracture|sprain|tumor|cancer|carcinoma|leukemia|lymphoma)", re.I
)

# DATASET-OUTCOME whitelist — covers actual predicted variables in UKB/ADNI/HCP-YA
# papers (see README "Dataset Outcomes" for references to typical prediction tasks).
# Target must match one of these patterns to pass the post_process filter.
# We also auto-accept any concept in the `disease` domain (clinical diagnosis
# IS the most common outcome) and any MSH/CogAtlas concept in the
# `cognitive_function` domain (behavior/cognition).
#
# Categories cover:
# - Clinical diagnostic labels (Alzheimer, schizophrenia, MCI, etc.) — all 3 datasets
# - AD staging / conversion (CN→MCI→AD, ATN) — ADNI
# - Clinical scales (CDR, MMSE, ADAS-Cog, PHQ-9, MoCA, NPI) — ADNI + UKB
# - Cognitive abilities (IQ, memory, attention, processing speed) — all 3
# - Specific cognitive tests (PMAT, flanker, N-back, delay discounting) — HCP
# - Personality (Big Five) — HCP + UKB
# - Behavior/affect (anxiety, depression, aggression, risk-taking) — all 3
# - Motor/sensory (grip strength, gait, reaction time, dexterity) — UKB + HCP
# - Brain age / neurodegeneration markers — UKB + ADNI
# - NeuroSTORM-evaluated phenotypes: MND, early psychosis (HCP-EP), ADHD200,
#   COBRE, UCLA L5c, TCP psychiatric scales, fMRI task state classification
# - Subject fingerprinting / re-identification
_OUTCOME_KEYWORDS = re.compile(
    r"("
    # cognitive abilities — general
    r"intelligence|cognition|cognitive\s+(function|ability|performance|deterioration|impairment|dysfunction|decline|test|assessment|composite|score)|"
    r"memory|attention|executive|processing\s+speed|reasoning|language|"
    r"fluency|perception|reaction\s+time|fluid\s+intelligence|"
    r"crystallized\s+intelligence|working\s+memory|episodic\s+memory|"
    r"semantic\s+memory|verbal\s+(memory|fluency|learning)|visuospatial|"
    # specific HCP NIH Toolbox / cognitive tasks
    r"pmat|flanker|card\s+sort|n-?back|list\s+sort|picture\s+sequence|"
    r"pattern\s+comparison|picture\s+vocabulary|oral\s+reading|"
    r"delay\s+discounting|risk[- ]taking|go[- ]no[- ]go|"
    # HCP Penn CNB cognitive battery
    r"penn\s+(word|matrix|line\s+orientation|continuous\s+performance|progressive\s+matrices|fear|emotion|cnb)|"
    r"matrix\s+pattern|numeric\s+memory|prospective\s+memory|pairs\s+matching|"
    r"trail\s+making|symbol\s+digit|boston\s+naming|animal\s+fluency|"
    r"category\s+fluency|logical\s+memory|clock\s+drawing|ravlt|"
    # HCP 7 task states (NeuroSTORM state classification)
    r"emotion\s+task|gambling\s+task|language\s+task|motor\s+task|"
    r"relational\s+task|social\s+task|working\s+memory\s+task|"
    # clinical scales (ADNI/UKB/TCP/HCP)
    r"\b(cdr|cdr-sb|mmse|moca|adas|adas-cog|npi|faq|gds|phq-?9|gad-?7|bai|hdrs|hrsd|hamd|ham-d|"
    r"bdi|ymrs|panss|sans|saps|audit|asrs|pro|adi|srs|tci|neo-?ffi|asr|abcl|"
    r"cidi|cidi-sf|eysenck|swemwbs|psqi|ftnd|ssaga|masq|promis|upsit)\b|"
    r"adult\s+self\s+report|adult\s+behavior\s+checklist|"
    # personality / affect
    r"neuroticism|extraversion|agreeableness|conscientiousness|openness|"
    r"personality|temperament|affect|mood|emotion|anxiety|depression|"
    r"well-?being|satisfaction|life\s+satisfaction|psychological|stress\s+response|"
    r"anxiety\s+sensitivity|cautiousness|"
    r"affect\s+(positive|negative)|emotion\s+recognition|emotional\s+regulation|"
    r"perceived\s+(stress|rejection|hostility)|anger|fear|sadness|"
    # social functioning (HCP + UKB)
    r"loneliness|social\s+(isolation|support|relationship|cognition)|"
    r"meaning\s+and\s+purpose|instrumental\s+support|emotional\s+support|"
    r"friendship|"
    # behavior
    r"behavior|aggression|impulsivity|addiction|substance|alcohol|smoking|"
    r"tobacco|cannabis|cocaine|opiate|opioid|hallucinogen|"
    r"drug\s+use|substance\s+use|sleep\s+quality|insomnia|"
    # diagnoses / clinical outcomes — added NeuroSTORM-evaluated cohorts and ADNI stages
    r"alzheimer|parkinson|schizophrenia|autism|adhd|bipolar|epilepsy|"
    r"mci|mild\s+cognitive|dementia|psychosis|early\s+psychosis|stroke|post[- ]stroke|"
    r"multiple\s+sclerosis|huntington|frontotemporal|lewy\s+body|"
    r"motor\s+neuron\s+disease|mnd|als|"
    r"transdiagnostic|psychiatric\s+disorder|mental\s+health\s+disorder|"
    r"ocd|ptsd|phobia|panic|agoraphobia|somatoform|eating\s+disorder|"
    # ADNI-specific diagnostic stages
    r"\b(cn|smc|emci|lmci|ad\b|preclinical|at\b|atn|alzheimer\s+continuum)\b|"
    r"significant\s+memory\s+concern|subjective\s+(memory|cognitive)\s+(concern|complaint|decline)|"
    r"cognitively\s+(normal|unimpaired)|"
    r"disorder|syndrome|diagnosis|onset|conversion|progression|severity|"
    r"symptom|manifestation|prognosis|outcome|treatment\s+response|"
    r"disease\s+(stage|staging|duration|burden)|"
    # cardiovascular / metabolic diseases (UKB ICD-10)
    r"myocardial\s+infarction|heart\s+failure|hypertension|atrial\s+fibrillation|"
    r"coronary|cardiovascular\s+disease|diabetes|type\s*[12]\s+diabetes|"
    r"chronic\s+kidney|fatty\s+liver|nafld|metabolic\s+syndrome|obesity|"
    # AD-specific biomarker status
    r"amyloid\s+(status|positivity|positive|negative|load|burden|suvr)|"
    r"tau\s+(status|positivity|positive|tangle|pathology|burden|suvr)|"
    r"atn\s+(profile|stage|classification)|"
    r"neurodegeneration\s+(stage|status)|"
    # brain age / aging
    r"brain\s+age|brain-?age(-?gap)?|aging|age[- ]related|age\s+acceleration|"
    # motor / sensory
    r"grip\s+strength|gait|motor\s+coordination|motor\s+function|"
    r"balance|tremor|dexterity|walking\s+speed|two[- ]minute\s+walk|endurance|"
    r"visual\s+(acuity|field)|audition|hearing|olfaction|taste|pain|"
    r"chronic\s+pain|musculoskeletal\s+pain|"
    # mortality / longevity
    r"mortality|all-?cause\s+death|survival|life\s+expectancy"
    r")", re.I
)

# Target domains considered as valid dataset outcomes
_OUTCOME_DOMAINS = {"disease", "cognitive_function"}

# NeuroClaw testable modalities and their keywords
# Aligned with UKB/ADNI/HCP-YA available data + deep learning models
TESTABLE_MODALITIES = {
    "sMRI": ["cortical thickness", "volume", "atrophy", "gray matter", "white matter",
             "brain structure", "morphometry", "VBM", "FreeSurfer", "recon-all",
             "brain region", "hippocampus", "amygdala", "thalamus", "caudate",
             "putamen", "cerebellum", "insula", "cortex", "ventricle"],
    "fMRI": ["functional connectivity", "BOLD", "activation", "resting-state",
             "task-based", "network", "default mode", "fMRI", "brain response",
             "neural activity", "brain activation"],
    "dMRI": ["DTI", "diffusion", "fractional anisotropy", "tractography",
             "white matter integrity", "structural connectivity", "FA", "MD",
             "connectivity matrix", "fiber bundle", "white matter tract"],
    "PET": ["PET", "tracer", "amyloid", "tau", "FDG", "SUVr", "binding potential",
            "glucose metabolism", "florbetapir", "flortaucipir"],
    "EEG": ["EEG", "ERP", "oscillation", "power spectrum", "alpha", "beta", "theta",
            "delta", "gamma", "microstate", "coherence", "event-related"],
    "organ_volume": ["organ volume", "liver volume", "kidney volume", "spleen volume",
                     "MedSAM", "segmentation", "organ size"],
}

# Deep learning model keywords for testability scoring
DL_MODEL_KEYWORDS = [
    "BrainGNN", "NeuroStorm", "GNN", "graph neural", "region of interest", "ROI",
    "connectivity matrix", "adjacency", "node feature", "graph convolution",
    "deep learning", "CNN", "ResNet", "attention", "transformer",
    "voxel", "patch", "whole-brain",
]

# ── Dataset-Available Variables ──────────────────────────────────────
# Defines what can be measured in each dataset. Hypotheses must start
# from these features and end at dataset-available outcomes.

DATASET_FEATURES = {
    "UKB": {
        # sMRI (T1w): FreeSurfer-derived ROI measures
        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "smri_cortical_area":     {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "smri_cortical_volume":   {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "smri_voxel":             {"modality": "sMRI", "tool": "voxel",       "level": "voxel"},
        # dMRI: diffusion metrics per tract
        "dmri_fa":  {"modality": "dMRI", "tool": "TBSS", "level": "tract"},
        "dmri_md":  {"modality": "dMRI", "tool": "TBSS", "level": "tract"},
        "dmri_sc":  {"modality": "dMRI", "tool": "tractography", "level": "connectivity"},
        # rfMRI: functional connectivity
        "rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"},
        # lesion segmentation
        "lesion_volume": {"modality": "sMRI", "tool": "MedSAM", "level": "ROI"},
        # non-imaging
        "genetics":       {"modality": "genetics",    "tool": "WGS/GSA",     "level": "SNP"},
        "environment":    {"modality": "environment",  "tool": "questionnaire","level": "variable"},
        "physical":       {"modality": "physical",     "tool": "measurement",  "level": "variable"},
        "hospitalization":{"modality": "clinical",     "tool": "ICD10",        "level": "outcome"},
    },
    "ADNI": {
        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "smri_voxel":             {"modality": "sMRI", "tool": "voxel",       "level": "voxel"},
        "pet_amyloid": {"modality": "PET", "tool": "florbetapir",  "level": "ROI"},
        "pet_tau":     {"modality": "PET", "tool": "flortaucipir", "level": "ROI"},
        "pet_fdg":     {"modality": "PET", "tool": "FDG",          "level": "ROI"},
        "fmri_fc":     {"modality": "fMRI", "tool": "task/resting", "level": "connectivity"},
        "dti_fa":      {"modality": "dMRI", "tool": "DTI",          "level": "tract"},
        "lesion_volume": {"modality": "sMRI", "tool": "MedSAM", "level": "ROI"},
        "genetics":    {"modality": "genetics", "tool": "APOE/GWAS", "level": "SNP"},
        "medication":  {"modality": "clinical", "tool": "medication_log", "level": "variable"},
    },
    "HCP_YA": {
        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "smri_myelin":            {"modality": "sMRI", "tool": "T1w/T2w",    "level": "ROI"},
        "smri_voxel":             {"modality": "sMRI", "tool": "voxel",       "level": "voxel"},
        "rfmri_fc":  {"modality": "fMRI", "tool": "rfMRI",    "level": "connectivity"},
        "tfmri_task":{"modality": "fMRI", "tool": "task fMRI","level": "activation"},
        "dmri_sc":   {"modality": "dMRI", "tool": "HARDI",    "level": "connectivity"},
        "meg":       {"modality": "MEG",  "tool": "MEG",      "level": "connectivity"},
    },
    # NAS-available patient cohorts with preprocessed ROI time series.
    # Phenotype CSVs live under Z:\Dataset\fMRI\phenotype and the dataset-
    # specific rest csvs. All supply rfMRI volumes or ROI series; structural
    # T1 is available for HCP-EP and HCP-Aging (the other four are rfMRI-only
    # public releases).
    "ABIDE": {
        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
    },
    "ADHD200": {
        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
    },
    "COBRE": {
        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
    },
    "UCLA": {
        # UCLA CNP — rest + 6 task contrasts, cross-diagnosis cohort.
        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
        "tfmri_task":   {"modality": "fMRI", "tool": "task fMRI",   "level": "activation"},
    },
    "HCP_EP": {
        # HCP Early Psychosis — patient cohort, T1w + rfMRI cleaned.
        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
    },
    "HCP_AGING": {
        # HCP-Aging — T1w + rfMRI REST1/REST2 + 3 task contrasts.
        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "smri_myelin":             {"modality": "sMRI", "tool": "T1w/T2w",    "level": "ROI"},
        "rfmri_fc":     {"modality": "fMRI", "tool": "rfMRI",       "level": "connectivity"},
        "rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI",       "level": "ROI"},
        "tfmri_task":   {"modality": "fMRI", "tool": "task fMRI",   "level": "activation"},
    },
    # ── Visual decoding (fMRI) ──────────────────────────────────────────
    # NSD & BOLD5000: image-stimulus visual task fMRI, no rest.
    "NSD": {
        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "tfmri_visual_voxel":      {"modality": "fMRI", "tool": "task fMRI",
                                     "level": "voxel", "stimulus": "natural_image"},
        "tfmri_visual_roi":        {"modality": "fMRI", "tool": "task fMRI",
                                     "level": "ROI",   "stimulus": "natural_image"},
    },
    "BOLD5000": {
        "smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
        "tfmri_visual_voxel":      {"modality": "fMRI", "tool": "task fMRI",
                                     "level": "voxel", "stimulus": "ImageNet_COCO_Scene"},
        "tfmri_visual_roi":        {"modality": "fMRI", "tool": "task fMRI",
                                     "level": "ROI",   "stimulus": "ImageNet_COCO_Scene"},
    },
    # ── Visual decoding (EEG) ───────────────────────────────────────────
    "SEED_DV": {
        "eeg_psd": {"modality": "EEG", "tool": "PSD", "level": "channel"},
        "eeg_de":  {"modality": "EEG", "tool": "DE",  "level": "channel"},
    },
    # ── Emotion decoding (EEG + eye tracking) ───────────────────────────
    "SEED": {
        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
        "eeg_psd":      {"modality": "EEG", "tool": "PSD", "level": "channel"},
    },
    "SEED_IV": {
        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
        "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
                         "level": "variable"},
    },
    "SEED_V": {
        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
        "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
                         "level": "variable"},
    },
    "SEED_VII": {
        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
        "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
                         "level": "variable"},
    },
    "SEED_GER": {
        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
        "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
                         "level": "variable"},
    },
    "SEED_FRA": {
        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
        "eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
                         "level": "variable"},
    },
    # ── Vigilance decoding (EEG) ────────────────────────────────────────
    "SEED_VIG": {
        "eeg_de":       {"modality": "EEG", "tool": "DE",  "level": "channel"},
        "eog":          {"modality": "EOG", "tool": "EOG", "level": "channel"},
        "eye_movement": {"modality": "eye_tracking", "tool": "gaze/blink",
                         "level": "variable"},
    },
}

DATASET_OUTCOMES = {
    "UKB": [
        "disease_diagnosis",   # ICD10 codes
        "mortality",           # death registry
        "cognitive_score",     # touchscreen cognitive tests
        "imaging_phenotype",   # derived imaging phenotypes
    ],
    "ADNI": [
        "diagnosis",           # CN / MCI / AD
        "conversion",          # MCI → AD conversion
        "cognitive_decline",   # ADAS-Cog, MMSE decline
        "biomarker_status",    # amyloid+/tau+ status
    ],
    "HCP_YA": [
        "behavioral_score",    # NIH Toolbox
        "cognitive_task",      # task fMRI performance
        "personality",         # NEO-FFI
    ],
    # ABIDE — ASD vs controls, rest only.
    "ABIDE": [
        "diagnosis",           # ASD vs TD
        "symptom_severity",    # ADOS, ADI-R, SRS
        "cognitive_score",     # FIQ/VIQ/PIQ
    ],
    # ADHD200 — ADHD subtype vs TDC.
    "ADHD200": [
        "diagnosis",           # ADHD (combined/inattentive/hyperactive) vs TDC
        "symptom_severity",    # ADHD-RS, Conners
        "cognitive_score",     # WASI/WISC
    ],
    # COBRE — schizophrenia vs controls.
    "COBRE": [
        "diagnosis",           # schizophrenia vs HC
        "symptom_severity",    # PANSS positive/negative/general
        "cognitive_score",     # WAIS
    ],
    # UCLA CNP — schizophrenia/bipolar/ADHD vs controls.
    "UCLA": [
        "diagnosis",           # SCZ / BP / ADHD / HC
        "symptom_severity",    # HAM-D, YMRS, ADHD-RS
        "cognitive_task",      # 6 task contrasts
    ],
    # HCP-EP — early psychosis (FES + AR) vs HC.
    "HCP_EP": [
        "diagnosis",           # affective/non-affective psychosis vs HC
        "symptom_severity",    # PANSS, SANS, YMRS
        "cognitive_score",     # MATRICS Consensus Cognitive Battery
    ],
    # HCP-Aging — lifespan 36-100 yrs, healthy aging.
    "HCP_AGING": [
        "cognitive_decline",   # NIH Toolbox across age
        "behavioral_score",    # same battery as HCP-YA
        "cognitive_task",      # CARIT/FACENAME/VISMOTOR
    ],
    # ── Visual decoding outcomes ────────────────────────────────────────
    "NSD": [
        "image_category",         # COCO 80-class
        "image_semantic",         # CLIP / language-model embedding
        "stimulus_reconstruction",# pixel / latent reconstruction
    ],
    "BOLD5000": [
        "image_category",         # ImageNet 1000-class / COCO / Scene
        "scene_type",             # Scene 365-class
        "image_semantic",
    ],
    "SEED_DV": [
        "video_class",            # discrete video categories
        "video_semantic",
        "video_reconstruction",
    ],
    # ── Emotion decoding outcomes ───────────────────────────────────────
    "SEED":     ["emotion_3class"],            # positive/neutral/negative
    "SEED_IV":  ["emotion_4class"],            # happy/sad/fear/neutral
    "SEED_V":   ["emotion_5class"],            # +disgust
    "SEED_VII": ["emotion_7class", "emotion_continuous"],
    "SEED_GER": ["emotion_3class"],
    "SEED_FRA": ["emotion_3class"],
    # ── Vigilance decoding outcomes ─────────────────────────────────────
    "SEED_VIG": ["vigilance_continuous", "perclos"],
}

# Imaging feature templates — dynamically combined with AAL atlas regions
# {region} is replaced with actual neuroanatomy node names at generation time
IMAGING_FEATURE_TEMPLATES = {
    # sMRI FreeSurfer ROI features
    "cortical thickness of {region}":   {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
                                          "datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]},
    "gray matter volume of {region}":   {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
                                          "datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]},
    "subcortical volume of {region}":   {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
                                          "datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]},
    "cortical area of {region}":        {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
                                          "datasets": ["UKB", "HCP_YA", "HCP_AGING"]},
    # dMRI tract features
    "fractional anisotropy of {region}": {"modality": "dMRI", "tool": "TBSS", "level": "tract",
                                           "datasets": ["UKB", "HCP_YA"]},
    "mean diffusivity of {region}":      {"modality": "dMRI", "tool": "TBSS", "level": "tract",
                                           "datasets": ["UKB", "HCP_YA"]},
    # PET ROI features (ADNI)
    "amyloid SUVR of {region}":          {"modality": "PET", "tool": "florbetapir", "level": "ROI",
                                           "datasets": ["ADNI"]},
    "tau SUVR of {region}":              {"modality": "PET", "tool": "flortaucipir", "level": "ROI",
                                           "datasets": ["ADNI"]},
    "FDG uptake of {region}":            {"modality": "PET", "tool": "FDG", "level": "ROI",
                                           "datasets": ["ADNI"]},
    # lesion segmentation
    "lesion volume of {region}":          {"modality": "sMRI", "tool": "MedSAM", "level": "ROI",
                                           "datasets": ["UKB", "ADNI"]},
}

# Connectivity feature templates — {a} and {b} are AAL regions
CONNECTIVITY_FEATURE_TEMPLATES = {
    "functional connectivity between {a} and {b}":    {"modality": "fMRI", "tool": "rfMRI",
                                                        "level": "connectivity",
                                                        "datasets": ["UKB", "ADNI", "HCP_YA",
                                                                     "ABIDE", "ADHD200", "COBRE",
                                                                     "UCLA", "HCP_EP", "HCP_AGING"]},
    "effective connectivity from {a} to {b}":         {"modality": "fMRI", "tool": "DCM/GC",
                                                        "level": "connectivity",
                                                        "datasets": ["ADNI", "HCP_YA",
                                                                     "UCLA", "HCP_EP", "HCP_AGING"]},
    "structural connectivity between {a} and {b}":    {"modality": "dMRI", "tool": "tractography",
                                                        "level": "connectivity",
                                                        "datasets": ["UKB", "HCP_YA"]},
}

# Domain pairs for imaging-driven hypothesis generation
# source domain → target domain, aligned with dataset modalities
IMAGING_DOMAIN_PAIRS = [
    # sMRI features → disease
    ("neuroanatomy", "disease"),
    # connectivity → disease
    ("connectivity", "disease"),
    # sMRI features → cognitive function
    ("neuroanatomy", "cognitive_function"),
    # gene → brain structure (UKB genetics + imaging)
    ("gene", "neuroanatomy"),
    # disease → drug (ADNI)
    ("disease", "drug"),
]

# Brain decoding domain pairs (NSD / BOLD5000 / SEED family).
# These are SEPARATE from IMAGING_DOMAIN_PAIRS because decoding hypotheses
# reverse the usual direction: instead of "brain feature → clinical outcome",
# they go "stimulus ↔ brain" or "brain → psychological-state label".
DECODING_DOMAIN_PAIRS = [
    # Encoding: stimulus drives brain response
    ("visual_stimulus", "neuroanatomy"),
    ("visual_stimulus", "imaging_feature"),
    ("visual_stimulus", "connectivity"),
    # Decoding: brain predicts stimulus identity
    ("neuroanatomy",    "visual_stimulus"),
    ("imaging_feature", "visual_stimulus"),
    # EEG → emotion (SEED/SEED-IV/SEED-V/SEED-VII/SEED-GER/SEED-FRA)
    ("imaging_feature", "emotion"),
    ("neuroanatomy",    "emotion"),
    # EEG → vigilance (SEED-VIG)
    ("imaging_feature", "vigilance"),
    ("neuroanatomy",    "vigilance"),
]

# AAL atlas regions used for imaging feature generation
# Subset of neuroanatomy nodes from NN_AAL source
_AAL_REGION_KEYWORDS = [
    "Precentral", "Frontal_Sup", "Frontal_Mid", "Frontal_Inf", "Rolandic_Oper",
    "Supp_Motor", "Olfactory", "Frontal_Sup_Med", "Frontal_Med_Orb",
    "Rectus", "Insula", "Cingulate", "Hippocampus", "Parahippocampal",
    "Amygdala", "Calcarine", "Cuneus", "Lingual", "Occipital",
    "Fusiform", "Postcentral", "Parietal", "SupraMarginal", "Angular",
    "Precuneus", "Paracentral", "Caudate", "Putamen", "Pallidum",
    "Thalamus", "Heschl", "Temporal", "Temporal_Pole",
]

# ── engine ─────────────────────────────────────────────────────────────

class HypothesisEngine:
    """Batch-generate, persist, and rank testable hypotheses from a knowledge graph."""

    def __init__(self, kg: KnowledgeGraph):
        self.kg = kg
        self.G = kg.G
        self._index = kg._index
        # Build claims index for frequency_boost: (subj, pred, obj) → [claim_meta, ...]
        self._claims_by_triple: dict[tuple[str, str, str], list[dict]] = {}
        for nid, node in self._index.items():
            if "claim" not in node.domain_tags:
                continue
            meta = node.metadata
            key = (meta.get("subject_id", ""), meta.get("predicate", ""), meta.get("object_id", ""))
            if key[0] and key[2]:
                self._claims_by_triple.setdefault(key, []).append(meta)

    # ── batch generation ───────────────────────────────────────────────

    def batch_generate(
        self,
        domain_pairs: Optional[list[tuple[str, str]]] = None,
        max_hops: int = 3,
        max_paths_per_pair: int = 5,
        max_seeds_per_domain: int = 50,
    ) -> list[Hypothesis]:
        """Batch-generate hypotheses across the entire graph.

        Strategy: for each domain pair, sample seed concepts from domain_a,
        find paths to concepts in domain_b within max_hops hops.
        """
        if domain_pairs is None:
            domain_pairs = DEFAULT_DOMAIN_PAIRS

        all_hypotheses: list[Hypothesis] = []
        seen_pairs: set[tuple[str, str]] = set()
        _hyp_counter = 0

        for dom_a, dom_b in domain_pairs:
            logger.info(f"generating hypotheses: {dom_a} -> {dom_b}")

            seeds_a = self._sample_domain_nodes(dom_a, max_seeds_per_domain)
            targets_b = {
                nid for nid, data in self.G.nodes(data=True)
                if dom_b in data.get("domain_tags", [])
                and "claim" not in data.get("domain_tags", [])
                and nid not in PATH_IGNORE_NODE_IDS
            }

            for seed_id in seeds_a:
                if seed_id not in self.G:
                    continue

                # BFS from seed
                try:
                    reachable = nx.single_source_shortest_path(
                        self.G, seed_id, cutoff=max_hops
                    )
                except nx.NetworkXError:
                    continue

                # find targets in domain_b
                candidates = [
                    nid for nid in reachable
                    if nid in targets_b and nid != seed_id
                ]

                pair_count = 0
                for target_id in candidates:
                    pair_key = tuple(sorted([seed_id, target_id]))
                    if pair_key in seen_pairs:
                        continue
                    seen_pairs.add(pair_key)

                    raw_path = reachable[target_id]
                    links = self._enrich_path(raw_path)
                    if not links:
                        continue

                    conf = self._compute_confidence_score(links)
                    nov = self._compute_novelty_score(links)
                    evi = self._compute_evidence_score(links)
                    test, test_reason = self._compute_testability_score(links)
                    claim_ids = [l.claim_id for l in links if l.claim_id]

                    _hyp_counter += 1
                    h = Hypothesis(
                        id=f"HYP:{_hyp_counter:06d}",
                        hypothesis_type="bridge",
                        source_id=seed_id,
                        source_name=self._index[seed_id].preferred_name,
                        target_id=target_id,
                        target_name=self._index[target_id].preferred_name,
                        path=links,
                        confidence_score=conf,
                        novelty_score=nov,
                        evidence_score=evi,
                        testability_score=test,
                        composite_score=0.0,  # set below
                        supporting_claims=claim_ids,
                        testability_reason=test_reason,
                        metadata={"domain_a": dom_a, "domain_b": dom_b},
                    )
                    h.explanation = self._generate_explanation(h)
                    h.composite_score = self._composite_score(h)
                    all_hypotheses.append(h)

                    pair_count += 1
                    if pair_count >= max_paths_per_pair:
                        break

        logger.info(f"batch generation complete: {len(all_hypotheses)} hypotheses from {len(domain_pairs)} domain pairs")

        all_hypotheses = self.post_process(all_hypotheses)
        return all_hypotheses

    def post_process(
        self,
        hypotheses: list[Hypothesis],
        min_hops: int = 2,
        filter_vague_relations: bool = True,
        filter_non_measurable: bool = True,
        max_hops_filter: int = 5,
    ) -> list[Hypothesis]:
        """Filter low-quality hypotheses after generation.

        Filters:
        1. Noisy entities — source/target name matches NOISE_PATTERNS
        2. 1-hop hypotheses — too simple, just restates existing edges
        3. Vague relations — all links are is_associated_with / associated_with / about
        4. Non-measurable biomarkers — entities not directly measurable from brain imaging
        5. Pure association chains — no directional predicates (causes/treats/increases/etc.)
        6. Overly long paths — exceeds max_hops_filter (default 5) to reduce noise accumulation
        """
        before = len(hypotheses)
        filtered = []

        for h in hypotheses:
            # filter noisy entities (source, target, and all intermediate nodes)
            all_names = {h.source_name, h.target_name}
            for link in h.path:
                all_names.add(link.from_name)
                all_names.add(link.to_name)
            if any(self._is_noisy_entity(name) for name in all_names):
                continue

            # filter 1-hop (single direct edge = no discovery value)
            if len(h.path) < min_hops:
                continue

            # filter all-vague-relations
            if filter_vague_relations:
                relation_types = {l.relation_type for l in h.path}
                if relation_types and relation_types <= VAGUE_RELATIONS:
                    continue

            # filter single-PMID bridges (all hops cite the same paper = not a real bridge)
            if len(h.path) >= 2:
                pmids = set()
                for link in h.path:
                    pmid = link.source_paper.get("pmid", "") if isinstance(link.source_paper, dict) else ""
                    if pmid:
                        pmids.add(pmid)
                if len(pmids) == 1:
                    continue

            # filter non-measurable biomarkers (not testable from imaging)
            if filter_non_measurable:
                if self._has_non_measurable_entity(h):
                    continue

            # filter biologically implausible paths (brain region → non-neurological target)
            if self._has_implausible_path(h):
                continue

            # filter paths with weak evidence (target not mentioned in raw_text)
            if self._has_weak_evidence(h):
                continue

            # filter paths where both ends of any edge are broad hubs
            # ("Brain Diseases --causes--> Cognitive Dysfunction" is uninformative)
            if self._has_hub_to_hub_edge(h):
                continue

            # filter paths touching any vague COGAT/MeSH umbrella hub
            # (memory/logic/loss/activation/risk/stress/Brain/Neurons).
            # These nodes are too abstract to drive a DL experiment whether
            # they appear as source, target, or intermediate.
            if self._touches_path_ignore_node(h):
                continue

            # filter paths that transit through disease mega-hubs as
            # intermediate nodes (A → Disease → B is uninformative).
            # These nodes are still valid as source/target endpoints.
            if self._transits_intermediate_only_hub(h):
                continue

            # (C-1) filter paths whose INTERMEDIATE node is a generic
            # phrase ("neural activity", "disease progression", "grey
            # matter", ...). Endpoints are not checked here.
            if self._has_intermediate_generic_phrase(h):
                continue

            # (C-2) filter paths whose directional density is too thin
            # (3+ hops with < 50% directional relations = too vague to
            # be a mechanism hypothesis).
            if self._has_thin_directional_density(h):
                continue

            # filter: target must be a dataset outcome (diagnosis/cognition/behavior/
            # personality/motor). Predicting "White Matter" or "Neurons" is not a
            # hypothesis UKB/ADNI/HCP can directly test — those are imaging features
            # used as INPUTS, not outcomes.
            if not self._is_dataset_outcome(h):
                continue

            # (C-3) filter: target name is an umbrella concept ("skill",
            # "disease", "neurological disorder", "clinical features")
            # even though it passes the outcome keyword check. These
            # can't anchor a concrete DL label.
            if self._is_too_broad_target(h.target_name):
                continue

            # filter paths with no directional predicates (pure association chains)
            if len(h.path) >= 2:
                relation_types = {l.relation_type for l in h.path}
                if not (relation_types & DIRECTIONAL_RELATIONS):
                    continue

            # filter paths that exceed max hop length (noise accumulation)
            if len(h.path) > max_hops_filter:
                continue

            filtered.append(h)

        # Deduplicate: for each (source, target) pair, keep top 2 by composite score
        from collections import defaultdict
        pair_groups = defaultdict(list)
        for h in filtered:
            key = (h.source_id, h.target_id)
            pair_groups[key].append(h)

        deduplicated = []
        for key, group in pair_groups.items():
            # Sort by composite score descending
            group.sort(key=lambda x: x.composite_score, reverse=True)
            # Keep top 2 (or 1 if only one exists)
            deduplicated.extend(group[:2])

        logger.info(f"post_process: {before} -> {len(filtered)} filtered -> {len(deduplicated)} deduplicated "
                     f"(removed {before - len(deduplicated)} total)")
        return deduplicated

    def _has_non_measurable_entity(self, h: Hypothesis) -> bool:
        """Check if hypothesis involves entities not measurable from brain imaging.

        Filters out hypotheses where source or target is:
        - A non-measurable domain (neurotransmitter levels, protein expression, etc.)
        - Matches non-measurable entity name patterns (CSF markers, blood markers, etc.)
        """
        for node_name, node_id in [(h.source_name, h.source_id), (h.target_name, h.target_id)]:
            # check domain tags
            node = self._index.get(node_id)
            if node:
                domains = set(node.domain_tags) - {"claim"}
                # allow neurotransmitter/protein as intermediate hops only if source or target is neuroanatomy
                if domains & NON_MEASURABLE_BIOMARKER_TYPES:
                    # check if the OTHER end is a brain region (then it's a valid "X affects brain" hypothesis)
                    other_name = h.target_name if node_name == h.source_name else h.source_name
                    other_id = h.target_id if node_name == h.source_name else h.source_id
                    other_node = self._index.get(other_id)
                    if other_node and "neuroanatomy" not in other_node.domain_tags:
                        return True

            # check name patterns
            for pattern in _NON_MEASURABLE_PATTERNS:
                if pattern.search(node_name):
                    return True

        return False

    @staticmethod
    def _is_noisy_entity(name: str) -> bool:
        """Check if an entity name matches known noise patterns."""
        if not name or len(name.strip()) == 0:
            return True
        name_clean = name.strip()
        for pattern in NOISE_PATTERNS:
            if pattern.match(name_clean):
                return True
        # check if name contains any noise word
        words = set(re.split(r"[\s\-_,/]+", name_clean.lower()))
        if words & _NOISE_WORDS:
            return True
        return False

    @staticmethod
    def _is_generic_intermediate(name: str) -> bool:
        """(C-1) Phrase-level filter for intermediate node names that pass
        token-level `_NOISE_WORDS` but are still too vague.

        Examples that get blocked:
          - "neural activity"  (no individual noise token)
          - "functional connectivity" (legit metric but not a mechanism)
          - "disease progression"
          - "grey matter"  (umbrella)
          - "cognitive deficit"

        Only call on intermediate nodes — these phrases can be valid as
        endpoints (e.g. "functional connectivity" as a target metric).
        """
        if not name:
            return True
        s = name.strip()
        for pattern in _GENERIC_INTERMEDIATE_PATTERNS:
            if pattern.match(s):
                return True
        return False

    @staticmethod
    def _is_too_broad_target(name: str) -> bool:
        """(C-3) Block target names that pass the outcome keyword regex but
        are umbrella concepts ("disease", "skill", "neurological disorder",
        "clinical features"). A DL experiment can't be designed against
        these — you don't know which subtype to label.
        """
        if not name:
            return True
        s = name.strip()
        for pattern in _TARGET_TOO_BROAD_PATTERNS:
            if pattern.match(s):
                return True
        return False

    def _has_intermediate_generic_phrase(self, h: Hypothesis) -> bool:
        """(C-1) Reject paths whose intermediate node is a generic phrase
        like "neural activity" or "disease progression". Endpoints are
        excluded from this check because some metrics (e.g. "functional
        connectivity") legitimately appear as outcomes.
        """
        if len(h.path) < 2:
            return False
        intermediate_names: list[str] = []
        for i, link in enumerate(h.path):
            # link.from_name is intermediate when i >= 1
            # link.to_name   is intermediate when i <  len(path) - 1
            if i >= 1:
                intermediate_names.append(link.from_name or "")
            if i < len(h.path) - 1:
                intermediate_names.append(link.to_name or "")
        for name in intermediate_names:
            if self._is_generic_intermediate(name):
                return True
        return False

    def _has_thin_directional_density(self, h: Hypothesis) -> bool:
        """(C-2) Reject paths where directional relations are too sparse.

        Current rule (older): >= 1 directional anywhere = pass.
        Problem: a 4-hop path with 1 directional + 3 vague edges still
        looks like a real chain to scoring but is essentially a vague
        association narrative.

        New rule:
          - 1-2 hop path: at least 1 directional (unchanged)
          - 3+ hop path: at least half of the edges must be directional
        """
        n = len(h.path)
        if n < 3:
            return False
        directional = sum(1 for l in h.path if l.relation_type in DIRECTIONAL_RELATIONS)
        return directional * 2 < n   # < 50% directional

    def _has_implausible_path(self, h: Hypothesis) -> bool:
        """Check if hypothesis path has biologically implausible connections.

        Filters paths where a brain region directly predicts a non-neurological
        condition (e.g., amygdala → urinary incontinence) without a plausible
        intermediate neurological mechanism.
        """
        # Check if source is a brain region and target is non-neurological
        source_node = self._index.get(h.source_id)
        target_node = self._index.get(h.target_id)

        if not source_node or not target_node:
            return False

        source_is_brain = "neuroanatomy" in source_node.domain_tags
        target_is_neuro = any(d in target_node.domain_tags for d in
                              ["neuroanatomy", "disease", "cognitive_function",
                               "biomarker", "gene", "drug", "neurotransmitter"])

        # If source is brain region and target is non-neurological, check target name
        if source_is_brain and not target_is_neuro:
            if _NON_NEUROLOGICAL_TARGETS.search(h.target_name):
                return True

        # Also check intermediate nodes in the path
        for link in h.path:
            if _NON_NEUROLOGICAL_TARGETS.search(link.to_name):
                # Check if the previous node is a brain region
                prev_node = self._index.get(link.from_id)
                if prev_node and "neuroanatomy" in prev_node.domain_tags:
                    # Only filter if there's no disease intermediate
                    has_disease_intermediate = any(
                        "disease" in self._index.get(l.from_id, ConceptNode(id="", preferred_name="")).domain_tags
                        for l in h.path[:h.path.index(link)]
                    )
                    if not has_disease_intermediate:
                        return True

        return False

    def _has_hub_to_hub_edge(self, h: Hypothesis) -> bool:
        """Reject paths containing any edge whose endpoints are both broad hubs.

        Example: "Brain Diseases --causes--> Cognitive Dysfunction" — both ends
        are top-level categories; the edge is too generic to be a mechanistic
        step in a hypothesis.

        Hub set is the top-N nodes by non-'about' degree, computed once and
        cached. Uses a low bar (N=50) because hubs are self-evidently generic.
        """
        if not hasattr(self, "_hub_id_set"):
            # Build once per engine instance
            from collections import Counter
            degree = Counter()
            for u, v, data in self.G.edges(data=True):
                if data.get("relation_type") != "about":
                    degree[u] += 1
                    degree[v] += 1
            top = degree.most_common(50)
            self._hub_id_set = {cid for cid, _ in top}

        for link in h.path:
            if link.from_id in self._hub_id_set and link.to_id in self._hub_id_set:
                return True
        return False

    def _touches_path_ignore_node(self, h: Hypothesis) -> bool:
        """Reject paths whose source, target, or any intermediate node is in
        PATH_IGNORE_NODE_IDS (vague COGAT/MeSH umbrella hubs).

        Catches concepts the token-based _is_noisy_entity misses because
        the names ("memory", "logic", "Brain", "Neurons") are legitimate
        English words but the KG concept id refers to an over-general
        umbrella that's not testable.
        """
        if h.source_id in PATH_IGNORE_NODE_IDS:
            return True
        if h.target_id in PATH_IGNORE_NODE_IDS:
            return True
        for link in h.path:
            if link.from_id in PATH_IGNORE_NODE_IDS:
                return True
            if link.to_id in PATH_IGNORE_NODE_IDS:
                return True
        return False

    @staticmethod
    def _transits_intermediate_only_hub(h: Hypothesis) -> bool:
        """Reject paths that use disease mega-hubs as intermediate transit.

        INTERMEDIATE_ONLY_IGNORE_IDS nodes are valid as source/target
        (predicting Alzheimer is a real hypothesis) but not as middle
        hops (A → Alzheimer → B is just "both relate to AD").
        """
        if len(h.path) < 2:
            return False
        for i, link in enumerate(h.path):
            if i >= 1 and link.from_id in INTERMEDIATE_ONLY_IGNORE_IDS:
                return True
            if i < len(h.path) - 1 and link.to_id in INTERMEDIATE_ONLY_IGNORE_IDS:
                return True
        return False

    def _is_dataset_outcome(self, h: Hypothesis) -> bool:
        """Check if target is a UKB/ADNI/HCP-testable outcome.

        The goal of our hypotheses is to predict SOMETHING from brain imaging.
        Valid targets:
        - Clinical diagnoses (disease domain) — Alzheimer, MCI, schizophrenia, etc.
        - Cognitive/behavioral/personality measures (cognitive_function domain)
        - Brain decoding targets:
            * neuroanatomy (for encoding: stimulus → brain activation)
            * visual_stimulus (for decoding: brain → stimulus category)
            * emotion (SEED family: EEG → affect label)
            * vigilance (SEED-VIG: EEG → alertness)

        Invalid targets:
        - Molecular entities (gene, biomarker, drug, neurotransmitter) — these
          may be predictors, not predicted quantities
        - Overly generic disease categories (Brain Diseases, Mental Disorders) —
          already filtered by hub-to-hub, but double-check by keyword.

        Accepts target if EITHER:
          a) target's domain is in _OUTCOME_DOMAINS ∪ decoding domains, OR
          b) target name matches _OUTCOME_KEYWORDS regex (as fallback for
             claim_extraction concepts whose domain may be uncertain)
        """
        target = self._index.get(h.target_id)
        if target is None:
            return False

        domains = set(target.domain_tags)
        # Accept: disease, cognitive_function, or decoding-target domains
        outcome_domains = _OUTCOME_DOMAINS | {"visual_stimulus", "emotion", "vigilance"}
        if domains & outcome_domains:
            return True

        # Accept: neuroanatomy targets when the hypothesis is a brain-decoding
        # encoding path (stimulus → brain region). Excludes the clinical-
        # prediction case where a target of 'White Matter' would be an input.
        if "neuroanatomy" in domains:
            source = self._index.get(h.source_id)
            if source:
                source_domains = set(source.domain_tags)
                if source_domains & {"visual_stimulus", "emotion", "vigilance"}:
                    return True

        # Fallback: outcome keyword match (catches claim_extraction concepts
        # that describe outcomes but have wrong domain tags)
        if _OUTCOME_KEYWORDS.search(h.target_name):
            return True

        return False

    def _has_weak_evidence(self, h: Hypothesis) -> bool:
        """Check if hypothesis path has weak evidence (target not mentioned in raw_text).

        For hypotheses where the target is a specific brain region, check if any hop's
        raw_text actually mentions that region. If not, the path is likely spurious
        (e.g., IL-1β → Internal Capsula where the evidence text talks about "grey matter"
        but never mentions internal capsule).

        Exception: paths anchored by curated functional facts (e.g. `evokes` from
        visual_stimulus to a functional ROI) carry programmatic confidence, not
        paper evidence — skip the raw_text requirement for them.
        """
        target_node = self._index.get(h.target_id)
        if not target_node or "neuroanatomy" not in target_node.domain_tags:
            return False

        # Skip paths whose source is a visual_stimulus / emotion / vigilance node, or
        # which contain at least one curated functional edge (evokes / decoded_from /
        # elicits). These are seeded from neuroscience textbooks, not paper claims.
        source_node = self._index.get(h.source_id)
        if source_node:
            decoding_domains = {"visual_stimulus", "emotion", "vigilance"}
            if any(t in decoding_domains for t in source_node.domain_tags):
                return False
        if any(l.relation_type in {"evokes", "decoded_from", "elicits"} for l in h.path):
            return False

        # Extract key terms from target name (e.g., "Internal Capsula" → ["internal", "capsula"])
        target_terms = set(re.findall(r'\b\w{4,}\b', h.target_name.lower()))
        if not target_terms:
            return False

        # Check if any hop mentions the target region
        for link in h.path:
            raw = link.raw_text or link.evidence.get("raw_text", "") if isinstance(link.evidence, dict) else ""
            if raw:
                raw_lower = raw.lower()
                # If any target term appears in raw_text, evidence is OK
                if any(term in raw_lower for term in target_terms):
                    return False

        # No hop mentions the target region → weak evidence
        logger.debug(f"weak evidence: {h.id} target '{h.target_name}' not mentioned in any raw_text")
        return True

    # ── imaging-driven batch generation ──────────────────────────────

    def batch_generate_imaging(
        self,
        dataset: str = "UKB",
        max_paths_per_pair: int = 5,
        max_seeds: int = 50,
        max_hops: int = 3,
        include_connectivity: bool = True,
    ) -> list[Hypothesis]:
        """Generate hypotheses driven by imaging features available in a dataset.

        Strategy:
        1. Find AAL atlas neuroanatomy nodes in the graph as ROI seeds
        2. For each ROI × imaging feature template, construct a feature name
           (e.g., "cortical thickness of Hippocampus_L")
        3. Find graph paths from each ROI to disease/cognitive_function nodes
        4. Filter using expanded exclusion rules
        5. Annotate each hypothesis with dataset metadata
        """
        dataset_key = dataset.upper().replace("-", "_")
        if dataset_key not in DATASET_FEATURES:
            raise ValueError(f"Unknown dataset: {dataset}. Available: {list(DATASET_FEATURES.keys())}")

        ds_features = DATASET_FEATURES[dataset_key]
        ds_outcomes = DATASET_OUTCOMES.get(dataset_key, [])

        # 1. Find AAL atlas ROI nodes
        aal_nodes = self._find_aal_regions(max_seeds)
        if not aal_nodes:
            logger.warning("No AAL atlas regions found in graph")
            return []

        logger.info(f"Found {len(aal_nodes)} AAL regions for imaging hypothesis generation")

        # 2. Collect outcome nodes (disease, cognitive_function)
        outcome_nodes = self._collect_outcome_nodes()
        if not outcome_nodes:
            logger.warning("No outcome nodes (disease/cognitive_function) found")
            return []

        # 3. Determine which imaging templates apply to this dataset
        applicable_templates = {
            name: meta for name, meta in IMAGING_FEATURE_TEMPLATES.items()
            if dataset_key in meta["datasets"]
        }

        all_hypotheses: list[Hypothesis] = []
        _hyp_counter = 0
        seen_pairs: set[tuple[str, str]] = set()

        # 4. Generate ROI-level imaging hypotheses
        for region_id, region_name in aal_nodes.items():
            for feat_template, feat_meta in applicable_templates.items():
                feature_name = feat_template.replace("{region}", region_name)

                # Find paths from this ROI to outcomes
                try:
                    reachable = nx.single_source_shortest_path(
                        self.G, region_id, cutoff=max_hops
                    )
                except nx.NetworkXError:
                    continue

                candidates = [
                    nid for nid in reachable
                    if nid in outcome_nodes and nid != region_id
                ]

                pair_count = 0
                for target_id in candidates:
                    pair_key = (region_id, target_id, feat_template)
                    if pair_key in seen_pairs:
                        continue
                    seen_pairs.add(pair_key)

                    raw_path = reachable[target_id]
                    links = self._enrich_path(raw_path)
                    if not links:
                        continue

                    # Skip if path contains non-measurable entities
                    if self._path_has_non_measurable(links):
                        continue

                    conf = self._compute_confidence_score(links)
                    nov = self._compute_novelty_score(links)
                    evi = self._compute_evidence_score(links)
                    test, test_reason = self._compute_testability_score(links)
                    # Boost testability for imaging-driven hypotheses
                    test = min(test + 0.15, 1.0)
                    claim_ids = [l.claim_id for l in links if l.claim_id]

                    _hyp_counter += 1
                    target_node = self._index.get(target_id)
                    h = Hypothesis(
                        id=f"HYP:IMG:{_hyp_counter:06d}",
                        hypothesis_type="imaging",
                        source_id=region_id,
                        source_name=feature_name,
                        target_id=target_id,
                        target_name=target_node.preferred_name if target_node else target_id,
                        path=links,
                        confidence_score=conf,
                        novelty_score=nov,
                        evidence_score=evi,
                        testability_score=test,
                        composite_score=0.0,
                        supporting_claims=claim_ids,
                        testability_reason=test_reason,
                        metadata={
                            "dataset": dataset_key,
                            "input_modality": feat_meta["modality"],
                            "input_feature": feature_name,
                            "input_level": feat_meta["level"],
                            "input_tool": feat_meta["tool"],
                            "input_region": region_name,
                            "outcome_type": self._classify_outcome(target_node),
                        },
                    )
                    h.explanation = self._generate_explanation(h)
                    h.composite_score = self._composite_score(h)
                    all_hypotheses.append(h)

                    pair_count += 1
                    if pair_count >= max_paths_per_pair:
                        break

        # 5. Generate connectivity-level hypotheses
        if include_connectivity:
            conn_templates = {
                name: meta for name, meta in CONNECTIVITY_FEATURE_TEMPLATES.items()
                if dataset_key in meta["datasets"]
            }
            if conn_templates:
                hyps = self._generate_connectivity_hypotheses(
                    aal_nodes, outcome_nodes, conn_templates,
                    dataset_key, max_paths_per_pair, max_hops, _hyp_counter, seen_pairs,
                )
                _hyp_counter += len(hyps)
                all_hypotheses.extend(hyps)

        logger.info(
            f"imaging batch generation ({dataset_key}): "
            f"{len(all_hypotheses)} hypotheses from {len(aal_nodes)} regions"
        )

        all_hypotheses = self.post_process(all_hypotheses)
        return all_hypotheses

    def _find_aal_regions(self, max_n: int) -> dict[str, str]:
        """Find AAL atlas neuroanatomy nodes. Returns {node_id: region_name}."""
        candidates = {}
        for nid, data in self.G.nodes(data=True):
            if "neuroanatomy" not in data.get("domain_tags", []):
                continue
            name = data.get("preferred_name", "")
            # Match against AAL region keywords
            name_lower = name.lower()
            for kw in _AAL_REGION_KEYWORDS:
                if kw.lower() in name_lower:
                    candidates[nid] = name
                    break
        # Sort by degree (more connected = richer paths)
        sorted_items = sorted(
            candidates.items(),
            key=lambda item: self.G.degree(item[0]),
            reverse=True,
        )
        return dict(sorted_items[:max_n])

    def _collect_outcome_nodes(self) -> set[str]:
        """Collect all disease + cognitive_function nodes as potential outcomes."""
        outcome_ids = set()
        for nid, data in self.G.nodes(data=True):
            domains = set(data.get("domain_tags", []))
            if "claim" in domains:
                continue
            if nid in PATH_IGNORE_NODE_IDS:
                continue
            if domains & {"disease", "cognitive_function"}:
                outcome_ids.add(nid)
        return outcome_ids

    def _classify_outcome(self, node: Optional[ConceptNode]) -> str:
        """Classify outcome node type for metadata."""
        if not node:
            return "unknown"
        domains = set(node.domain_tags)
        if "disease" in domains:
            return "disease"
        if "cognitive_function" in domains:
            return "cognitive_function"
        if "biomarker" in domains:
            return "biomarker"
        return "other"

    def _path_has_non_measurable(self, links: list[HypothesisLink]) -> bool:
        """Check if any intermediate node in the path is non-measurable."""
        for link in links:
            for name, nid in [(link.from_name, link.from_id), (link.to_name, link.to_id)]:
                node = self._index.get(nid)
                if node:
                    domains = set(node.domain_tags) - {"claim"}
                    if domains & NON_MEASURABLE_BIOMARKER_TYPES:
                        return True
                for pattern in _NON_MEASURABLE_PATTERNS:
                    if pattern.search(name):
                        return True
        return False

    def _generate_connectivity_hypotheses(
        self,
        aal_nodes: dict[str, str],
        outcome_nodes: set[str],
        conn_templates: dict,
        dataset_key: str,
        max_paths_per_pair: int,
        max_hops: int,
        hyp_counter_start: int,
        seen_pairs: set,
    ) -> list[Hypothesis]:
        """Generate hypotheses for connectivity features (FC/EC/SC between region pairs)."""
        hypotheses = []
        counter = hyp_counter_start
        region_ids = list(aal_nodes.keys())

        # Sample region pairs (limit to avoid O(n^2) explosion)
        max_pairs = min(len(region_ids) * 3, 200)
        import random
        if len(region_ids) > 20:
            sampled_pairs = []
            for _ in range(max_pairs):
                a, b = random.sample(region_ids, 2)
                sampled_pairs.append((a, b))
        else:
            sampled_pairs = [(a, b) for i, a in enumerate(region_ids) for b in region_ids[i+1:]]
            sampled_pairs = sampled_pairs[:max_pairs]

        for region_a_id, region_b_id in sampled_pairs:
            name_a = aal_nodes[region_a_id]
            name_b = aal_nodes[region_b_id]

            for feat_template, feat_meta in conn_templates.items():
                feature_name = feat_template.replace("{a}", name_a).replace("{b}", name_b)

                # Find paths from region_a to outcomes (potentially through region_b)
                try:
                    reachable = nx.single_source_shortest_path(
                        self.G, region_a_id, cutoff=max_hops
                    )
                except nx.NetworkXError:
                    continue

                candidates = [
                    nid for nid in reachable
                    if nid in outcome_nodes and nid != region_a_id
                ]

                pair_count = 0
                for target_id in candidates:
                    pair_key = (region_a_id, target_id, feat_template)
                    if pair_key in seen_pairs:
                        continue
                    seen_pairs.add(pair_key)

                    raw_path = reachable[target_id]
                    links = self._enrich_path(raw_path)
                    if not links:
                        continue

                    if self._path_has_non_measurable(links):
                        continue

                    conf = self._compute_confidence_score(links)
                    nov = self._compute_novelty_score(links)
                    evi = self._compute_evidence_score(links)
                    test, test_reason = self._compute_testability_score(links)
                    test = min(test + 0.15, 1.0)
                    claim_ids = [l.claim_id for l in links if l.claim_id]

                    counter += 1
                    target_node = self._index.get(target_id)
                    h = Hypothesis(
                        id=f"HYP:IMG:{counter:06d}",
                        hypothesis_type="imaging_connectivity",
                        source_id=region_a_id,
                        source_name=feature_name,
                        target_id=target_id,
                        target_name=target_node.preferred_name if target_node else target_id,
                        path=links,
                        confidence_score=conf,
                        novelty_score=nov,
                        evidence_score=evi,
                        testability_score=test,
                        composite_score=0.0,
                        supporting_claims=claim_ids,
                        testability_reason=test_reason,
                        metadata={
                            "dataset": dataset_key,
                            "input_modality": feat_meta["modality"],
                            "input_feature": feature_name,
                            "input_level": feat_meta["level"],
                            "input_tool": feat_meta["tool"],
                            "input_region_a": name_a,
                            "input_region_b": name_b,
                            "input_region": f"{name_a} - {name_b}",
                            "outcome_type": self._classify_outcome(target_node),
                        },
                    )
                    h.explanation = self._generate_explanation(h)
                    h.composite_score = self._composite_score(h)
                    hypotheses.append(h)

                    pair_count += 1
                    if pair_count >= max_paths_per_pair:
                        break

        return hypotheses

    # ── persistence ────────────────────────────────────────────────────

    def save_hypotheses(self, hypotheses: list[Hypothesis], path: str | Path) -> None:
        """Save hypotheses to JSON."""
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)
        data = {
            "n_hypotheses": len(hypotheses),
            "hypotheses": [h.to_dict() for h in hypotheses],
        }
        path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
        logger.info(f"saved {len(hypotheses)} hypotheses to {path}")

    def load_hypotheses(self, path: str | Path) -> list[Hypothesis]:
        """Load hypotheses from JSON."""
        path = Path(path)
        data = json.loads(path.read_text(encoding="utf-8"))
        hypotheses = [Hypothesis.from_dict(h) for h in data["hypotheses"]]
        logger.info(f"loaded {len(hypotheses)} hypotheses from {path}")
        return hypotheses

    # ── ranking ────────────────────────────────────────────────────────

    def rank_hypotheses(
        self,
        hypotheses: list[Hypothesis],
        weights: Optional[dict[str, float]] = None,
        top_n: int = 100,
        skip_post_process: bool = False,
    ) -> list[Hypothesis]:
        """Rank hypotheses by composite score (novelty, evidence, testability, confidence).

        Args:
            hypotheses: list of hypotheses to rank
            weights: custom weights dict, keys: confidence, evidence, novelty, testability
            top_n: return top N results
            skip_post_process: if True, skip the post-processing filter
        """
        if not skip_post_process:
            hypotheses = self.post_process(hypotheses)

        if weights is None:
            # testability weighted highest — must be verifiable with imaging experiments
            weights = {
                "confidence": 0.20,
                "evidence": 0.20,
                "novelty": 0.25,
                "testability": 0.35,
            }

        for h in hypotheses:
            h.composite_score = (
                (h.confidence_score ** weights["confidence"])
                * (h.evidence_score ** weights["evidence"])
                * (h.novelty_score ** weights["novelty"])
                * (max(h.testability_score, 0.01) ** weights["testability"])
            )

        hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
        return hypotheses[:top_n]

    # ── query-based (kept for interactive use) ─────────────────────────

    def find_paths(
        self,
        source_id: str,
        target_id: str,
        max_hops: int = 3,
        max_paths: int = 20,
    ) -> list[Hypothesis]:
        """Find hypothesis paths between two concepts with evidence enrichment."""
        if source_id not in self.G or target_id not in self.G:
            return []

        claim_nodes = {nid for nid, n in self._index.items() if "claim" in n.domain_tags}
        intermediate_exclude = claim_nodes - {source_id, target_id}
        # Also strip vague umbrella hubs from the search subgraph so paths
        # never include them as intermediates. Endpoints are excluded from
        # the strip so a caller can still query them directly.
        intermediate_exclude |= (PATH_IGNORE_NODE_IDS - {source_id, target_id})

        subgraph = self.G.copy()
        subgraph.remove_nodes_from(intermediate_exclude)

        if source_id not in subgraph or target_id not in subgraph:
            return []

        try:
            raw_paths = list(nx.all_simple_paths(
                subgraph, source_id, target_id, cutoff=max_hops
            ))
        except nx.NetworkXError:
            return []

        raw_paths = raw_paths[:max_paths]
        return self._build_hypotheses_from_paths(raw_paths, "path")

    def bridge_discovery(
        self,
        concept_id: str,
        target_domain: str,
        max_hops: int = 3,
        max_results: int = 20,
    ) -> list[Hypothesis]:
        """Find cross-domain connections through intermediate claims."""
        if concept_id not in self.G:
            return []

        target_nodes = {
            nid for nid, data in self.G.nodes(data=True)
            if target_domain in data.get("domain_tags", [])
        }
        if not target_nodes:
            return []

        try:
            reachable = nx.single_source_shortest_path(
                self.G, concept_id, cutoff=max_hops
            )
        except nx.NetworkXError:
            return []

        candidates = {
            nid for nid in reachable
            if nid in target_nodes and nid != concept_id
            and "claim" not in self._index.get(nid, ConceptNode(id="", preferred_name="")).domain_tags
        }

        hypotheses = []
        for target_id in candidates:
            raw_path = reachable[target_id]
            links = self._enrich_path(raw_path)
            if not links:
                continue

            conf = self._compute_confidence_score(links)
            nov = self._compute_novelty_score(links)
            evi = self._compute_evidence_score(links)
            test, test_reason = self._compute_testability_score(links)
            claim_ids = [l.claim_id for l in links if l.claim_id]

            h = Hypothesis(
                hypothesis_type="bridge",
                source_id=concept_id,
                source_name=self._index[concept_id].preferred_name,
                target_id=target_id,
                target_name=self._index[target_id].preferred_name,
                path=links,
                confidence_score=conf,
                novelty_score=nov,
                evidence_score=evi,
                testability_score=test,
                supporting_claims=claim_ids,
                testability_reason=test_reason,
            )
            h.explanation = self._generate_explanation(h)
            h.composite_score = self._composite_score(h)
            hypotheses.append(h)

        hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
        return hypotheses[:max_results]

    def discover_hypotheses(
        self,
        concept_id: str,
        max_hops: int = 3,
        max_results: int = 30,
        exclude_domains: Optional[set[str]] = None,
    ) -> list[Hypothesis]:
        """Find hypotheses radiating from a single concept to all reachable domains."""
        if concept_id not in self.G:
            return []

        exclude = exclude_domains or {"claim"}
        source_node = self._index.get(concept_id)
        source_domains = set(source_node.domain_tags) - exclude if source_node else set()

        try:
            reachable = nx.single_source_shortest_path(self.G, concept_id, cutoff=max_hops)
        except nx.NetworkXError:
            return []

        candidates = []
        for target_id, raw_path in reachable.items():
            if target_id == concept_id:
                continue
            target_node = self._index.get(target_id)
            if not target_node:
                continue
            target_domains = set(target_node.domain_tags) - exclude
            if not target_domains or target_domains <= source_domains:
                continue
            candidates.append((target_id, raw_path))

        hypotheses = []
        for target_id, raw_path in candidates:
            links = self._enrich_path(raw_path)
            if not links:
                continue
            conf = self._compute_confidence_score(links)
            nov = self._compute_novelty_score(links)
            evi = self._compute_evidence_score(links)
            test, test_reason = self._compute_testability_score(links)
            claim_ids = [l.claim_id for l in links if l.claim_id]

            h = Hypothesis(
                hypothesis_type="discover",
                source_id=concept_id,
                source_name=self._index[concept_id].preferred_name,
                target_id=target_id,
                target_name=self._index[target_id].preferred_name,
                path=links,
                confidence_score=conf,
                novelty_score=nov,
                evidence_score=evi,
                testability_score=test,
                supporting_claims=claim_ids,
                testability_reason=test_reason,
            )
            h.explanation = self._generate_explanation(h)
            h.composite_score = self._composite_score(h)
            hypotheses.append(h)

        hypotheses = self.post_process(hypotheses)
        hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
        return hypotheses[:max_results]

    def find_trending(
        self,
        since_year: int = 2020,
        min_claims: int = 3,
        direction: str = "strengthening",
        max_results: int = 30,
    ) -> list[dict]:
        """Find concept pairs with strengthening/weakening evidence over time.

        Returns list of dicts with: concept_a, concept_b, years, slope, direction, claims.
        """
        from collections import Counter

        # Group claims by (subject, object)
        claim_groups: dict[tuple[str, str], list[dict]] = {}
        for nid, node in self._index.items():
            if "claim" not in node.domain_tags:
                continue
            meta = node.metadata
            sid = meta.get("subject_id", "")
            oid = meta.get("object_id", "")
            if not sid or not oid:
                continue
            key = (sid, oid)
            claim_groups.setdefault(key, []).append(meta)

        results = []
        for (sid, oid), claims in claim_groups.items():
            years = []
            for c in claims:
                sp = c.get("source_paper", {})
                y = sp.get("year")
                if y and y >= since_year:
                    years.append(y)

            if len(years) < min_claims:
                continue

            year_counts = Counter(years)
            ys = sorted(year_counts.keys())
            cs = [year_counts[y] for y in ys]
            slope = _simple_slope(ys, cs)

            if direction == "strengthening" and slope <= 0.3:
                continue
            if direction == "weakening" and slope >= -0.3:
                continue
            if direction == "emerging" and max(ys) < 2025:
                continue

            src_node = self._index.get(sid)
            tgt_node = self._index.get(oid)

            results.append({
                "concept_a": src_node.preferred_name if src_node else sid,
                "concept_b": tgt_node.preferred_name if tgt_node else oid,
                "concept_a_id": sid,
                "concept_b_id": oid,
                "year_counts": {str(y): year_counts[y] for y in ys},
                "slope": round(slope, 3),
                "direction": direction,
                "n_claims": len(claims),
            })

        results.sort(key=lambda r: abs(r["slope"]), reverse=True)
        return results[:max_results]

    def contradiction_detection(
        self,
        domain_filter: Optional[str] = None,
        max_results: int = 50,
    ) -> list[Contradiction]:
        """Find pairs of claims that assert opposite things about the same concept pair."""
        claim_lookup: dict[tuple[str, str], list[ConceptNode]] = {}
        for nid, node in self._index.items():
            if "claim" not in node.domain_tags:
                continue
            meta = node.metadata
            sid = meta.get("subject_id", "")
            oid = meta.get("object_id", "")
            if not sid or not oid:
                continue

            if domain_filter:
                src_node = self._index.get(sid)
                tgt_node = self._index.get(oid)
                domains = set()
                if src_node:
                    domains.update(src_node.domain_tags)
                if tgt_node:
                    domains.update(tgt_node.domain_tags)
                if domain_filter not in domains:
                    continue

            key = (sid, oid)
            claim_lookup.setdefault(key, []).append(node)

        contradictions = []
        for (sid, oid), claims in claim_lookup.items():
            if len(claims) < 2:
                continue
            for i in range(len(claims)):
                for j in range(i + 1, len(claims)):
                    c1, c2 = claims[i], claims[j]
                    m1, m2 = c1.metadata, c2.metadata
                    severity = self._check_contradiction(m1, m2)
                    if severity > 0:
                        contradictions.append(Contradiction(
                            concept_a_id=sid,
                            concept_a_name=m1.get("subject_name", sid),
                            concept_b_id=oid,
                            concept_b_name=m1.get("object_name", oid),
                            claim_for_id=c1.id,
                            claim_for_predicate=m1.get("predicate", ""),
                            claim_for_text=m1.get("raw_text", ""),
                            claim_against_id=c2.id,
                            claim_against_predicate=m2.get("predicate", ""),
                            claim_against_text=m2.get("raw_text", ""),
                            severity=severity,
                        ))

        contradictions.sort(key=lambda c: c.severity, reverse=True)
        return contradictions[:max_results]

    def gap_detection(
        self,
        domain_a: str,
        domain_b: Optional[str] = None,
        max_results: int = 50,
    ) -> list[Gap]:
        """Find concept pairs 2 hops apart with no direct edge."""
        if domain_b is None:
            domain_b = domain_a

        nodes_a = {
            nid for nid, data in self.G.nodes(data=True)
            if domain_a in data.get("domain_tags", [])
            and "claim" not in data.get("domain_tags", [])
        }
        nodes_b = {
            nid for nid, data in self.G.nodes(data=True)
            if domain_b in data.get("domain_tags", [])
            and "claim" not in data.get("domain_tags", [])
        }

        gaps = []
        seen = set()

        for a_id in nodes_a:
            if a_id not in self.G:
                continue
            hop1 = set(self.G.successors(a_id)) | set(self.G.predecessors(a_id))
            hop2 = set()
            for n1 in hop1:
                if "claim" in self._index.get(n1, ConceptNode(id="", preferred_name="")).domain_tags:
                    continue
                hop2.update(self.G.successors(n1))
                hop2.update(self.G.predecessors(n1))

            hop2 -= {a_id}
            hop2 -= hop1

            for b_id in hop2 & nodes_b:
                pair = tuple(sorted([a_id, b_id]))
                if pair in seen:
                    continue
                seen.add(pair)

                if self.G.has_edge(a_id, b_id) or self.G.has_edge(b_id, a_id):
                    continue

                try:
                    path = nx.shortest_path(self.G, a_id, b_id)
                except (nx.NetworkXNoPath, nx.NetworkXError):
                    continue

                if len(path) > 3:
                    continue

                connecting = [n for n in path[1:-1]
                              if "claim" not in self._index.get(n, ConceptNode(id="", preferred_name="")).domain_tags]

                a_node = self._index.get(a_id)
                b_node = self._index.get(b_id)

                gaps.append(Gap(
                    concept_a_id=a_id,
                    concept_a_name=a_node.preferred_name if a_node else a_id,
                    concept_b_id=b_id,
                    concept_b_name=b_node.preferred_name if b_node else b_id,
                    distance=len(path) - 1,
                    connecting_concepts=connecting,
                    domain_a=domain_a,
                    domain_b=domain_b,
                    potential_relation=self._infer_relation(path),
                ))

        gaps.sort(key=lambda g: (0 if g.domain_a != g.domain_b else 1, g.distance))
        return gaps[:max_results]

    # ── name resolution ────────────────────────────────────────────────

    def resolve_name(self, query: str) -> Optional[str]:
        """Resolve a name to a concept ID. Returns None if not found."""
        if not query:
            return None

        for node in self._index.values():
            if node.preferred_name == query:
                return node.id

        query_lower = query.lower()
        for node in self._index.values():
            if node.preferred_name.lower() == query_lower:
                return node.id

        for node in self._index.values():
            for alias in node.aliases:
                if alias.lower() == query_lower:
                    return node.id

        candidates = []
        for node in self._index.values():
            name_lower = node.preferred_name.lower()
            if query_lower in name_lower or name_lower in query_lower:
                candidates.append(node)
                continue
            for alias in node.aliases:
                if query_lower in alias.lower() or alias.lower() in query_lower:
                    candidates.append(node)
                    break

        if len(candidates) == 1:
            return candidates[0].id
        elif len(candidates) > 1:
            candidates.sort(key=lambda n: len(n.preferred_name))
            return candidates[0].id

        return None

    # ── internal helpers ───────────────────────────────────────────────

    def _sample_domain_nodes(self, domain: str, max_n: int) -> list[str]:
        """Sample up to max_n non-claim nodes from a domain, preferring nodes with edges."""
        nodes = [
            nid for nid, data in self.G.nodes(data=True)
            if domain in data.get("domain_tags", [])
            and "claim" not in data.get("domain_tags", [])
            and nid not in PATH_IGNORE_NODE_IDS
        ]
        # sort by degree (more connected = more useful as seed)
        nodes.sort(key=lambda n: self.G.degree(n), reverse=True)
        return nodes[:max_n]

    def _build_hypotheses_from_paths(
        self, raw_paths: list[list[str]], hyp_type: str
    ) -> list[Hypothesis]:
        """Build Hypothesis objects from raw node-ID paths."""
        hypotheses = []
        for raw_path in raw_paths:
            links = self._enrich_path(raw_path)
            if not links:
                continue

            conf = self._compute_confidence_score(links)
            nov = self._compute_novelty_score(links)
            evi = self._compute_evidence_score(links)
            test, test_reason = self._compute_testability_score(links)
            claim_ids = [l.claim_id for l in links if l.claim_id]

            h = Hypothesis(
                hypothesis_type=hyp_type,
                source_id=raw_path[0],
                source_name=self._index[raw_path[0]].preferred_name,
                target_id=raw_path[-1],
                target_name=self._index[raw_path[-1]].preferred_name,
                path=links,
                confidence_score=conf,
                novelty_score=nov,
                evidence_score=evi,
                testability_score=test,
                supporting_claims=claim_ids,
                testability_reason=test_reason,
            )
            h.explanation = self._generate_explanation(h)
            h.composite_score = self._composite_score(h)
            hypotheses.append(h)

        hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
        return hypotheses

    def _enrich_path(self, raw_path: list[str]) -> list[HypothesisLink]:
        """Convert a raw node-ID path into rich HypothesisLink objects."""
        links = []
        for i in range(len(raw_path) - 1):
            src_id, tgt_id = raw_path[i], raw_path[i + 1]
            if not self.G.has_edge(src_id, tgt_id):
                continue

            edge_data = self.G.edges[src_id, tgt_id]
            src_node = self._index.get(src_id)
            tgt_node = self._index.get(tgt_id)

            claim_id = edge_data.get("metadata", {}).get("claim_id", "")
            claim_node = self._index.get(claim_id) if claim_id else None

            evidence = {}
            paper = {}
            raw_text = ""

            if claim_node and claim_node.metadata:
                meta = claim_node.metadata
                evidence = meta.get("evidence", {})
                paper = meta.get("source_paper", {})
                raw_text = meta.get("raw_text", "")

            links.append(HypothesisLink(
                from_id=src_id,
                from_name=src_node.preferred_name if src_node else src_id,
                to_id=tgt_id,
                to_name=tgt_node.preferred_name if tgt_node else tgt_id,
                relation_type=edge_data.get("relation_type", "unknown"),
                confidence=edge_data.get("confidence", 0.5),
                claim_id=claim_id,
                raw_text=raw_text,
                evidence=evidence,
                source_paper=paper,
            ))

        return links

    # ── scoring ────────────────────────────────────────────────────────

    def compute_frequency_boost(self, claim_meta: dict) -> float:
        """Frequency boost based on independent PRIMARY study replication.

        Prefers the merged `primary_supporting_papers` list set by
        `phase4_optimize.merge_duplicate_claims` (already filtered for
        non-review study types). Falls back to rebuilding from the
        pre-merge index, matching the same filter logic.
        """
        # Fast path: canonical claim carries primary-PMID list
        primary = claim_meta.get("primary_supporting_papers")
        if primary is not None and isinstance(primary, list):
            n = len(primary)
            if n >= 3:
                return 1.2
            elif n >= 1:
                return 1.0
            else:
                return 0.5

        # Fallback: scan all claims with the same SPO, filter reviews
        key = (
            claim_meta.get("subject_id", ""),
            claim_meta.get("predicate", ""),
            claim_meta.get("object_id", ""),
        )
        all_claims = self._claims_by_triple.get(key, [])
        primary_pmids = set()
        for c in all_claims:
            st = c.get("evidence", {}).get("study_type", "")
            if st not in _REVIEW_TYPES:
                pmid = c.get("source_paper", {}).get("pmid", "")
                if pmid:
                    primary_pmids.add(pmid)

        if len(primary_pmids) >= 3:
            return 1.2
        elif len(primary_pmids) >= 1:
            return 1.0
        else:
            return 0.5

    @staticmethod
    def compute_temporal_decay(claim_meta: dict, reference_year: int = 2026) -> float:
        """Temporal decay: newer primary studies get higher weight.

        Reviews get no time bonus (1.0). Primary studies decay 3% per year, floor 0.7.
        """
        st = claim_meta.get("evidence", {}).get("study_type", "")
        if st in _REVIEW_TYPES:
            return 1.0
        year = claim_meta.get("source_paper", {}).get("year", 0)
        if not year:
            return 0.85  # unknown year, neutral
        age = reference_year - year
        return max(0.7, 1.0 - 0.03 * age)

    def _compute_confidence_score(self, path: list[HypothesisLink]) -> float:
        """Confidence = geometric mean of per-link scores, with weak-link penalty.

        Per-link score = edge.confidence × freq_boost × temporal_decay
          (edge.confidence already includes study_type weighting from
          phase4_optimize.apply_evidence_weighting and the claim-level
          statistical quality signals from claim_extractor._estimate_confidence)

        Aggregate: geometric mean (one weak link crushes the path)
          + weakest-link penalty (×0.7 when min_edge < 0.1)

        Single source of truth for each multiplier:
        - study_type → phase4_optimize.WEIGHT_MAP (canonical, idempotent)
        - p_value/sample_size/replicability → claim_extractor._estimate_confidence
        - freq across primary PMIDs → compute_frequency_boost
        - publication recency → compute_temporal_decay
        """
        if not path:
            return 0.0

        import math

        scores = []
        min_conf = float("inf")
        for link in path:
            raw = max(link.confidence, 1e-3)  # tiny floor for log()
            min_conf = min(min_conf, raw)

            full_meta = {
                "evidence": link.evidence,
                "source_paper": link.source_paper,
                "subject_id": link.from_id,
                "predicate": link.relation_type,
                "object_id": link.to_id,
            }
            freq_boost = self.compute_frequency_boost(full_meta)
            temp_decay = self.compute_temporal_decay(full_meta)

            s = raw * freq_boost * temp_decay
            scores.append(min(s, 1.0))

        log_sum = sum(math.log(max(s, 1e-6)) for s in scores)
        gm = math.exp(log_sum / len(scores))

        if min_conf < 0.1:
            gm *= 0.7

        return max(min(gm, 1.0), 0.0)

    def _compute_novelty_score(self, path: list[HypothesisLink]) -> float:
        """Score how novel/surprising a hypothesis is.

        Lower = more expected (direct known relationship), Higher = more surprising.
        """
        score = 0.3  # base

        # hop bonus: longer paths = more novel connections
        score += 0.1 * min(len(path) - 1, 3)

        # cross-domain bonus: connecting different domains is more novel
        domains_seen = set()
        for link in path:
            src = self._index.get(link.from_id)
            tgt = self._index.get(link.to_id)
            if src:
                domains_seen.update(src.domain_tags)
            if tgt:
                domains_seen.update(tgt.domain_tags)
        domains_seen.discard("claim")
        n_domains = len(domains_seen)
        if n_domains >= 3:
            score += 0.15
        elif n_domains >= 2:
            score += 0.10

        # rare relation bonus: non-generic relations are more novel
        rare_count = sum(1 for l in path if l.relation_type not in COMMON_RELATIONS)
        score += 0.05 * min(rare_count, 3)

        # evidence diversity: more papers = better supported, less novel
        # fewer papers = more speculative, more novel
        pmids = {l.source_paper.get("pmid", "") for l in path if l.source_paper.get("pmid")}
        if len(pmids) == 0:
            score += 0.10  # no paper support = speculative but novel
        elif len(pmids) == 1:
            score += 0.05  # single source = weak replication

        return min(score, 1.0)

    def _compute_evidence_score(self, path: list[HypothesisLink]) -> float:
        """Score evidence quality: traceability and text availability.

        DOES NOT use p_value/sample_size/effect_size — those signals already
        flow into edge.confidence via claim_extractor._estimate_confidence
        and are aggregated by _compute_confidence_score. Counting them again
        here was double-dipping.

        This score asks a different question: "How well-anchored is the
        evidence in source documents?" — which complements confidence's
        "How statistically strong is the evidence?". Path-level: most
        well-extracted edges score 0.6-0.8; we reserve >0.9 for paths whose
        every step has rich provenance.
        """
        _REVIEW_TYPES = {"narrative_review", "review"}
        scores = []
        for link in path:
            study_type = (link.evidence.get("study_type") or "").lower()
            s = 0.2 if study_type in _REVIEW_TYPES else 0.3

            if link.raw_text and len(link.raw_text) > 20:
                s += 0.20
            if link.claim_id:
                s += 0.15
            if link.source_paper.get("pmid"):
                s += 0.15
            if link.evidence.get("study_type"):
                s += 0.10

            scores.append(min(s, 1.0))

        return self._geometric_mean(scores)

    def _compute_testability_score(self, path: list[HypothesisLink]) -> tuple[float, str]:
        """Score how testable a hypothesis is with NeuroClaw imaging experiments.

        Boosts for:
        - Brain region features directly measurable from sMRI (volume, thickness)
        - Connectivity features (functional/structural) for GNN models
        - Modalities available in UKB/ADNI/HCP-YA
        - Deep learning model compatibility (BrainGNN, NeuroStorm)
        - Target diseases present in datasets (AD, PD, depression, etc.)

        Returns (score, reason_string).
        """
        all_text = " ".join(
            l.raw_text + " " + l.from_name + " " + l.to_name + " " + l.relation_type
            for l in path
        ).lower()

        # check which modalities are mentioned
        matched_modalities = []
        for modality, keywords in TESTABLE_MODALITIES.items():
            for kw in keywords:
                if kw.lower() in all_text:
                    matched_modalities.append(modality)
                    break

        if not matched_modalities:
            return 0.15, "no imaging modality detected"

        score = 0.25  # base for having a modality

        # modality bonus (more = more testable angles)
        score += 0.10 * min(len(matched_modalities), 3)

        # heavy bonus for sMRI features (volume/thickness — directly measurable in all 3 datasets)
        if "sMRI" in matched_modalities:
            score += 0.15

        # heavy bonus for connectivity features (input to BrainGNN/GNN models)
        if "dMRI" in matched_modalities or "fMRI" in matched_modalities:
            score += 0.15

        # bonus for PET (available in ADNI, key for AD research)
        if "PET" in matched_modalities:
            score += 0.10

        # bonus for brain region specificity (testable with atlas parcellation)
        brain_region_keywords = ["cortex", "hippocampus", "amygdala", "thalamus",
                                 "cerebellum", "striatum", "insula", "gyrus",
                                 "caudate", "putamen", "pallidum", "accumbens",
                                 "precuneus", "cuneus", "lingual", "fusiform",
                                 "parahippocampal", "entorhinal", "parietal",
                                 "frontal", "temporal", "occipital"]
        regions_found = [kw for kw in brain_region_keywords if kw in all_text]
        if regions_found:
            score += 0.10  # atlas-based ROI analysis
            if len(regions_found) >= 2:
                score += 0.05  # pair of regions = connectivity hypothesis

        # bonus for diseases present in target datasets
        dataset_diseases = [
            "alzheimer", "parkinson", "depression", "schizophrenia", "adhd",
            "autism", "epilepsy", "multiple sclerosis", "anxiety", "bipolar",
            "dementia", "mci", "mild cognitive",
        ]
        if any(d in all_text for d in dataset_diseases):
            score += 0.05

        # bonus for DL-model-compatible features (graph structure, ROI, connectivity matrix)
        if any(kw.lower() in all_text for kw in DL_MODEL_KEYWORDS):
            score += 0.05

        # build reason string
        modalities_str = ", ".join(matched_modalities)
        reason = f"modalities: {modalities_str}"
        if regions_found:
            reason += f" | brain regions: {', '.join(regions_found[:4])}"
        if any(d in all_text for d in dataset_diseases):
            matched_diseases = [d for d in dataset_diseases if d in all_text]
            reason += f" | diseases: {', '.join(matched_diseases[:3])}"

        return min(score, 1.0), reason

    def _composite_score(self, h: Hypothesis) -> float:
        """Weighted geometric mean of the 4 score components.

        Geometric: a hypothesis is only as good as its weakest dimension.
        A path with great evidence but 0 testability is worthless to us.

        Matches the linear fitness in evolution_engine._score_fitness
        (same weights, different aggregation — fitness adds convergence /
        diversity / length modifiers not relevant here).
        """
        c = max(h.confidence_score, 0.01)
        e = max(h.evidence_score, 0.01)
        n = max(h.novelty_score, 0.01)
        t = max(h.testability_score, 0.01)
        score = (c ** 0.20) * (e ** 0.20) * (n ** 0.25) * (t ** 0.35)

        if self._has_only_review_evidence(h):
            score *= 0.7

        return score

    @staticmethod
    def _has_only_review_evidence(h: Hypothesis) -> bool:
        """True if every link in the path comes from a review/narrative_review."""
        _REVIEW_TYPES = {"narrative_review", "review"}
        if not h.path:
            return False
        for link in h.path:
            study_type = (link.evidence.get("study_type") or "").lower()
            if study_type and study_type not in _REVIEW_TYPES:
                return False
        return True

    def _check_contradiction(self, m1: dict, m2: dict) -> float:
        """Check if two claims contradict each other. Returns severity 0-1."""
        p1 = m1.get("predicate", "")
        p2 = m2.get("predicate", "")
        n1 = m1.get("negated", False)
        n2 = m2.get("negated", False)

        if p1 == p2 and n1 != n2:
            return 1.0

        if (p1, p2) in OPPOSING_PREDICATES:
            return 0.8

        if p1 == p2 and not n1 and not n2:
            d1 = m1.get("evidence", {}).get("direction", "")
            d2 = m2.get("evidence", {}).get("direction", "")
            if d1 and d2 and d1 != d2:
                return 0.6

        return 0.0

    def _infer_relation(self, path: list[str]) -> str:
        """Infer a potential relation from a path's edge types."""
        relations = []
        for i in range(len(path) - 1):
            if self.G.has_edge(path[i], path[i + 1]):
                rt = self.G.edges[path[i], path[i + 1]].get("relation_type", "")
                if rt and rt not in ("about", "is_a", "part_of"):
                    relations.append(rt)

        if relations:
            for r in relations:
                if r not in COMMON_RELATIONS:
                    return r
            return relations[0]
        return "associated_with"

    def _generate_explanation(self, h: Hypothesis) -> str:
        """Generate a human-readable explanation for a hypothesis."""
        path_str = " --> ".join(
            f"{l.from_name} --[{l.relation_type}]--> {l.to_name}" for l in h.path
        )
        if not path_str:
            return ""

        pmids = {l.source_paper.get("pmid", "") for l in h.path if l.source_paper.get("pmid")}
        key_finding = ""
        for l in h.path:
            if l.raw_text:
                key_finding = l.raw_text[:150]
                if len(l.raw_text) > 150:
                    key_finding += "..."
                break

        lines = [
            f"Hypothesis: {h.source_name} may relate to {h.target_name} via {len(h.path)}-hop path.",
            f"Path: {path_str}",
            f"Evidence: {len(h.supporting_claims)} claims from {len(pmids)} papers",
        ]
        if key_finding:
            lines.append(f"Key finding: '{key_finding}'")
        if h.testability_reason:
            lines.append(f"Testability: {h.testability_reason}")
        lines.append(
            f"Confidence: {h.confidence_score:.2f} | "
            f"Novelty: {h.novelty_score:.2f} | "
            f"Evidence: {h.evidence_score:.2f} | "
            f"Testability: {h.testability_score:.2f}"
        )
        return "\n".join(lines)

    @staticmethod
    def _geometric_mean(values: list[float]) -> float:
        if not values:
            return 0.0
        product = math.prod(values)
        return product ** (1.0 / len(values))


def _simple_slope(xs: list[int], ys: list[int]) -> float:
    """Simple linear regression slope without numpy."""
    n = len(xs)
    if n < 2:
        return 0.0
    mean_x = sum(xs) / n
    mean_y = sum(ys) / n
    num = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys))
    den = sum((x - mean_x) ** 2 for x in xs)
    if den == 0:
        return 0.0
    return num / den