Spaces:
Sleeping
Sleeping
File size: 117,866 Bytes
6e7a2fd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 | """Hypothesis engine: batch-generate, persist, and rank testable hypotheses.
Phase 3 of the NeuroClaw discovery loop:
1. batch_generate() β traverse the graph to produce hypotheses at scale
2. save / load β persist hypotheses to JSON
3. rank_hypotheses() β sort by novelty, evidence, testability, confidence
4. (Phase 5-6) hypotheses become executable NeuroClaw analysis tasks
Usage:
from core.knowledge_graph import load_graph, HypothesisEngine
kg = load_graph()
engine = HypothesisEngine(kg)
# batch generate across all domain pairs
hypotheses = engine.batch_generate()
engine.save_hypotheses(hypotheses, "data/hypotheses.json")
# or load and re-rank
hypotheses = engine.load_hypotheses("data/hypotheses.json")
ranked = engine.rank_hypotheses(hypotheses)
"""
from __future__ import annotations
import json
import logging
import math
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Optional
import networkx as nx
from .graph_manager import KnowledgeGraph
from .schema import ConceptNode
logger = logging.getLogger(__name__)
# ββ data structures ββββββββββββββββββββββββββββββββββββββββββββββββββββ
@dataclass
class HypothesisLink:
"""A single step in a hypothesis chain."""
from_id: str
from_name: str
to_id: str
to_name: str
relation_type: str
confidence: float
claim_id: str = ""
raw_text: str = ""
evidence: dict = field(default_factory=dict)
source_paper: dict = field(default_factory=dict)
@dataclass
class Hypothesis:
"""A generated hypothesis with full evidence chain."""
id: str = ""
hypothesis_type: str = "" # "path", "bridge", "gap", "contradiction"
source_id: str = ""
source_name: str = ""
target_id: str = ""
target_name: str = ""
path: list[HypothesisLink] = field(default_factory=list)
confidence_score: float = 0.0
novelty_score: float = 0.0
evidence_score: float = 0.0
testability_score: float = 0.0
composite_score: float = 0.0
supporting_claims: list[str] = field(default_factory=list)
explanation: str = ""
testability_reason: str = ""
metadata: dict = field(default_factory=dict)
critic_score: float = 0.0
critic_feedback: list[dict] = field(default_factory=list)
critic_rounds: int = 0
evolve_score: float = 0.0
def to_dict(self) -> dict:
d = asdict(self)
return d
@classmethod
def from_dict(cls, d: dict) -> Hypothesis:
d = d.copy()
if "path" in d and isinstance(d["path"], list):
d["path"] = [HypothesisLink(**p) if isinstance(p, dict) else p for p in d["path"]]
return cls(**{k: v for k, v in d.items() if k in cls.__dataclass_fields__})
@dataclass
class Contradiction:
"""A pair of conflicting claims."""
concept_a_id: str = ""
concept_a_name: str = ""
concept_b_id: str = ""
concept_b_name: str = ""
claim_for_id: str = ""
claim_for_predicate: str = ""
claim_for_text: str = ""
claim_against_id: str = ""
claim_against_predicate: str = ""
claim_against_text: str = ""
severity: float = 0.0
@dataclass
class Gap:
"""An unexplored relationship between two concepts."""
concept_a_id: str = ""
concept_a_name: str = ""
concept_b_id: str = ""
concept_b_name: str = ""
distance: int = 0
connecting_concepts: list[str] = field(default_factory=list)
domain_a: str = ""
domain_b: str = ""
potential_relation: str = ""
# ββ constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
OPPOSING_PREDICATES = {
("increases", "reduces"),
("reduces", "increases"),
("causes", "inhibits"),
("inhibits", "causes"),
("treats", "contraindicated_for"),
("contraindicated_for", "treats"),
("activates", "inhibits"),
("inhibits", "activates"),
}
# Review-only study types (no independent empirical evidence).
# Used by compute_frequency_boost and compute_temporal_decay. Edge-level
# weighting by study_type lives in phase4_optimize.apply_evidence_weighting.
_REVIEW_TYPES = {"review", "narrative_review", "systematic_review"}
COMMON_RELATIONS = {"is_a", "part_of", "associated_with", "about", "is_associated_with"}
# Noisy entity name patterns β hypotheses involving these are low quality.
# Two categories:
# (a) process-word β entity: nominalized verbs/states ("loss", "progression")
# that pop up as bridge nodes but carry no biological content.
# (b) generic containers: vague collective terms ("tissue volumes", "Family")
# that don't refer to a specific measurable thing.
_NOISE_WORDS = frozenset({
# original set
"unseen", "risk", "effect", "level", "status", "change", "type",
"group", "factor", "model", "method", "unknown", "other", "none",
"miscellaneous", "various", "difference", "increase", "decrease",
# nominalized processes/states (category a)
"loss", "progression", "reduction", "elevation", "alteration",
"disruption", "dysfunction", "impairment", "deterioration",
"improvement", "recovery", "response", "onset", "activation",
"inhibition", "regulation", "modulation", "stimulation",
"expression", "function", "functions",
# generic containers (category b)
"family", "members", "phenomenon", "phenomena", "processes",
"mechanisms", "pathways", "symptoms", "manifestations",
"volumes", "volume",
# life events / demographics that are not biological entities
"stress", "life", "events", "exposure", "outcome", "outcomes",
"quality",
})
NOISE_PATTERNS = [
re.compile(r"^[A-Z][a-z]?$"), # 1-2 letter: "Id", "Ca", "Mg"
re.compile(r"^[A-Z][a-z]{2,4}$"), # Short mixed-case: "Tics", "Risk"
re.compile(r"^\d+$"), # Pure numbers
]
# (C-1) Generic-phrase patterns for INTERMEDIATE nodes. The token-based
# `_NOISE_WORDS` filter misses phrases like "functional connectivity" or
# "neural activity" because no individual word is in the noise list, but
# the WHOLE phrase carries no measurable content. We only block these when
# they appear as INTERMEDIATE nodes (paths can legitimately end in
# "functional connectivity" as an outcome metric).
_GENERIC_INTERMEDIATE_PATTERNS = [
re.compile(r"^(abnormal|altered|impaired|reduced|increased|disrupted|aberrant)?\s*"
r"(brain|neural|neuronal|cortical|cerebral)\s+"
r"(activity|activation|function|functioning|connectivity|"
r"network|networks|signaling|metabolism|response|responses)$",
re.I),
re.compile(r"^(functional|structural|anatomical|effective)\s+"
r"(connectivity|network|networks|integrity|abnormalit(y|ies))$", re.I),
re.compile(r"^(disease|symptom|clinical|treatment|therapeutic)\s+"
r"(progression|outcome|outcomes|response|severity|burden|stage|staging)$", re.I),
re.compile(r"^(common|typical|specific|various|different)\s+"
r"(features|patterns|mechanisms|processes)$", re.I),
re.compile(r"^(neuro)?(degeneration|inflammation|protection|plasticity|genesis|imaging)$",
re.I),
re.compile(r"^(grey|gray|white)\s+matter$", re.I),
re.compile(r"^(cognitive|behavioral|emotional|motor|sensory)\s+"
r"(deficit|deficits|dysfunction|impairment|abnormalit(y|ies))$", re.I),
]
# (C-3) Target-name patterns that LOOK like outcomes (so they pass
# _is_dataset_outcome's keyword fallback) but are actually too broad to
# drive a DL experiment. We block these even if their domain says
# disease/cognitive_function.
_TARGET_TOO_BROAD_PATTERNS = [
# bare umbrella nouns (single token)
re.compile(r"^(skill|skills|ability|abilities|outcome|outcomes|"
r"symptom|symptoms|manifestation|manifestations|"
r"phenomenon|phenomena|finding|findings|"
r"deficit|deficits|impairment|impairments|"
r"function|functions|functioning|behavior|behaviors|"
r"capability|capabilities|condition|conditions|"
r"disease|diseases|disorder|disorders|syndrome|syndromes|"
r"focus|integration|balance|knowledge|autonomy|"
r"performance|adaptation|resilience|vulnerability|"
r"recovery|progression|mechanism|process)$", re.I),
# broad-category disease umbrellas (when these are the literal target,
# they're too generic β but specific subtypes like "Alzheimer Disease"
# don't match these patterns)
re.compile(r"^(neurological|psychiatric|mental|cognitive|behavioral|"
r"neurodegenerative|cardiovascular)\s+"
r"(disease|diseases|disorder|disorders|condition|conditions)$", re.I),
re.compile(r"^(human\s+)?(disease|diseases|disorder|disorders)$", re.I),
re.compile(r"^(brain|mental|psychiatric|psychological)\s+health$", re.I),
re.compile(r"^clinical\s+(features|outcome|outcomes|presentation|status)$", re.I),
# "X deficits/impairments" patterns (too vague as targets)
re.compile(r"^(motor|cognitive|neurocognitive|functional|social|"
r"verbal|visual|sensory|emotional|behavioral)\s+"
r"(deficit|deficits|impairment|impairments|dysfunction|"
r"disability|decline|deterioration)$", re.I),
]
# Vague relation types that add little signal
VAGUE_RELATIONS = {"is_associated_with", "associated_with", "about"}
# CognitiveAtlas / MeSH concept ids that are top-degree generic hubs
# in the KG. The audit found these at degrees 700-9000+, with names that
# are real English words (not caught by _NOISE_WORDS) but referring to
# extremely abstract umbrella concepts:
#
# COGAT trm_4a3fd79d0a891 "memory" degree 2248
# COGAT trm_4a3fd79d0a80f "logic" degree 2052
# COGAT trm_5159c80c1dd24 "loss" degree 1034
# COGAT trm_4a3fd79d09741 "activation" degree 840
# COGAT trm_4a3fd79d0afcf "risk" degree 722
# COGAT trm_4a3fd79d0b2a8 "stress" degree 139
# MSH:D001921 "Brain" degree 9157
# MSH:D009474 "Neurons" degree 1354
#
# Hypotheses with these as intermediate nodes or endpoints are too vague
# to drive a downstream DL experiment ("FPN -> memory" is not testable
# because we don't know which memory subsystem). Filtered in post_process.
PATH_IGNORE_NODE_IDS = frozenset({
"COGAT_CONCEPT:trm_4a3fd79d0a891", # memory
"COGAT_CONCEPT:trm_4a3fd79d0a80f", # logic
"COGAT_CONCEPT:trm_5159c80c1dd24", # loss
"COGAT_CONCEPT:trm_4a3fd79d09741", # activation
"COGAT_CONCEPT:trm_4a3fd79d0afcf", # risk
"COGAT_CONCEPT:trm_4a3fd79d0b2a8", # stress
"MSH:D001921", # Brain (umbrella)
"MSH:D009474", # Neurons (umbrella)
})
# Disease/category mega-hubs that are valid as hypothesis endpoints
# ("predict Alzheimer" is fine) but NOT as intermediate transit nodes
# ("A β Alzheimer β B" is just "A relates to AD, AD relates to B" β no
# discovery value). Audit found 37.8% of hypotheses transit through these.
INTERMEDIATE_ONLY_IGNORE_IDS = frozenset({
"COGAT_DISORDER:dso_5419", # schizophrenia (degree 1005)
"MSH:D009103", # Multiple Sclerosis (816)
"COGAT_DISORDER:dso_3312", # bipolar disorder (703)
"MSH:D000544", # Alzheimer Disease (746)
"MSH:D004827", # Epilepsy (750)
"MSH:D010300", # Parkinson Disease (709)
"COGAT_DISORDER:dso_0060041", # autism spectrum disorder (613)
"MSH:D001289", # ADHD (601)
"MSH:D003863", # Depression (577)
"MSH:D001523", # Mental Disorders (489)
})
DIRECTIONAL_RELATIONS = {
"causes", "treats", "increases", "reduces", "modulates",
"activates", "inhibits", "is_biomarker_of", "is_risk_factor_for",
"predicts", "distinguishes", "mediates",
# Brain decoding directional predicates
"evokes", "decoded_from", "elicits",
}
# domain pairs worth exploring β aligned with NeuroClaw imaging experiments
# target datasets: UKB (T1w/dMRI/rfMRI/SWI), ADNI (T1w/PET/fMRI/DTI), HCP-YA (T1w/T2w/fMRI/dMRI/MEG)
# experiment models: BrainGNN, NeuroStorm, SVM, XGBoost on raw images + handcrafted features
#
# Design principle: target should be a dataset OUTCOME (what we want to predict),
# source should be a MEASURABLE feature (what the dataset provides as input).
# - UKB outcomes: fluid intelligence, neuroticism, dementia diagnosis, motor tests
# - ADNI outcomes: MCIβAD conversion, CDR-SB, cognitive composite
# - HCP outcomes: fluid/crystallized IQ, emotion recognition, personality traits
#
# Allowed sources (what we can measure): neuroanatomy (MRI regions), connectivity
# networks, gene, biomarker (CSF/PET), drug (for intervention studies).
# Allowed targets (what we predict): disease (diagnostic labels), cognitive_function
# (the OUTCOMES β includes behavior, personality, affect).
DEFAULT_DOMAIN_PAIRS = [
# core: measurable features β clinical/behavioral OUTCOMES
("neuroanatomy", "disease"), # MRI β diagnosis
("neuroanatomy", "cognitive_function"), # MRI β cognition/behavior
("connectivity", "disease"), # dMRI/fMRI connectivity β diagnosis
("connectivity", "cognitive_function"), # connectivity β cognition
# genetics β outcomes (UKB 500k WGS)
("gene", "disease"),
("gene", "cognitive_function"), # GWAS β behavior/IQ
# fluid biomarkers β outcomes (ADNI CSF, blood)
("biomarker", "disease"),
("biomarker", "cognitive_function"),
# drug β outcomes (ADNI pharmaceutical arms)
("drug", "disease"),
("drug", "cognitive_function"),
# cross-outcome (comorbidity, transdiagnostic)
("disease", "disease"),
("cognitive_function", "disease"), # e.g. anxiety β MS diagnosis risk
("disease", "cognitive_function"), # e.g. AD β processing speed decline
]
# Domains that are NOT directly measurable from brain imaging
# These hypotheses will be filtered out in post_process
NON_MEASURABLE_BIOMARKER_TYPES = {
"neurotransmitter", # needs specialized PET tracers (e.g., 11C-raclopride for DA)
"protein", # needs tissue biopsy or CSF
"enzyme", # needs molecular assays
"receptor", # needs specialized PET (e.g., 11C-PIB for AΞ², but that's biomarker domain)
# fluid biomarkers β not available in UKB/HCP-YA, only ADNI CSF subset
"csf_biomarker",
"blood_biomarker",
"saliva_biomarker",
"tear_biomarker",
}
# Specific entity name patterns that are NOT directly measurable from imaging
_NON_MEASURABLE_PATTERNS = [
re.compile(r"(neurotransmitter|dopamine|serotonin|norepinephrine|gaba|glutamate|acetylcholine)\s+(level|concentration|release|synthesis)", re.I),
re.compile(r"(alpha|beta|gamma|delta|kappa)\s*synuclein\s*(pathology|aggregation|expression)", re.I),
re.compile(r"(amyloid|tau|phosphorylated)\s*(beta|protein|peptide)\s*(aggregation|production|clearance)", re.I),
re.compile(r"(enzyme|kinase|phosphatase)\s*(activity|expression)", re.I),
re.compile(r"(receptor|transporter)\s*(density|binding|expression)", re.I),
re.compile(r"(TNF|interleukin|IL-\d|cytokine|chemokine)\s*(alpha|beta|level|concentration|production)", re.I),
re.compile(r"CSF\s+(AΞ²|amyloid|tau|p-tau|NFL|neurofilament)", re.I),
re.compile(r"(blood|plasma|serum)\s+(biomarker|marker|level|concentration)", re.I),
re.compile(r"(CSF|cerebrospinal fluid)\s+", re.I),
re.compile(r"(saliva|tear|urine)\s+(biomarker|marker|level)", re.I),
re.compile(r"(biopsy|tissue sample)", re.I),
]
# Non-neurological target domains β brain regions should not directly predict these
_NON_NEUROLOGICAL_TARGETS = re.compile(
r"(urinary|incontinence|frequency|enuresis|bladder|renal|kidney|liver|"
r"gastrointestinal|cardiac|pulmonary|dermatol|orthopedic|musculoskeletal|"
r"fracture|sprain|tumor|cancer|carcinoma|leukemia|lymphoma)", re.I
)
# DATASET-OUTCOME whitelist β covers actual predicted variables in UKB/ADNI/HCP-YA
# papers (see README "Dataset Outcomes" for references to typical prediction tasks).
# Target must match one of these patterns to pass the post_process filter.
# We also auto-accept any concept in the `disease` domain (clinical diagnosis
# IS the most common outcome) and any MSH/CogAtlas concept in the
# `cognitive_function` domain (behavior/cognition).
#
# Categories cover:
# - Clinical diagnostic labels (Alzheimer, schizophrenia, MCI, etc.) β all 3 datasets
# - AD staging / conversion (CNβMCIβAD, ATN) β ADNI
# - Clinical scales (CDR, MMSE, ADAS-Cog, PHQ-9, MoCA, NPI) β ADNI + UKB
# - Cognitive abilities (IQ, memory, attention, processing speed) β all 3
# - Specific cognitive tests (PMAT, flanker, N-back, delay discounting) β HCP
# - Personality (Big Five) β HCP + UKB
# - Behavior/affect (anxiety, depression, aggression, risk-taking) β all 3
# - Motor/sensory (grip strength, gait, reaction time, dexterity) β UKB + HCP
# - Brain age / neurodegeneration markers β UKB + ADNI
# - NeuroSTORM-evaluated phenotypes: MND, early psychosis (HCP-EP), ADHD200,
# COBRE, UCLA L5c, TCP psychiatric scales, fMRI task state classification
# - Subject fingerprinting / re-identification
_OUTCOME_KEYWORDS = re.compile(
r"("
# cognitive abilities β general
r"intelligence|cognition|cognitive\s+(function|ability|performance|deterioration|impairment|dysfunction|decline|test|assessment|composite|score)|"
r"memory|attention|executive|processing\s+speed|reasoning|language|"
r"fluency|perception|reaction\s+time|fluid\s+intelligence|"
r"crystallized\s+intelligence|working\s+memory|episodic\s+memory|"
r"semantic\s+memory|verbal\s+(memory|fluency|learning)|visuospatial|"
# specific HCP NIH Toolbox / cognitive tasks
r"pmat|flanker|card\s+sort|n-?back|list\s+sort|picture\s+sequence|"
r"pattern\s+comparison|picture\s+vocabulary|oral\s+reading|"
r"delay\s+discounting|risk[- ]taking|go[- ]no[- ]go|"
# HCP Penn CNB cognitive battery
r"penn\s+(word|matrix|line\s+orientation|continuous\s+performance|progressive\s+matrices|fear|emotion|cnb)|"
r"matrix\s+pattern|numeric\s+memory|prospective\s+memory|pairs\s+matching|"
r"trail\s+making|symbol\s+digit|boston\s+naming|animal\s+fluency|"
r"category\s+fluency|logical\s+memory|clock\s+drawing|ravlt|"
# HCP 7 task states (NeuroSTORM state classification)
r"emotion\s+task|gambling\s+task|language\s+task|motor\s+task|"
r"relational\s+task|social\s+task|working\s+memory\s+task|"
# clinical scales (ADNI/UKB/TCP/HCP)
r"\b(cdr|cdr-sb|mmse|moca|adas|adas-cog|npi|faq|gds|phq-?9|gad-?7|bai|hdrs|hrsd|hamd|ham-d|"
r"bdi|ymrs|panss|sans|saps|audit|asrs|pro|adi|srs|tci|neo-?ffi|asr|abcl|"
r"cidi|cidi-sf|eysenck|swemwbs|psqi|ftnd|ssaga|masq|promis|upsit)\b|"
r"adult\s+self\s+report|adult\s+behavior\s+checklist|"
# personality / affect
r"neuroticism|extraversion|agreeableness|conscientiousness|openness|"
r"personality|temperament|affect|mood|emotion|anxiety|depression|"
r"well-?being|satisfaction|life\s+satisfaction|psychological|stress\s+response|"
r"anxiety\s+sensitivity|cautiousness|"
r"affect\s+(positive|negative)|emotion\s+recognition|emotional\s+regulation|"
r"perceived\s+(stress|rejection|hostility)|anger|fear|sadness|"
# social functioning (HCP + UKB)
r"loneliness|social\s+(isolation|support|relationship|cognition)|"
r"meaning\s+and\s+purpose|instrumental\s+support|emotional\s+support|"
r"friendship|"
# behavior
r"behavior|aggression|impulsivity|addiction|substance|alcohol|smoking|"
r"tobacco|cannabis|cocaine|opiate|opioid|hallucinogen|"
r"drug\s+use|substance\s+use|sleep\s+quality|insomnia|"
# diagnoses / clinical outcomes β added NeuroSTORM-evaluated cohorts and ADNI stages
r"alzheimer|parkinson|schizophrenia|autism|adhd|bipolar|epilepsy|"
r"mci|mild\s+cognitive|dementia|psychosis|early\s+psychosis|stroke|post[- ]stroke|"
r"multiple\s+sclerosis|huntington|frontotemporal|lewy\s+body|"
r"motor\s+neuron\s+disease|mnd|als|"
r"transdiagnostic|psychiatric\s+disorder|mental\s+health\s+disorder|"
r"ocd|ptsd|phobia|panic|agoraphobia|somatoform|eating\s+disorder|"
# ADNI-specific diagnostic stages
r"\b(cn|smc|emci|lmci|ad\b|preclinical|at\b|atn|alzheimer\s+continuum)\b|"
r"significant\s+memory\s+concern|subjective\s+(memory|cognitive)\s+(concern|complaint|decline)|"
r"cognitively\s+(normal|unimpaired)|"
r"disorder|syndrome|diagnosis|onset|conversion|progression|severity|"
r"symptom|manifestation|prognosis|outcome|treatment\s+response|"
r"disease\s+(stage|staging|duration|burden)|"
# cardiovascular / metabolic diseases (UKB ICD-10)
r"myocardial\s+infarction|heart\s+failure|hypertension|atrial\s+fibrillation|"
r"coronary|cardiovascular\s+disease|diabetes|type\s*[12]\s+diabetes|"
r"chronic\s+kidney|fatty\s+liver|nafld|metabolic\s+syndrome|obesity|"
# AD-specific biomarker status
r"amyloid\s+(status|positivity|positive|negative|load|burden|suvr)|"
r"tau\s+(status|positivity|positive|tangle|pathology|burden|suvr)|"
r"atn\s+(profile|stage|classification)|"
r"neurodegeneration\s+(stage|status)|"
# brain age / aging
r"brain\s+age|brain-?age(-?gap)?|aging|age[- ]related|age\s+acceleration|"
# motor / sensory
r"grip\s+strength|gait|motor\s+coordination|motor\s+function|"
r"balance|tremor|dexterity|walking\s+speed|two[- ]minute\s+walk|endurance|"
r"visual\s+(acuity|field)|audition|hearing|olfaction|taste|pain|"
r"chronic\s+pain|musculoskeletal\s+pain|"
# mortality / longevity
r"mortality|all-?cause\s+death|survival|life\s+expectancy"
r")", re.I
)
# Target domains considered as valid dataset outcomes
_OUTCOME_DOMAINS = {"disease", "cognitive_function"}
# NeuroClaw testable modalities and their keywords
# Aligned with UKB/ADNI/HCP-YA available data + deep learning models
TESTABLE_MODALITIES = {
"sMRI": ["cortical thickness", "volume", "atrophy", "gray matter", "white matter",
"brain structure", "morphometry", "VBM", "FreeSurfer", "recon-all",
"brain region", "hippocampus", "amygdala", "thalamus", "caudate",
"putamen", "cerebellum", "insula", "cortex", "ventricle"],
"fMRI": ["functional connectivity", "BOLD", "activation", "resting-state",
"task-based", "network", "default mode", "fMRI", "brain response",
"neural activity", "brain activation"],
"dMRI": ["DTI", "diffusion", "fractional anisotropy", "tractography",
"white matter integrity", "structural connectivity", "FA", "MD",
"connectivity matrix", "fiber bundle", "white matter tract"],
"PET": ["PET", "tracer", "amyloid", "tau", "FDG", "SUVr", "binding potential",
"glucose metabolism", "florbetapir", "flortaucipir"],
"EEG": ["EEG", "ERP", "oscillation", "power spectrum", "alpha", "beta", "theta",
"delta", "gamma", "microstate", "coherence", "event-related"],
"organ_volume": ["organ volume", "liver volume", "kidney volume", "spleen volume",
"MedSAM", "segmentation", "organ size"],
}
# Deep learning model keywords for testability scoring
DL_MODEL_KEYWORDS = [
"BrainGNN", "NeuroStorm", "GNN", "graph neural", "region of interest", "ROI",
"connectivity matrix", "adjacency", "node feature", "graph convolution",
"deep learning", "CNN", "ResNet", "attention", "transformer",
"voxel", "patch", "whole-brain",
]
# ββ Dataset-Available Variables ββββββββββββββββββββββββββββββββββββββ
# Defines what can be measured in each dataset. Hypotheses must start
# from these features and end at dataset-available outcomes.
DATASET_FEATURES = {
"UKB": {
# sMRI (T1w): FreeSurfer-derived ROI measures
"smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"smri_cortical_area": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"smri_cortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"smri_voxel": {"modality": "sMRI", "tool": "voxel", "level": "voxel"},
# dMRI: diffusion metrics per tract
"dmri_fa": {"modality": "dMRI", "tool": "TBSS", "level": "tract"},
"dmri_md": {"modality": "dMRI", "tool": "TBSS", "level": "tract"},
"dmri_sc": {"modality": "dMRI", "tool": "tractography", "level": "connectivity"},
# rfMRI: functional connectivity
"rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"},
# lesion segmentation
"lesion_volume": {"modality": "sMRI", "tool": "MedSAM", "level": "ROI"},
# non-imaging
"genetics": {"modality": "genetics", "tool": "WGS/GSA", "level": "SNP"},
"environment": {"modality": "environment", "tool": "questionnaire","level": "variable"},
"physical": {"modality": "physical", "tool": "measurement", "level": "variable"},
"hospitalization":{"modality": "clinical", "tool": "ICD10", "level": "outcome"},
},
"ADNI": {
"smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"smri_voxel": {"modality": "sMRI", "tool": "voxel", "level": "voxel"},
"pet_amyloid": {"modality": "PET", "tool": "florbetapir", "level": "ROI"},
"pet_tau": {"modality": "PET", "tool": "flortaucipir", "level": "ROI"},
"pet_fdg": {"modality": "PET", "tool": "FDG", "level": "ROI"},
"fmri_fc": {"modality": "fMRI", "tool": "task/resting", "level": "connectivity"},
"dti_fa": {"modality": "dMRI", "tool": "DTI", "level": "tract"},
"lesion_volume": {"modality": "sMRI", "tool": "MedSAM", "level": "ROI"},
"genetics": {"modality": "genetics", "tool": "APOE/GWAS", "level": "SNP"},
"medication": {"modality": "clinical", "tool": "medication_log", "level": "variable"},
},
"HCP_YA": {
"smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"smri_myelin": {"modality": "sMRI", "tool": "T1w/T2w", "level": "ROI"},
"smri_voxel": {"modality": "sMRI", "tool": "voxel", "level": "voxel"},
"rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"},
"tfmri_task":{"modality": "fMRI", "tool": "task fMRI","level": "activation"},
"dmri_sc": {"modality": "dMRI", "tool": "HARDI", "level": "connectivity"},
"meg": {"modality": "MEG", "tool": "MEG", "level": "connectivity"},
},
# NAS-available patient cohorts with preprocessed ROI time series.
# Phenotype CSVs live under Z:\Dataset\fMRI\phenotype and the dataset-
# specific rest csvs. All supply rfMRI volumes or ROI series; structural
# T1 is available for HCP-EP and HCP-Aging (the other four are rfMRI-only
# public releases).
"ABIDE": {
"rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"},
"rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"},
},
"ADHD200": {
"rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"},
"rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"},
},
"COBRE": {
"rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"},
"rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"},
},
"UCLA": {
# UCLA CNP β rest + 6 task contrasts, cross-diagnosis cohort.
"rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"},
"rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"},
"tfmri_task": {"modality": "fMRI", "tool": "task fMRI", "level": "activation"},
},
"HCP_EP": {
# HCP Early Psychosis β patient cohort, T1w + rfMRI cleaned.
"smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"},
"rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"},
},
"HCP_AGING": {
# HCP-Aging β T1w + rfMRI REST1/REST2 + 3 task contrasts.
"smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"smri_subcortical_volume": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"smri_myelin": {"modality": "sMRI", "tool": "T1w/T2w", "level": "ROI"},
"rfmri_fc": {"modality": "fMRI", "tool": "rfMRI", "level": "connectivity"},
"rfmri_roi_ts": {"modality": "fMRI", "tool": "rfMRI", "level": "ROI"},
"tfmri_task": {"modality": "fMRI", "tool": "task fMRI", "level": "activation"},
},
# ββ Visual decoding (fMRI) ββββββββββββββββββββββββββββββββββββββββββ
# NSD & BOLD5000: image-stimulus visual task fMRI, no rest.
"NSD": {
"smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"tfmri_visual_voxel": {"modality": "fMRI", "tool": "task fMRI",
"level": "voxel", "stimulus": "natural_image"},
"tfmri_visual_roi": {"modality": "fMRI", "tool": "task fMRI",
"level": "ROI", "stimulus": "natural_image"},
},
"BOLD5000": {
"smri_cortical_thickness": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI"},
"tfmri_visual_voxel": {"modality": "fMRI", "tool": "task fMRI",
"level": "voxel", "stimulus": "ImageNet_COCO_Scene"},
"tfmri_visual_roi": {"modality": "fMRI", "tool": "task fMRI",
"level": "ROI", "stimulus": "ImageNet_COCO_Scene"},
},
# ββ Visual decoding (EEG) βββββββββββββββββββββββββββββββββββββββββββ
"SEED_DV": {
"eeg_psd": {"modality": "EEG", "tool": "PSD", "level": "channel"},
"eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"},
},
# ββ Emotion decoding (EEG + eye tracking) βββββββββββββββββββββββββββ
"SEED": {
"eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"},
"eeg_psd": {"modality": "EEG", "tool": "PSD", "level": "channel"},
},
"SEED_IV": {
"eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"},
"eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
"level": "variable"},
},
"SEED_V": {
"eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"},
"eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
"level": "variable"},
},
"SEED_VII": {
"eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"},
"eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
"level": "variable"},
},
"SEED_GER": {
"eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"},
"eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
"level": "variable"},
},
"SEED_FRA": {
"eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"},
"eye_movement": {"modality": "eye_tracking", "tool": "saccade/fixation",
"level": "variable"},
},
# ββ Vigilance decoding (EEG) ββββββββββββββββββββββββββββββββββββββββ
"SEED_VIG": {
"eeg_de": {"modality": "EEG", "tool": "DE", "level": "channel"},
"eog": {"modality": "EOG", "tool": "EOG", "level": "channel"},
"eye_movement": {"modality": "eye_tracking", "tool": "gaze/blink",
"level": "variable"},
},
}
DATASET_OUTCOMES = {
"UKB": [
"disease_diagnosis", # ICD10 codes
"mortality", # death registry
"cognitive_score", # touchscreen cognitive tests
"imaging_phenotype", # derived imaging phenotypes
],
"ADNI": [
"diagnosis", # CN / MCI / AD
"conversion", # MCI β AD conversion
"cognitive_decline", # ADAS-Cog, MMSE decline
"biomarker_status", # amyloid+/tau+ status
],
"HCP_YA": [
"behavioral_score", # NIH Toolbox
"cognitive_task", # task fMRI performance
"personality", # NEO-FFI
],
# ABIDE β ASD vs controls, rest only.
"ABIDE": [
"diagnosis", # ASD vs TD
"symptom_severity", # ADOS, ADI-R, SRS
"cognitive_score", # FIQ/VIQ/PIQ
],
# ADHD200 β ADHD subtype vs TDC.
"ADHD200": [
"diagnosis", # ADHD (combined/inattentive/hyperactive) vs TDC
"symptom_severity", # ADHD-RS, Conners
"cognitive_score", # WASI/WISC
],
# COBRE β schizophrenia vs controls.
"COBRE": [
"diagnosis", # schizophrenia vs HC
"symptom_severity", # PANSS positive/negative/general
"cognitive_score", # WAIS
],
# UCLA CNP β schizophrenia/bipolar/ADHD vs controls.
"UCLA": [
"diagnosis", # SCZ / BP / ADHD / HC
"symptom_severity", # HAM-D, YMRS, ADHD-RS
"cognitive_task", # 6 task contrasts
],
# HCP-EP β early psychosis (FES + AR) vs HC.
"HCP_EP": [
"diagnosis", # affective/non-affective psychosis vs HC
"symptom_severity", # PANSS, SANS, YMRS
"cognitive_score", # MATRICS Consensus Cognitive Battery
],
# HCP-Aging β lifespan 36-100 yrs, healthy aging.
"HCP_AGING": [
"cognitive_decline", # NIH Toolbox across age
"behavioral_score", # same battery as HCP-YA
"cognitive_task", # CARIT/FACENAME/VISMOTOR
],
# ββ Visual decoding outcomes ββββββββββββββββββββββββββββββββββββββββ
"NSD": [
"image_category", # COCO 80-class
"image_semantic", # CLIP / language-model embedding
"stimulus_reconstruction",# pixel / latent reconstruction
],
"BOLD5000": [
"image_category", # ImageNet 1000-class / COCO / Scene
"scene_type", # Scene 365-class
"image_semantic",
],
"SEED_DV": [
"video_class", # discrete video categories
"video_semantic",
"video_reconstruction",
],
# ββ Emotion decoding outcomes βββββββββββββββββββββββββββββββββββββββ
"SEED": ["emotion_3class"], # positive/neutral/negative
"SEED_IV": ["emotion_4class"], # happy/sad/fear/neutral
"SEED_V": ["emotion_5class"], # +disgust
"SEED_VII": ["emotion_7class", "emotion_continuous"],
"SEED_GER": ["emotion_3class"],
"SEED_FRA": ["emotion_3class"],
# ββ Vigilance decoding outcomes βββββββββββββββββββββββββββββββββββββ
"SEED_VIG": ["vigilance_continuous", "perclos"],
}
# Imaging feature templates β dynamically combined with AAL atlas regions
# {region} is replaced with actual neuroanatomy node names at generation time
IMAGING_FEATURE_TEMPLATES = {
# sMRI FreeSurfer ROI features
"cortical thickness of {region}": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
"datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]},
"gray matter volume of {region}": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
"datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]},
"subcortical volume of {region}": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
"datasets": ["UKB", "ADNI", "HCP_YA", "HCP_EP", "HCP_AGING"]},
"cortical area of {region}": {"modality": "sMRI", "tool": "FreeSurfer", "level": "ROI",
"datasets": ["UKB", "HCP_YA", "HCP_AGING"]},
# dMRI tract features
"fractional anisotropy of {region}": {"modality": "dMRI", "tool": "TBSS", "level": "tract",
"datasets": ["UKB", "HCP_YA"]},
"mean diffusivity of {region}": {"modality": "dMRI", "tool": "TBSS", "level": "tract",
"datasets": ["UKB", "HCP_YA"]},
# PET ROI features (ADNI)
"amyloid SUVR of {region}": {"modality": "PET", "tool": "florbetapir", "level": "ROI",
"datasets": ["ADNI"]},
"tau SUVR of {region}": {"modality": "PET", "tool": "flortaucipir", "level": "ROI",
"datasets": ["ADNI"]},
"FDG uptake of {region}": {"modality": "PET", "tool": "FDG", "level": "ROI",
"datasets": ["ADNI"]},
# lesion segmentation
"lesion volume of {region}": {"modality": "sMRI", "tool": "MedSAM", "level": "ROI",
"datasets": ["UKB", "ADNI"]},
}
# Connectivity feature templates β {a} and {b} are AAL regions
CONNECTIVITY_FEATURE_TEMPLATES = {
"functional connectivity between {a} and {b}": {"modality": "fMRI", "tool": "rfMRI",
"level": "connectivity",
"datasets": ["UKB", "ADNI", "HCP_YA",
"ABIDE", "ADHD200", "COBRE",
"UCLA", "HCP_EP", "HCP_AGING"]},
"effective connectivity from {a} to {b}": {"modality": "fMRI", "tool": "DCM/GC",
"level": "connectivity",
"datasets": ["ADNI", "HCP_YA",
"UCLA", "HCP_EP", "HCP_AGING"]},
"structural connectivity between {a} and {b}": {"modality": "dMRI", "tool": "tractography",
"level": "connectivity",
"datasets": ["UKB", "HCP_YA"]},
}
# Domain pairs for imaging-driven hypothesis generation
# source domain β target domain, aligned with dataset modalities
IMAGING_DOMAIN_PAIRS = [
# sMRI features β disease
("neuroanatomy", "disease"),
# connectivity β disease
("connectivity", "disease"),
# sMRI features β cognitive function
("neuroanatomy", "cognitive_function"),
# gene β brain structure (UKB genetics + imaging)
("gene", "neuroanatomy"),
# disease β drug (ADNI)
("disease", "drug"),
]
# Brain decoding domain pairs (NSD / BOLD5000 / SEED family).
# These are SEPARATE from IMAGING_DOMAIN_PAIRS because decoding hypotheses
# reverse the usual direction: instead of "brain feature β clinical outcome",
# they go "stimulus β brain" or "brain β psychological-state label".
DECODING_DOMAIN_PAIRS = [
# Encoding: stimulus drives brain response
("visual_stimulus", "neuroanatomy"),
("visual_stimulus", "imaging_feature"),
("visual_stimulus", "connectivity"),
# Decoding: brain predicts stimulus identity
("neuroanatomy", "visual_stimulus"),
("imaging_feature", "visual_stimulus"),
# EEG β emotion (SEED/SEED-IV/SEED-V/SEED-VII/SEED-GER/SEED-FRA)
("imaging_feature", "emotion"),
("neuroanatomy", "emotion"),
# EEG β vigilance (SEED-VIG)
("imaging_feature", "vigilance"),
("neuroanatomy", "vigilance"),
]
# AAL atlas regions used for imaging feature generation
# Subset of neuroanatomy nodes from NN_AAL source
_AAL_REGION_KEYWORDS = [
"Precentral", "Frontal_Sup", "Frontal_Mid", "Frontal_Inf", "Rolandic_Oper",
"Supp_Motor", "Olfactory", "Frontal_Sup_Med", "Frontal_Med_Orb",
"Rectus", "Insula", "Cingulate", "Hippocampus", "Parahippocampal",
"Amygdala", "Calcarine", "Cuneus", "Lingual", "Occipital",
"Fusiform", "Postcentral", "Parietal", "SupraMarginal", "Angular",
"Precuneus", "Paracentral", "Caudate", "Putamen", "Pallidum",
"Thalamus", "Heschl", "Temporal", "Temporal_Pole",
]
# ββ engine βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class HypothesisEngine:
"""Batch-generate, persist, and rank testable hypotheses from a knowledge graph."""
def __init__(self, kg: KnowledgeGraph):
self.kg = kg
self.G = kg.G
self._index = kg._index
# Build claims index for frequency_boost: (subj, pred, obj) β [claim_meta, ...]
self._claims_by_triple: dict[tuple[str, str, str], list[dict]] = {}
for nid, node in self._index.items():
if "claim" not in node.domain_tags:
continue
meta = node.metadata
key = (meta.get("subject_id", ""), meta.get("predicate", ""), meta.get("object_id", ""))
if key[0] and key[2]:
self._claims_by_triple.setdefault(key, []).append(meta)
# ββ batch generation βββββββββββββββββββββββββββββββββββββββββββββββ
def batch_generate(
self,
domain_pairs: Optional[list[tuple[str, str]]] = None,
max_hops: int = 3,
max_paths_per_pair: int = 5,
max_seeds_per_domain: int = 50,
) -> list[Hypothesis]:
"""Batch-generate hypotheses across the entire graph.
Strategy: for each domain pair, sample seed concepts from domain_a,
find paths to concepts in domain_b within max_hops hops.
"""
if domain_pairs is None:
domain_pairs = DEFAULT_DOMAIN_PAIRS
all_hypotheses: list[Hypothesis] = []
seen_pairs: set[tuple[str, str]] = set()
_hyp_counter = 0
for dom_a, dom_b in domain_pairs:
logger.info(f"generating hypotheses: {dom_a} -> {dom_b}")
seeds_a = self._sample_domain_nodes(dom_a, max_seeds_per_domain)
targets_b = {
nid for nid, data in self.G.nodes(data=True)
if dom_b in data.get("domain_tags", [])
and "claim" not in data.get("domain_tags", [])
and nid not in PATH_IGNORE_NODE_IDS
}
for seed_id in seeds_a:
if seed_id not in self.G:
continue
# BFS from seed
try:
reachable = nx.single_source_shortest_path(
self.G, seed_id, cutoff=max_hops
)
except nx.NetworkXError:
continue
# find targets in domain_b
candidates = [
nid for nid in reachable
if nid in targets_b and nid != seed_id
]
pair_count = 0
for target_id in candidates:
pair_key = tuple(sorted([seed_id, target_id]))
if pair_key in seen_pairs:
continue
seen_pairs.add(pair_key)
raw_path = reachable[target_id]
links = self._enrich_path(raw_path)
if not links:
continue
conf = self._compute_confidence_score(links)
nov = self._compute_novelty_score(links)
evi = self._compute_evidence_score(links)
test, test_reason = self._compute_testability_score(links)
claim_ids = [l.claim_id for l in links if l.claim_id]
_hyp_counter += 1
h = Hypothesis(
id=f"HYP:{_hyp_counter:06d}",
hypothesis_type="bridge",
source_id=seed_id,
source_name=self._index[seed_id].preferred_name,
target_id=target_id,
target_name=self._index[target_id].preferred_name,
path=links,
confidence_score=conf,
novelty_score=nov,
evidence_score=evi,
testability_score=test,
composite_score=0.0, # set below
supporting_claims=claim_ids,
testability_reason=test_reason,
metadata={"domain_a": dom_a, "domain_b": dom_b},
)
h.explanation = self._generate_explanation(h)
h.composite_score = self._composite_score(h)
all_hypotheses.append(h)
pair_count += 1
if pair_count >= max_paths_per_pair:
break
logger.info(f"batch generation complete: {len(all_hypotheses)} hypotheses from {len(domain_pairs)} domain pairs")
all_hypotheses = self.post_process(all_hypotheses)
return all_hypotheses
def post_process(
self,
hypotheses: list[Hypothesis],
min_hops: int = 2,
filter_vague_relations: bool = True,
filter_non_measurable: bool = True,
max_hops_filter: int = 5,
) -> list[Hypothesis]:
"""Filter low-quality hypotheses after generation.
Filters:
1. Noisy entities β source/target name matches NOISE_PATTERNS
2. 1-hop hypotheses β too simple, just restates existing edges
3. Vague relations β all links are is_associated_with / associated_with / about
4. Non-measurable biomarkers β entities not directly measurable from brain imaging
5. Pure association chains β no directional predicates (causes/treats/increases/etc.)
6. Overly long paths β exceeds max_hops_filter (default 5) to reduce noise accumulation
"""
before = len(hypotheses)
filtered = []
for h in hypotheses:
# filter noisy entities (source, target, and all intermediate nodes)
all_names = {h.source_name, h.target_name}
for link in h.path:
all_names.add(link.from_name)
all_names.add(link.to_name)
if any(self._is_noisy_entity(name) for name in all_names):
continue
# filter 1-hop (single direct edge = no discovery value)
if len(h.path) < min_hops:
continue
# filter all-vague-relations
if filter_vague_relations:
relation_types = {l.relation_type for l in h.path}
if relation_types and relation_types <= VAGUE_RELATIONS:
continue
# filter single-PMID bridges (all hops cite the same paper = not a real bridge)
if len(h.path) >= 2:
pmids = set()
for link in h.path:
pmid = link.source_paper.get("pmid", "") if isinstance(link.source_paper, dict) else ""
if pmid:
pmids.add(pmid)
if len(pmids) == 1:
continue
# filter non-measurable biomarkers (not testable from imaging)
if filter_non_measurable:
if self._has_non_measurable_entity(h):
continue
# filter biologically implausible paths (brain region β non-neurological target)
if self._has_implausible_path(h):
continue
# filter paths with weak evidence (target not mentioned in raw_text)
if self._has_weak_evidence(h):
continue
# filter paths where both ends of any edge are broad hubs
# ("Brain Diseases --causes--> Cognitive Dysfunction" is uninformative)
if self._has_hub_to_hub_edge(h):
continue
# filter paths touching any vague COGAT/MeSH umbrella hub
# (memory/logic/loss/activation/risk/stress/Brain/Neurons).
# These nodes are too abstract to drive a DL experiment whether
# they appear as source, target, or intermediate.
if self._touches_path_ignore_node(h):
continue
# filter paths that transit through disease mega-hubs as
# intermediate nodes (A β Disease β B is uninformative).
# These nodes are still valid as source/target endpoints.
if self._transits_intermediate_only_hub(h):
continue
# (C-1) filter paths whose INTERMEDIATE node is a generic
# phrase ("neural activity", "disease progression", "grey
# matter", ...). Endpoints are not checked here.
if self._has_intermediate_generic_phrase(h):
continue
# (C-2) filter paths whose directional density is too thin
# (3+ hops with < 50% directional relations = too vague to
# be a mechanism hypothesis).
if self._has_thin_directional_density(h):
continue
# filter: target must be a dataset outcome (diagnosis/cognition/behavior/
# personality/motor). Predicting "White Matter" or "Neurons" is not a
# hypothesis UKB/ADNI/HCP can directly test β those are imaging features
# used as INPUTS, not outcomes.
if not self._is_dataset_outcome(h):
continue
# (C-3) filter: target name is an umbrella concept ("skill",
# "disease", "neurological disorder", "clinical features")
# even though it passes the outcome keyword check. These
# can't anchor a concrete DL label.
if self._is_too_broad_target(h.target_name):
continue
# filter paths with no directional predicates (pure association chains)
if len(h.path) >= 2:
relation_types = {l.relation_type for l in h.path}
if not (relation_types & DIRECTIONAL_RELATIONS):
continue
# filter paths that exceed max hop length (noise accumulation)
if len(h.path) > max_hops_filter:
continue
filtered.append(h)
# Deduplicate: for each (source, target) pair, keep top 2 by composite score
from collections import defaultdict
pair_groups = defaultdict(list)
for h in filtered:
key = (h.source_id, h.target_id)
pair_groups[key].append(h)
deduplicated = []
for key, group in pair_groups.items():
# Sort by composite score descending
group.sort(key=lambda x: x.composite_score, reverse=True)
# Keep top 2 (or 1 if only one exists)
deduplicated.extend(group[:2])
logger.info(f"post_process: {before} -> {len(filtered)} filtered -> {len(deduplicated)} deduplicated "
f"(removed {before - len(deduplicated)} total)")
return deduplicated
def _has_non_measurable_entity(self, h: Hypothesis) -> bool:
"""Check if hypothesis involves entities not measurable from brain imaging.
Filters out hypotheses where source or target is:
- A non-measurable domain (neurotransmitter levels, protein expression, etc.)
- Matches non-measurable entity name patterns (CSF markers, blood markers, etc.)
"""
for node_name, node_id in [(h.source_name, h.source_id), (h.target_name, h.target_id)]:
# check domain tags
node = self._index.get(node_id)
if node:
domains = set(node.domain_tags) - {"claim"}
# allow neurotransmitter/protein as intermediate hops only if source or target is neuroanatomy
if domains & NON_MEASURABLE_BIOMARKER_TYPES:
# check if the OTHER end is a brain region (then it's a valid "X affects brain" hypothesis)
other_name = h.target_name if node_name == h.source_name else h.source_name
other_id = h.target_id if node_name == h.source_name else h.source_id
other_node = self._index.get(other_id)
if other_node and "neuroanatomy" not in other_node.domain_tags:
return True
# check name patterns
for pattern in _NON_MEASURABLE_PATTERNS:
if pattern.search(node_name):
return True
return False
@staticmethod
def _is_noisy_entity(name: str) -> bool:
"""Check if an entity name matches known noise patterns."""
if not name or len(name.strip()) == 0:
return True
name_clean = name.strip()
for pattern in NOISE_PATTERNS:
if pattern.match(name_clean):
return True
# check if name contains any noise word
words = set(re.split(r"[\s\-_,/]+", name_clean.lower()))
if words & _NOISE_WORDS:
return True
return False
@staticmethod
def _is_generic_intermediate(name: str) -> bool:
"""(C-1) Phrase-level filter for intermediate node names that pass
token-level `_NOISE_WORDS` but are still too vague.
Examples that get blocked:
- "neural activity" (no individual noise token)
- "functional connectivity" (legit metric but not a mechanism)
- "disease progression"
- "grey matter" (umbrella)
- "cognitive deficit"
Only call on intermediate nodes β these phrases can be valid as
endpoints (e.g. "functional connectivity" as a target metric).
"""
if not name:
return True
s = name.strip()
for pattern in _GENERIC_INTERMEDIATE_PATTERNS:
if pattern.match(s):
return True
return False
@staticmethod
def _is_too_broad_target(name: str) -> bool:
"""(C-3) Block target names that pass the outcome keyword regex but
are umbrella concepts ("disease", "skill", "neurological disorder",
"clinical features"). A DL experiment can't be designed against
these β you don't know which subtype to label.
"""
if not name:
return True
s = name.strip()
for pattern in _TARGET_TOO_BROAD_PATTERNS:
if pattern.match(s):
return True
return False
def _has_intermediate_generic_phrase(self, h: Hypothesis) -> bool:
"""(C-1) Reject paths whose intermediate node is a generic phrase
like "neural activity" or "disease progression". Endpoints are
excluded from this check because some metrics (e.g. "functional
connectivity") legitimately appear as outcomes.
"""
if len(h.path) < 2:
return False
intermediate_names: list[str] = []
for i, link in enumerate(h.path):
# link.from_name is intermediate when i >= 1
# link.to_name is intermediate when i < len(path) - 1
if i >= 1:
intermediate_names.append(link.from_name or "")
if i < len(h.path) - 1:
intermediate_names.append(link.to_name or "")
for name in intermediate_names:
if self._is_generic_intermediate(name):
return True
return False
def _has_thin_directional_density(self, h: Hypothesis) -> bool:
"""(C-2) Reject paths where directional relations are too sparse.
Current rule (older): >= 1 directional anywhere = pass.
Problem: a 4-hop path with 1 directional + 3 vague edges still
looks like a real chain to scoring but is essentially a vague
association narrative.
New rule:
- 1-2 hop path: at least 1 directional (unchanged)
- 3+ hop path: at least half of the edges must be directional
"""
n = len(h.path)
if n < 3:
return False
directional = sum(1 for l in h.path if l.relation_type in DIRECTIONAL_RELATIONS)
return directional * 2 < n # < 50% directional
def _has_implausible_path(self, h: Hypothesis) -> bool:
"""Check if hypothesis path has biologically implausible connections.
Filters paths where a brain region directly predicts a non-neurological
condition (e.g., amygdala β urinary incontinence) without a plausible
intermediate neurological mechanism.
"""
# Check if source is a brain region and target is non-neurological
source_node = self._index.get(h.source_id)
target_node = self._index.get(h.target_id)
if not source_node or not target_node:
return False
source_is_brain = "neuroanatomy" in source_node.domain_tags
target_is_neuro = any(d in target_node.domain_tags for d in
["neuroanatomy", "disease", "cognitive_function",
"biomarker", "gene", "drug", "neurotransmitter"])
# If source is brain region and target is non-neurological, check target name
if source_is_brain and not target_is_neuro:
if _NON_NEUROLOGICAL_TARGETS.search(h.target_name):
return True
# Also check intermediate nodes in the path
for link in h.path:
if _NON_NEUROLOGICAL_TARGETS.search(link.to_name):
# Check if the previous node is a brain region
prev_node = self._index.get(link.from_id)
if prev_node and "neuroanatomy" in prev_node.domain_tags:
# Only filter if there's no disease intermediate
has_disease_intermediate = any(
"disease" in self._index.get(l.from_id, ConceptNode(id="", preferred_name="")).domain_tags
for l in h.path[:h.path.index(link)]
)
if not has_disease_intermediate:
return True
return False
def _has_hub_to_hub_edge(self, h: Hypothesis) -> bool:
"""Reject paths containing any edge whose endpoints are both broad hubs.
Example: "Brain Diseases --causes--> Cognitive Dysfunction" β both ends
are top-level categories; the edge is too generic to be a mechanistic
step in a hypothesis.
Hub set is the top-N nodes by non-'about' degree, computed once and
cached. Uses a low bar (N=50) because hubs are self-evidently generic.
"""
if not hasattr(self, "_hub_id_set"):
# Build once per engine instance
from collections import Counter
degree = Counter()
for u, v, data in self.G.edges(data=True):
if data.get("relation_type") != "about":
degree[u] += 1
degree[v] += 1
top = degree.most_common(50)
self._hub_id_set = {cid for cid, _ in top}
for link in h.path:
if link.from_id in self._hub_id_set and link.to_id in self._hub_id_set:
return True
return False
def _touches_path_ignore_node(self, h: Hypothesis) -> bool:
"""Reject paths whose source, target, or any intermediate node is in
PATH_IGNORE_NODE_IDS (vague COGAT/MeSH umbrella hubs).
Catches concepts the token-based _is_noisy_entity misses because
the names ("memory", "logic", "Brain", "Neurons") are legitimate
English words but the KG concept id refers to an over-general
umbrella that's not testable.
"""
if h.source_id in PATH_IGNORE_NODE_IDS:
return True
if h.target_id in PATH_IGNORE_NODE_IDS:
return True
for link in h.path:
if link.from_id in PATH_IGNORE_NODE_IDS:
return True
if link.to_id in PATH_IGNORE_NODE_IDS:
return True
return False
@staticmethod
def _transits_intermediate_only_hub(h: Hypothesis) -> bool:
"""Reject paths that use disease mega-hubs as intermediate transit.
INTERMEDIATE_ONLY_IGNORE_IDS nodes are valid as source/target
(predicting Alzheimer is a real hypothesis) but not as middle
hops (A β Alzheimer β B is just "both relate to AD").
"""
if len(h.path) < 2:
return False
for i, link in enumerate(h.path):
if i >= 1 and link.from_id in INTERMEDIATE_ONLY_IGNORE_IDS:
return True
if i < len(h.path) - 1 and link.to_id in INTERMEDIATE_ONLY_IGNORE_IDS:
return True
return False
def _is_dataset_outcome(self, h: Hypothesis) -> bool:
"""Check if target is a UKB/ADNI/HCP-testable outcome.
The goal of our hypotheses is to predict SOMETHING from brain imaging.
Valid targets:
- Clinical diagnoses (disease domain) β Alzheimer, MCI, schizophrenia, etc.
- Cognitive/behavioral/personality measures (cognitive_function domain)
- Brain decoding targets:
* neuroanatomy (for encoding: stimulus β brain activation)
* visual_stimulus (for decoding: brain β stimulus category)
* emotion (SEED family: EEG β affect label)
* vigilance (SEED-VIG: EEG β alertness)
Invalid targets:
- Molecular entities (gene, biomarker, drug, neurotransmitter) β these
may be predictors, not predicted quantities
- Overly generic disease categories (Brain Diseases, Mental Disorders) β
already filtered by hub-to-hub, but double-check by keyword.
Accepts target if EITHER:
a) target's domain is in _OUTCOME_DOMAINS βͺ decoding domains, OR
b) target name matches _OUTCOME_KEYWORDS regex (as fallback for
claim_extraction concepts whose domain may be uncertain)
"""
target = self._index.get(h.target_id)
if target is None:
return False
domains = set(target.domain_tags)
# Accept: disease, cognitive_function, or decoding-target domains
outcome_domains = _OUTCOME_DOMAINS | {"visual_stimulus", "emotion", "vigilance"}
if domains & outcome_domains:
return True
# Accept: neuroanatomy targets when the hypothesis is a brain-decoding
# encoding path (stimulus β brain region). Excludes the clinical-
# prediction case where a target of 'White Matter' would be an input.
if "neuroanatomy" in domains:
source = self._index.get(h.source_id)
if source:
source_domains = set(source.domain_tags)
if source_domains & {"visual_stimulus", "emotion", "vigilance"}:
return True
# Fallback: outcome keyword match (catches claim_extraction concepts
# that describe outcomes but have wrong domain tags)
if _OUTCOME_KEYWORDS.search(h.target_name):
return True
return False
def _has_weak_evidence(self, h: Hypothesis) -> bool:
"""Check if hypothesis path has weak evidence (target not mentioned in raw_text).
For hypotheses where the target is a specific brain region, check if any hop's
raw_text actually mentions that region. If not, the path is likely spurious
(e.g., IL-1Ξ² β Internal Capsula where the evidence text talks about "grey matter"
but never mentions internal capsule).
Exception: paths anchored by curated functional facts (e.g. `evokes` from
visual_stimulus to a functional ROI) carry programmatic confidence, not
paper evidence β skip the raw_text requirement for them.
"""
target_node = self._index.get(h.target_id)
if not target_node or "neuroanatomy" not in target_node.domain_tags:
return False
# Skip paths whose source is a visual_stimulus / emotion / vigilance node, or
# which contain at least one curated functional edge (evokes / decoded_from /
# elicits). These are seeded from neuroscience textbooks, not paper claims.
source_node = self._index.get(h.source_id)
if source_node:
decoding_domains = {"visual_stimulus", "emotion", "vigilance"}
if any(t in decoding_domains for t in source_node.domain_tags):
return False
if any(l.relation_type in {"evokes", "decoded_from", "elicits"} for l in h.path):
return False
# Extract key terms from target name (e.g., "Internal Capsula" β ["internal", "capsula"])
target_terms = set(re.findall(r'\b\w{4,}\b', h.target_name.lower()))
if not target_terms:
return False
# Check if any hop mentions the target region
for link in h.path:
raw = link.raw_text or link.evidence.get("raw_text", "") if isinstance(link.evidence, dict) else ""
if raw:
raw_lower = raw.lower()
# If any target term appears in raw_text, evidence is OK
if any(term in raw_lower for term in target_terms):
return False
# No hop mentions the target region β weak evidence
logger.debug(f"weak evidence: {h.id} target '{h.target_name}' not mentioned in any raw_text")
return True
# ββ imaging-driven batch generation ββββββββββββββββββββββββββββββ
def batch_generate_imaging(
self,
dataset: str = "UKB",
max_paths_per_pair: int = 5,
max_seeds: int = 50,
max_hops: int = 3,
include_connectivity: bool = True,
) -> list[Hypothesis]:
"""Generate hypotheses driven by imaging features available in a dataset.
Strategy:
1. Find AAL atlas neuroanatomy nodes in the graph as ROI seeds
2. For each ROI Γ imaging feature template, construct a feature name
(e.g., "cortical thickness of Hippocampus_L")
3. Find graph paths from each ROI to disease/cognitive_function nodes
4. Filter using expanded exclusion rules
5. Annotate each hypothesis with dataset metadata
"""
dataset_key = dataset.upper().replace("-", "_")
if dataset_key not in DATASET_FEATURES:
raise ValueError(f"Unknown dataset: {dataset}. Available: {list(DATASET_FEATURES.keys())}")
ds_features = DATASET_FEATURES[dataset_key]
ds_outcomes = DATASET_OUTCOMES.get(dataset_key, [])
# 1. Find AAL atlas ROI nodes
aal_nodes = self._find_aal_regions(max_seeds)
if not aal_nodes:
logger.warning("No AAL atlas regions found in graph")
return []
logger.info(f"Found {len(aal_nodes)} AAL regions for imaging hypothesis generation")
# 2. Collect outcome nodes (disease, cognitive_function)
outcome_nodes = self._collect_outcome_nodes()
if not outcome_nodes:
logger.warning("No outcome nodes (disease/cognitive_function) found")
return []
# 3. Determine which imaging templates apply to this dataset
applicable_templates = {
name: meta for name, meta in IMAGING_FEATURE_TEMPLATES.items()
if dataset_key in meta["datasets"]
}
all_hypotheses: list[Hypothesis] = []
_hyp_counter = 0
seen_pairs: set[tuple[str, str]] = set()
# 4. Generate ROI-level imaging hypotheses
for region_id, region_name in aal_nodes.items():
for feat_template, feat_meta in applicable_templates.items():
feature_name = feat_template.replace("{region}", region_name)
# Find paths from this ROI to outcomes
try:
reachable = nx.single_source_shortest_path(
self.G, region_id, cutoff=max_hops
)
except nx.NetworkXError:
continue
candidates = [
nid for nid in reachable
if nid in outcome_nodes and nid != region_id
]
pair_count = 0
for target_id in candidates:
pair_key = (region_id, target_id, feat_template)
if pair_key in seen_pairs:
continue
seen_pairs.add(pair_key)
raw_path = reachable[target_id]
links = self._enrich_path(raw_path)
if not links:
continue
# Skip if path contains non-measurable entities
if self._path_has_non_measurable(links):
continue
conf = self._compute_confidence_score(links)
nov = self._compute_novelty_score(links)
evi = self._compute_evidence_score(links)
test, test_reason = self._compute_testability_score(links)
# Boost testability for imaging-driven hypotheses
test = min(test + 0.15, 1.0)
claim_ids = [l.claim_id for l in links if l.claim_id]
_hyp_counter += 1
target_node = self._index.get(target_id)
h = Hypothesis(
id=f"HYP:IMG:{_hyp_counter:06d}",
hypothesis_type="imaging",
source_id=region_id,
source_name=feature_name,
target_id=target_id,
target_name=target_node.preferred_name if target_node else target_id,
path=links,
confidence_score=conf,
novelty_score=nov,
evidence_score=evi,
testability_score=test,
composite_score=0.0,
supporting_claims=claim_ids,
testability_reason=test_reason,
metadata={
"dataset": dataset_key,
"input_modality": feat_meta["modality"],
"input_feature": feature_name,
"input_level": feat_meta["level"],
"input_tool": feat_meta["tool"],
"input_region": region_name,
"outcome_type": self._classify_outcome(target_node),
},
)
h.explanation = self._generate_explanation(h)
h.composite_score = self._composite_score(h)
all_hypotheses.append(h)
pair_count += 1
if pair_count >= max_paths_per_pair:
break
# 5. Generate connectivity-level hypotheses
if include_connectivity:
conn_templates = {
name: meta for name, meta in CONNECTIVITY_FEATURE_TEMPLATES.items()
if dataset_key in meta["datasets"]
}
if conn_templates:
hyps = self._generate_connectivity_hypotheses(
aal_nodes, outcome_nodes, conn_templates,
dataset_key, max_paths_per_pair, max_hops, _hyp_counter, seen_pairs,
)
_hyp_counter += len(hyps)
all_hypotheses.extend(hyps)
logger.info(
f"imaging batch generation ({dataset_key}): "
f"{len(all_hypotheses)} hypotheses from {len(aal_nodes)} regions"
)
all_hypotheses = self.post_process(all_hypotheses)
return all_hypotheses
def _find_aal_regions(self, max_n: int) -> dict[str, str]:
"""Find AAL atlas neuroanatomy nodes. Returns {node_id: region_name}."""
candidates = {}
for nid, data in self.G.nodes(data=True):
if "neuroanatomy" not in data.get("domain_tags", []):
continue
name = data.get("preferred_name", "")
# Match against AAL region keywords
name_lower = name.lower()
for kw in _AAL_REGION_KEYWORDS:
if kw.lower() in name_lower:
candidates[nid] = name
break
# Sort by degree (more connected = richer paths)
sorted_items = sorted(
candidates.items(),
key=lambda item: self.G.degree(item[0]),
reverse=True,
)
return dict(sorted_items[:max_n])
def _collect_outcome_nodes(self) -> set[str]:
"""Collect all disease + cognitive_function nodes as potential outcomes."""
outcome_ids = set()
for nid, data in self.G.nodes(data=True):
domains = set(data.get("domain_tags", []))
if "claim" in domains:
continue
if nid in PATH_IGNORE_NODE_IDS:
continue
if domains & {"disease", "cognitive_function"}:
outcome_ids.add(nid)
return outcome_ids
def _classify_outcome(self, node: Optional[ConceptNode]) -> str:
"""Classify outcome node type for metadata."""
if not node:
return "unknown"
domains = set(node.domain_tags)
if "disease" in domains:
return "disease"
if "cognitive_function" in domains:
return "cognitive_function"
if "biomarker" in domains:
return "biomarker"
return "other"
def _path_has_non_measurable(self, links: list[HypothesisLink]) -> bool:
"""Check if any intermediate node in the path is non-measurable."""
for link in links:
for name, nid in [(link.from_name, link.from_id), (link.to_name, link.to_id)]:
node = self._index.get(nid)
if node:
domains = set(node.domain_tags) - {"claim"}
if domains & NON_MEASURABLE_BIOMARKER_TYPES:
return True
for pattern in _NON_MEASURABLE_PATTERNS:
if pattern.search(name):
return True
return False
def _generate_connectivity_hypotheses(
self,
aal_nodes: dict[str, str],
outcome_nodes: set[str],
conn_templates: dict,
dataset_key: str,
max_paths_per_pair: int,
max_hops: int,
hyp_counter_start: int,
seen_pairs: set,
) -> list[Hypothesis]:
"""Generate hypotheses for connectivity features (FC/EC/SC between region pairs)."""
hypotheses = []
counter = hyp_counter_start
region_ids = list(aal_nodes.keys())
# Sample region pairs (limit to avoid O(n^2) explosion)
max_pairs = min(len(region_ids) * 3, 200)
import random
if len(region_ids) > 20:
sampled_pairs = []
for _ in range(max_pairs):
a, b = random.sample(region_ids, 2)
sampled_pairs.append((a, b))
else:
sampled_pairs = [(a, b) for i, a in enumerate(region_ids) for b in region_ids[i+1:]]
sampled_pairs = sampled_pairs[:max_pairs]
for region_a_id, region_b_id in sampled_pairs:
name_a = aal_nodes[region_a_id]
name_b = aal_nodes[region_b_id]
for feat_template, feat_meta in conn_templates.items():
feature_name = feat_template.replace("{a}", name_a).replace("{b}", name_b)
# Find paths from region_a to outcomes (potentially through region_b)
try:
reachable = nx.single_source_shortest_path(
self.G, region_a_id, cutoff=max_hops
)
except nx.NetworkXError:
continue
candidates = [
nid for nid in reachable
if nid in outcome_nodes and nid != region_a_id
]
pair_count = 0
for target_id in candidates:
pair_key = (region_a_id, target_id, feat_template)
if pair_key in seen_pairs:
continue
seen_pairs.add(pair_key)
raw_path = reachable[target_id]
links = self._enrich_path(raw_path)
if not links:
continue
if self._path_has_non_measurable(links):
continue
conf = self._compute_confidence_score(links)
nov = self._compute_novelty_score(links)
evi = self._compute_evidence_score(links)
test, test_reason = self._compute_testability_score(links)
test = min(test + 0.15, 1.0)
claim_ids = [l.claim_id for l in links if l.claim_id]
counter += 1
target_node = self._index.get(target_id)
h = Hypothesis(
id=f"HYP:IMG:{counter:06d}",
hypothesis_type="imaging_connectivity",
source_id=region_a_id,
source_name=feature_name,
target_id=target_id,
target_name=target_node.preferred_name if target_node else target_id,
path=links,
confidence_score=conf,
novelty_score=nov,
evidence_score=evi,
testability_score=test,
composite_score=0.0,
supporting_claims=claim_ids,
testability_reason=test_reason,
metadata={
"dataset": dataset_key,
"input_modality": feat_meta["modality"],
"input_feature": feature_name,
"input_level": feat_meta["level"],
"input_tool": feat_meta["tool"],
"input_region_a": name_a,
"input_region_b": name_b,
"input_region": f"{name_a} - {name_b}",
"outcome_type": self._classify_outcome(target_node),
},
)
h.explanation = self._generate_explanation(h)
h.composite_score = self._composite_score(h)
hypotheses.append(h)
pair_count += 1
if pair_count >= max_paths_per_pair:
break
return hypotheses
# ββ persistence ββββββββββββββββββββββββββββββββββββββββββββββββββββ
def save_hypotheses(self, hypotheses: list[Hypothesis], path: str | Path) -> None:
"""Save hypotheses to JSON."""
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
data = {
"n_hypotheses": len(hypotheses),
"hypotheses": [h.to_dict() for h in hypotheses],
}
path.write_text(json.dumps(data, indent=2, ensure_ascii=False), encoding="utf-8")
logger.info(f"saved {len(hypotheses)} hypotheses to {path}")
def load_hypotheses(self, path: str | Path) -> list[Hypothesis]:
"""Load hypotheses from JSON."""
path = Path(path)
data = json.loads(path.read_text(encoding="utf-8"))
hypotheses = [Hypothesis.from_dict(h) for h in data["hypotheses"]]
logger.info(f"loaded {len(hypotheses)} hypotheses from {path}")
return hypotheses
# ββ ranking ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def rank_hypotheses(
self,
hypotheses: list[Hypothesis],
weights: Optional[dict[str, float]] = None,
top_n: int = 100,
skip_post_process: bool = False,
) -> list[Hypothesis]:
"""Rank hypotheses by composite score (novelty, evidence, testability, confidence).
Args:
hypotheses: list of hypotheses to rank
weights: custom weights dict, keys: confidence, evidence, novelty, testability
top_n: return top N results
skip_post_process: if True, skip the post-processing filter
"""
if not skip_post_process:
hypotheses = self.post_process(hypotheses)
if weights is None:
# testability weighted highest β must be verifiable with imaging experiments
weights = {
"confidence": 0.20,
"evidence": 0.20,
"novelty": 0.25,
"testability": 0.35,
}
for h in hypotheses:
h.composite_score = (
(h.confidence_score ** weights["confidence"])
* (h.evidence_score ** weights["evidence"])
* (h.novelty_score ** weights["novelty"])
* (max(h.testability_score, 0.01) ** weights["testability"])
)
hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
return hypotheses[:top_n]
# ββ query-based (kept for interactive use) βββββββββββββββββββββββββ
def find_paths(
self,
source_id: str,
target_id: str,
max_hops: int = 3,
max_paths: int = 20,
) -> list[Hypothesis]:
"""Find hypothesis paths between two concepts with evidence enrichment."""
if source_id not in self.G or target_id not in self.G:
return []
claim_nodes = {nid for nid, n in self._index.items() if "claim" in n.domain_tags}
intermediate_exclude = claim_nodes - {source_id, target_id}
# Also strip vague umbrella hubs from the search subgraph so paths
# never include them as intermediates. Endpoints are excluded from
# the strip so a caller can still query them directly.
intermediate_exclude |= (PATH_IGNORE_NODE_IDS - {source_id, target_id})
subgraph = self.G.copy()
subgraph.remove_nodes_from(intermediate_exclude)
if source_id not in subgraph or target_id not in subgraph:
return []
try:
raw_paths = list(nx.all_simple_paths(
subgraph, source_id, target_id, cutoff=max_hops
))
except nx.NetworkXError:
return []
raw_paths = raw_paths[:max_paths]
return self._build_hypotheses_from_paths(raw_paths, "path")
def bridge_discovery(
self,
concept_id: str,
target_domain: str,
max_hops: int = 3,
max_results: int = 20,
) -> list[Hypothesis]:
"""Find cross-domain connections through intermediate claims."""
if concept_id not in self.G:
return []
target_nodes = {
nid for nid, data in self.G.nodes(data=True)
if target_domain in data.get("domain_tags", [])
}
if not target_nodes:
return []
try:
reachable = nx.single_source_shortest_path(
self.G, concept_id, cutoff=max_hops
)
except nx.NetworkXError:
return []
candidates = {
nid for nid in reachable
if nid in target_nodes and nid != concept_id
and "claim" not in self._index.get(nid, ConceptNode(id="", preferred_name="")).domain_tags
}
hypotheses = []
for target_id in candidates:
raw_path = reachable[target_id]
links = self._enrich_path(raw_path)
if not links:
continue
conf = self._compute_confidence_score(links)
nov = self._compute_novelty_score(links)
evi = self._compute_evidence_score(links)
test, test_reason = self._compute_testability_score(links)
claim_ids = [l.claim_id for l in links if l.claim_id]
h = Hypothesis(
hypothesis_type="bridge",
source_id=concept_id,
source_name=self._index[concept_id].preferred_name,
target_id=target_id,
target_name=self._index[target_id].preferred_name,
path=links,
confidence_score=conf,
novelty_score=nov,
evidence_score=evi,
testability_score=test,
supporting_claims=claim_ids,
testability_reason=test_reason,
)
h.explanation = self._generate_explanation(h)
h.composite_score = self._composite_score(h)
hypotheses.append(h)
hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
return hypotheses[:max_results]
def discover_hypotheses(
self,
concept_id: str,
max_hops: int = 3,
max_results: int = 30,
exclude_domains: Optional[set[str]] = None,
) -> list[Hypothesis]:
"""Find hypotheses radiating from a single concept to all reachable domains."""
if concept_id not in self.G:
return []
exclude = exclude_domains or {"claim"}
source_node = self._index.get(concept_id)
source_domains = set(source_node.domain_tags) - exclude if source_node else set()
try:
reachable = nx.single_source_shortest_path(self.G, concept_id, cutoff=max_hops)
except nx.NetworkXError:
return []
candidates = []
for target_id, raw_path in reachable.items():
if target_id == concept_id:
continue
target_node = self._index.get(target_id)
if not target_node:
continue
target_domains = set(target_node.domain_tags) - exclude
if not target_domains or target_domains <= source_domains:
continue
candidates.append((target_id, raw_path))
hypotheses = []
for target_id, raw_path in candidates:
links = self._enrich_path(raw_path)
if not links:
continue
conf = self._compute_confidence_score(links)
nov = self._compute_novelty_score(links)
evi = self._compute_evidence_score(links)
test, test_reason = self._compute_testability_score(links)
claim_ids = [l.claim_id for l in links if l.claim_id]
h = Hypothesis(
hypothesis_type="discover",
source_id=concept_id,
source_name=self._index[concept_id].preferred_name,
target_id=target_id,
target_name=self._index[target_id].preferred_name,
path=links,
confidence_score=conf,
novelty_score=nov,
evidence_score=evi,
testability_score=test,
supporting_claims=claim_ids,
testability_reason=test_reason,
)
h.explanation = self._generate_explanation(h)
h.composite_score = self._composite_score(h)
hypotheses.append(h)
hypotheses = self.post_process(hypotheses)
hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
return hypotheses[:max_results]
def find_trending(
self,
since_year: int = 2020,
min_claims: int = 3,
direction: str = "strengthening",
max_results: int = 30,
) -> list[dict]:
"""Find concept pairs with strengthening/weakening evidence over time.
Returns list of dicts with: concept_a, concept_b, years, slope, direction, claims.
"""
from collections import Counter
# Group claims by (subject, object)
claim_groups: dict[tuple[str, str], list[dict]] = {}
for nid, node in self._index.items():
if "claim" not in node.domain_tags:
continue
meta = node.metadata
sid = meta.get("subject_id", "")
oid = meta.get("object_id", "")
if not sid or not oid:
continue
key = (sid, oid)
claim_groups.setdefault(key, []).append(meta)
results = []
for (sid, oid), claims in claim_groups.items():
years = []
for c in claims:
sp = c.get("source_paper", {})
y = sp.get("year")
if y and y >= since_year:
years.append(y)
if len(years) < min_claims:
continue
year_counts = Counter(years)
ys = sorted(year_counts.keys())
cs = [year_counts[y] for y in ys]
slope = _simple_slope(ys, cs)
if direction == "strengthening" and slope <= 0.3:
continue
if direction == "weakening" and slope >= -0.3:
continue
if direction == "emerging" and max(ys) < 2025:
continue
src_node = self._index.get(sid)
tgt_node = self._index.get(oid)
results.append({
"concept_a": src_node.preferred_name if src_node else sid,
"concept_b": tgt_node.preferred_name if tgt_node else oid,
"concept_a_id": sid,
"concept_b_id": oid,
"year_counts": {str(y): year_counts[y] for y in ys},
"slope": round(slope, 3),
"direction": direction,
"n_claims": len(claims),
})
results.sort(key=lambda r: abs(r["slope"]), reverse=True)
return results[:max_results]
def contradiction_detection(
self,
domain_filter: Optional[str] = None,
max_results: int = 50,
) -> list[Contradiction]:
"""Find pairs of claims that assert opposite things about the same concept pair."""
claim_lookup: dict[tuple[str, str], list[ConceptNode]] = {}
for nid, node in self._index.items():
if "claim" not in node.domain_tags:
continue
meta = node.metadata
sid = meta.get("subject_id", "")
oid = meta.get("object_id", "")
if not sid or not oid:
continue
if domain_filter:
src_node = self._index.get(sid)
tgt_node = self._index.get(oid)
domains = set()
if src_node:
domains.update(src_node.domain_tags)
if tgt_node:
domains.update(tgt_node.domain_tags)
if domain_filter not in domains:
continue
key = (sid, oid)
claim_lookup.setdefault(key, []).append(node)
contradictions = []
for (sid, oid), claims in claim_lookup.items():
if len(claims) < 2:
continue
for i in range(len(claims)):
for j in range(i + 1, len(claims)):
c1, c2 = claims[i], claims[j]
m1, m2 = c1.metadata, c2.metadata
severity = self._check_contradiction(m1, m2)
if severity > 0:
contradictions.append(Contradiction(
concept_a_id=sid,
concept_a_name=m1.get("subject_name", sid),
concept_b_id=oid,
concept_b_name=m1.get("object_name", oid),
claim_for_id=c1.id,
claim_for_predicate=m1.get("predicate", ""),
claim_for_text=m1.get("raw_text", ""),
claim_against_id=c2.id,
claim_against_predicate=m2.get("predicate", ""),
claim_against_text=m2.get("raw_text", ""),
severity=severity,
))
contradictions.sort(key=lambda c: c.severity, reverse=True)
return contradictions[:max_results]
def gap_detection(
self,
domain_a: str,
domain_b: Optional[str] = None,
max_results: int = 50,
) -> list[Gap]:
"""Find concept pairs 2 hops apart with no direct edge."""
if domain_b is None:
domain_b = domain_a
nodes_a = {
nid for nid, data in self.G.nodes(data=True)
if domain_a in data.get("domain_tags", [])
and "claim" not in data.get("domain_tags", [])
}
nodes_b = {
nid for nid, data in self.G.nodes(data=True)
if domain_b in data.get("domain_tags", [])
and "claim" not in data.get("domain_tags", [])
}
gaps = []
seen = set()
for a_id in nodes_a:
if a_id not in self.G:
continue
hop1 = set(self.G.successors(a_id)) | set(self.G.predecessors(a_id))
hop2 = set()
for n1 in hop1:
if "claim" in self._index.get(n1, ConceptNode(id="", preferred_name="")).domain_tags:
continue
hop2.update(self.G.successors(n1))
hop2.update(self.G.predecessors(n1))
hop2 -= {a_id}
hop2 -= hop1
for b_id in hop2 & nodes_b:
pair = tuple(sorted([a_id, b_id]))
if pair in seen:
continue
seen.add(pair)
if self.G.has_edge(a_id, b_id) or self.G.has_edge(b_id, a_id):
continue
try:
path = nx.shortest_path(self.G, a_id, b_id)
except (nx.NetworkXNoPath, nx.NetworkXError):
continue
if len(path) > 3:
continue
connecting = [n for n in path[1:-1]
if "claim" not in self._index.get(n, ConceptNode(id="", preferred_name="")).domain_tags]
a_node = self._index.get(a_id)
b_node = self._index.get(b_id)
gaps.append(Gap(
concept_a_id=a_id,
concept_a_name=a_node.preferred_name if a_node else a_id,
concept_b_id=b_id,
concept_b_name=b_node.preferred_name if b_node else b_id,
distance=len(path) - 1,
connecting_concepts=connecting,
domain_a=domain_a,
domain_b=domain_b,
potential_relation=self._infer_relation(path),
))
gaps.sort(key=lambda g: (0 if g.domain_a != g.domain_b else 1, g.distance))
return gaps[:max_results]
# ββ name resolution ββββββββββββββββββββββββββββββββββββββββββββββββ
def resolve_name(self, query: str) -> Optional[str]:
"""Resolve a name to a concept ID. Returns None if not found."""
if not query:
return None
for node in self._index.values():
if node.preferred_name == query:
return node.id
query_lower = query.lower()
for node in self._index.values():
if node.preferred_name.lower() == query_lower:
return node.id
for node in self._index.values():
for alias in node.aliases:
if alias.lower() == query_lower:
return node.id
candidates = []
for node in self._index.values():
name_lower = node.preferred_name.lower()
if query_lower in name_lower or name_lower in query_lower:
candidates.append(node)
continue
for alias in node.aliases:
if query_lower in alias.lower() or alias.lower() in query_lower:
candidates.append(node)
break
if len(candidates) == 1:
return candidates[0].id
elif len(candidates) > 1:
candidates.sort(key=lambda n: len(n.preferred_name))
return candidates[0].id
return None
# ββ internal helpers βββββββββββββββββββββββββββββββββββββββββββββββ
def _sample_domain_nodes(self, domain: str, max_n: int) -> list[str]:
"""Sample up to max_n non-claim nodes from a domain, preferring nodes with edges."""
nodes = [
nid for nid, data in self.G.nodes(data=True)
if domain in data.get("domain_tags", [])
and "claim" not in data.get("domain_tags", [])
and nid not in PATH_IGNORE_NODE_IDS
]
# sort by degree (more connected = more useful as seed)
nodes.sort(key=lambda n: self.G.degree(n), reverse=True)
return nodes[:max_n]
def _build_hypotheses_from_paths(
self, raw_paths: list[list[str]], hyp_type: str
) -> list[Hypothesis]:
"""Build Hypothesis objects from raw node-ID paths."""
hypotheses = []
for raw_path in raw_paths:
links = self._enrich_path(raw_path)
if not links:
continue
conf = self._compute_confidence_score(links)
nov = self._compute_novelty_score(links)
evi = self._compute_evidence_score(links)
test, test_reason = self._compute_testability_score(links)
claim_ids = [l.claim_id for l in links if l.claim_id]
h = Hypothesis(
hypothesis_type=hyp_type,
source_id=raw_path[0],
source_name=self._index[raw_path[0]].preferred_name,
target_id=raw_path[-1],
target_name=self._index[raw_path[-1]].preferred_name,
path=links,
confidence_score=conf,
novelty_score=nov,
evidence_score=evi,
testability_score=test,
supporting_claims=claim_ids,
testability_reason=test_reason,
)
h.explanation = self._generate_explanation(h)
h.composite_score = self._composite_score(h)
hypotheses.append(h)
hypotheses.sort(key=lambda h: h.composite_score, reverse=True)
return hypotheses
def _enrich_path(self, raw_path: list[str]) -> list[HypothesisLink]:
"""Convert a raw node-ID path into rich HypothesisLink objects."""
links = []
for i in range(len(raw_path) - 1):
src_id, tgt_id = raw_path[i], raw_path[i + 1]
if not self.G.has_edge(src_id, tgt_id):
continue
edge_data = self.G.edges[src_id, tgt_id]
src_node = self._index.get(src_id)
tgt_node = self._index.get(tgt_id)
claim_id = edge_data.get("metadata", {}).get("claim_id", "")
claim_node = self._index.get(claim_id) if claim_id else None
evidence = {}
paper = {}
raw_text = ""
if claim_node and claim_node.metadata:
meta = claim_node.metadata
evidence = meta.get("evidence", {})
paper = meta.get("source_paper", {})
raw_text = meta.get("raw_text", "")
links.append(HypothesisLink(
from_id=src_id,
from_name=src_node.preferred_name if src_node else src_id,
to_id=tgt_id,
to_name=tgt_node.preferred_name if tgt_node else tgt_id,
relation_type=edge_data.get("relation_type", "unknown"),
confidence=edge_data.get("confidence", 0.5),
claim_id=claim_id,
raw_text=raw_text,
evidence=evidence,
source_paper=paper,
))
return links
# ββ scoring ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def compute_frequency_boost(self, claim_meta: dict) -> float:
"""Frequency boost based on independent PRIMARY study replication.
Prefers the merged `primary_supporting_papers` list set by
`phase4_optimize.merge_duplicate_claims` (already filtered for
non-review study types). Falls back to rebuilding from the
pre-merge index, matching the same filter logic.
"""
# Fast path: canonical claim carries primary-PMID list
primary = claim_meta.get("primary_supporting_papers")
if primary is not None and isinstance(primary, list):
n = len(primary)
if n >= 3:
return 1.2
elif n >= 1:
return 1.0
else:
return 0.5
# Fallback: scan all claims with the same SPO, filter reviews
key = (
claim_meta.get("subject_id", ""),
claim_meta.get("predicate", ""),
claim_meta.get("object_id", ""),
)
all_claims = self._claims_by_triple.get(key, [])
primary_pmids = set()
for c in all_claims:
st = c.get("evidence", {}).get("study_type", "")
if st not in _REVIEW_TYPES:
pmid = c.get("source_paper", {}).get("pmid", "")
if pmid:
primary_pmids.add(pmid)
if len(primary_pmids) >= 3:
return 1.2
elif len(primary_pmids) >= 1:
return 1.0
else:
return 0.5
@staticmethod
def compute_temporal_decay(claim_meta: dict, reference_year: int = 2026) -> float:
"""Temporal decay: newer primary studies get higher weight.
Reviews get no time bonus (1.0). Primary studies decay 3% per year, floor 0.7.
"""
st = claim_meta.get("evidence", {}).get("study_type", "")
if st in _REVIEW_TYPES:
return 1.0
year = claim_meta.get("source_paper", {}).get("year", 0)
if not year:
return 0.85 # unknown year, neutral
age = reference_year - year
return max(0.7, 1.0 - 0.03 * age)
def _compute_confidence_score(self, path: list[HypothesisLink]) -> float:
"""Confidence = geometric mean of per-link scores, with weak-link penalty.
Per-link score = edge.confidence Γ freq_boost Γ temporal_decay
(edge.confidence already includes study_type weighting from
phase4_optimize.apply_evidence_weighting and the claim-level
statistical quality signals from claim_extractor._estimate_confidence)
Aggregate: geometric mean (one weak link crushes the path)
+ weakest-link penalty (Γ0.7 when min_edge < 0.1)
Single source of truth for each multiplier:
- study_type β phase4_optimize.WEIGHT_MAP (canonical, idempotent)
- p_value/sample_size/replicability β claim_extractor._estimate_confidence
- freq across primary PMIDs β compute_frequency_boost
- publication recency β compute_temporal_decay
"""
if not path:
return 0.0
import math
scores = []
min_conf = float("inf")
for link in path:
raw = max(link.confidence, 1e-3) # tiny floor for log()
min_conf = min(min_conf, raw)
full_meta = {
"evidence": link.evidence,
"source_paper": link.source_paper,
"subject_id": link.from_id,
"predicate": link.relation_type,
"object_id": link.to_id,
}
freq_boost = self.compute_frequency_boost(full_meta)
temp_decay = self.compute_temporal_decay(full_meta)
s = raw * freq_boost * temp_decay
scores.append(min(s, 1.0))
log_sum = sum(math.log(max(s, 1e-6)) for s in scores)
gm = math.exp(log_sum / len(scores))
if min_conf < 0.1:
gm *= 0.7
return max(min(gm, 1.0), 0.0)
def _compute_novelty_score(self, path: list[HypothesisLink]) -> float:
"""Score how novel/surprising a hypothesis is.
Lower = more expected (direct known relationship), Higher = more surprising.
"""
score = 0.3 # base
# hop bonus: longer paths = more novel connections
score += 0.1 * min(len(path) - 1, 3)
# cross-domain bonus: connecting different domains is more novel
domains_seen = set()
for link in path:
src = self._index.get(link.from_id)
tgt = self._index.get(link.to_id)
if src:
domains_seen.update(src.domain_tags)
if tgt:
domains_seen.update(tgt.domain_tags)
domains_seen.discard("claim")
n_domains = len(domains_seen)
if n_domains >= 3:
score += 0.15
elif n_domains >= 2:
score += 0.10
# rare relation bonus: non-generic relations are more novel
rare_count = sum(1 for l in path if l.relation_type not in COMMON_RELATIONS)
score += 0.05 * min(rare_count, 3)
# evidence diversity: more papers = better supported, less novel
# fewer papers = more speculative, more novel
pmids = {l.source_paper.get("pmid", "") for l in path if l.source_paper.get("pmid")}
if len(pmids) == 0:
score += 0.10 # no paper support = speculative but novel
elif len(pmids) == 1:
score += 0.05 # single source = weak replication
return min(score, 1.0)
def _compute_evidence_score(self, path: list[HypothesisLink]) -> float:
"""Score evidence quality: traceability and text availability.
DOES NOT use p_value/sample_size/effect_size β those signals already
flow into edge.confidence via claim_extractor._estimate_confidence
and are aggregated by _compute_confidence_score. Counting them again
here was double-dipping.
This score asks a different question: "How well-anchored is the
evidence in source documents?" β which complements confidence's
"How statistically strong is the evidence?". Path-level: most
well-extracted edges score 0.6-0.8; we reserve >0.9 for paths whose
every step has rich provenance.
"""
_REVIEW_TYPES = {"narrative_review", "review"}
scores = []
for link in path:
study_type = (link.evidence.get("study_type") or "").lower()
s = 0.2 if study_type in _REVIEW_TYPES else 0.3
if link.raw_text and len(link.raw_text) > 20:
s += 0.20
if link.claim_id:
s += 0.15
if link.source_paper.get("pmid"):
s += 0.15
if link.evidence.get("study_type"):
s += 0.10
scores.append(min(s, 1.0))
return self._geometric_mean(scores)
def _compute_testability_score(self, path: list[HypothesisLink]) -> tuple[float, str]:
"""Score how testable a hypothesis is with NeuroClaw imaging experiments.
Boosts for:
- Brain region features directly measurable from sMRI (volume, thickness)
- Connectivity features (functional/structural) for GNN models
- Modalities available in UKB/ADNI/HCP-YA
- Deep learning model compatibility (BrainGNN, NeuroStorm)
- Target diseases present in datasets (AD, PD, depression, etc.)
Returns (score, reason_string).
"""
all_text = " ".join(
l.raw_text + " " + l.from_name + " " + l.to_name + " " + l.relation_type
for l in path
).lower()
# check which modalities are mentioned
matched_modalities = []
for modality, keywords in TESTABLE_MODALITIES.items():
for kw in keywords:
if kw.lower() in all_text:
matched_modalities.append(modality)
break
if not matched_modalities:
return 0.15, "no imaging modality detected"
score = 0.25 # base for having a modality
# modality bonus (more = more testable angles)
score += 0.10 * min(len(matched_modalities), 3)
# heavy bonus for sMRI features (volume/thickness β directly measurable in all 3 datasets)
if "sMRI" in matched_modalities:
score += 0.15
# heavy bonus for connectivity features (input to BrainGNN/GNN models)
if "dMRI" in matched_modalities or "fMRI" in matched_modalities:
score += 0.15
# bonus for PET (available in ADNI, key for AD research)
if "PET" in matched_modalities:
score += 0.10
# bonus for brain region specificity (testable with atlas parcellation)
brain_region_keywords = ["cortex", "hippocampus", "amygdala", "thalamus",
"cerebellum", "striatum", "insula", "gyrus",
"caudate", "putamen", "pallidum", "accumbens",
"precuneus", "cuneus", "lingual", "fusiform",
"parahippocampal", "entorhinal", "parietal",
"frontal", "temporal", "occipital"]
regions_found = [kw for kw in brain_region_keywords if kw in all_text]
if regions_found:
score += 0.10 # atlas-based ROI analysis
if len(regions_found) >= 2:
score += 0.05 # pair of regions = connectivity hypothesis
# bonus for diseases present in target datasets
dataset_diseases = [
"alzheimer", "parkinson", "depression", "schizophrenia", "adhd",
"autism", "epilepsy", "multiple sclerosis", "anxiety", "bipolar",
"dementia", "mci", "mild cognitive",
]
if any(d in all_text for d in dataset_diseases):
score += 0.05
# bonus for DL-model-compatible features (graph structure, ROI, connectivity matrix)
if any(kw.lower() in all_text for kw in DL_MODEL_KEYWORDS):
score += 0.05
# build reason string
modalities_str = ", ".join(matched_modalities)
reason = f"modalities: {modalities_str}"
if regions_found:
reason += f" | brain regions: {', '.join(regions_found[:4])}"
if any(d in all_text for d in dataset_diseases):
matched_diseases = [d for d in dataset_diseases if d in all_text]
reason += f" | diseases: {', '.join(matched_diseases[:3])}"
return min(score, 1.0), reason
def _composite_score(self, h: Hypothesis) -> float:
"""Weighted geometric mean of the 4 score components.
Geometric: a hypothesis is only as good as its weakest dimension.
A path with great evidence but 0 testability is worthless to us.
Matches the linear fitness in evolution_engine._score_fitness
(same weights, different aggregation β fitness adds convergence /
diversity / length modifiers not relevant here).
"""
c = max(h.confidence_score, 0.01)
e = max(h.evidence_score, 0.01)
n = max(h.novelty_score, 0.01)
t = max(h.testability_score, 0.01)
score = (c ** 0.20) * (e ** 0.20) * (n ** 0.25) * (t ** 0.35)
if self._has_only_review_evidence(h):
score *= 0.7
return score
@staticmethod
def _has_only_review_evidence(h: Hypothesis) -> bool:
"""True if every link in the path comes from a review/narrative_review."""
_REVIEW_TYPES = {"narrative_review", "review"}
if not h.path:
return False
for link in h.path:
study_type = (link.evidence.get("study_type") or "").lower()
if study_type and study_type not in _REVIEW_TYPES:
return False
return True
def _check_contradiction(self, m1: dict, m2: dict) -> float:
"""Check if two claims contradict each other. Returns severity 0-1."""
p1 = m1.get("predicate", "")
p2 = m2.get("predicate", "")
n1 = m1.get("negated", False)
n2 = m2.get("negated", False)
if p1 == p2 and n1 != n2:
return 1.0
if (p1, p2) in OPPOSING_PREDICATES:
return 0.8
if p1 == p2 and not n1 and not n2:
d1 = m1.get("evidence", {}).get("direction", "")
d2 = m2.get("evidence", {}).get("direction", "")
if d1 and d2 and d1 != d2:
return 0.6
return 0.0
def _infer_relation(self, path: list[str]) -> str:
"""Infer a potential relation from a path's edge types."""
relations = []
for i in range(len(path) - 1):
if self.G.has_edge(path[i], path[i + 1]):
rt = self.G.edges[path[i], path[i + 1]].get("relation_type", "")
if rt and rt not in ("about", "is_a", "part_of"):
relations.append(rt)
if relations:
for r in relations:
if r not in COMMON_RELATIONS:
return r
return relations[0]
return "associated_with"
def _generate_explanation(self, h: Hypothesis) -> str:
"""Generate a human-readable explanation for a hypothesis."""
path_str = " --> ".join(
f"{l.from_name} --[{l.relation_type}]--> {l.to_name}" for l in h.path
)
if not path_str:
return ""
pmids = {l.source_paper.get("pmid", "") for l in h.path if l.source_paper.get("pmid")}
key_finding = ""
for l in h.path:
if l.raw_text:
key_finding = l.raw_text[:150]
if len(l.raw_text) > 150:
key_finding += "..."
break
lines = [
f"Hypothesis: {h.source_name} may relate to {h.target_name} via {len(h.path)}-hop path.",
f"Path: {path_str}",
f"Evidence: {len(h.supporting_claims)} claims from {len(pmids)} papers",
]
if key_finding:
lines.append(f"Key finding: '{key_finding}'")
if h.testability_reason:
lines.append(f"Testability: {h.testability_reason}")
lines.append(
f"Confidence: {h.confidence_score:.2f} | "
f"Novelty: {h.novelty_score:.2f} | "
f"Evidence: {h.evidence_score:.2f} | "
f"Testability: {h.testability_score:.2f}"
)
return "\n".join(lines)
@staticmethod
def _geometric_mean(values: list[float]) -> float:
if not values:
return 0.0
product = math.prod(values)
return product ** (1.0 / len(values))
def _simple_slope(xs: list[int], ys: list[int]) -> float:
"""Simple linear regression slope without numpy."""
n = len(xs)
if n < 2:
return 0.0
mean_x = sum(xs) / n
mean_y = sum(ys) / n
num = sum((x - mean_x) * (y - mean_y) for x, y in zip(xs, ys))
den = sum((x - mean_x) ** 2 for x in xs)
if den == 0:
return 0.0
return num / den
|