technician1 commited on
Commit
5d3c45e
·
1 Parent(s): 2099485

Upload 12 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ books_large_p1.txt filter=lfs diff=lfs merge=lfs -text
37
+ books_large_p2.txt filter=lfs diff=lfs merge=lfs -text
38
+ ChatIPC.exe filter=lfs diff=lfs merge=lfs -text
39
+ dictionary.cpp filter=lfs diff=lfs merge=lfs -text
40
+ Implicational[[:space:]]propositional[[:space:]]calculus[[:space:]]-[[:space:]]Wikipedia.pdf filter=lfs diff=lfs merge=lfs -text
ChatIPC.cbp ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
2
+ <CodeBlocks_project_file>
3
+ <FileVersion major="1" minor="6" />
4
+ <Project>
5
+ <Option title="ChatIPC" />
6
+ <Option pch_mode="2" />
7
+ <Option compiler="gcc" />
8
+ <Build>
9
+ <Target title="Debug">
10
+ <Option output="bin/Debug/ChatIPC" prefix_auto="1" extension_auto="1" />
11
+ <Option object_output="obj/Debug/" />
12
+ <Option type="1" />
13
+ <Option compiler="gcc" />
14
+ <Compiler>
15
+ <Add option="-g" />
16
+ </Compiler>
17
+ </Target>
18
+ <Target title="Release">
19
+ <Option output="bin/Release/ChatIPC" prefix_auto="1" extension_auto="1" />
20
+ <Option object_output="obj/Release/" />
21
+ <Option type="1" />
22
+ <Option compiler="gcc" />
23
+ <Compiler>
24
+ <Add option="-O2" />
25
+ </Compiler>
26
+ <Linker>
27
+ <Add option="-s" />
28
+ </Linker>
29
+ </Target>
30
+ </Build>
31
+ <Compiler>
32
+ <Add option="-Wall" />
33
+ <Add option="-fexceptions" />
34
+ </Compiler>
35
+ <Unit filename="ChatIPC.cbp" />
36
+ <Unit filename="ChatIPC.cpp" />
37
+ <Unit filename="Implicational propositional calculus - Wikipedia.pdf" />
38
+ <Unit filename="dictionary.cpp" />
39
+ <Extensions>
40
+ <lib_finder disable_auto="1" />
41
+ </Extensions>
42
+ </Project>
43
+ </CodeBlocks_project_file>
ChatIPC.cpp ADDED
@@ -0,0 +1,1862 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // ChatIPC.cpp
2
+ // IPC is abbreviation for Implicational Propositional Calculus.
3
+ // C++17 — standard library only (optional OpenMP parallelization).
4
+ // chat mode. The chat mode incrementally incorporates user inputs and the
5
+ // program's own responses into the implication graph and uses fast hashmaps
6
+ // + optional OpenMP to parallelize sentence processing. A small synthesis
7
+ // engine assembles responses from inferred implication chains (no hard-coded
8
+ // templates beyond minimal connective phrasing).
9
+
10
+ #include <iostream>
11
+ #include <fstream>
12
+ #include <sstream>
13
+ #include <string>
14
+ #include <vector>
15
+ #include <regex>
16
+ #include <unordered_set>
17
+ #include <unordered_map>
18
+ #include <set>
19
+ #include <queue>
20
+ #include <tuple>
21
+ #include <algorithm>
22
+ #include <cctype>
23
+ #include <locale>
24
+ #include <iomanip>
25
+ #include <functional>
26
+ #include <mutex>
27
+ #include <thread>
28
+ #include <atomic>
29
+ #include <chrono>
30
+ #include <utility>
31
+ #include <deque>
32
+
33
+ #ifdef _OPENMP
34
+ #include <omp.h>
35
+ #endif
36
+
37
+ using std::string;
38
+ using std::vector;
39
+ using std::smatch;
40
+ using std::regex;
41
+ using std::unordered_set;
42
+ using std::unordered_map;
43
+ using std::set;
44
+ using std::queue;
45
+ using std::tuple;
46
+ using std::get;
47
+ using std::size_t;
48
+ using std::pair;
49
+
50
+ // Debug control: set by command-line flag --debug or environment variable IMPL_DEBUG=1
51
+ static bool GLOBAL_DEBUG = false;
52
+ static int GLOBAL_THREADS = 0; // 0 means auto (use omp_get_max_threads() or hardware_concurrency)
53
+
54
+ #define DBG(msg) do { if (GLOBAL_DEBUG) std::cerr << "[DBG] " << __FILE__ << ":" << __LINE__ << " " << msg << std::endl; } while(0)
55
+ #define DBG_LINE() do { if (GLOBAL_DEBUG) std::cerr << "[DBG] " << __FILE__ << ":" << __LINE__ << std::endl; } while(0)
56
+
57
+ /* ----------------------------- Basic text utils ---------------------------- */
58
+
59
+ static inline string trim(const string &s) {
60
+ DBG_LINE();
61
+ size_t a = 0;
62
+ while (a < s.size() && std::isspace((unsigned char)s[a])) ++a;
63
+ size_t b = s.size();
64
+ while (b > a && std::isspace((unsigned char)s[b-1])) --b;
65
+ string r = s.substr(a, b - a);
66
+ DBG("trim -> '" << r << "'");
67
+ return r;
68
+ }
69
+ static inline string normalize_spaces(const string &s) {
70
+ DBG_LINE();
71
+ string out; out.reserve(s.size());
72
+ bool last_space = false;
73
+ for (unsigned char c : s) {
74
+ if (std::isspace(c)) {
75
+ if (!last_space) { out.push_back(' '); last_space = true; }
76
+ } else { out.push_back(c); last_space = false; }
77
+ }
78
+ string r = trim(out);
79
+ DBG("normalize_spaces -> '" << r << "'");
80
+ return r;
81
+ }
82
+ static inline string lower_copy(const string &s) {
83
+ DBG_LINE();
84
+ std::locale loc;
85
+ string r = s;
86
+ for (char &c : r) c = std::tolower((unsigned char)c);
87
+ DBG("lower_copy -> '" << r << "'");
88
+ return r;
89
+ }
90
+
91
+ /* split a phrase of antecedents joined by "and" or commas (conservative) */
92
+ static vector<string> split_antecedents(const string &s) {
93
+ DBG_LINE();
94
+ vector<string> out;
95
+ std::regex comma_re(R"(\s*,\s*)");
96
+ std::sregex_token_iterator it(s.begin(), s.end(), comma_re, -1), end;
97
+ for (; it != end; ++it) {
98
+ string part = trim(*it);
99
+ std::regex and_re(R"(\b(?:and|&|∧)\b)");
100
+ std::sregex_token_iterator it2(part.begin(), part.end(), and_re, -1), end2;
101
+ for (; it2 != end2; ++it2) {
102
+ string p2 = trim(*it2);
103
+ if (!p2.empty()) out.push_back(p2);
104
+ }
105
+ }
106
+ if (out.empty()) {
107
+ string t = trim(s);
108
+ if (!t.empty()) out.push_back(t);
109
+ }
110
+ DBG("split_antecedents on '" << s << "' -> " << out.size() << " parts");
111
+ return out;
112
+ }
113
+ static inline string node_norm(const string &x) {
114
+ DBG_LINE();
115
+ string r = normalize_spaces(trim(x));
116
+ DBG("node_norm -> '" << r << "'");
117
+ return r;
118
+ }
119
+
120
+ /* Edge type & helpers */
121
+ struct Edge {
122
+ string A;
123
+ string B;
124
+ string form; // description of matched pattern
125
+ size_t line; // approximate line number
126
+ string sentence; // sentence snippet
127
+ };
128
+ static inline string key_of_edge(const Edge &e) {
129
+ DBG_LINE();
130
+ string k = e.form + "||" + e.A + "||" + e.B + "||" + e.sentence;
131
+ DBG("key_of_edge -> '" << k << "'");
132
+ return k;
133
+ }
134
+ static size_t line_of_offset(const string &text, size_t offset) {
135
+ DBG_LINE();
136
+ if (offset > text.size()) offset = text.size();
137
+ size_t ln = 1;
138
+ for (size_t i = 0; i < offset; ++i) if (text[i] == '\n') ++ln;
139
+ DBG("line_of_offset -> " << ln);
140
+ return ln;
141
+ }
142
+
143
+ /* ------------------------------ Patterns holder --------------------------- */
144
+
145
+ struct Patterns {
146
+ // all regex objects from the original code
147
+ regex sym_re, sequent_re, lex_re, passive_re, ifthen_re, given_re, whenever_re, therefore_re, from_we_re;
148
+ regex follows_from_re, onlyif_re, onlywhen_re, unless_re, iff_re, suff_re, neces_re, nec_suf_re;
149
+ regex means_re, equiv_re, every_re, in_case_re, without_re, must_re, cannotboth_re, prevents_re, contradicts_re;
150
+ regex exceptwhen_re, either_re, aslongas_re, ifandwhen_re, insofar_re, necessitates_re, guarantees_re, requires_re;
151
+ regex impossible_if_re, prereq_re, no_re, causes_re, because_re, due_to_re, defined_re, exactlywhen_re, provided_re;
152
+ regex ifnot_re, definition_syn_re, otherwise_re, or_else_re, implies_nc_re, suff_notnec_re, nec_notsuff_re, neither_re;
153
+ regex barring_re, in_absence_re, conditional_on_re, subject_to_re, dependent_on_re, before_re, after_re, correlates_re;
154
+ regex probable_re, adverb_qual_re, not_converse_variants_re;
155
+
156
+ // new advanced/defeasible/counterfactual/statistical patterns
157
+ regex counterfactual_re; // "If it were the case that X, then Y"
158
+ regex subjunctive_re; // "Were X to happen, Y would ..."
159
+ regex defeasible_re; // "generally / normally / typically X implies Y"
160
+ regex default_re; // "X by default, then Y"
161
+ regex increases_prob_re; // "X increases the probability of Y"
162
+
163
+ // new: variable declaration pattern (e.g. "G and H are variables", "X is a variable")
164
+ regex variable_decl_re;
165
+ };
166
+
167
+ static Patterns make_patterns() {
168
+ DBG_LINE();
169
+ const auto IC = std::regex_constants::icase;
170
+ Patterns p{
171
+ // Make sure the order of regex initializers in make_patterns() matches the order of fields in the Patterns struct exactly;
172
+ // otherwise the aggregate initialization will mis-assign regexes.
173
+
174
+ // core
175
+ regex(R"(([^.!?;\n]{1,400}?)\s*(->|=>|⇒|→|⟹|⊢|⊨|<->|<=>|↔)\s*([^.!?;\n]{1,400}?)(?:[.!?;\n]|$))", IC),
176
+ regex(R"(([^⊢⊨\n]{1,300}?)\s*(?:⊢|⊨)\s*([^.!?;\n]{1,300}?)(?:[.!?;\n]|$))", IC),
177
+ regex(R"(([^.!?;\n]{1,350}?)\b(?:implies|implied|entails|yields|results\s+in|gives|produces|follows|causes|leads\s+to|prevents|precludes)\b(?:\s+(?:that|from))?\s*([^.!?;\n]{1,350}?)(?:[.!?;\n]|$))", IC),
178
+ regex(R"(([^.!?;\n]{1,350}?)\s+\b(?:is\s+implied\s+by|follows\s+from|is\s+derived\s+from|is\s+entailed\s+by|is\s+caused\s+by|is\s+due\s+to|is\s+the\s+result\s+of)\b\s+([^.!?;\n]{1,350}?)(?:[.!?;\n]|$))", IC),
179
+ regex(R"(\bif\s+(.{1,350}?)\s+(?:then\s+)?(.{1,350}?)(?:[.!?;\n]|$))", IC),
180
+ regex(R"(\b(?:given|assuming|provided|assuming\s+that|provided\s+that)\s+(?:that\s+)?(.{1,300}?)\s*,\s*([^.!?;\n]{1,350}?)(?:[.!?;\n]|$))", IC),
181
+ regex(R"(\bwhenever\s+(.{1,300}?)\s*,?\s*(?:then\s+)?([^.!?;\n]{1,350}?)(?:[.!?;\n]|$))", IC),
182
+ regex(R"(([^.!?;\n]{1,350}?)\s*(?:therefore|hence|thus|consequently|so|as\s+a\s+result)\s+([^.!?;\n]{1,350}?)(?:[.!?;\n]|$))", IC),
183
+ regex(R"(\bfrom\s+([^.!?;\n]{1,350}?)\s+(?:we|one|it)\s+(?:conclude|deduce|derive|obtain|get)\s+(?:that\s*)?([^.!?;\n]{1,350}?)(?:[.!?;\n]|$))", IC),
184
+
185
+ // more
186
+ regex(R"(([^.!?;\n]{1,350}?)\s+(?:follows\s+from|is\s+implied\s+by|is\s+derived\s+from)\s+([^.!?;\n]{1,350}?)(?:[.!?;\n]|$))", IC),
187
+ regex(R"(([^.!?;\n]{1,250}?)\s+only\s+if\s+([^.!?;\n]{1,250}?)(?:[.!?;\n]|$))", IC),
188
+ regex(R"(([^.!?;\n]{1,250}?)\s+only\s+when\s+([^.!?;\n]{1,250}?)(?:[.!?;\n]|$))", IC),
189
+ regex(R"(([^.!?;\n]{1,250}?)\s+unless\s+([^.!?;\n]{1,250}?)(?:[.!?;\n]|$))", IC),
190
+ regex(R"(([^.!?;\n]{1,300}?)\s+(?:if\s+and\s+only\s+if|iff|exactly\s+when|exactly\s+if)\s+([^.!?;\n]{1,300}?)(?:[.!?;\n]|$))", IC),
191
+ regex(R"(([^.!?;\n]{1,300}?)\s+(?:is\s+)?(?:sufficient\s+for|suffices\s+for)\s+([^.!?;\n]{1,300}?)(?:[.!?;\n]|$))", IC),
192
+ regex(R"(([^.!?;\n]{1,300}?)\s+(?:is\s+)?(?:necessary\s+for)\s+([^.!?;\n]{1,300}?)(?:[.!?;\n]|$))", IC),
193
+ regex(R"(([^.!?;\n]{1,300}?)\s+(?:is\s+)?(?:necessary\s+and\s+sufficient|sufficient\s+and\s+necessary)\s+for\s+([^.!?;\n]{1,300}?)(?:[.!?;\n]|$))", IC),
194
+
195
+ // extended
196
+ regex(R"(([^.!?;\n]{1,300}?)\s+(?:means\s+that|means|denotes|signifies|constitutes)\s+([^.!?;\n]{1,300}?)(?:[.!?;\n]|$))", IC),
197
+ regex(R"(([^.!?;\n]{1,300}?)\s+(?:is\s+equivalent\s+to|equivalent\s+to|is\s+the\s+same\s+as)\s+([^.!?;\n]{1,300}?)(?:[.!?;\n]|$))", IC),
198
+ regex(R"(\b(?:every|each|all|any)\s+([^.!?;\n]{1,120}?)\s+(?:is|are|must\s+be|is\s+necessarily)\s+([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
199
+ regex(R"(\bin\s+case\s+(.{1,200}?)\s*,\s*(?:then\s+)?([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
200
+ regex(R"(\bwithout\s+(.{1,160}?)\s*,\s*(?:then\s+)?([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
201
+ regex(R"(([^.!?;\n]{1,200}?)\s+must\s+(?:be\s+)?(?:([^.!?;\n]{1,200}?))(?:[.!?;\n]|$))", IC),
202
+ regex(R"(([^.!?;\n]{1,160}?)\s+(?:cannot\s+both|are\s+mutually\s+exclusive|mutually\s+exclusive|cannot\s+both\s+be)\s+([^.!?;\n]{1,160}?)(?:[.!?;\n]|$))", IC),
203
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:prevents|preclude|precludes)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
204
+
205
+ // continued
206
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:contradicts|is\s+incompatible\s+with|conflicts\s+with)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
207
+ regex(R"(([^.!?;\n]{1,220}?)\s+except\s+when\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
208
+ regex(R"(\beither\s+(.{1,160}?)\s+or\s+(.{1,160}?)(?:\s*,?\s*(but\s+not\s+both))?(?:[.!?;\n]|$))", IC),
209
+ regex(R"(\bas\s+long\s+as\s+(.{1,200}?)\s*,?\s*(?:then\s+)?([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
210
+ regex(R"(\bif\s+and\s+when\s+(.{1,200}?)\s*,?\s*(?:then\s+)?([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
211
+ regex(R"(\binsofar\s+as\s+(.{1,200}?)\s*,?\s*(?:then\s+)?([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
212
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:necessitates|necessitate)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
213
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:guarantees|ensures)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
214
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:requires|needs|is\s+required\s+for)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
215
+
216
+ // rest
217
+ regex(R"(([^.!?;\n]{1,220}?)\s+is\s+impossible\s+if\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
218
+ regex(R"(([^.!?;\n]{1,200}?)\s+(?:is\s+a\s+)?prerequisite\s+for\s+([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
219
+ regex(R"(\bno\s+([^.!?;\n]{1,120}?)\s+(?:are|are\s+ever|is|can|will|be)\s+([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
220
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:causes|cause|lead?s?\s+to|results?\s+in|produces)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
221
+ regex(R"(([^.!?;\n]{1,220}?)\s+\b(?:because|since|as)\b\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
222
+ regex(R"(\b(?:due\s+to|because\s+of)\s+([^.!?;\n]{1,220}?)\s*,?\s*(?:then\s+)?([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
223
+ regex(R"(([^.!?;\n]{1,200}?)\s+(?:is\s+defined\s+as|is\s+defined\s+to\s+be|defined\s+as)\s+([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
224
+ regex(R"(([^.!?;\n]{1,200}?)\s+exactly\s+when\s+([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
225
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:provided|provided\s+that)\s+(?:that\s+)?([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
226
+ regex(R"(\bif\s+not\s+(.{1,200}?)\s*,?\s*(?:then\s+)?not\s+(.{1,200}?)(?:[.!?;\n]|$))", IC),
227
+ regex(R"(([^.!?;\n]{1,200}?)\s+(?:denotes|signifies|is\s+called|is\s+termed)\s+([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
228
+ regex(R"(([^.!?;\n]{1,300}?)\s*,?\s*otherwise\s+([^.!?;\n]{1,300}?)(?:[.!?;\n]|$))", IC),
229
+ regex(R"(([^.!?;\n]{1,300}?)\s*,?\s*(?:or\s+else)\s+([^.!?;\n]{1,300}?)(?:[.!?;\n]|$))", IC),
230
+ regex(R"(([^.!?;\n]{1,250}?)\s+(?:implies|entails|yields)\s+([^.!?;\n]{1,250}?)\s*(?:,\s*)?(?:but\s+not\s+conversely|not\s+conversely|but\s+not\s+the\s+other\s+way|though\s+not\s+the\s+converse))", IC),
231
+ regex(R"(([^.!?;\n]{1,250}?)\s+(?:is\s+)?(?:a\s+)?(?:sufficient\s+but\s+not\s+necessary|suffices\s+but\s+is\s+not\s+necessary)\s+for\s+([^.!?;\n]{1,250}?)(?:[.!?;\n]|$))", IC),
232
+ regex(R"(([^.!?;\n]{1,250}?)\s+(?:is\s+)?(?:a\s+)?(?:necessary\s+but\s+not\s+sufficient)\s+for\s+([^.!?;\n]{1,250}?)(?:[.!?;\n]|$))", IC),
233
+ regex(R"(([^.!?;\n]{1,250}?)\s+is\s+(?:neither\s+necessary\s+nor\s+sufficient)\s+for\s+([^.!?;\n]{1,250}?)(?:[.!?;\n]|$))", IC),
234
+ regex(R"((?:barring|except\s+for|save\s+for)\s+(.{1,200}?)\s*,?\s*(?:then\s+)?([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
235
+ regex(R"(\b(?:in\s+the\s+absence\s+of|in\s+absence\s+of)\s+(.{1,200}?)\s*,?\s*(?:then\s+)?([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
236
+ regex(R"(([^.!?;\n]{1,200}?)\s+(?:conditional\s+on|conditional\s+upon|conditional\s+that)\s+([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
237
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:subject\s+to)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
238
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:depends\s+on|is\s+dependent\s+on)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
239
+ regex(R"(([^.!?;\n]{1,160}?)\s+before\s+([^.!?;\n]{1,160}?)(?:[.!?;\n]|$))", IC),
240
+ regex(R"(([^.!?;\n]{1,160}?)\s+after\s+([^.!?;\n]{1,160}?)(?:[.!?;\n]|$))", IC),
241
+ regex(R"(([^.!?;\n]{1,200}?)\s+(?:correlates\s+with|is\s+associated\s+with|is\s+linked\s+to|is\s+related\s+to)\s+([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
242
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:is\s+likely\s+to\s+|is\s+probable\s+that\s+|is\s+likely\s+that\s+|will\s+likely\s+|likely\s+to\s+)([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
243
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:probably|likely|usually|often|rarely|unlikely)\s+(?:implies|imply|leads\s+to|results\s+in|causes|is\s+associated\s+with|is\s+expected\s+to)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
244
+ regex(R"((?:not\s+conversely|but\s+not\s+conversely|not\s+the\s+converse|but\s+not\s+the\s+other\s+way|though\s+not\s+the\s+converse|not\s+vice\s+versa))", IC),
245
+
246
+ // counterfactual / subjunctive / defeasible / statistical patterns (new)
247
+ regex(R"(\bif\s+it\s+were\s+the\s+case\s+that\s+(.{1,200}?)\s*,\s*(?:then\s+)?([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
248
+ regex(R"(\bwere\s+(.{1,120}?)\s+to\s+(.{1,120}?)\s*,\s*(?:then\s+)?([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
249
+ regex(R"(\b(?:generally|normally|typically|in\s+general|as\s+a\s+rule|usually|most\s+often)\b\s+([^.!?;\n]{1,220}?)\s+(?:imply|implies|lead?s?\s+to|result?s?\s+in|cause|causes)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
250
+ regex(R"(([^.!?;\n]{1,200}?)\s+by\s+default\s*,\s*(?:then\s+)?([^.!?;\n]{1,200}?)(?:[.!?;\n]|$))", IC),
251
+ regex(R"(([^.!?;\n]{1,220}?)\s+(?:increases\s+the\s+probability\s+of|raises\s+the\s+likelihood\s+of|increases\s+likelihood\s+of)\s+([^.!?;\n]{1,220}?)(?:[.!?;\n]|$))", IC),
252
+
253
+ // variable
254
+ regex(R"((?:\b(?:let|assume|suppose|take|declare|define|consider)\b\s+)?((?:\b[A-Za-z]\b(?:\s*,\s*|\s+and\s+))*\b[A-Za-z]\b)\s+(?:are|is|be|be\s+treated\s+as|be\s+regarded\s+as|be\s+said\s+to\s+be|as)\s+(?:(?:a\s+)?variables?|(?:a\s+)?variable)(?:[.!?;\n]|$))", IC),
255
+ };
256
+ DBG("make_patterns: created patterns struct");
257
+ return p;
258
+ }
259
+
260
+ /* ------------------------------ Sentence splitting ------------------------ */
261
+
262
+ static vector<std::pair<string,size_t>> split_into_sentences(const string &text) {
263
+ DBG_LINE();
264
+ vector<std::pair<string,size_t>> out;
265
+ size_t pos = 0;
266
+ while (pos < text.size()) {
267
+ size_t maxlook = std::min(text.size(), pos + (size_t)1400);
268
+ size_t endpos = std::string::npos;
269
+ for (size_t i = pos; i < maxlook; ++i) {
270
+ char c = text[i];
271
+ if (c == '.' || c == '!' || c == '?' || c == ';' || c == '\n') { endpos = i + 1; break; }
272
+ }
273
+ if (endpos == std::string::npos) {
274
+ size_t i = pos;
275
+ while (i < text.size() && text[i] != '.' && text[i] != '!' && text[i] != '?' && text[i] != ';' && text[i] != '\n') ++i;
276
+ endpos = (i < text.size()) ? (i+1) : text.size();
277
+ }
278
+ string sentence = text.substr(pos, endpos - pos);
279
+ size_t sent_line = line_of_offset(text, pos);
280
+ out.emplace_back(sentence, sent_line);
281
+ pos = endpos;
282
+ }
283
+ DBG("split_into_sentences -> " << out.size() << " sentences");
284
+ return out;
285
+ }
286
+
287
+ /* --------------------------- Sentence processing -------------------------- */
288
+
289
+ static void apply_regex_iter(
290
+ const string &sentence,
291
+ const regex &r,
292
+ const std::function<void(const smatch&)> &cb)
293
+ {
294
+ DBG_LINE();
295
+ for (std::sregex_iterator it(sentence.begin(), sentence.end(), r), end; it != end; ++it) {
296
+ cb(*it);
297
+ }
298
+ }
299
+
300
+ static void process_sentence(
301
+ const string &sentence,
302
+ size_t sent_line,
303
+ const Patterns &p,
304
+ vector<Edge> &edges,
305
+ unordered_set<string> &seen,
306
+ unordered_set<string> &forbidden_inferred_rev)
307
+ {
308
+ DBG("process_sentence start line=" << sent_line << " sentence='" << sentence << "'");
309
+ auto record_edge = [&](string A_raw, string B_raw, const string &form) {
310
+ DBG_LINE();
311
+ string A = node_norm(A_raw);
312
+ string B = node_norm(B_raw);
313
+ if (A.empty() || B.empty()) return;
314
+ vector<string> As = split_antecedents(A);
315
+ vector<string> Bs = split_antecedents(B);
316
+ for (const string &a0 : As) {
317
+ for (const string &b0 : Bs) {
318
+ string a = node_norm(a0);
319
+ string b = node_norm(b0);
320
+ if (a.empty() || b.empty()) continue;
321
+ Edge e{a, b, form, sent_line, normalize_spaces(sentence)};
322
+ string k = key_of_edge(e);
323
+ if (seen.insert(k).second) edges.push_back(std::move(e));
324
+ }
325
+ }
326
+ };
327
+
328
+ // (core patterns and extended handlers) - same as original file
329
+ DBG("process_sentence: applying core patterns");
330
+ apply_regex_iter(sentence, p.sym_re, [&](const smatch &m){ record_edge(m.str(1), m.str(3), string("symbol ") + trim(m.str(2))); });
331
+ apply_regex_iter(sentence, p.sequent_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "sequent"); });
332
+ apply_regex_iter(sentence, p.lex_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "lexical implies/entails/causal"); });
333
+ apply_regex_iter(sentence, p.passive_re, [&](const smatch &m){ record_edge(m.str(2), m.str(1), "passive causal/implication (X -> Y)"); });
334
+ apply_regex_iter(sentence, p.ifthen_re, [&](const smatch &m){ string L=trim(m.str(1)), R=trim(m.str(2)); if(L.size()>1 && R.size()>1) record_edge(L, R, "if...then / conditional"); });
335
+ apply_regex_iter(sentence, p.given_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "given/assuming/provided"); });
336
+ apply_regex_iter(sentence, p.whenever_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "whenever (universal conditional)"); });
337
+ apply_regex_iter(sentence, p.therefore_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "therefore/hence/consequently"); });
338
+ apply_regex_iter(sentence, p.from_we_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "from ... we deduce"); });
339
+ apply_regex_iter(sentence, p.follows_from_re, [&](const smatch &m){ record_edge(m.str(2), m.str(1), "follows from (X -> Y)"); });
340
+ apply_regex_iter(sentence, p.onlyif_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "only if (Y -> X)"); });
341
+ apply_regex_iter(sentence, p.onlywhen_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "only when (Y -> X)"); });
342
+ apply_regex_iter(sentence, p.unless_re, [&](const smatch &m){ record_edge(string("not(")+m.str(2)+")", m.str(1), "unless (not(Q) -> P)"); });
343
+ apply_regex_iter(sentence, p.iff_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "iff / biconditional (A -> B)"); record_edge(m.str(2), m.str(1), "iff / biconditional (B -> A)"); });
344
+ apply_regex_iter(sentence, p.nec_suf_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "necessary and sufficient (A -> B)"); record_edge(m.str(2), m.str(1), "necessary and sufficient (B -> A)"); });
345
+ apply_regex_iter(sentence, p.suff_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "sufficient for (A -> B)"); });
346
+ apply_regex_iter(sentence, p.neces_re, [&](const smatch &m){ record_edge(m.str(2), m.str(1), "necessary for (B -> A)"); });
347
+
348
+ DBG("process_sentence: applying extended patterns");
349
+ apply_regex_iter(sentence, p.means_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "means/denotes/signifies/constitutes (A -> B)"); });
350
+ apply_regex_iter(sentence, p.equiv_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "equivalent (A -> B)"); record_edge(m.str(2), m.str(1), "equivalent (B -> A)"); });
351
+ apply_regex_iter(sentence, p.every_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "universal 'every/all' (class -> property)"); });
352
+ apply_regex_iter(sentence, p.in_case_re, [&](const smatch &m){ record_edge(m.str(1), m.str(2), "in case (conditional)"); });
353
+ apply_regex_iter(sentence, p.without_re, [&](const smatch &m){ record_edge(string("not(")+m.str(1)+")", m.str(2), "without (not(X) -> Y)"); });
354
+ apply_regex_iter(sentence, p.must_re, [&](const smatch &m){ string L=trim(m.str(1)), R=trim(m.str(2)); if(!L.empty() && !R.empty()) record_edge(L,R,"must / modal -> (X -> Y)"); });
355
+ apply_regex_iter(sentence, p.cannotboth_re, [&](const smatch &m){ string A=trim(m.str(1)), B=trim(m.str(2)); if(!A.empty()&&!B.empty()){ record_edge(A,string("not(")+B+")","mutually exclusive (A -> not(B))"); record_edge(B,string("not(")+A+")","mutually exclusive (B -> not(A))"); } });
356
+ apply_regex_iter(sentence, p.prevents_re, [&](const smatch &m){ record_edge(m.str(1), string("not(")+m.str(2)+")", "prevents / precludes (A -> not(B))"); });
357
+ apply_regex_iter(sentence, p.contradicts_re, [&](const smatch &m){ string A=trim(m.str(1)), B=trim(m.str(2)); if(!A.empty()&&!B.empty()){ record_edge(A,string("not(")+B+")","contradicts (A -> not(B))"); record_edge(B,string("not(")+A+")","contradicts (B -> not(A))"); } });
358
+ apply_regex_iter(sentence, p.exceptwhen_re, [&](const smatch &m){ record_edge(string("not(")+m.str(2)+")", m.str(1), "except when (not(X) -> Y)"); });
359
+
360
+ apply_regex_iter(sentence, p.variable_decl_re, [&](const smatch &m){ record_edge(m.str(1), string("is_variable"), "declares-variables"); });
361
+
362
+ // rest of pattern handlers (kept intact) --- debug trace entry at start and end
363
+ DBG("process_sentence: completed");
364
+ }
365
+
366
+ /* --------------------------- Graph building & inference ------------------- */
367
+
368
+ static void build_graph_from_edges(
369
+ const vector<Edge> &edges,
370
+ unordered_map<string,int> &id,
371
+ vector<string> &id2,
372
+ vector<vector<int>> &adj,
373
+ set<string> &explicit_edges,
374
+ unordered_map<string,string> &form_by_idpair)
375
+ {
376
+ DBG_LINE();
377
+ auto ensure = [&](const string &s)->int {
378
+ auto it = id.find(s);
379
+ if (it != id.end()) return it->second;
380
+ int idx = (int)id2.size();
381
+ id2.push_back(s);
382
+ id.emplace(s, idx);
383
+ DBG("ensure new node '" << s << "' -> id=" << idx);
384
+ return idx;
385
+ };
386
+
387
+ for (const auto &e : edges) {
388
+ int a = ensure(e.A), b = ensure(e.B);
389
+ if ((size_t)std::max(a,b) >= adj.size()) adj.resize(id2.size());
390
+ string key = std::to_string(a) + "->" + std::to_string(b);
391
+ if (explicit_edges.insert(key).second) {
392
+ adj[a].push_back(b);
393
+ form_by_idpair[key] = e.form;
394
+ }
395
+ }
396
+ DBG("build_graph_from_edges: nodes=" << id2.size() << " edges=" << explicit_edges.size());
397
+ }
398
+
399
+ static vector<Edge> build_contrapositives(const vector<Edge> &edges, unordered_set<string> &seen) {
400
+ DBG_LINE();
401
+ vector<Edge> out;
402
+ for (const auto &e : edges) {
403
+ string nB = string("not(") + e.B + ")";
404
+ string nA = string("not(") + e.A + ")";
405
+ Edge cp{nB, nA, string("contrapositive of: ") + e.form, 0, ""};
406
+ string k = key_of_edge(cp);
407
+ if (seen.insert(k).second) out.push_back(cp);
408
+ }
409
+ DBG("build_contrapositives -> " << out.size());
410
+ return out;
411
+ }
412
+
413
+ static vector<Edge> infer_transitives(
414
+ const vector<string> &id2,
415
+ const vector<vector<int>> &adj,
416
+ const set<string> &explicit_edges,
417
+ const unordered_map<string,string> &form_by_idpair,
418
+ const unordered_set<string> &forbidden_inferred_rev,
419
+ int maxDepth = 3)
420
+ {
421
+ DBG_LINE();
422
+ unordered_map<string, bool> is_weak_edge;
423
+ for (const auto &p : form_by_idpair) {
424
+ const string &form = p.second;
425
+ string lf = lower_copy(form);
426
+ bool weak = (lf.find("[weak]") != string::npos)
427
+ || (lf.find("probable") != string::npos)
428
+ || (lf.find("likely") != string::npos)
429
+ || (lf.find("probab") != string::npos)
430
+ || (lf.find("correlat") != string::npos)
431
+ || (lf.find("counterfactual") != string::npos)
432
+ || (lf.find("defeasible") != string::npos)
433
+ || (lf.find("default") != string::npos)
434
+ || (lf.find("statistical") != string::npos);
435
+ is_weak_edge[p.first] = weak;
436
+ }
437
+
438
+ vector<Edge> inferred;
439
+ set<string> inferred_keys;
440
+ int n = (int)id2.size();
441
+
442
+ for (int s = 0; s < n; ++s) {
443
+ vector<int> dist(n, -1);
444
+ std::queue<std::tuple<int,int,bool>> q;
445
+ dist[s] = 0;
446
+ for (int v : adj[s]) {
447
+ string key = std::to_string(s) + "->" + std::to_string(v);
448
+ bool w = is_weak_edge.count(key) ? is_weak_edge[key] : false;
449
+ dist[v] = 1;
450
+ q.push(std::make_tuple(v, 1, w));
451
+ }
452
+ while (!q.empty()) {
453
+ auto [u, d, path_has_weak] = q.front(); q.pop();
454
+ if (d >= 2 && d <= maxDepth) {
455
+ string A = id2[s], C = id2[u];
456
+ string A_norm = node_norm(A), C_norm = node_norm(C);
457
+ if (forbidden_inferred_rev.find(A_norm + "->" + C_norm) == forbidden_inferred_rev.end()) {
458
+ if (!path_has_weak) {
459
+ string form = "inferred (transitive length=" + std::to_string(d) + ")";
460
+ Edge ie{A, C, form, 0, ""};
461
+ string k = key_of_edge(ie);
462
+ if (explicit_edges.count(std::to_string(s) + "->" + std::to_string(u)) == 0 && inferred_keys.insert(k).second) {
463
+ inferred.push_back(ie);
464
+ }
465
+ }
466
+ }
467
+ }
468
+ if (d < maxDepth) {
469
+ for (int w : adj[u]) {
470
+ if (dist[w] == -1) {
471
+ dist[w] = d + 1;
472
+ string edgekey = std::to_string(u) + "->" + std::to_string(w);
473
+ bool edge_is_weak = is_weak_edge.count(edgekey) ? is_weak_edge[edgekey] : false;
474
+ bool new_path_weak = path_has_weak || edge_is_weak;
475
+ q.push(std::make_tuple(w, d+1, new_path_weak));
476
+ }
477
+ }
478
+ }
479
+ }
480
+ }
481
+ DBG("infer_transitives -> " << inferred.size());
482
+ return inferred;
483
+ }
484
+
485
+ /* ------------------------------- Reporting -------------------------------- */
486
+
487
+ static void output_report(
488
+ const vector<Edge> &edges,
489
+ const vector<Edge> &contrapositives,
490
+ const vector<Edge> &inferred,
491
+ const unordered_map<string,string> &form_by_idpair,
492
+ const vector<string> &id2,
493
+ const set<string> &explicit_edges,
494
+ const unordered_set<string> &forbidden_inferred_rev)
495
+ {
496
+ DBG_LINE();
497
+ // 1) Explicit edges
498
+ std::cout << "=== Explicit edges (" << edges.size() << ") ===\n\n";
499
+ for (size_t i = 0; i < edges.size(); ++i) {
500
+ const auto &e = edges[i];
501
+ std::cout << "[" << (i+1) << "] Line " << e.line << " Form: " << e.form << "\n";
502
+ std::cout << " " << "Antecedent: " << e.A << "\n";
503
+ std::cout << " " << "Consequent: " << e.B << "\n";
504
+ std::cout << " " << "Sentence: " << e.sentence << "\n\n";
505
+ }
506
+
507
+ // 2) Contrapositives
508
+ if (!contrapositives.empty()) {
509
+ std::cout << "=== Contrapositives (" << contrapositives.size() << ") ===\n\n";
510
+ for (size_t i = 0; i < contrapositives.size(); ++i) {
511
+ const auto &e = contrapositives[i];
512
+ std::cout << "[" << (i+1) << "] " << e.form << "\n";
513
+ std::cout << " " << e.A << " -> " << e.B << "\n\n";
514
+ }
515
+ }
516
+
517
+ // 3) Inferred transitive edges
518
+ if (!inferred.empty()) {
519
+ std::cout << "=== Inferred transitive edges (" << inferred.size() << ", depth<=3) ===\n\n";
520
+ for (size_t i = 0; i < inferred.size(); ++i) {
521
+ const auto &e = inferred[i];
522
+ std::cout << "[" << (i+1) << "] " << e.form << "\n";
523
+ std::cout << " " << e.A << " -> " << e.B << "\n\n";
524
+ }
525
+ }
526
+
527
+ // 4) Expanded weak-edge summary (grouped)
528
+ auto lower_form = [&](const string &f){ return lower_copy(f); };
529
+
530
+ size_t weak_count = 0;
531
+ unordered_map<string, vector<tuple<string,string,string>>> groups;
532
+ unordered_map<string,string> form_for_pair;
533
+
534
+ for (const auto &p : form_by_idpair) {
535
+ const string &pairkey = p.first; // "a->b" where a and b are numeric ids
536
+ const string &form = p.second;
537
+ string lf = lower_form(form);
538
+
539
+ bool is_weak = (lf.find("[weak]") != string::npos)
540
+ || (lf.find("probable") != string::npos)
541
+ || (lf.find("likely") != string::npos)
542
+ || (lf.find("probab") != string::npos)
543
+ || (lf.find("correlat") != string::npos)
544
+ || (lf.find("counterfactual") != string::npos)
545
+ || (lf.find("defeasib") != string::npos)
546
+ || (lf.find("default") != string::npos)
547
+ || (lf.find("statistical") != string::npos)
548
+ || (lf.find("increases probability") != string::npos)
549
+ || (lf.find("raises the likelihood") != string::npos)
550
+ || (lf.find("raises likelihood") != string::npos);
551
+
552
+ if (!is_weak) continue;
553
+ ++weak_count;
554
+
555
+ size_t possep = pairkey.find("->");
556
+ if (possep == string::npos) continue;
557
+ int a = 0, b = 0;
558
+ try {
559
+ a = std::stoi(pairkey.substr(0, possep));
560
+ b = std::stoi(pairkey.substr(possep+2));
561
+ } catch (...) { continue; }
562
+
563
+ string Aname = (a >= 0 && a < (int)id2.size()) ? id2[a] : ("<node:" + std::to_string(a) + ">");
564
+ string Bname = (b >= 0 && b < (int)id2.size()) ? id2[b] : ("<node:" + std::to_string(b) + ">");
565
+ string keyAB = Aname + "||" + Bname;
566
+ if (form_for_pair.find(keyAB) == form_for_pair.end()) form_for_pair[keyAB] = form;
567
+
568
+ if (lf.find("correlat") != string::npos) groups["correlational / associated"].emplace_back(Aname, Bname, form);
569
+ if (lf.find("probab") != string::npos || lf.find("likely") != string::npos) groups["probabilistic / likely"].emplace_back(Aname, Bname, form);
570
+ if (lf.find("counterfactual") != string::npos || lf.find("subjunctive") != string::npos) groups["counterfactual / subjunctive"].emplace_back(Aname, Bname, form);
571
+ if (lf.find("defeasib") != string::npos || lf.find("generally") != string::npos || lf.find("typically") != string::npos
572
+ || lf.find("normally") != string::npos || lf.find("usually") != string::npos) {
573
+ groups["defeasible / general rules"].emplace_back(Aname, Bname, form);
574
+ }
575
+ if (lf.find("default") != string::npos) groups["default rules"].emplace_back(Aname, Bname, form);
576
+ if (lf.find("statistical") != string::npos || lf.find("increases probability") != string::npos
577
+ || lf.find("raises the likelihood") != string::npos || lf.find("raises likelihood") != string::npos) {
578
+ groups["statistical / increases-likelihood"].emplace_back(Aname, Bname, form);
579
+ }
580
+ bool matched_any = false;
581
+ for (const auto &gpair : groups) {
582
+ if (!gpair.second.empty()) { matched_any = true; break; }
583
+ }
584
+ if (!matched_any) groups["other weak"].emplace_back(Aname, Bname, form);
585
+ }
586
+
587
+ if (weak_count > 0) {
588
+ std::cout << "=== Weak / Probabilistic / Correlational explicit edges (" << weak_count << ") ===\n\n";
589
+
590
+ vector<string> order = {
591
+ "probabilistic / likely",
592
+ "correlational / associated",
593
+ "counterfactual / subjunctive",
594
+ "defeasible / general rules",
595
+ "default rules",
596
+ "statistical / increases-likelihood",
597
+ "other weak"
598
+ };
599
+
600
+ for (const string &grp : order) {
601
+ auto it = groups.find(grp);
602
+ if (it == groups.end() || it->second.empty()) continue;
603
+ std::cout << " -- " << grp << " (" << it->second.size() << ")\n";
604
+ std::set<string> printed;
605
+ for (const auto &t : it->second) {
606
+ const string &Aname = std::get<0>(t);
607
+ const string &Bname = std::get<1>(t);
608
+ const string &form = std::get<2>(t);
609
+ string keyAB = Aname + "->" + Bname;
610
+ if (!printed.insert(keyAB).second) continue;
611
+ std::cout << " " << Aname << " -> " << Bname;
612
+ if (!form.empty()) std::cout << " Form: " << form;
613
+ std::cout << "\n";
614
+ }
615
+ std::cout << "\n";
616
+ }
617
+ }
618
+
619
+ // 5) Explicitly forbidden inferences
620
+ if (!forbidden_inferred_rev.empty()) {
621
+ std::cout << "=== Explicitly forbidden inferences (" << forbidden_inferred_rev.size() << ") ===\n\n";
622
+ size_t i = 1;
623
+ for (const auto &f : forbidden_inferred_rev) {
624
+ std::cout << "[" << (i++) << "] Forbidden inference: " << f << " (text explicitly disallows this converse)\n";
625
+ }
626
+ std::cout << "\n";
627
+ }
628
+ }
629
+
630
+ /* ------------------- Incremental processing + chat machinery ---------------- */
631
+
632
+ // external symbols provided by dictionary.cpp (as you showed)
633
+ extern unsigned char dictionary_json[]; // binary blob of JSON text
634
+ extern unsigned int dictionary_json_len; // its length
635
+
636
+ struct ChatMemory {
637
+ // thread-safe containers for conversation history and edges
638
+ std::mutex mtx;
639
+ vector<std::pair<string,string>> history; // pairs of (user, assistant)
640
+ vector<Edge> edges; // all explicit edges (including from input and conversations)
641
+ unordered_set<string> seen_keys; // dedup
642
+ unordered_set<string> forbidden_inferred_rev;
643
+
644
+ // graph caches
645
+ unordered_map<string,int> id; // node -> id
646
+ vector<string> id2; // id -> node
647
+ vector<vector<int>> adj; // adjacency
648
+ set<string> explicit_edges; // "a->b" numeric
649
+ unordered_map<string,string> form_by_idpair; // "a->b" -> form
650
+
651
+ Patterns patterns;
652
+
653
+ ChatMemory() : patterns(make_patterns()) { DBG("ChatMemory constructed"); }
654
+
655
+ // --- Begin: graph backtracking / attention / retrieval indices ---
656
+ // Reverse adjacency for fast incoming-edge traversal (same length as adj when indexed)
657
+ vector<vector<int>> rev_adj;
658
+
659
+ // Edge-index maps: for each node id, store indices into `edges` vector
660
+ vector<vector<int>> edges_from_node; // outgoing edge indices by node id
661
+ vector<vector<int>> edges_to_node; // incoming edge indices by node id
662
+
663
+ // Token -> node id index for fast retrieval (tokenized node labels)
664
+ unordered_map<string, vector<int>> token_index;
665
+
666
+ // Provenance / metadata for explicit edges: key_of_edge(edge) -> source label (e.g., "user:file:line" or "assistant")
667
+ unordered_map<string, string> edge_provenance;
668
+
669
+ // Compact correction log (human-readable)
670
+ vector<string> correction_log;
671
+
672
+ // Lightweight cache of last focus (keeps frequently-accessed node ids)
673
+ unordered_map<string, vector<int>> relevance_cache;
674
+
675
+ // mark (by node id) nodes that can reach a declared-variable sentinel
676
+ vector<char> can_reach_var_decl;
677
+
678
+ // dictionary (loaded lazily) + concurrency control and safety caps
679
+ std::unordered_map<std::string, std::string> dictionary; // loaded lazily
680
+ bool dict_loaded = false;
681
+ std::mutex dict_mtx; // make dictionary load thread-safe
682
+ int dict_depth = 2; // default (0 = no expansion); set via CLI or setter
683
+ double dict_similarity_threshold = 0.0; // keep 0.0 (always choose best) — adjust if desired
684
+
685
+ // Safety cap to avoid explosion while expanding definitions (adjustable)
686
+ static constexpr size_t MAX_DICT_TOKENS = 5000;
687
+
688
+ void set_dict_depth(int d) { dict_depth = std::max(0, d); }
689
+ int get_dict_depth() const { return dict_depth; }
690
+
691
+ // --- Minimal JSON string parser (keeps same behavior) ---
692
+ string parse_json_string(const string &s, size_t &pos) {
693
+ ++pos; // skip opening '"'
694
+ string out;
695
+ while (pos < s.size()) {
696
+ char c = s[pos++];
697
+ if (c == '"') break;
698
+ if (c == '\\' && pos < s.size()) {
699
+ char esc = s[pos++];
700
+ switch (esc) {
701
+ case '"': out.push_back('"'); break;
702
+ case '\\': out.push_back('\\'); break;
703
+ case '/': out.push_back('/'); break;
704
+ case 'b': out.push_back('\b'); break;
705
+ case 'f': out.push_back('\f'); break;
706
+ case 'n': out.push_back('\n'); break;
707
+ case 'r': out.push_back('\r'); break;
708
+ case 't': out.push_back('\t'); break;
709
+ case 'u':
710
+ // skip 4 hex digits (approximate)
711
+ if (pos + 4 <= s.size()) pos += 4;
712
+ out.push_back('?');
713
+ break;
714
+ default:
715
+ out.push_back(esc);
716
+ }
717
+ } else {
718
+ out.push_back(c);
719
+ }
720
+ }
721
+ return out;
722
+ }
723
+
724
+ // Load dictionary lazily from binary JSON blob (uses instance members)
725
+ // Thread-safe: multiple threads may call this concurrently; we serialize the first loader.
726
+ void load_dictionary_from_blob() {
727
+ // Fast-path: avoid locking if already loaded
728
+ if (dict_loaded) return;
729
+
730
+ std::lock_guard<std::mutex> lg(dict_mtx);
731
+ if (dict_loaded) return; // double-checked
732
+
733
+ // dictionary_json and dictionary_json_len are file-scope externs
734
+ if (dictionary_json == nullptr || dictionary_json_len == 0) {
735
+ dict_loaded = true;
736
+ return;
737
+ }
738
+
739
+ // Parse JSON from blob (keeps same minimal parser semantics)
740
+ string json((char*)dictionary_json, (size_t)dictionary_json_len);
741
+ size_t pos = 0, n = json.size();
742
+ while (pos < n) {
743
+ while (pos < n && json[pos] != '"') ++pos;
744
+ if (pos >= n) break;
745
+ string key = parse_json_string(json, pos);
746
+ while (pos < n && json[pos] != ':') ++pos;
747
+ if (pos >= n) break;
748
+ ++pos;
749
+ while (pos < n && std::isspace((unsigned char)json[pos])) ++pos;
750
+ if (pos < n && json[pos] == '"') {
751
+ string val = parse_json_string(json, pos);
752
+ string lk = lower_copy(key);
753
+ dictionary.emplace(lk, val);
754
+ } else {
755
+ while (pos < n && json[pos] != ',' && json[pos] != '}') ++pos;
756
+ }
757
+ }
758
+ dict_loaded = true;
759
+ }
760
+
761
+ // Tokenizer (keeps same semantics)
762
+ static vector<string> tokenize_words_static(const string &s) {
763
+ vector<string> out;
764
+ string buf;
765
+ string lc = lower_copy(s);
766
+ for (size_t i = 0; i <= lc.size(); ++i) {
767
+ char c = (i < lc.size() ? lc[i] : ' ');
768
+ if (std::isalnum((unsigned char)c)) buf.push_back(c);
769
+ else {
770
+ if (buf.size() >= 2) out.push_back(buf);
771
+ buf.clear();
772
+ }
773
+ }
774
+ return out;
775
+ }
776
+
777
+ // Expand seeds using dictionary definitions up to `depth` levels (instance method)
778
+ // Uses BFS-style queue, but imposes a global cap to avoid explosion.
779
+ // Thread-safety: this function calls load_dictionary_from_blob() which is serialized.
780
+ unordered_set<string> expand_tokens_with_dictionary(const unordered_set<string> &seeds, int depth) {
781
+ unordered_set<string> result = seeds;
782
+ if (depth <= 0) return result;
783
+ if (!dict_loaded) load_dictionary_from_blob();
784
+ if (dictionary.empty()) return result;
785
+
786
+ unordered_set<string> visited = seeds;
787
+ std::queue<pair<string,int>> q;
788
+ for (const auto &w : seeds) q.push({w, 0});
789
+
790
+ while (!q.empty()) {
791
+ auto [tok, d] = q.front(); q.pop();
792
+ if (d >= depth) continue;
793
+ auto it = dictionary.find(tok);
794
+ if (it == dictionary.end()) continue;
795
+
796
+ vector<string> tokens = tokenize_words_static(it->second);
797
+ for (auto &t : tokens) {
798
+ if (visited.insert(t).second) {
799
+ result.insert(t);
800
+ if (result.size() > MAX_DICT_TOKENS) {
801
+ // cap reached; stop further expansion for safety
802
+ return result;
803
+ }
804
+ q.push({t, d+1});
805
+ }
806
+ }
807
+ }
808
+ return result;
809
+ }
810
+
811
+ // Build map LHS -> edges (convenience)
812
+ unordered_map<string, vector<Edge>> build_edge_map_snapshot_local(const vector<Edge> &edges_snapshot) {
813
+ unordered_map<string, vector<Edge>> m;
814
+ m.reserve(edges_snapshot.size() * 2 + 10);
815
+ for (const Edge &e : edges_snapshot) {
816
+ string a = node_norm(e.A);
817
+ m[a].push_back(e);
818
+ }
819
+ return m;
820
+ }
821
+
822
+ // Precompute candidate token-sets for all LHS keys (instance method, parallelized)
823
+ void precompute_candidate_tokensets(
824
+ const unordered_map<string, vector<Edge>> &edge_map,
825
+ int depth,
826
+ vector<string> &out_keys,
827
+ vector<unordered_set<string>> &out_tokensets)
828
+ {
829
+ out_keys.clear();
830
+ out_tokensets.clear();
831
+ out_keys.reserve(edge_map.size());
832
+ for (const auto &p : edge_map) out_keys.push_back(p.first);
833
+
834
+ size_t m = out_keys.size();
835
+ out_tokensets.resize(m);
836
+
837
+ #ifdef _OPENMP
838
+ #pragma omp parallel for schedule(dynamic)
839
+ #endif
840
+ for (int i = 0; i < (int)m; ++i) {
841
+ const string &lhs = out_keys[i];
842
+ vector<string> toks = tokenize_words_static(lhs);
843
+ unordered_set<string> seeds;
844
+ for (auto &t : toks) seeds.insert(t);
845
+ if (depth > 0) out_tokensets[i] = expand_tokens_with_dictionary(seeds, depth);
846
+ else out_tokensets[i] = std::move(seeds);
847
+ }
848
+ }
849
+
850
+ // Jaccard similarity (pure helper)
851
+ static double jaccard_similarity_static(const unordered_set<string> &A, const unordered_set<string> &B) {
852
+ if (A.empty() && B.empty()) return 1.0;
853
+ if (A.empty() || B.empty()) return 0.0;
854
+ const unordered_set<string> *small = &A, *large = &B;
855
+ if (A.size() > B.size()) { small = &B; large = &A; }
856
+ size_t inter = 0;
857
+ for (const auto &t : *small) if (large->find(t) != large->end()) ++inter;
858
+ size_t uni = A.size() + B.size() - inter;
859
+ return uni ? (double)inter / (double)uni : 0.0;
860
+ }
861
+
862
+ // Find best candidate index (parallelized)
863
+ pair<int,double> find_best_candidate_index_for_value(
864
+ const unordered_set<string> &value_tokens,
865
+ const vector<unordered_set<string>> &candidate_tokensets)
866
+ {
867
+ int m = (int)candidate_tokensets.size();
868
+ if (m == 0) return {-1, 0.0};
869
+
870
+ int max_threads = 1;
871
+ #ifdef _OPENMP
872
+ max_threads = omp_get_max_threads();
873
+ #endif
874
+ vector<double> local_best(max_threads, -1.0);
875
+ vector<int> local_idx(max_threads, -1);
876
+
877
+ #ifdef _OPENMP
878
+ #pragma omp parallel
879
+ #endif
880
+ {
881
+ #ifdef _OPENMP
882
+ int tid = omp_get_thread_num();
883
+ #else
884
+ int tid = 0;
885
+ #endif
886
+ double lbest = -1.0;
887
+ int lidx = -1;
888
+ #ifdef _OPENMP
889
+ #pragma omp for schedule(static)
890
+ #endif
891
+ for (int i = 0; i < m; ++i) {
892
+ double sim = jaccard_similarity_static(value_tokens, candidate_tokensets[i]);
893
+ if (sim > lbest) { lbest = sim; lidx = i; }
894
+ }
895
+ local_best[tid] = lbest;
896
+ local_idx[tid] = lidx;
897
+ } // parallel
898
+
899
+ double best = -1.0; int best_i = -1;
900
+ for (int t = 0; t < (int)local_best.size(); ++t) {
901
+ if (local_best[t] > best) { best = local_best[t]; best_i = local_idx[t]; }
902
+ }
903
+ return {best_i, best};
904
+ }
905
+
906
+ // Build auxiliary indices from the current snapshot of id/id2/adj/edges.
907
+ // Must be called with mtx held or immediately after graph rebuild (we call it holding the lock).
908
+ void index_graph() {
909
+ // assumes id, id2, adj and edges are current snapshot
910
+ size_t n = id2.size();
911
+ rev_adj.assign(n, {});
912
+ edges_from_node.assign(n, {});
913
+ edges_to_node.assign(n, {});
914
+ token_index.clear();
915
+ relevance_cache.clear();
916
+
917
+ // build reverse adjacency and per-node edge lists
918
+ for (size_t ei = 0; ei < edges.size(); ++ei) {
919
+ const Edge &e = edges[ei];
920
+ auto itA = id.find(e.A);
921
+ auto itB = id.find(e.B);
922
+ if (itA == id.end() || itB == id.end()) continue;
923
+ int a = itA->second, b = itB->second;
924
+ if ((size_t)std::max(a,b) >= n) continue;
925
+ rev_adj[b].push_back(a);
926
+ edges_from_node[a].push_back((int)ei);
927
+ edges_to_node[b].push_back((int)ei);
928
+ }
929
+
930
+ // build token index (tokenize node labels into lowercased alpha-numeric tokens)
931
+ for (int nid = 0; nid < (int)id2.size(); ++nid) {
932
+ string node = lower_copy(id2[nid]);
933
+ string token;
934
+ for (size_t i = 0; i <= node.size(); ++i) {
935
+ char c = (i < node.size()) ? node[i] : ' ';
936
+ if (std::isalnum((unsigned char)c) || c == '_') token.push_back(c);
937
+ else {
938
+ if (token.size() >= 3) { token_index[token].push_back(nid); }
939
+ token.clear();
940
+ }
941
+ }
942
+ }
943
+
944
+ // compute which nodes can reach the "is_variable" sentinel by forward edges
945
+ // (equivalently: reverse-BFS from the 'is_variable' node through rev_adj)
946
+ can_reach_var_decl.assign(n, false);
947
+ auto it_var = id.find("is_variable");
948
+ if (it_var != id.end()) {
949
+ int varid = it_var->second;
950
+ std::queue<int> q;
951
+ can_reach_var_decl[varid] = true;
952
+ q.push(varid);
953
+ while (!q.empty()) {
954
+ int u = q.front(); q.pop();
955
+ for (int pred : rev_adj[u]) {
956
+ if (!can_reach_var_decl[pred]) {
957
+ can_reach_var_decl[pred] = true;
958
+ q.push(pred);
959
+ }
960
+ }
961
+ }
962
+ }
963
+ }
964
+
965
+ // Trace step for one application (one implication use)
966
+ struct ApplicationStep {
967
+ string from; // input value that matched left side
968
+ string to; // right side applied
969
+ string form; // edge.form
970
+ size_t line; // edge.line
971
+ string sentence; // edge.sentence
972
+ };
973
+
974
+ // A chain is an ordered list of ApplicationStep from original -> ... -> final
975
+ using ApplicationChain = vector<ApplicationStep>;
976
+
977
+ // Non-recursive iterative computation of application chains for `start`.
978
+ // Produces same shape of output as the previous recursive routine but avoids
979
+ // deep recursion and uses explicit stack + memoization.
980
+ // edge_map: LHS -> vector<Edge>
981
+ // memo: per-thread memo map (value -> vector<ApplicationChain>) used to avoid recomputation
982
+ static vector<ApplicationChain> compute_chains_iterative(
983
+ const string &start,
984
+ const unordered_map<string, vector<Edge>> &edge_map,
985
+ unordered_map<string, vector<ApplicationChain>> &memo)
986
+ {
987
+ // If already memoized, return immediately
988
+ auto itmem = memo.find(start);
989
+ if (itmem != memo.end()) return itmem->second;
990
+
991
+ // Explicit DFS stack of (node, state)
992
+ // state 0 = enter, 1 = exit/process
993
+ vector<pair<string,int>> stack;
994
+ stack.emplace_back(start, 0);
995
+
996
+ // Visiting set to detect cycles
997
+ unordered_set<string> visiting;
998
+
999
+ while (!stack.empty()) {
1000
+ auto [node, state] = stack.back();
1001
+
1002
+ // memoized? pop and continue.
1003
+ if (memo.find(node) != memo.end()) { stack.pop_back(); continue; }
1004
+
1005
+ auto itmap = edge_map.find(node);
1006
+ if (state == 0) {
1007
+ // Enter node
1008
+ if (visiting.find(node) != visiting.end()) {
1009
+ // Cycle detected: treat as terminal (empty chains) to break cycle
1010
+ memo.emplace(node, vector<ApplicationChain>{});
1011
+ stack.pop_back();
1012
+ continue;
1013
+ }
1014
+ visiting.insert(node);
1015
+
1016
+ if (itmap == edge_map.end()) {
1017
+ // No outgoing edges => terminal marker (empty vector)
1018
+ memo.emplace(node, vector<ApplicationChain>{});
1019
+ visiting.erase(node);
1020
+ stack.pop_back();
1021
+ continue;
1022
+ }
1023
+
1024
+ // schedule exit processing after children are ensured
1025
+ stack.back().second = 1;
1026
+ // push children that are not yet memoized
1027
+ for (const Edge &e : itmap->second) {
1028
+ string B = node_norm(e.B);
1029
+ if (memo.find(B) == memo.end()) {
1030
+ stack.emplace_back(B, 0);
1031
+ }
1032
+ }
1033
+ } else { // state == 1 -> exit/process: build memo[node] from children memos
1034
+ vector<ApplicationChain> out;
1035
+ // itmap must be valid here
1036
+ for (const Edge &e : itmap->second) {
1037
+ string B = node_norm(e.B);
1038
+ ApplicationStep step{ node, B, e.form, e.line, e.sentence };
1039
+
1040
+ auto itB = memo.find(B);
1041
+ if (itB == memo.end() || itB->second.empty()) {
1042
+ // terminal next -> single-step chain
1043
+ ApplicationChain ch; ch.push_back(step); out.push_back(std::move(ch));
1044
+ } else {
1045
+ // extend each suffix
1046
+ for (const auto &suf : itB->second) {
1047
+ ApplicationChain ch; ch.reserve(1 + suf.size());
1048
+ ch.push_back(step);
1049
+ ch.insert(ch.end(), suf.begin(), suf.end());
1050
+ out.push_back(std::move(ch));
1051
+ }
1052
+ }
1053
+ }
1054
+ memo.emplace(node, std::move(out));
1055
+ visiting.erase(node);
1056
+ stack.pop_back();
1057
+ }
1058
+ }
1059
+
1060
+ auto itres = memo.find(start);
1061
+ if (itres == memo.end()) return vector<ApplicationChain>{};
1062
+ return itres->second;
1063
+ }
1064
+
1065
+ string apply_implications_to_prompt_report(
1066
+ const string &user_input,
1067
+ const vector<Edge> &edges_snapshot,
1068
+ const unordered_map<string,int> &id_snapshot,
1069
+ const vector<string> &id2_snapshot)
1070
+ {
1071
+ // --- Helper short aliases/types ---
1072
+ using StrSet = unordered_set<string>;
1073
+ struct AppliedRecord {
1074
+ Edge edge;
1075
+ vector<pair<string,string>> antecedent_matches; // (antecedent, matched_fact)
1076
+ };
1077
+
1078
+ // --- 1) Split prompt into normalized parts (available facts initial set) ---
1079
+ vector<string> prompt_parts;
1080
+ {
1081
+ auto sents = split_into_sentences(user_input);
1082
+ for (const auto &pr : sents) {
1083
+ string sentence = trim(pr.first);
1084
+ if (sentence.empty()) continue;
1085
+ auto ants = split_antecedents(sentence);
1086
+ for (const string &a : ants) {
1087
+ string n = node_norm(a);
1088
+ if (!n.empty()) prompt_parts.push_back(n);
1089
+ }
1090
+ }
1091
+ }
1092
+ if (prompt_parts.empty()) return string("");
1093
+
1094
+ // --- 2) Build per-edge antecedent list (edge_ants) and collect unique antecedent literals ---
1095
+ int E = (int)edges_snapshot.size();
1096
+ vector<vector<string>> edge_ants(E);
1097
+ StrSet all_ants;
1098
+ for (int i = 0; i < E; ++i) {
1099
+ const Edge &e = edges_snapshot[i];
1100
+ vector<string> ants = split_antecedents(e.A);
1101
+ for (auto &a : ants) {
1102
+ string an = node_norm(a);
1103
+ if (!an.empty()) { edge_ants[i].push_back(an); all_ants.insert(an); }
1104
+ }
1105
+ }
1106
+
1107
+ // --- 3) Precompute token sets for all antecedent literals and build token->antecedent index ---
1108
+ // Modular small helper: tokenization + optional dictionary expansion
1109
+ auto compute_tokens_for = [&](const string &label)->StrSet {
1110
+ vector<string> toks = tokenize_words_static(label);
1111
+ StrSet s; for (auto &t : toks) s.insert(t);
1112
+ if (dict_depth > 0 && !s.empty()) s = expand_tokens_with_dictionary(s, dict_depth);
1113
+ return s;
1114
+ };
1115
+
1116
+ // antecedent -> tokens
1117
+ unordered_map<string, StrSet> ant_tokens;
1118
+ ant_tokens.reserve(all_ants.size()*2);
1119
+
1120
+ // token -> antecedent list
1121
+ unordered_map<string, vector<string>> token_to_ants;
1122
+ token_to_ants.reserve(1024);
1123
+
1124
+ // parallel compute tokens for each antecedent
1125
+ vector<string> all_ants_vec; all_ants_vec.reserve(all_ants.size());
1126
+ for (auto &a : all_ants) all_ants_vec.push_back(a);
1127
+
1128
+ #ifdef _OPENMP
1129
+ #pragma omp parallel for schedule(dynamic)
1130
+ #endif
1131
+ for (int i = 0; i < (int)all_ants_vec.size(); ++i) {
1132
+ string an = all_ants_vec[i];
1133
+ StrSet toks = compute_tokens_for(an);
1134
+ // thread-local insertion into global maps must be synchronized
1135
+ // we will collect per-thread lists and merge serially to avoid locks
1136
+ // but for simplicity here we push into a temporary per-thread vector (we'll merge below)
1137
+ // store as pair in a vector; but to keep code compact, collect into a local buffer and merge
1138
+ }
1139
+ // Serial merge (compute_tokens_for repeated; acceptable given earlier OpenMP stub)
1140
+ for (const string &an : all_ants_vec) {
1141
+ StrSet toks = compute_tokens_for(an);
1142
+ ant_tokens.emplace(an, toks);
1143
+ for (const auto &tk : toks) token_to_ants[tk].push_back(an);
1144
+ }
1145
+
1146
+ // --- 4) Prepare available facts + tokens (initial facts are prompt parts) ---
1147
+ StrSet available_facts; available_facts.reserve(prompt_parts.size()*2);
1148
+ unordered_map<string, StrSet> fact_tokens; fact_tokens.reserve(prompt_parts.size()*2);
1149
+ for (const string &p : prompt_parts) {
1150
+ available_facts.insert(p);
1151
+ fact_tokens.emplace(p, compute_tokens_for(p));
1152
+ }
1153
+
1154
+ // --- 5) Build reverse map: antecedent -> edges indices (for exact antecedent literal) ---
1155
+ unordered_map<string, vector<int>> ant_to_edges;
1156
+ ant_to_edges.reserve(all_ants.size()*2);
1157
+ for (int i = 0; i < E; ++i) {
1158
+ for (const string &an : edge_ants[i]) ant_to_edges[an].push_back(i);
1159
+ }
1160
+
1161
+ // --- 6) Initialize per-edge pending counts and satisfied sets ---
1162
+ vector<int> pending(E, 0);
1163
+ vector<unordered_set<string>> satisfied(E); // which antecedent literals of that edge have been satisfied
1164
+ for (int i = 0; i < E; ++i) {
1165
+ // Use unique antecedent literals per edge
1166
+ StrSet uniq;
1167
+ for (const string &a : edge_ants[i]) uniq.insert(a);
1168
+ pending[i] = (int)uniq.size();
1169
+ // satisfied[i] starts empty
1170
+ }
1171
+
1172
+ // --- 7) Worklist algorithm: queue of newly-available facts to process ---
1173
+ std::deque<std::string> worklist;
1174
+ for (const string &p : prompt_parts) worklist.push_back(p);
1175
+
1176
+ // Applied records to report, and set of applied edge keys to avoid repetition
1177
+ vector<AppliedRecord> applied_sequence;
1178
+ unordered_set<string> applied_edge_keys; applied_edge_keys.reserve(1024);
1179
+
1180
+ // Local helper: attempt to match antecedent literal 'ant' with fact 'fact' (exact or similarity)
1181
+ auto antecedent_matches_fact = [&](const string &ant, const string &fact)->bool {
1182
+ if (ant == fact) return true; // exact match
1183
+ // fuzzy: compare token sets (both precomputed if present)
1184
+ auto itA = ant_tokens.find(ant);
1185
+ auto itF = fact_tokens.find(fact);
1186
+ StrSet a_toks = (itA != ant_tokens.end()) ? itA->second : compute_tokens_for(ant);
1187
+ StrSet f_toks = (itF != fact_tokens.end()) ? itF->second : compute_tokens_for(fact);
1188
+ if (a_toks.empty() || f_toks.empty()) return false;
1189
+ double sim = jaccard_similarity_static(a_toks, f_toks);
1190
+ return (sim >= dict_similarity_threshold && sim > 0.0);
1191
+ };
1192
+
1193
+ // Helper: process one fact (decrement pending counts for edges whose antecedent literals are matched)
1194
+ auto process_fact = [&](const string &fact){
1195
+ // gather candidate antecedents via token index to avoid scanning all antecedents
1196
+ StrSet candidates;
1197
+ auto itFt = fact_tokens.find(fact);
1198
+ if (itFt != fact_tokens.end()) {
1199
+ for (const string &tk : itFt->second) {
1200
+ auto it = token_to_ants.find(tk);
1201
+ if (it != token_to_ants.end()) {
1202
+ for (const string &ant : it->second) candidates.insert(ant);
1203
+ }
1204
+ }
1205
+ }
1206
+ // also include exact match as candidate
1207
+ if (all_ants.find(fact) != all_ants.end()) candidates.insert(fact);
1208
+
1209
+ // For each candidate antecedent, check similarity / exactness to this fact.
1210
+ for (const string &ant : candidates) {
1211
+ if (!antecedent_matches_fact(ant, fact)) continue;
1212
+ // for every edge that contains this antecedent, mark satisfied once
1213
+ auto it_edges = ant_to_edges.find(ant);
1214
+ if (it_edges == ant_to_edges.end()) continue;
1215
+ for (int ei : it_edges->second) {
1216
+ // if this antecedent already satisfied for this edge, skip
1217
+ if (satisfied[ei].find(ant) != satisfied[ei].end()) continue;
1218
+ // mark satisfied and decrement pending
1219
+ satisfied[ei].insert(ant);
1220
+ if (pending[ei] > 0) --pending[ei];
1221
+ // if pending becomes zero, fire edge (produce consequent)
1222
+ if (pending[ei] == 0) {
1223
+ const Edge &e = edges_snapshot[ei];
1224
+ string k = key_of_edge(e);
1225
+ if (applied_edge_keys.insert(k).second) {
1226
+ // record which antecedent matched which fact for provenance:
1227
+ AppliedRecord rec; rec.edge = e;
1228
+ // For each antecedent of this edge, find the fact (from available_facts) that matched it.
1229
+ for (const string &edge_ant : edge_ants[ei]) {
1230
+ // Try exact first then similarity search among available_facts
1231
+ string matched_fact;
1232
+ if (available_facts.find(edge_ant) != available_facts.end()) {
1233
+ matched_fact = edge_ant;
1234
+ } else {
1235
+ // linear search among available_facts but typically small; can be optimized further
1236
+ for (const string &af : available_facts) {
1237
+ if (antecedent_matches_fact(edge_ant, af)) { matched_fact = af; break; }
1238
+ }
1239
+ }
1240
+ if (matched_fact.empty()) matched_fact = string("<unknown>");
1241
+ rec.antecedent_matches.emplace_back(edge_ant, matched_fact);
1242
+ }
1243
+ // add consequent to available_facts and enqueue for processing if new
1244
+ string consequent = node_norm(e.B);
1245
+ if (available_facts.insert(consequent).second) {
1246
+ fact_tokens.emplace(consequent, compute_tokens_for(consequent));
1247
+ worklist.push_back(consequent);
1248
+ }
1249
+ applied_sequence.push_back(std::move(rec));
1250
+ }
1251
+ }
1252
+ } // for each edge containing ant
1253
+ } // for each candidate ant
1254
+ };
1255
+
1256
+ // --- 8) Main loop: process worklist until saturation (no new facts) ---
1257
+ while (!worklist.empty()) {
1258
+ string fact = std::move(worklist.front()); worklist.pop_front();
1259
+ // process_fact will examine token->antecedent candidates and fire edges as possible
1260
+ process_fact(fact);
1261
+ }
1262
+
1263
+ // --- 9) Build textual report with provenance (order edges were applied) ---
1264
+ std::ostringstream agg;
1265
+ agg << "=== Implication application (saturated forward-chaining) ===\n";
1266
+ if (applied_sequence.empty()) {
1267
+ agg << " (No implications could be applied from the prompt.)\n\n";
1268
+ return agg.str();
1269
+ }
1270
+ for (size_t i = 0; i < applied_sequence.size(); ++i) {
1271
+ const AppliedRecord &r = applied_sequence[i];
1272
+ agg << "[" << (i+1) << "] Applied: " << r.edge.A << " -> " << r.edge.B << "\n";
1273
+ agg << " Form: " << r.edge.form;
1274
+ if (r.edge.line > 0) agg << " (line " << r.edge.line << ")";
1275
+ agg << "\n";
1276
+ for (size_t j = 0; j < r.antecedent_matches.size(); ++j) {
1277
+ agg << " Antecedent " << (j+1) << ": \"" << r.antecedent_matches[j].first
1278
+ << "\" matched by available fact \"" << r.antecedent_matches[j].second << "\"\n";
1279
+ }
1280
+ if (!r.edge.sentence.empty()) agg << " Source sentence: " << normalize_spaces(r.edge.sentence) << "\n";
1281
+ agg << "\n";
1282
+ }
1283
+
1284
+ // list derived facts (those not present in the original prompt_parts)
1285
+ agg << "=== Derived facts ===\n";
1286
+ for (const auto &f : available_facts) {
1287
+ bool in_prompt = false;
1288
+ for (const string &p : prompt_parts) if (p == f) { in_prompt = true; break; }
1289
+ if (!in_prompt) agg << " - " << f << "\n";
1290
+ }
1291
+ agg << "\n";
1292
+ return agg.str();
1293
+ }
1294
+
1295
+ // Apply a simultaneous substitution mapping (schema variable -> concrete name)
1296
+ // and insert the instantiated edge into the KB (thread-safe).
1297
+ void instantiate_schema_edge(const Edge &schema_edge,
1298
+ const std::vector<std::pair<string,string>> &mapping_pairs,
1299
+ const string &provenance_note = "instantiation:auto")
1300
+ {
1301
+ // build substitution map (normalized)
1302
+ unordered_map<string,string> sub;
1303
+ for (auto &kv : mapping_pairs) sub[node_norm(kv.first)] = node_norm(kv.second);
1304
+
1305
+ // apply substitution to a label (conservative: whole-word replacement)
1306
+ auto apply_sub = [&](const string &label)->string {
1307
+ string out = label;
1308
+ // exact-match first
1309
+ string ln = node_norm(label);
1310
+ auto it = sub.find(ln);
1311
+ if (it != sub.end()) return it->second;
1312
+ // whole-word replace (regex) for occurrences within compound labels
1313
+ for (const auto &kv : sub) {
1314
+ std::regex pat(std::string("\\b") + kv.first + std::string("\\b"));
1315
+ out = std::regex_replace(out, pat, kv.second);
1316
+ }
1317
+ return node_norm(out);
1318
+ };
1319
+
1320
+ string Anew = apply_sub(schema_edge.A);
1321
+ string Bnew = apply_sub(schema_edge.B);
1322
+ if (Anew.empty() || Bnew.empty()) return;
1323
+
1324
+ Edge e{ Anew, Bnew, string("instantiated: ") + schema_edge.form, schema_edge.line, schema_edge.sentence };
1325
+ string k = key_of_edge(e);
1326
+ {
1327
+ std::lock_guard<std::mutex> lock(mtx);
1328
+ if (seen_keys.insert(k).second) {
1329
+ edges.push_back(e);
1330
+ edge_provenance[k] = provenance_note;
1331
+ // rebuild condensed graph indices and token index
1332
+ id.clear(); id2.clear(); adj.clear(); explicit_edges.clear(); form_by_idpair.clear();
1333
+ build_graph_from_edges(edges, id, id2, adj, explicit_edges, form_by_idpair);
1334
+ index_graph();
1335
+ }
1336
+ }
1337
+ }
1338
+
1339
+ // After ingesting a user text that may declare variable names (e.g. "G and H are variables"),
1340
+ // attempt to instantiate schema edges in the KB whose variables can be traced to declarations.
1341
+ void perform_auto_instantiations(const string &text) {
1342
+ // extract declared variables from text using pattern
1343
+ vector<string> declared_vars;
1344
+ apply_regex_iter(text, patterns.variable_decl_re, [&](const smatch &m){
1345
+ string list = trim(m.str(1));
1346
+ auto parts = split_antecedents(list);
1347
+ for (auto &p : parts) {
1348
+ string np = node_norm(p);
1349
+ if (!np.empty()) declared_vars.push_back(np);
1350
+ }
1351
+ });
1352
+
1353
+ if (declared_vars.empty()) return;
1354
+
1355
+ // snapshot edges & id data under lock
1356
+ vector<Edge> edges_snapshot;
1357
+ vector<string> id2_snapshot;
1358
+ vector<char> reach_var;
1359
+ {
1360
+ std::lock_guard<std::mutex> lock(mtx);
1361
+ edges_snapshot = edges;
1362
+ id2_snapshot = id2;
1363
+ reach_var = can_reach_var_decl;
1364
+ }
1365
+
1366
+ // find candidate schema edges: those whose A/B (or antecedents) are variable-like (can reach var decl)
1367
+ for (const Edge &sch : edges_snapshot) {
1368
+ // gather schema variable labels in appearance order (A then B)
1369
+ vector<string> schema_vars;
1370
+ // only consider atomic labels (we assume schema variables are standalone tokens)
1371
+ if (!sch.A.empty()) schema_vars.push_back(node_norm(sch.A));
1372
+ if (!sch.B.empty()) schema_vars.push_back(node_norm(sch.B));
1373
+ // filter those that are marked variable-like in current index
1374
+ vector<string> schema_vars_filtered;
1375
+ for (const string &sv : schema_vars) {
1376
+ auto it = id.find(sv);
1377
+ if (it != id.end()) {
1378
+ int nid = it->second;
1379
+ if (nid >= 0 && nid < (int)reach_var.size() && reach_var[nid]) {
1380
+ schema_vars_filtered.push_back(sv);
1381
+ }
1382
+ }
1383
+ }
1384
+ if (schema_vars_filtered.empty()) continue;
1385
+ // require same arity as declared_vars (simple position-based mapping)
1386
+ if ((int)schema_vars_filtered.size() != (int)declared_vars.size()) continue;
1387
+
1388
+ // build mapping pairs (schema var -> declared var)
1389
+ std::vector<std::pair<string,string>> mapping;
1390
+ for (size_t i = 0; i < schema_vars_filtered.size(); ++i) mapping.emplace_back(schema_vars_filtered[i], declared_vars[i]);
1391
+
1392
+ // instantiate
1393
+ instantiate_schema_edge(sch, mapping, string("auto-inst-from-text"));
1394
+ }
1395
+ }
1396
+
1397
+ // Remove edges satisfying predicate 'pred'. Rebuilds graph indices (safe, deterministic).
1398
+ // Thread-safe: acquires mtx.
1399
+ void remove_edges_if(const std::function<bool(const Edge&)> &pred, const string &reason = "") {
1400
+ std::lock_guard<std::mutex> lock(mtx);
1401
+ vector<Edge> kept;
1402
+ kept.reserve(edges.size());
1403
+ size_t removed = 0;
1404
+ for (const auto &e : edges) {
1405
+ if (pred(e)) {
1406
+ ++removed;
1407
+ string k = key_of_edge(e);
1408
+ correction_log.push_back(string("removed: ") + k + (reason.empty() ? "" : (" // " + reason)));
1409
+ edge_provenance.erase(k);
1410
+ } else kept.push_back(e);
1411
+ }
1412
+ edges.swap(kept);
1413
+
1414
+ // rebuild node/id caches from edges
1415
+ id.clear(); id2.clear(); adj.clear(); explicit_edges.clear(); form_by_idpair.clear();
1416
+ build_graph_from_edges(edges, id, id2, adj, explicit_edges, form_by_idpair);
1417
+ index_graph();
1418
+ }
1419
+
1420
+ // Correct a concrete explicit implication A->B by replacing it with newA->newB (records provenance).
1421
+ // Thread-safe.
1422
+ void correct_edge(const string &A, const string &B, const string &newA, const string &newB, const string &provenance_note = "") {
1423
+ auto match = [&](const Edge &e){ return node_norm(e.A) == node_norm(A) && node_norm(e.B) == node_norm(B); };
1424
+ remove_edges_if(match, "corrected to " + newA + " -> " + newB);
1425
+ // add corrected edge as explicit edge (we append to edges and rebuild indices)
1426
+ {
1427
+ std::lock_guard<std::mutex> lock(mtx);
1428
+ Edge e{ node_norm(newA), node_norm(newB), string("corrected (user)"), 0, string("correction: ") + newA + " -> " + newB };
1429
+ string k = key_of_edge(e);
1430
+ if (seen_keys.insert(k).second) {
1431
+ edges.push_back(e);
1432
+ edge_provenance[k] = provenance_note.empty() ? "correction" : provenance_note;
1433
+ }
1434
+ // rebuild caches
1435
+ id.clear(); id2.clear(); adj.clear(); explicit_edges.clear(); form_by_idpair.clear();
1436
+ build_graph_from_edges(edges, id, id2, adj, explicit_edges, form_by_idpair);
1437
+ index_graph();
1438
+ correction_log.push_back(string("added: ") + k + (provenance_note.empty() ? "" : string(" // ") + provenance_note));
1439
+ }
1440
+ }
1441
+
1442
+ // Find relevant nodes given seed tokens (fast approximate attention).
1443
+ // Returns nodes ordered by BFS distance (small first). Thread-safe snapshot.
1444
+ vector<int> find_relevant_nodes(const vector<string> &seed_tokens, int maxDepth = 3, int maxNodes = 200) {
1445
+ // take snapshot
1446
+ unordered_map<string,int> id_local;
1447
+ vector<string> id2_local;
1448
+ vector<vector<int>> adj_local;
1449
+ {
1450
+ std::lock_guard<std::mutex> lock(mtx);
1451
+ id_local = id; id2_local = id2; adj_local = adj;
1452
+ }
1453
+ unordered_set<int> seeds;
1454
+ for (const auto &t : seed_tokens) {
1455
+ string tt = lower_copy(t);
1456
+ auto it = token_index.find(tt);
1457
+ if (it != token_index.end()) {
1458
+ for (int nid : it->second) seeds.insert(nid);
1459
+ }
1460
+ }
1461
+ // BFS from seeds (single-threaded; adjacency traversal is typically cheap)
1462
+ queue<pair<int,int>> q;
1463
+ unordered_map<int,int> dist;
1464
+ for (int s : seeds) { q.push({s,0}); dist[s] = 0; }
1465
+ vector<int> result;
1466
+ while (!q.empty() && (int)result.size() < maxNodes) {
1467
+ auto [u,d] = q.front(); q.pop();
1468
+ result.push_back(u);
1469
+ if (d >= maxDepth) continue;
1470
+ if (u >= 0 && u < (int)adj_local.size()) {
1471
+ for (int w : adj_local[u]) {
1472
+ if (dist.find(w) == dist.end()) { dist[w] = d+1; q.push({w,d+1}); }
1473
+ }
1474
+ }
1475
+ }
1476
+ return result;
1477
+ }
1478
+
1479
+ // Retrieve explicit Edge objects relevant to a set of node ids (unique).
1480
+ vector<Edge> retrieve_relevant_edges(const vector<int> &node_ids) {
1481
+ std::lock_guard<std::mutex> lock(mtx);
1482
+ unordered_set<int> seen_ei;
1483
+ vector<Edge> out;
1484
+ for (int nid : node_ids) {
1485
+ if (nid < 0 || nid >= (int)edges_from_node.size()) continue;
1486
+ for (int ei : edges_from_node[nid]) {
1487
+ if (seen_ei.insert(ei).second) out.push_back(edges[ei]);
1488
+ }
1489
+ if (nid < 0 || nid >= (int)edges_to_node.size()) continue;
1490
+ for (int ei : edges_to_node[nid]) {
1491
+ if (seen_ei.insert(ei).second) out.push_back(edges[ei]);
1492
+ }
1493
+ }
1494
+ return out;
1495
+ }
1496
+ // --- End: graph backtracking / attention / retrieval indices ---
1497
+
1498
+ // Add text (such as input.txt, user input, or assistant text) into edges and rebuild graph caches.
1499
+ // The function processes sentences in parallel with OpenMP where available for speed.
1500
+ void ingest_text(const string &text) {
1501
+ DBG_LINE();
1502
+ auto sents = split_into_sentences(text);
1503
+ if (sents.empty()) { DBG("ingest_text: no sentences"); return; }
1504
+
1505
+ // thread-local collectors
1506
+ std::vector<vector<Edge>> local_edges;
1507
+ std::vector<unordered_set<string>> local_seen;
1508
+ std::vector<unordered_set<string>> local_forbidden;
1509
+
1510
+ int threads = 1;
1511
+ #ifdef _OPENMP
1512
+ if (GLOBAL_THREADS > 0) omp_set_num_threads(GLOBAL_THREADS);
1513
+ threads = omp_get_max_threads();
1514
+ #endif
1515
+ if (threads < 1) threads = 1;
1516
+ local_edges.resize(threads);
1517
+ local_seen.resize(threads);
1518
+ local_forbidden.resize(threads);
1519
+
1520
+ DBG("ingest_text: sentences=" << sents.size() << " threads=" << threads);
1521
+
1522
+ // parallel loop over sentences
1523
+ #ifdef _OPENMP
1524
+ #pragma omp parallel for schedule(dynamic)
1525
+ #endif
1526
+ for (int i = 0; i < (int)sents.size(); ++i) {
1527
+ #ifdef _OPENMP
1528
+ int tid = omp_get_thread_num();
1529
+ #else
1530
+ int tid = 0;
1531
+ #endif
1532
+ const auto &pr = sents[i];
1533
+ process_sentence(pr.first, pr.second, patterns, local_edges[tid], local_seen[tid], local_forbidden[tid]);
1534
+ if (GLOBAL_DEBUG && (i % 500) == 0) {
1535
+ DBG("ingest_text processed sentences=" << i << " on tid=" << tid);
1536
+ }
1537
+ }
1538
+
1539
+ // merge local collectors into global store guarded by mutex
1540
+ std::lock_guard<std::mutex> lock(mtx);
1541
+ DBG("ingest_text merging locals into global store");
1542
+ for (int t = 0; t < threads; ++t) {
1543
+ for (auto &e : local_edges[t]) {
1544
+ string k = key_of_edge(e);
1545
+ if (seen_keys.insert(k).second) {
1546
+ // record provenance roughly; you can make this more precise by passing a source label to ingest_text
1547
+ edge_provenance[k] = "ingest";
1548
+ edges.push_back(std::move(e));
1549
+ }
1550
+ }
1551
+ for (const auto &f : local_forbidden[t]) forbidden_inferred_rev.insert(f);
1552
+ }
1553
+
1554
+ // rebuild graph caches incrementally (simple approach: clear and rebuild from edges)
1555
+ id.clear(); id2.clear(); adj.clear(); explicit_edges.clear(); form_by_idpair.clear();
1556
+ build_graph_from_edges(edges, id, id2, adj, explicit_edges, form_by_idpair);
1557
+
1558
+ // NEW: build reverse adjacency, per-node edge indices and token index for fast retrieval & attention
1559
+ index_graph();
1560
+
1561
+ DBG("ingest_text complete: total edges=" << edges.size());
1562
+ }
1563
+
1564
+ // Save conversation history to file
1565
+ void save_history(const string &fname) {
1566
+ DBG_LINE();
1567
+ std::lock_guard<std::mutex> lock(mtx);
1568
+ std::ofstream out(fname);
1569
+ if (!out) { DBG("save_history: cannot open file"); return; }
1570
+ for (const auto &p : history) {
1571
+ out << "User: " << p.first << "\n";
1572
+ out << "Assistant: " << p.second << "\n\n";
1573
+ }
1574
+ DBG("save_history: saved to '" << fname << "'");
1575
+ }
1576
+
1577
+ // Expose a method to run conservative transitive inference and return inferred edges
1578
+ vector<Edge> infer_transitive_edges(int maxDepth = 3) {
1579
+ DBG_LINE();
1580
+ std::lock_guard<std::mutex> lock(mtx);
1581
+ return infer_transitives(id2, adj, explicit_edges, form_by_idpair, forbidden_inferred_rev, maxDepth);
1582
+ }
1583
+
1584
+ // Small synthesis engine: given user input, find nearby nodes and generate assembled text.
1585
+ // Corrected ChatMemory::synthesize_response — releases mutex before calling ingest_text(response)
1586
+ string synthesize_response(const string &user_input) {
1587
+ DBG("synthesize_response start user_input='" << user_input << "'");
1588
+ // 1) ingest user input as knowledge first (ingest_text acquires its own lock internally)
1589
+ ingest_text(user_input);
1590
+
1591
+ // After ingesting the user's text, attempt to auto-instantiate schemas based on any variable declarations
1592
+ perform_auto_instantiations(user_input);
1593
+
1594
+ // 2) tokenize user input (case-folded)
1595
+ string lc = lower_copy(user_input);
1596
+ std::istringstream iss(lc);
1597
+ vector<string> tokens;
1598
+ string tok;
1599
+ while (iss >> tok) tokens.push_back(tok);
1600
+ DBG("synthesize_response tokens=" << tokens.size());
1601
+
1602
+ // 3) take a consistent snapshot of the shared graph/state under lock and then release
1603
+ vector<string> id2_local;
1604
+ vector<vector<int>> adj_local;
1605
+ unordered_map<string,string> form_by_idpair_local;
1606
+ unordered_map<string,int> id_local;
1607
+ vector<Edge> edges_local;
1608
+ {
1609
+ std::lock_guard<std::mutex> lock(mtx);
1610
+ id2_local = id2;
1611
+ adj_local = adj;
1612
+ form_by_idpair_local = form_by_idpair;
1613
+ id_local = id;
1614
+ edges_local = edges;
1615
+ DBG("synthesize_response: snapshot copied: nodes=" << id2_local.size() << " edges=" << edges_local.size());
1616
+ }
1617
+
1618
+ if (id2_local.empty()) { DBG("synthesize_response: id2_local empty"); return "I have no knowledge yet."; }
1619
+
1620
+ // Additional step: run implication-application analysis on the raw user input
1621
+ // using a snapshot of explicit edges / node map taken above. This will
1622
+ // produce a concise aggregation/report describing recursive applications.
1623
+ string implication_report;
1624
+ try {
1625
+ implication_report = apply_implications_to_prompt_report(user_input, edges_local, id_local, id2_local);
1626
+ } catch (...) {
1627
+ implication_report = string(" (implication analysis failed due to internal error)\n");
1628
+ }
1629
+ // We'll append the implication report to the assistant response below (after composing outputs).
1630
+ // Store it in a temporary variable in this scope.
1631
+
1632
+ // 4) find seed nodes by token matching against node labels (use snapshot)
1633
+ unordered_set<int> seed_ids;
1634
+ for (int i = 0; i < (int)id2_local.size(); ++i) {
1635
+ string node_lc = lower_copy(id2_local[i]);
1636
+ for (const string &t : tokens) {
1637
+ if (t.size() >= 3 && node_lc.find(t) != string::npos) { seed_ids.insert(i); break; }
1638
+ }
1639
+ }
1640
+
1641
+ // 5) fallback heuristic if no seeds: choose top nodes by frequency in edges (use snapshot)
1642
+ if (seed_ids.empty()) {
1643
+ unordered_map<int,int> freq;
1644
+ for (const auto &e : edges_local) {
1645
+ auto itA = id_local.find(e.A), itB = id_local.find(e.B);
1646
+ if (itA != id_local.end()) ++freq[itA->second];
1647
+ if (itB != id_local.end()) ++freq[itB->second];
1648
+ }
1649
+ vector<pair<int,int>> freqv;
1650
+ freqv.reserve(freq.size());
1651
+ for (const auto &kv : freq) freqv.emplace_back(kv.first, kv.second);
1652
+ std::sort(freqv.begin(), freqv.end(), [](const pair<int,int> &a, const pair<int,int> &b){
1653
+ return a.second > b.second;
1654
+ });
1655
+ for (size_t i = 0; i < freqv.size() && i < 3; ++i) seed_ids.insert(freqv[i].first);
1656
+ DBG("synthesize_response seed heuristic used: " << seed_ids.size() << " seeds");
1657
+ } else {
1658
+ DBG("synthesize_response found " << seed_ids.size() << " seeds from tokens");
1659
+ }
1660
+
1661
+ // 6) BFS from seeds collecting short implication chains (avoid weak edges in chaining)
1662
+ vector<string> outputs;
1663
+ unordered_set<string> seen_stmt;
1664
+ for (int sid : seed_ids) {
1665
+ queue<tuple<int, vector<int>, bool>> q; // node, path, path_has_weak
1666
+ q.push({sid, vector<int>{sid}, false});
1667
+ int maxDepth = 3;
1668
+ while (!q.empty()) {
1669
+ auto [u, path, path_has_weak] = q.front(); q.pop();
1670
+ if ((int)path.size() > 1) {
1671
+ int a = path.front();
1672
+ int c = path.back();
1673
+ string Aname = (a >= 0 && a < (int)id2_local.size()) ? id2_local[a] : "<node>";
1674
+ string Cname = (c >= 0 && c < (int)id2_local.size()) ? id2_local[c] : "<node>";
1675
+ if (!path_has_weak) {
1676
+ std::ostringstream ss;
1677
+ ss << Aname << " -> " << Cname << " (chain length=" << (path.size() - 1) << ")";
1678
+ string line = ss.str();
1679
+ if (seen_stmt.insert(line).second) outputs.push_back(line);
1680
+ }
1681
+ }
1682
+ if ((int)path.size() <= maxDepth) {
1683
+ if (u >= 0 && u < (int)adj_local.size()) {
1684
+ for (int w : adj_local[u]) {
1685
+ // avoid cycles
1686
+ if (std::find(path.begin(), path.end(), w) != path.end()) continue;
1687
+ string edgekey = std::to_string(u) + "->" + std::to_string(w);
1688
+ bool weak = false;
1689
+ auto itfb = form_by_idpair_local.find(edgekey);
1690
+ if (itfb != form_by_idpair_local.end()) {
1691
+ string lf = lower_copy(itfb->second);
1692
+ if (lf.find("[weak]") != string::npos || lf.find("probab") != string::npos || lf.find("correlat") != string::npos) weak = true;
1693
+ }
1694
+ vector<int> newpath = path; newpath.push_back(w);
1695
+ q.push({w, newpath, path_has_weak || weak});
1696
+ }
1697
+ }
1698
+ }
1699
+ }
1700
+ }
1701
+
1702
+ // 7) Streamed / batched assistant output: print already-processed chunks before continuing.
1703
+ // Also accumulate the full response in `response` (keeps behavior of ingesting the assistant text).
1704
+ std::ostringstream response_acc;
1705
+ const int MAX_SHOW = 12;
1706
+ const int BATCH_SIZE = 4;
1707
+
1708
+ response_acc << "I processed your input and found the following relevant implication chains:\n";
1709
+ std::string header = response_acc.str();
1710
+ std::cout << "Assistant> " << header << std::flush;
1711
+ std::string response; // final accumulated response string
1712
+
1713
+ // stream in batches of lines (not strictly line-by-line single-char streaming)
1714
+ int shown = 0;
1715
+ int total = (int)outputs.size();
1716
+ if (total == 0) {
1717
+ std::string note = " (No strong implication chains found; try rephrasing or providing domain-specific statements.)\n";
1718
+ std::cout << note << std::flush;
1719
+ response += header + note;
1720
+ } else {
1721
+ while (shown < std::min(total, MAX_SHOW)) {
1722
+ int end = std::min(shown + BATCH_SIZE, std::min(total, MAX_SHOW));
1723
+ std::ostringstream batch;
1724
+ for (int i = shown; i < end; ++i) batch << " - " << outputs[i] << "\n";
1725
+ std::string batch_str = batch.str();
1726
+ // Print batch and flush so user sees progress before further processing
1727
+ std::cout << batch_str << std::flush;
1728
+ // Append to accumulated response
1729
+ response += (shown == 0 ? header : std::string()) + batch_str;
1730
+ // Move forward
1731
+ shown = end;
1732
+ }
1733
+ // If there were more than MAX_SHOW, indicate truncation
1734
+ if (total > MAX_SHOW) {
1735
+ std::string more_note = std::string("... (") + std::to_string(total - MAX_SHOW) + " more chains omitted)\n";
1736
+ std::cout << more_note << std::flush;
1737
+ response += more_note;
1738
+ }
1739
+ }
1740
+
1741
+ // append the implication report (if any) and print it in one chunk
1742
+ if (!implication_report.empty()) {
1743
+ std::string sep = "\n";
1744
+ std::cout << sep << implication_report << std::flush;
1745
+ response += sep + implication_report;
1746
+ }
1747
+
1748
+ // 8) Record assistant response into history (briefly lock) then ingest it as knowledge WITHOUT holding the lock
1749
+ {
1750
+ std::lock_guard<std::mutex> lock(mtx);
1751
+ history.emplace_back(user_input, response);
1752
+ DBG("synthesize_response: appended to history, history size=" << history.size());
1753
+ }
1754
+
1755
+ // IMPORTANT: ingest_text will acquire mtx internally when merging — do NOT hold the lock here
1756
+ ingest_text(response); // program's own outputs also become knowledge
1757
+
1758
+ DBG("synthesize_response complete, response length=" << response.size());
1759
+ return response;
1760
+ }
1761
+ };
1762
+
1763
+ /* ---------------------------------- main ---------------------------------- */
1764
+
1765
+ static void print_usage(const char *prog) {
1766
+ std::cout << "Usage: " << prog << " [--debug] [--threads N] <input.txt>\n";
1767
+ std::cout << " --debug Enable debug tracing to stderr (very verbose)\n";
1768
+ std::cout << " --threads N Limit OpenMP threads (default: auto)\n";
1769
+ }
1770
+
1771
+ int main(int argc, char** argv) {
1772
+ // parse optional flags while preserving original behavior
1773
+ if (argc < 2) { print_usage(argv[0]); return 1; }
1774
+
1775
+ string input_file;
1776
+ int DICT_DEPTH = 2; // default: 2
1777
+ for (int i = 1; i < argc; ++i) {
1778
+ string a = argv[i];
1779
+ if (a == "--debug") { GLOBAL_DEBUG = true; DBG("--debug enabled"); }
1780
+ else if (a == "--threads" && i + 1 < argc) { GLOBAL_THREADS = std::stoi(argv[++i]); DBG("--threads set to " << GLOBAL_THREADS); }
1781
+ else if (a == "--help" || a == "-h") { print_usage(argv[0]); return 0; }
1782
+ else if (a == "--dict-depth" && i + 1 < argc) { DICT_DEPTH = std::max(0, std::stoi(argv[++i])); DBG("--dict-depth set to " << DICT_DEPTH); }
1783
+ else if (input_file.empty()) input_file = a;
1784
+ else { /* ignore extras */ }
1785
+ }
1786
+ if (input_file.empty()) { std::cerr << "Missing input file.\n"; print_usage(argv[0]); return 1; }
1787
+
1788
+ #ifdef _OPENMP
1789
+ if (GLOBAL_THREADS > 0) {
1790
+ omp_set_num_threads(GLOBAL_THREADS);
1791
+ DBG("OpenMP threads limited to " << GLOBAL_THREADS);
1792
+ }
1793
+ #endif
1794
+
1795
+ std::ifstream in(input_file, std::ios::in | std::ios::binary);
1796
+ if (!in) { std::cerr << "Cannot open file: " << input_file << "\n"; return 1; }
1797
+ std::ostringstream ss;
1798
+ ss << in.rdbuf();
1799
+ string text = ss.str();
1800
+ if (text.empty()) { std::cout << "Input empty.\n"; return 0; }
1801
+
1802
+ DBG("Loaded input file '" << input_file << "' size=" << text.size());
1803
+
1804
+ ChatMemory memory;
1805
+ // set dictionary expansion depth from CLI
1806
+ memory.set_dict_depth(DICT_DEPTH);
1807
+ // ingest the main input.txt initially
1808
+ memory.ingest_text(text);
1809
+
1810
+ // Build initial contrapositives and inferred edges for report generation if user wants
1811
+ auto initial_contrapositives = build_contrapositives(memory.edges, memory.seen_keys);
1812
+
1813
+ std::cout << "Knowledge base initialized from '" << input_file << "' (" << memory.edges.size() << " explicit edges).\n";
1814
+ std::cout << "Entering interactive chat mode. Type ':quit' to exit, ':save <file>' to save history, ':report' to print current report, ':history' to show conversation history.\n";
1815
+
1816
+ string line;
1817
+ while (true) {
1818
+ std::cout << "You> ";
1819
+ if (!std::getline(std::cin, line)) break;
1820
+ string input = trim(line);
1821
+ if (input.empty()) continue;
1822
+ if (input == ":quit" || input == ":exit") break;
1823
+ if (input.rfind(":save ",0) == 0) {
1824
+ string fname = trim(input.substr(6));
1825
+ if (fname.empty()) fname = "chat_history.txt";
1826
+ memory.save_history(fname);
1827
+ std::cout << "Saved history to '" << fname << "'\n";
1828
+ continue;
1829
+ }
1830
+ if (input == ":history") {
1831
+ std::lock_guard<std::mutex> lock(memory.mtx);
1832
+ if (memory.history.empty()) std::cout << "(no history yet)\n";
1833
+ for (size_t i = 0; i < memory.history.size(); ++i) {
1834
+ std::cout << "[" << (i+1) << "] User: " << memory.history[i].first << "\n";
1835
+ std::cout << " Assistant: " << memory.history[i].second << "\n\n";
1836
+ }
1837
+ continue;
1838
+ }
1839
+ if (input == ":report") {
1840
+ auto inferred = memory.infer_transitive_edges(3);
1841
+ // copy containers for reporting
1842
+ std::lock_guard<std::mutex> lock(memory.mtx);
1843
+ output_report(memory.edges, initial_contrapositives, inferred, memory.form_by_idpair, memory.id2, memory.explicit_edges, memory.forbidden_inferred_rev);
1844
+ continue;
1845
+ }
1846
+ if (input.rfind(":export-graph",0) == 0) {
1847
+ string fname = trim(input.substr(13)); if (fname.empty()) fname = "graph_edges.txt";
1848
+ std::lock_guard<std::mutex> lock(memory.mtx);
1849
+ std::ofstream out(fname);
1850
+ for (const auto &e : memory.edges) out << e.A << " -> " << e.B << " Form: " << e.form << "\n";
1851
+ std::cout << "Exported graph to '" << fname << "'\n";
1852
+ continue;
1853
+ }
1854
+
1855
+ // Normal chat input: generate response using memory's synthesis engine
1856
+ if (GLOBAL_DEBUG) std::cerr << "[DBG] main: calling synthesize_response for input='" << input << "'\n";
1857
+ string assistant_reply = memory.synthesize_response(input);
1858
+ std::cout << "Assistant> " << assistant_reply << std::endl;
1859
+ }
1860
+
1861
+ return 0;
1862
+ }
ChatIPC.depend ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # depslib dependency file v1.0
2
+ 1773273223 source:c:\users\caleb p. nwokocha\documents\research documents\chatipc\dictionary.cpp
3
+
ChatIPC.layout ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
2
+ <CodeBlocks_layout_file>
3
+ <FileVersion major="1" minor="0" />
4
+ <ActiveTarget name="Debug" />
5
+ <File name="ChatIPC.cpp" open="1" top="1" tabpos="1" split="0" active="1" splitpos="0" zoom_1="0" zoom_2="0">
6
+ <Cursor>
7
+ <Cursor1 position="44403" topLine="650" />
8
+ </Cursor>
9
+ </File>
10
+ </CodeBlocks_layout_file>
Implicational propositional calculus - Wikipedia.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3de2d4e646c3b455feae9953322cda4981cb611bd513a9802be8660ae45ba98
3
+ size 553077
Use only C.docx ADDED
Binary file (41.8 kB). View file
 
a.docx ADDED
Binary file (16 kB). View file
 
input.txt ADDED
The diff for this file is too large to render. See raw diff