Upload 2 files
Browse files- ChatIPC.cpp +198 -58
- ChatIPC.exe +2 -2
ChatIPC.cpp
CHANGED
|
@@ -1,7 +1,3 @@
|
|
| 1 |
-
// chatipc_modular.cpp
|
| 2 |
-
// Compile: g++ -std=c++17 -O2 -fopenmp -o chatipc_modular chatipc_modular.cpp
|
| 3 |
-
// Requires dictionary.cpp providing: extern unsigned char dictionary_json[]; extern unsigned int dictionary_json_len;
|
| 4 |
-
|
| 5 |
#include <algorithm>
|
| 6 |
#include <atomic>
|
| 7 |
#include <cctype>
|
|
@@ -74,8 +70,15 @@ struct StringInterner {
|
|
| 74 |
}
|
| 75 |
};
|
| 76 |
|
| 77 |
-
// ----------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
|
|
|
| 79 |
using StrPtr = const std::string*;
|
| 80 |
struct PtrHash { size_t operator()(StrPtr p) const noexcept { return std::hash<std::string>()(*p); } };
|
| 81 |
struct PtrEq { bool operator()(StrPtr a, StrPtr b) const noexcept { return *a == *b; } };
|
|
@@ -86,17 +89,91 @@ struct KnowledgeBase {
|
|
| 86 |
StringInterner interner;
|
| 87 |
std::unordered_map<StrPtr, NextSet, PtrHash, PtrEq> next;
|
| 88 |
std::mutex m;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
void add_pair_interned(StrPtr k, StrPtr v){
|
| 90 |
std::lock_guard<std::mutex> lk(m);
|
| 91 |
auto &vec = next[k];
|
| 92 |
for (auto p : vec) if (*p == *v) return;
|
| 93 |
vec.push_back(v);
|
| 94 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
void add_pair(const std::string &k, const std::string &v){
|
| 96 |
StrPtr kp = interner.intern(k);
|
| 97 |
StrPtr vp = interner.intern(v);
|
|
|
|
|
|
|
|
|
|
| 98 |
add_pair_interned(kp, vp);
|
| 99 |
}
|
|
|
|
| 100 |
std::optional<NextSet> lookup_by_string(const std::string &k) const {
|
| 101 |
for (auto &pr : next) if (*pr.first == k) return pr.second;
|
| 102 |
return std::nullopt;
|
|
@@ -108,6 +185,21 @@ struct KnowledgeBase {
|
|
| 108 |
}
|
| 109 |
};
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
// --------------------------- Small JSON parse helpers ----------------------
|
| 112 |
|
| 113 |
static inline bool json_valid_index(size_t i, size_t n){ return i < n; }
|
|
@@ -168,51 +260,6 @@ static std::unordered_map<std::string,std::string> parse_dictionary_json(){
|
|
| 168 |
return dict;
|
| 169 |
}
|
| 170 |
|
| 171 |
-
// --------------------------- Build definition index (small funcs) ---------
|
| 172 |
-
|
| 173 |
-
static std::unordered_set<std::string> def_tokens_from_text(const std::string &s){
|
| 174 |
-
auto toks = tokenize_non_alnum(s);
|
| 175 |
-
return std::unordered_set<std::string>(toks.begin(), toks.end());
|
| 176 |
-
}
|
| 177 |
-
|
| 178 |
-
static void expand_def_index(const std::unordered_map<std::string,std::unordered_set<std::string>> &direct,
|
| 179 |
-
std::unordered_map<std::string,std::unordered_set<std::string>> &out,
|
| 180 |
-
int depth)
|
| 181 |
-
{
|
| 182 |
-
for (auto &pr : direct){
|
| 183 |
-
const std::string &word = pr.first;
|
| 184 |
-
std::unordered_set<std::string> acc = pr.second;
|
| 185 |
-
if (depth > 1){
|
| 186 |
-
std::vector<std::string> frontier(acc.begin(), acc.end());
|
| 187 |
-
for (int d=1; d<depth; ++d){
|
| 188 |
-
std::vector<std::string> nextf;
|
| 189 |
-
for (auto &w : frontier){
|
| 190 |
-
auto it = direct.find(w);
|
| 191 |
-
if (it==direct.end()) continue;
|
| 192 |
-
for (auto &t : it->second){
|
| 193 |
-
if (acc.insert(t).second) nextf.push_back(t);
|
| 194 |
-
}
|
| 195 |
-
}
|
| 196 |
-
if (nextf.empty()) break;
|
| 197 |
-
frontier.swap(nextf);
|
| 198 |
-
}
|
| 199 |
-
}
|
| 200 |
-
out.emplace(word, std::move(acc));
|
| 201 |
-
}
|
| 202 |
-
}
|
| 203 |
-
|
| 204 |
-
static std::unordered_map<std::string,std::unordered_set<std::string>>
|
| 205 |
-
build_definition_index(int depth)
|
| 206 |
-
{
|
| 207 |
-
std::unordered_map<std::string,std::unordered_set<std::string>> out;
|
| 208 |
-
if (depth <= 0) return out;
|
| 209 |
-
auto raw = parse_dictionary_json();
|
| 210 |
-
std::unordered_map<std::string,std::unordered_set<std::string>> direct;
|
| 211 |
-
for (auto &pr : raw) direct.emplace(pr.first, def_tokens_from_text(pr.second));
|
| 212 |
-
expand_def_index(direct, out, depth);
|
| 213 |
-
return out;
|
| 214 |
-
}
|
| 215 |
-
|
| 216 |
// --------------------------- Similarity helpers (very small) ----------------
|
| 217 |
|
| 218 |
static double jaccard_similarity(const std::unordered_set<std::string> &A,
|
|
@@ -379,13 +426,15 @@ static void learn_files_parallel(KnowledgeBase &kb, const std::vector<std::strin
|
|
| 379 |
|
| 380 |
// --------------------------- Serialization (short functions) ----------------
|
| 381 |
|
| 382 |
-
// File format documented in comments near functions
|
| 383 |
static void save_kb_binary(const KnowledgeBase &kb, const std::string &fname){
|
| 384 |
std::ofstream ofs(fname, std::ios::binary);
|
| 385 |
if (!ofs) throw std::runtime_error("cannot open save file");
|
|
|
|
|
|
|
| 386 |
std::vector<const std::string*> interned;
|
| 387 |
interned.reserve(kb.interner.pool.size());
|
| 388 |
for (auto &s : kb.interner.pool) interned.push_back(&s);
|
|
|
|
| 389 |
uint64_t N = interned.size();
|
| 390 |
ofs.write(reinterpret_cast<const char*>(&N), sizeof(N));
|
| 391 |
for (auto p : interned){
|
|
@@ -393,10 +442,11 @@ static void save_kb_binary(const KnowledgeBase &kb, const std::string &fname){
|
|
| 393 |
ofs.write(reinterpret_cast<const char*>(&L), sizeof(L));
|
| 394 |
ofs.write(p->data(), static_cast<std::streamsize>(L));
|
| 395 |
}
|
|
|
|
|
|
|
| 396 |
uint64_t E = kb.next.size();
|
| 397 |
ofs.write(reinterpret_cast<const char*>(&E), sizeof(E));
|
| 398 |
for (auto &pr : kb.next){
|
| 399 |
-
// find index of key
|
| 400 |
const std::string &key = *pr.first;
|
| 401 |
auto it = std::find_if(interned.begin(), interned.end(), [&](const std::string* s){ return *s == key; });
|
| 402 |
if (it == interned.end()) throw std::runtime_error("save index error");
|
|
@@ -411,12 +461,40 @@ static void save_kb_binary(const KnowledgeBase &kb, const std::string &fname){
|
|
| 411 |
ofs.write(reinterpret_cast<const char*>(&v_idx), sizeof(v_idx));
|
| 412 |
}
|
| 413 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
safe_flush(ofs);
|
| 415 |
}
|
| 416 |
|
| 417 |
-
static void load_kb_binary(KnowledgeBase &kb, const std::string &fname){
|
| 418 |
std::ifstream ifs(fname, std::ios::binary);
|
| 419 |
if (!ifs) throw std::runtime_error("cannot open load file");
|
|
|
|
| 420 |
uint64_t N;
|
| 421 |
ifs.read(reinterpret_cast<char*>(&N), sizeof(N));
|
| 422 |
std::vector<std::string> strings; strings.reserve((size_t)N);
|
|
@@ -428,6 +506,7 @@ static void load_kb_binary(KnowledgeBase &kb, const std::string &fname){
|
|
| 428 |
}
|
| 429 |
std::vector<StrPtr> ptrs; ptrs.reserve(strings.size());
|
| 430 |
for (auto &s : strings) ptrs.push_back(kb.interner.intern(s));
|
|
|
|
| 431 |
uint64_t E; ifs.read(reinterpret_cast<char*>(&E), sizeof(E));
|
| 432 |
for (uint64_t i=0;i<E;++i){
|
| 433 |
uint64_t key_idx; ifs.read(reinterpret_cast<char*>(&key_idx), sizeof(key_idx));
|
|
@@ -440,12 +519,69 @@ static void load_kb_binary(KnowledgeBase &kb, const std::string &fname){
|
|
| 440 |
}
|
| 441 |
kb.next.emplace(key_ptr, std::move(vec));
|
| 442 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
}
|
| 444 |
|
| 445 |
// --------------------------- CLI + Interactive loop (shorters) -----------
|
| 446 |
|
| 447 |
static void print_usage(const char *p){
|
| 448 |
-
std::cout << "Usage: " << p << " [--maxlen N] [--save FILE] [--load-kb FILE] [--dict-depth D] [--learn f1 f2 ...]\n";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
}
|
| 450 |
|
| 451 |
int main(int argc, char **argv){
|
|
@@ -480,8 +616,14 @@ int main(int argc, char **argv){
|
|
| 480 |
|
| 481 |
KnowledgeBase kb;
|
| 482 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
if (!load_kb.empty()){
|
| 484 |
-
try { load_kb_binary(kb, load_kb); std::cerr << "Loaded KB: " << load_kb << "\n"; }
|
| 485 |
catch (const std::exception &e){ std::cerr << "Load KB error: " << e.what() << "\n"; }
|
| 486 |
}
|
| 487 |
|
|
@@ -490,15 +632,13 @@ int main(int argc, char **argv){
|
|
| 490 |
learn_files_parallel(kb, learn_files);
|
| 491 |
}
|
| 492 |
|
| 493 |
-
auto def_index = build_definition_index(dict_depth);
|
| 494 |
-
if (!def_index.empty()) std::cerr << "Dictionary depth " << dict_depth << " loaded (" << def_index.size() << " words)\n";
|
| 495 |
-
|
| 496 |
std::string line;
|
| 497 |
std::cout << "Ready. Enter prompts.\n";
|
| 498 |
while (std::cout << "> " , std::getline(std::cin, line)){
|
| 499 |
if (line.empty()){ std::cout << "\n"; continue; }
|
| 500 |
auto prompt_toks = tokenize_whitespace(line);
|
| 501 |
for (size_t i=1;i<prompt_toks.size();++i) kb.add_pair(prompt_toks[i-1], prompt_toks[i]);
|
|
|
|
| 502 |
auto resp = generate_response(kb, prompt_toks, maxlen, def_index, repeat_penalty);
|
| 503 |
for (size_t i=0;i<resp.size();++i){ std::cout << resp[i]; if (i+1<resp.size()) std::cout << ' '; }
|
| 504 |
std::cout << "\n";
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#include <algorithm>
|
| 2 |
#include <atomic>
|
| 3 |
#include <cctype>
|
|
|
|
| 70 |
}
|
| 71 |
};
|
| 72 |
|
| 73 |
+
// ---------- Global parsed dictionary (populated once in main) ----------
|
| 74 |
+
static std::unordered_map<std::string,std::string> g_raw_dict;
|
| 75 |
+
|
| 76 |
+
static std::unordered_set<std::string> def_tokens_from_text(const std::string &s){
|
| 77 |
+
auto toks = tokenize_non_alnum(s);
|
| 78 |
+
return std::unordered_set<std::string>(toks.begin(), toks.end());
|
| 79 |
+
}
|
| 80 |
|
| 81 |
+
// --------------------------- Knowledge base (short methods) --------------
|
| 82 |
using StrPtr = const std::string*;
|
| 83 |
struct PtrHash { size_t operator()(StrPtr p) const noexcept { return std::hash<std::string>()(*p); } };
|
| 84 |
struct PtrEq { bool operator()(StrPtr a, StrPtr b) const noexcept { return *a == *b; } };
|
|
|
|
| 89 |
StringInterner interner;
|
| 90 |
std::unordered_map<StrPtr, NextSet, PtrHash, PtrEq> next;
|
| 91 |
std::mutex m;
|
| 92 |
+
|
| 93 |
+
// def-index: for each interned word pointer -> list of interned tokens (definition expansion)
|
| 94 |
+
std::unordered_map<StrPtr, std::vector<StrPtr>, PtrHash, PtrEq> def_index;
|
| 95 |
+
std::mutex def_m;
|
| 96 |
+
int def_depth = 0;
|
| 97 |
+
|
| 98 |
void add_pair_interned(StrPtr k, StrPtr v){
|
| 99 |
std::lock_guard<std::mutex> lk(m);
|
| 100 |
auto &vec = next[k];
|
| 101 |
for (auto p : vec) if (*p == *v) return;
|
| 102 |
vec.push_back(v);
|
| 103 |
}
|
| 104 |
+
|
| 105 |
+
// set def depth; if changed, drop previously computed def expansions
|
| 106 |
+
void set_def_depth(int D){
|
| 107 |
+
std::lock_guard<std::mutex> lk(def_m);
|
| 108 |
+
if (D != def_depth){
|
| 109 |
+
def_index.clear();
|
| 110 |
+
def_depth = D;
|
| 111 |
+
}
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
// compute definition expansion for a single interned word (if needed)
|
| 115 |
+
void ensure_def_for_interned(StrPtr wp){
|
| 116 |
+
// quick no-op checks
|
| 117 |
+
if (wp == nullptr) return;
|
| 118 |
+
if (def_depth <= 0) return;
|
| 119 |
+
|
| 120 |
+
// double-checked locking
|
| 121 |
+
{
|
| 122 |
+
std::lock_guard<std::mutex> lk(def_m);
|
| 123 |
+
if (def_index.find(wp) != def_index.end()) return;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
// compute expansion using global parsed dictionary g_raw_dict
|
| 127 |
+
std::unordered_set<std::string> acc;
|
| 128 |
+
std::vector<std::string> frontier;
|
| 129 |
+
auto it_raw = g_raw_dict.find(*wp);
|
| 130 |
+
if (it_raw != g_raw_dict.end()){
|
| 131 |
+
auto toks = def_tokens_from_text(it_raw->second);
|
| 132 |
+
for (auto &t : toks){
|
| 133 |
+
if (acc.insert(t).second) frontier.push_back(t);
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
for (int depth = 1; depth < def_depth && !frontier.empty(); ++depth){
|
| 138 |
+
std::vector<std::string> nextf;
|
| 139 |
+
for (auto &w : frontier){
|
| 140 |
+
auto it2 = g_raw_dict.find(w);
|
| 141 |
+
if (it2 == g_raw_dict.end()) continue;
|
| 142 |
+
auto toks2 = def_tokens_from_text(it2->second);
|
| 143 |
+
for (auto &t : toks2){
|
| 144 |
+
if (acc.insert(t).second) nextf.push_back(t);
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
frontier.swap(nextf);
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
// intern all accumulated tokens and store pointers
|
| 151 |
+
std::vector<StrPtr> out;
|
| 152 |
+
out.reserve(acc.size());
|
| 153 |
+
for (auto &s : acc){
|
| 154 |
+
out.push_back(interner.intern(s));
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
// store atomically (prevent double insertion)
|
| 158 |
+
{
|
| 159 |
+
std::lock_guard<std::mutex> lk(def_m);
|
| 160 |
+
// another thread may have inserted meanwhile; do not overwrite
|
| 161 |
+
if (def_index.find(wp) == def_index.end()){
|
| 162 |
+
def_index.emplace(wp, std::move(out));
|
| 163 |
+
}
|
| 164 |
+
}
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
// existing public add_pair but now ensure def-expansion is built immediately
|
| 168 |
void add_pair(const std::string &k, const std::string &v){
|
| 169 |
StrPtr kp = interner.intern(k);
|
| 170 |
StrPtr vp = interner.intern(v);
|
| 171 |
+
// ensure definition expansion for both words as soon as they are seen
|
| 172 |
+
ensure_def_for_interned(kp);
|
| 173 |
+
ensure_def_for_interned(vp);
|
| 174 |
add_pair_interned(kp, vp);
|
| 175 |
}
|
| 176 |
+
|
| 177 |
std::optional<NextSet> lookup_by_string(const std::string &k) const {
|
| 178 |
for (auto &pr : next) if (*pr.first == k) return pr.second;
|
| 179 |
return std::nullopt;
|
|
|
|
| 185 |
}
|
| 186 |
};
|
| 187 |
|
| 188 |
+
// thread-safe snapshot of kb.def_index as string-based def-index
|
| 189 |
+
static std::unordered_map<std::string,std::unordered_set<std::string>>
|
| 190 |
+
snapshot_def_index(KnowledgeBase &kb){
|
| 191 |
+
std::unordered_map<std::string,std::unordered_set<std::string>> out;
|
| 192 |
+
std::lock_guard<std::mutex> lk(kb.def_m);
|
| 193 |
+
out.reserve(kb.def_index.size());
|
| 194 |
+
for (auto &pr : kb.def_index){
|
| 195 |
+
std::unordered_set<std::string> s;
|
| 196 |
+
s.reserve(pr.second.size());
|
| 197 |
+
for (auto p : pr.second) s.insert(*p);
|
| 198 |
+
out.emplace(*pr.first, std::move(s));
|
| 199 |
+
}
|
| 200 |
+
return out;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
// --------------------------- Small JSON parse helpers ----------------------
|
| 204 |
|
| 205 |
static inline bool json_valid_index(size_t i, size_t n){ return i < n; }
|
|
|
|
| 260 |
return dict;
|
| 261 |
}
|
| 262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
// --------------------------- Similarity helpers (very small) ----------------
|
| 264 |
|
| 265 |
static double jaccard_similarity(const std::unordered_set<std::string> &A,
|
|
|
|
| 426 |
|
| 427 |
// --------------------------- Serialization (short functions) ----------------
|
| 428 |
|
|
|
|
| 429 |
static void save_kb_binary(const KnowledgeBase &kb, const std::string &fname){
|
| 430 |
std::ofstream ofs(fname, std::ios::binary);
|
| 431 |
if (!ofs) throw std::runtime_error("cannot open save file");
|
| 432 |
+
|
| 433 |
+
// interned strings snapshot (must include all tokens used by def_index)
|
| 434 |
std::vector<const std::string*> interned;
|
| 435 |
interned.reserve(kb.interner.pool.size());
|
| 436 |
for (auto &s : kb.interner.pool) interned.push_back(&s);
|
| 437 |
+
|
| 438 |
uint64_t N = interned.size();
|
| 439 |
ofs.write(reinterpret_cast<const char*>(&N), sizeof(N));
|
| 440 |
for (auto p : interned){
|
|
|
|
| 442 |
ofs.write(reinterpret_cast<const char*>(&L), sizeof(L));
|
| 443 |
ofs.write(p->data(), static_cast<std::streamsize>(L));
|
| 444 |
}
|
| 445 |
+
|
| 446 |
+
// edges
|
| 447 |
uint64_t E = kb.next.size();
|
| 448 |
ofs.write(reinterpret_cast<const char*>(&E), sizeof(E));
|
| 449 |
for (auto &pr : kb.next){
|
|
|
|
| 450 |
const std::string &key = *pr.first;
|
| 451 |
auto it = std::find_if(interned.begin(), interned.end(), [&](const std::string* s){ return *s == key; });
|
| 452 |
if (it == interned.end()) throw std::runtime_error("save index error");
|
|
|
|
| 461 |
ofs.write(reinterpret_cast<const char*>(&v_idx), sizeof(v_idx));
|
| 462 |
}
|
| 463 |
}
|
| 464 |
+
|
| 465 |
+
// --- write definition expansion section ---
|
| 466 |
+
uint64_t D = static_cast<uint64_t>(kb.def_depth);
|
| 467 |
+
ofs.write(reinterpret_cast<const char*>(&D), sizeof(D));
|
| 468 |
+
|
| 469 |
+
// def entries: number of keys with a stored expansion
|
| 470 |
+
uint64_t K = kb.def_index.size();
|
| 471 |
+
ofs.write(reinterpret_cast<const char*>(&K), sizeof(K));
|
| 472 |
+
for (auto &pr : kb.def_index){
|
| 473 |
+
// key index
|
| 474 |
+
const std::string &key = *pr.first;
|
| 475 |
+
auto it = std::find_if(interned.begin(), interned.end(), [&](const std::string* s){ return *s == key; });
|
| 476 |
+
if (it == interned.end()) throw std::runtime_error("save def index error");
|
| 477 |
+
uint64_t key_idx = static_cast<uint64_t>(std::distance(interned.begin(), it));
|
| 478 |
+
ofs.write(reinterpret_cast<const char*>(&key_idx), sizeof(key_idx));
|
| 479 |
+
|
| 480 |
+
// number of tokens
|
| 481 |
+
uint64_t M = pr.second.size();
|
| 482 |
+
ofs.write(reinterpret_cast<const char*>(&M), sizeof(M));
|
| 483 |
+
for (auto tokp : pr.second){
|
| 484 |
+
auto it2 = std::find_if(interned.begin(), interned.end(), [&](const std::string* s){ return *s == *tokp; });
|
| 485 |
+
if (it2 == interned.end()) throw std::runtime_error("save def token index error");
|
| 486 |
+
uint64_t v_idx = static_cast<uint64_t>(std::distance(interned.begin(), it2));
|
| 487 |
+
ofs.write(reinterpret_cast<const char*>(&v_idx), sizeof(v_idx));
|
| 488 |
+
}
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
safe_flush(ofs);
|
| 492 |
}
|
| 493 |
|
| 494 |
+
static void load_kb_binary(KnowledgeBase &kb, const std::string &fname, int cli_dict_depth){
|
| 495 |
std::ifstream ifs(fname, std::ios::binary);
|
| 496 |
if (!ifs) throw std::runtime_error("cannot open load file");
|
| 497 |
+
|
| 498 |
uint64_t N;
|
| 499 |
ifs.read(reinterpret_cast<char*>(&N), sizeof(N));
|
| 500 |
std::vector<std::string> strings; strings.reserve((size_t)N);
|
|
|
|
| 506 |
}
|
| 507 |
std::vector<StrPtr> ptrs; ptrs.reserve(strings.size());
|
| 508 |
for (auto &s : strings) ptrs.push_back(kb.interner.intern(s));
|
| 509 |
+
|
| 510 |
uint64_t E; ifs.read(reinterpret_cast<char*>(&E), sizeof(E));
|
| 511 |
for (uint64_t i=0;i<E;++i){
|
| 512 |
uint64_t key_idx; ifs.read(reinterpret_cast<char*>(&key_idx), sizeof(key_idx));
|
|
|
|
| 519 |
}
|
| 520 |
kb.next.emplace(key_ptr, std::move(vec));
|
| 521 |
}
|
| 522 |
+
|
| 523 |
+
// read def-expansion section (new-format)
|
| 524 |
+
uint64_t file_def_depth;
|
| 525 |
+
ifs.read(reinterpret_cast<char*>(&file_def_depth), sizeof(file_def_depth));
|
| 526 |
+
uint64_t K; ifs.read(reinterpret_cast<char*>(&K), sizeof(K));
|
| 527 |
+
// populate kb.def_index from file
|
| 528 |
+
{
|
| 529 |
+
std::lock_guard<std::mutex> lk(kb.def_m);
|
| 530 |
+
kb.def_index.clear();
|
| 531 |
+
kb.def_depth = static_cast<int>(file_def_depth);
|
| 532 |
+
}
|
| 533 |
+
for (uint64_t i=0;i<K;++i){
|
| 534 |
+
uint64_t key_idx; ifs.read(reinterpret_cast<char*>(&key_idx), sizeof(key_idx));
|
| 535 |
+
uint64_t M; ifs.read(reinterpret_cast<char*>(&M), sizeof(M));
|
| 536 |
+
std::vector<StrPtr> tokens; tokens.reserve((size_t)M);
|
| 537 |
+
for (uint64_t j=0;j<M;++j){
|
| 538 |
+
uint64_t v_idx; ifs.read(reinterpret_cast<char*>(&v_idx), sizeof(v_idx));
|
| 539 |
+
tokens.push_back(ptrs.at((size_t)v_idx));
|
| 540 |
+
}
|
| 541 |
+
kb.def_index.emplace(ptrs.at((size_t)key_idx), std::move(tokens));
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
+
// If CLI requested a different dict depth, clear and recompute expansion for loaded words only
|
| 545 |
+
if (cli_dict_depth != kb.def_depth){
|
| 546 |
+
kb.set_def_depth(cli_dict_depth);
|
| 547 |
+
// --- build deduplicated union of "words present" = saved strings (ptrs) ∪ KB words (keys and neighbors)
|
| 548 |
+
std::vector<StrPtr> targets;
|
| 549 |
+
targets.reserve(ptrs.size() + kb.next.size()*2);
|
| 550 |
+
|
| 551 |
+
{
|
| 552 |
+
std::unordered_set<StrPtr, PtrHash, PtrEq> seen;
|
| 553 |
+
// include all strings from the saved file
|
| 554 |
+
for (auto p : ptrs) {
|
| 555 |
+
if (seen.insert(p).second) targets.push_back(p);
|
| 556 |
+
}
|
| 557 |
+
// include all words present in KB edges (keys and their neighbors)
|
| 558 |
+
for (auto &pr : kb.next) {
|
| 559 |
+
if (seen.insert(pr.first).second) targets.push_back(pr.first);
|
| 560 |
+
for (auto v : pr.second) {
|
| 561 |
+
if (seen.insert(v).second) targets.push_back(v);
|
| 562 |
+
}
|
| 563 |
+
}
|
| 564 |
+
}
|
| 565 |
+
|
| 566 |
+
// --- recompute definition expansion for each target in parallel
|
| 567 |
+
#pragma omp parallel for schedule(dynamic)
|
| 568 |
+
for (ptrdiff_t i = 0; i < static_cast<ptrdiff_t>(targets.size()); ++i) {
|
| 569 |
+
kb.ensure_def_for_interned(targets[(size_t)i]);
|
| 570 |
+
}
|
| 571 |
+
}
|
| 572 |
}
|
| 573 |
|
| 574 |
// --------------------------- CLI + Interactive loop (shorters) -----------
|
| 575 |
|
| 576 |
static void print_usage(const char *p){
|
| 577 |
+
std::cout << "Usage: " << p << " [--maxlen N] [--save FILE] [--load-kb FILE] [--dict-depth D] [--learn f1 f2 ...] [--repeat-penalty P] [--help]\n";
|
| 578 |
+
std::cout << " --maxlen N Maximum number of tokens generated in a response.\n";
|
| 579 |
+
std::cout << " --save FILE Save the knowledge-base and dictionary expansions to a binary file.\n";
|
| 580 |
+
std::cout << " --load-kb FILE Load a previously saved knowledge-base (and dictionary expansions) from a binary file.\n";
|
| 581 |
+
std::cout << " --dict-depth D Depth of dictionary-definition expansion used during learning.\n";
|
| 582 |
+
std::cout << " --learn f1 f2 ... Learn from one or more text files to update the knowledge base.\n";
|
| 583 |
+
std::cout << " --repeat-penalty P Penalize repeated tokens during response generation (higher values discourage repetition).\n";
|
| 584 |
+
std::cout << " --help Show command-line interface options for ChatIPC usage.\n";
|
| 585 |
}
|
| 586 |
|
| 587 |
int main(int argc, char **argv){
|
|
|
|
| 616 |
|
| 617 |
KnowledgeBase kb;
|
| 618 |
|
| 619 |
+
// parse the embedded dictionary once for use by per-word expansion
|
| 620 |
+
g_raw_dict = parse_dictionary_json();
|
| 621 |
+
// set KB def depth (clears any previous expansion)
|
| 622 |
+
kb.set_def_depth(dict_depth);
|
| 623 |
+
|
| 624 |
+
|
| 625 |
if (!load_kb.empty()){
|
| 626 |
+
try { load_kb_binary(kb, load_kb, dict_depth); std::cerr << "Loaded KB: " << load_kb << "\n"; }
|
| 627 |
catch (const std::exception &e){ std::cerr << "Load KB error: " << e.what() << "\n"; }
|
| 628 |
}
|
| 629 |
|
|
|
|
| 632 |
learn_files_parallel(kb, learn_files);
|
| 633 |
}
|
| 634 |
|
|
|
|
|
|
|
|
|
|
| 635 |
std::string line;
|
| 636 |
std::cout << "Ready. Enter prompts.\n";
|
| 637 |
while (std::cout << "> " , std::getline(std::cin, line)){
|
| 638 |
if (line.empty()){ std::cout << "\n"; continue; }
|
| 639 |
auto prompt_toks = tokenize_whitespace(line);
|
| 640 |
for (size_t i=1;i<prompt_toks.size();++i) kb.add_pair(prompt_toks[i-1], prompt_toks[i]);
|
| 641 |
+
auto def_index = snapshot_def_index(kb);
|
| 642 |
auto resp = generate_response(kb, prompt_toks, maxlen, def_index, repeat_penalty);
|
| 643 |
for (size_t i=0;i<resp.size();++i){ std::cout << resp[i]; if (i+1<resp.size()) std::cout << ' '; }
|
| 644 |
std::cout << "\n";
|
ChatIPC.exe
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cebe8014f2651597ff43d04dc0a27089ada505868be13eda19ce52089baafc8b
|
| 3 |
+
size 24027648
|