guru / feed.py

Upload folder using huggingface_hub

d38c1d3 verified 27 days ago

20.2 kB

	"""
	Fast feeder: build brain LMDB using scipy sparse matrices instead of Python dicts.

	Python dicts cost ~70 bytes per edge. scipy sparse uses ~12 bytes per edge.
	178M edges: 12GB in dicts vs 2.1GB in scipy. Processes ALL 1.29M records.

	Usage:
	python3 feed.py # all datasets
	python3 feed.py --limit 1000 # first 1000 records
	"""

	import sys, os, json, time, argparse, re, struct, gc
	from pathlib import Path
	from collections import defaultdict

	import numpy as np
	from scipy.sparse import lil_matrix, csr_matrix
	import lmdb

	DATA_DIR = Path.home() / "webmind-research" / "data"
	SEED_PATH = Path.home() / "nexus-brain" / "seed.jsonl"
	DB_PATH = os.path.expanduser('~/nexus-brain/brain.lmdb')

	COOCCURRENCE_PULL = 0.3
	COOC_WINDOW = 10 # only pair words within this window (not all-pairs)
	MAX_CONTENT_TOKENS = 50 # cap content tokens per sentence to limit O(n²)

	FUNCTION_WORDS = frozenset({
	"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
	"have", "has", "had", "do", "does", "did", "will", "would", "could",
	"should", "may", "might", "shall", "can", "must", "of", "in", "to",
	"for", "with", "on", "at", "by", "from", "as", "into", "through",
	"during", "before", "after", "above", "below", "between", "out",
	"off", "over", "under", "and", "but", "or", "nor", "not", "so",
	"yet", "both", "either", "neither", "each", "every", "all", "any",
	"few", "more", "most", "other", "some", "such", "no", "only", "own",
	"same", "than", "too", "very", "just", "about", "up", "what", "which",
	"who", "whom", "this", "that", "these", "those", "am", "if", "then",
	"because", "while", "although", "though", "even", "also", "it", "its",
	"how", "when", "where", "why", "there", "here",
	})

	_ID_FMT = struct.Struct('<i')
	_ID_CONF_FMT = struct.Struct('<if')
	_SENT_ENTRY_FMT = struct.Struct('<ii')
	_NEURON_FMT = struct.Struct('<fq?b')

	VECTOR_DIM = 512


	def tokenize(text):
	return re.findall(r'[a-z0-9]+', text.lower())


	def avail_mb():
	try:
	with open('/proc/meminfo') as f:
	for line in f:
	if line.startswith('MemAvailable:'):
	return int(line.split()[1]) / 1024
	except Exception:
	return 9999


	def rss_mb():
	try:
	with open('/proc/self/status') as f:
	for line in f:
	if line.startswith('VmRSS:'):
	return int(line.split()[1]) / 1024
	except Exception:
	return 0


	def disk_free_gb():
	try:
	st = os.statvfs('/')
	return st.f_bavail * st.f_frsize / (1024 ** 3)
	except Exception:
	return 999


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("--dataset", default="all")
	parser.add_argument("--limit", type=int, default=0)
	parser.add_argument("--mem-floor", type=int, default=2048,
	help="Stop ingestion when available RAM drops below this (MB)")
	parser.add_argument("--max-vocab", type=int, default=600_000,
	help="Max vocabulary size (pre-allocate sparse matrix)")
	parser.add_argument("--map-size", type=int, default=4,
	help="LMDB map size in GB")
	parser.add_argument("--seed", action="store_true",
	help="Use curated seed.jsonl instead of raw data dir")
	args = parser.parse_args()

	if args.seed and SEED_PATH.exists():
	datasets = [SEED_PATH]
	print(f"Using curated seed: {SEED_PATH}")
	else:
	EXCLUDE = {'coco_captions', 'audiocaps', 'vggsound'}
	datasets = sorted(DATA_DIR.glob("*.jsonl"))
	if args.dataset != "all":
	datasets = [DATA_DIR / f"{args.dataset}.jsonl"]
	else:
	datasets = [d for d in datasets if d.stem not in EXCLUDE]

	print(f"Datasets: {len(datasets)} files")
	print(f"RAM: {avail_mb():.0f}MB avail \| Disk: {disk_free_gb():.1f}GB free")
	print(f"Max vocab: {args.max_vocab:,} \| Mem floor: {args.mem_floor}MB")
	print(f"LMDB path: {DB_PATH}")

	# ============ Phase 1: Build sparse co-occurrence matrix ============
	V = args.max_vocab # pre-allocate
	print(f"\n=== Phase 1: Building sparse co-occurrence ({V:,} x {V:,} pre-alloc) ===")

	# COO accumulation (fastest for construction)
	cooc_rows = []
	cooc_cols = []
	cooc_vals = []

	words = []
	word_idx = {}
	successors = defaultdict(lambda: defaultdict(float))
	sentences = []
	sentence_texts = [] # original text for each sentence (for full-text retrieval)
	template_counts = defaultdict(int) # pattern → count

	total_fed = 0
	t0 = time.time()

	try:
	for ds_path in datasets:
	if not ds_path.exists() or ds_path.stat().st_size == 0:
	continue
	ds_name = ds_path.stem
	ds_fed = 0

	with open(ds_path) as f:
	for line in f:
	if args.limit and total_fed >= args.limit:
	break
	try:
	record = json.loads(line)
	except json.JSONDecodeError:
	continue

	texts = []
	text = record.get("text", "").strip()
	question = record.get("question", "").strip()
	answer = record.get("answer", "").strip()
	if text and len(text) >= 10:
	texts.append(text)
	if question and answer:
	if len(answer) < 50 and not answer.endswith('.'):
	texts.append(f"{question.rstrip('?')} is {answer}")
	else:
	texts.append(answer)

	for sent in texts:
	tokens = tokenize(sent)
	content = [t for t in tokens if t not in FUNCTION_WORDS]
	if not content:
	continue

	for w in content:
	if w not in word_idx:
	if len(words) >= V:
	continue # vocabulary full
	idx = len(words)
	words.append(w)
	word_idx[w] = idx

	indices = [word_idx[w] for w in content if w in word_idx]
	if len(indices) < 2:
	continue
	# Cap long documents to prevent O(n²) explosion
	if len(indices) > MAX_CONTENT_TOKENS:
	indices = indices[:MAX_CONTENT_TOKENS]

	# Window-based co-occurrence (O(n*W) not O(n²))
	n_idx = len(indices)
	for i in range(n_idx):
	for j in range(i + 1, min(i + COOC_WINDOW + 1, n_idx)):
	a, b = indices[i], indices[j]
	cooc_rows.append(a)
	cooc_cols.append(b)
	cooc_vals.append(COOCCURRENCE_PULL)
	cooc_rows.append(b)
	cooc_cols.append(a)
	cooc_vals.append(COOCCURRENCE_PULL)

	for i in range(len(indices) - 1):
	successors[indices[i]][indices[i+1]] += 1.0

	sentences.append(tuple(indices))
	sentence_texts.append(sent)

	# Extract template: structural words stay, content → slots
	if 3 <= len(tokens) <= 20:
	structural_count = sum(1 for t in tokens if t in FUNCTION_WORDS)
	if structural_count >= 1 and structural_count < len(tokens):
	parts = []
	slot_idx = 0
	for t in tokens:
	if t in FUNCTION_WORDS:
	parts.append(t)
	else:
	parts.append(f"[S{slot_idx}]")
	slot_idx += 1
	pattern = " ".join(parts)
	template_counts[pattern] += 1

	total_fed += 1
	ds_fed += 1

	if total_fed % 10000 == 0:
	elapsed = time.time() - t0
	rate = total_fed / elapsed
	mem = avail_mb()
	n_edges = len(cooc_rows)
	edge_mb = n_edges * 12 / (1024 * 1024) # 4+4+4 bytes per COO entry
	print(
	f" [{ds_name}] {total_fed:,} fed \| "
	f"{len(words):,} words \| {n_edges:,} edges ({edge_mb:.0f}MB) \| "
	f"{rate:,.0f}/sec \| RSS: {rss_mb():.0f}MB \| "
	f"Avail: {mem:.0f}MB",
	flush=True
	)
	if mem < args.mem_floor:
	raise MemoryError("RAM floor hit")

	if args.limit and total_fed >= args.limit:
	break
	print(f" {ds_name}: {ds_fed:,} records", flush=True)

	except (KeyboardInterrupt, MemoryError) as e:
	print(f"\nPhase 1 stopped: {e}")

	elapsed1 = time.time() - t0
	n_edges = len(cooc_rows)
	print(f"\nPhase 1: {total_fed:,} records \| {len(words):,} words \| "
	f"{n_edges:,} COO entries \| {len(sentences):,} sentences \| {elapsed1:.1f}s")
	print(f" COO memory: {n_edges * 12 / (1024*1024):.0f} MB "
	f"(vs ~{n_edges * 70 / (1024*1024):.0f} MB in Python dicts)")

	# ============ Phase 1.5: COO → CSR + hashed projection vectors ============
	print(f"\n=== Phase 1.5: COO → CSR matrix + {VECTOR_DIM}-dim vectors ===")
	t15 = time.time()

	V_actual = len(words)

	# Build CSR from COO (scipy handles duplicate summing)
	from scipy.sparse import coo_matrix
	cooc_mat = coo_matrix(
	(np.array(cooc_vals, dtype=np.float32),
	(np.array(cooc_rows, dtype=np.int32),
	np.array(cooc_cols, dtype=np.int32))),
	shape=(V_actual, V_actual),
	).tocsr()

	# Free COO arrays
	del cooc_rows, cooc_cols, cooc_vals
	gc.collect()

	print(f" CSR: {V_actual:,} x {V_actual:,}, {cooc_mat.nnz:,} non-zeros, "
	f"{cooc_mat.data.nbytes / (1024*1024):.0f} MB")

	# Hashed projection vectors from CSR (streaming, no extra memory)
	word_vectors = np.zeros((V_actual, VECTOR_DIM), dtype=np.float32)
	indptr = cooc_mat.indptr
	indices = cooc_mat.indices
	data = cooc_mat.data

	for widx in range(V_actual):
	start, end = indptr[widx], indptr[widx + 1]
	for k in range(start, end):
	dim = int(indices[k]) % VECTOR_DIM
	word_vectors[widx, dim] += data[k]

	norms = np.linalg.norm(word_vectors, axis=1, keepdims=True)
	norms = np.maximum(norms, 1e-8)
	word_vectors = word_vectors / norms

	elapsed15 = time.time() - t15
	print(f" Vectors: {V_actual:,} × {VECTOR_DIM} = {word_vectors.nbytes/(1024*1024):.0f} MB \| {elapsed15:.1f}s")
	print(f" RSS: {rss_mb():.0f} MB \| Avail: {avail_mb():.0f} MB")

	# ============ Phase 2: Write to LMDB (batched to avoid I/O spikes) ============
	print("\n=== Phase 2: Writing to LMDB (batched) ===")
	t2 = time.time()

	# Free COO/CSR references we no longer need — reduce peak memory before mmap
	del indptr, indices, data
	gc.collect()
	print(f" Pre-LMDB: RSS {rss_mb():.0f} MB \| Avail {avail_mb():.0f} MB")

	if os.path.exists(DB_PATH):
	import shutil
	shutil.rmtree(DB_PATH)

	map_size = args.map_size * 1024 * 1024 * 1024
	env = lmdb.open(DB_PATH, max_dbs=16, map_size=map_size)

	neurons_db = env.open_db(b'neurons')
	vectors_db = env.open_db(b'vectors')
	successors_db = env.open_db(b'successors')
	predecessors_db = env.open_db(b'predecessors')
	words_db = env.open_db(b'words')
	sentences_db = env.open_db(b'sentences')
	sent_index_db = env.open_db(b'sent_index')
	cooc_db = env.open_db(b'cooccurrence')
	templates_db = env.open_db(b'templates')
	sent_text_db = env.open_db(b'sentence_text')
	meta_db = env.open_db(b'meta')

	BATCH = 10_000 # commit every 10K entries to limit dirty pages

	def batched_write(label, items, db, transform=None):
	"""Write items in batches of BATCH, committing + syncing between."""
	count = 0
	txn = env.begin(write=True)
	for item in items:
	if transform:
	k, v = transform(item)
	else:
	k, v = item
	if k is not None:
	txn.put(k, v, db=db)
	count += 1
	if count % BATCH == 0:
	txn.commit()
	env.sync(True)
	txn = env.begin(write=True)
	if count % 50_000 == 0:
	print(f" {label}: {count:,} written \| RSS {rss_mb():.0f} MB", flush=True)
	txn.commit()
	env.sync(True)
	print(f" {label}: {count:,} done", flush=True)
	return count

	# Meta (tiny, one txn)
	with env.begin(write=True) as txn:
	txn.put(b'count', _ID_FMT.pack(V_actual), db=meta_db)
	txn.put(b'next_id', _ID_FMT.pack(V_actual), db=meta_db)
	txn.put(b'dim', _ID_FMT.pack(VECTOR_DIM), db=meta_db)
	txn.put(b'next_sentence_id', _ID_FMT.pack(len(sentences)), db=meta_db)

	# Neurons (batched)
	print(f" Writing {V_actual:,} neurons...")
	batched_write("neurons", range(V_actual), neurons_db,
	transform=lambda i: (_ID_FMT.pack(i), _NEURON_FMT.pack(0.5, 0, False, 1)))

	# Vectors (batched — each is 2KB, so 10K batch = 20MB per commit)
	print(f" Writing {V_actual:,} vectors...")
	batched_write("vectors", range(V_actual), vectors_db,
	transform=lambda i: (_ID_FMT.pack(i), word_vectors[i].tobytes()))

	# Word mappings (batched)
	print(f" Writing word mappings...")
	skipped = 0
	def word_transform(item):
	nonlocal skipped
	w, idx = item
	encoded = w.encode('utf-8')
	if len(encoded) > 500:
	skipped += 1
	return (None, None)
	return (encoded, _ID_FMT.pack(idx))
	batched_write("words", word_idx.items(), words_db, transform=word_transform)
	if skipped:
	print(f" (skipped {skipped} oversized word keys)")

	# Successors + predecessors (batched, needs read-back for predecessors)
	print(f" Writing successors...")
	succ_count = 0
	txn = env.begin(write=True)
	for src, targets in successors.items():
	top = sorted(targets.items(), key=lambda x: -x[1])[:10]
	max_c = top[0][1] if top else 1.0
	succ_bytes = b''.join(
	_ID_CONF_FMT.pack(tid, min(c / max_c, 1.0))
	for tid, c in top
	)
	txn.put(_ID_FMT.pack(src), succ_bytes, db=successors_db)
	for tid, c in top[:3]:
	key = _ID_FMT.pack(tid)
	existing = txn.get(key, db=predecessors_db) or b''
	if len(existing) < 3 * 4:
	existing += _ID_FMT.pack(src)
	txn.put(key, existing, db=predecessors_db)
	succ_count += 1
	if succ_count % BATCH == 0:
	txn.commit()
	env.sync(True)
	txn = env.begin(write=True)
	if succ_count % 50_000 == 0:
	print(f" successors: {succ_count:,} \| RSS {rss_mb():.0f} MB", flush=True)
	txn.commit()
	env.sync(True)
	print(f" successors: {succ_count:,} done", flush=True)

	# Co-occurrence from CSR (batched)
	csr_indptr = cooc_mat.indptr
	csr_indices = cooc_mat.indices
	csr_data = cooc_mat.data
	print(f" Writing {cooc_mat.nnz:,} co-occurrence edges...")
	cooc_count = 0
	txn = env.begin(write=True)
	for a in range(V_actual):
	start, end = csr_indptr[a], csr_indptr[a + 1]
	if start == end:
	continue
	parts = []
	for k in range(start, end):
	b = int(csr_indices[k])
	w = float(csr_data[k])
	if a != b and w > 0:
	parts.append(_ID_CONF_FMT.pack(b, w))
	if parts:
	txn.put(_ID_FMT.pack(a), b''.join(parts), db=cooc_db)
	cooc_count += 1
	if cooc_count % BATCH == 0:
	txn.commit()
	env.sync(True)
	txn = env.begin(write=True)
	if cooc_count % 50_000 == 0:
	print(f" cooc rows: {cooc_count:,} \| RSS {rss_mb():.0f} MB", flush=True)
	txn.commit()
	env.sync(True)
	print(f" cooc rows: {cooc_count:,} done", flush=True)

	# Sentences (batched)
	print(f" Writing {len(sentences):,} sentences...")
	sent_reverse = defaultdict(list)
	sent_count = 0
	txn = env.begin(write=True)
	for sid, word_indices_tuple in enumerate(sentences):
	sent_bytes = b''.join(
	_SENT_ENTRY_FMT.pack(wid, pos)
	for pos, wid in enumerate(word_indices_tuple)
	)
	txn.put(_ID_FMT.pack(sid), sent_bytes, db=sentences_db)
	for wid in word_indices_tuple:
	sent_reverse[wid].append(sid)
	sent_count += 1
	if sent_count % BATCH == 0:
	txn.commit()
	env.sync(True)
	txn = env.begin(write=True)
	txn.commit()
	env.sync(True)
	print(f" sentences: {sent_count:,} done", flush=True)

	# Sentence full text (batched)
	print(f" Writing {len(sentence_texts):,} sentence texts...")
	batched_write("sent_text", enumerate(sentence_texts), sent_text_db,
	transform=lambda item: (
	_ID_FMT.pack(item[0]),
	item[1].encode('utf-8')[:500] # cap at 500 bytes
	))

	# Sentence index (batched)
	print(f" Writing sentence index ({len(sent_reverse):,} entries)...")
	batched_write("sent_index", sent_reverse.items(), sent_index_db,
	transform=lambda item: (
	_ID_FMT.pack(item[0]),
	b''.join(_ID_FMT.pack(s) for s in item[1][:50])
	))

	# Templates (top-K most frequent patterns)
	TOP_TEMPLATES = 5000
	top_templates = sorted(template_counts.items(), key=lambda x: -x[1])[:TOP_TEMPLATES]
	# Filter: only templates seen at least 3 times and with 1-6 slots
	top_templates = [(p, c) for p, c in top_templates
	if c >= 3 and 1 <= p.count('[S') <= 6]
	print(f" Writing {len(top_templates):,} templates (from {len(template_counts):,} unique)...")
	with env.begin(write=True) as txn:
	for tid, (pattern, count) in enumerate(top_templates):
	key = _ID_FMT.pack(tid)
	val = json.dumps({"pattern": pattern, "count": count, "slots": pattern.count('[S')}).encode()
	txn.put(key, val, db=templates_db)
	txn.put(b'count', _ID_FMT.pack(len(top_templates)), db=templates_db)
	env.sync(True)
	print(f" templates: {len(top_templates):,} done", flush=True)

	env.close()

	elapsed2 = time.time() - t2
	total_time = time.time() - t0
	lmdb_mb = sum(f.stat().st_size for f in Path(DB_PATH).iterdir()) / (1024 * 1024)

	print(f"\nPhase 2: {elapsed2:.1f}s")
	print(f"\n{'='*60}")
	print(f"COMPLETE: {total_fed:,} records → {V_actual:,} words")
	print(f" Edges: {cooc_mat.nnz:,} (CSR)")
	print(f" Sentences: {len(sentences):,}")
	print(f" Templates: {len(top_templates):,}")
	print(f" Vectors: {V_actual:,} × {VECTOR_DIM}")
	print(f" LMDB: {lmdb_mb:.1f} MB \| Disk: {disk_free_gb():.1f} GB")
	print(f" Time: {total_time:.1f}s ({total_fed/max(total_time,1):,.0f} rec/sec)")
	print(f" RSS: {rss_mb():.0f} MB \| Avail: {avail_mb():.0f} MB")
	print(f" Path: {DB_PATH}")
	print(f"{'='*60}")


	if __name__ == '__main__':
	main()