engram / scripts /index_knowledge.py

feat: upload scripts

954cf8a verified 8 days ago

11.7 kB

	#!/usr/bin/env python3
	"""
	scripts/index_knowledge.py — Batch index markdown files into .eng binaries.

	Processes markdown files from a directory (or single file), chunks them,
	fingerprints each chunk, and writes .eng files to the knowledge index.

	Usage:
	# Index a single file
	python scripts/index_knowledge.py --source path/to/file.md --project engram

	# Index a directory recursively
	python scripts/index_knowledge.py --source path/to/docs/ --project engram

	# Re-index changed files only (incremental)
	python scripts/index_knowledge.py --source path/to/docs/ --project engram --incremental

	# Dry run — show what would be indexed
	python scripts/index_knowledge.py --source path/to/docs/ --project engram --dry-run

	# Force re-index everything
	python scripts/index_knowledge.py --source path/to/docs/ --project engram --force

	Environment:
	ENGRAM_SESSIONS_DIR Base sessions dir (default: ~/.engram/sessions)
	ENGRAM_KNOWLEDGE_DIR Knowledge index dir (default: ~/.engram/knowledge)
	ENGRAM_MODEL_PATH Path to GGUF model for real fingerprints (optional)
	PYTHONPATH=. Must include project root for kvcos imports
	"""

	from __future__ import annotations

	import argparse
	import hashlib
	import json
	import os
	import sys
	import time
	from datetime import datetime, timezone
	from pathlib import Path

	# Ensure project root is importable
	sys.path.insert(0, str(Path(__file__).parent.parent))

	import torch

	from kvcos.engram.chunker import Chunk, chunk_markdown, eng_filename, slug_from_path
	from kvcos.engram.format import EigramEncoder
	from kvcos.engram.manifest import ChunkRecord, Manifest, _content_hash, _file_hash


	# ── Configuration ────────────────────────────────────────────────────

	KNOWLEDGE_DIR = Path(
	os.environ.get("ENGRAM_KNOWLEDGE_DIR", "~/.engram/knowledge")
	).expanduser()

	SKIP_PATTERNS = {
	"node_modules",
	".venv",
	"__pycache__",
	".git",
	".eng",
	"site-packages",
	}

	SKIP_FILES = {
	"LICENSE.md",
	"CHANGELOG.md",
	"SECURITY.md",
	}


	# ── Fingerprinting ──────────────────────────────────────────────────

	from kvcos.engram.embedder import get_fingerprint as _get_fingerprint


	# ── .eng Writer ──────────────────────────────────────────────────────

	_encoder = EigramEncoder()


	def _write_knowledge_eng(
	fp_tensor: torch.Tensor,
	chunk: Chunk,
	eng_path: Path,
	session_id: str,
	fp_source: str,
	source_path: str,
	project: str,
	chunk_index: int,
	chunk_total: int,
	) -> Path:
	"""Write a .eng binary for a knowledge chunk."""
	dim = fp_tensor.shape[0]
	basis_rank = 116
	vec_perdoc = torch.zeros(basis_rank)
	vec_fcdb = torch.zeros(basis_rank)
	joint_center = torch.zeros(128)

	# Truncate description to 256 chars for binary
	description = chunk.text[:256]

	blob = _encoder.encode(
	vec_perdoc=vec_perdoc,
	vec_fcdb=vec_fcdb,
	joint_center=joint_center,
	corpus_hash=hashlib.sha256(source_path.encode()).hexdigest()[:32],
	model_id=fp_source[:16],
	basis_rank=basis_rank,
	n_corpus=0,
	layer_range=(0, 0),
	context_len=len(chunk.text),
	l2_norm=float(torch.norm(fp_tensor).item()),
	scs=0.0,
	margin_proof=0.0,
	task_description=description,
	cache_id=session_id,
	vec_fourier=fp_tensor if dim == 2048 else None,
	vec_fourier_v2=fp_tensor,
	confusion_flag=False,
	)

	eng_path.parent.mkdir(parents=True, exist_ok=True)
	with open(eng_path, "wb") as f:
	f.write(blob)

	# Write extended sidecar with full metadata
	meta = {
	"cache_id": session_id,
	"task_description": chunk.text[:500],
	"source_path": source_path,
	"project": project,
	"fp_source": fp_source,
	"chunk_index": chunk_index,
	"chunk_total": chunk_total,
	"char_start": chunk.char_start,
	"char_end": chunk.char_end,
	"headers": list(chunk.headers),
	"ts": time.time(),
	"type": "knowledge",
	}
	meta_path = Path(str(eng_path) + ".meta.json")
	with open(meta_path, "w") as f:
	json.dump(meta, f, indent=2)

	return eng_path


	# ── Discovery ────────────────────────────────────────────────────────

	def discover_markdown_files(source: Path) -> list[Path]:
	"""Find all indexable .md files under source path."""
	if source.is_file():
	return [source] if source.suffix == ".md" else []

	files: list[Path] = []
	for p in sorted(source.rglob("*.md")):
	# Skip files in excluded directories
	if any(skip in p.parts for skip in SKIP_PATTERNS):
	continue
	# Skip excluded filenames
	if p.name in SKIP_FILES:
	continue
	# Skip empty files
	if p.stat().st_size == 0:
	continue
	files.append(p)

	return files


	# ── Main Pipeline ────────────────────────────────────────────────────

	def index_file(
	source_path: Path,
	project: str,
	manifest: Manifest,
	date_str: str,
	dry_run: bool = False,
	force: bool = False,
	) -> tuple[Manifest, int]:
	"""
	Index a single markdown file into .eng chunks.

	Returns:
	(updated_manifest, chunks_written)
	"""
	content = source_path.read_text(encoding="utf-8", errors="replace")
	content_hash = _content_hash(content)

	# Incremental: skip if unchanged
	if not force and not manifest.needs_reindex(str(source_path), content_hash):
	return manifest, 0

	slug = slug_from_path(str(source_path))
	context = f"Source: {source_path.name} \| Project: {project}"

	# Chunk the content
	chunks = chunk_markdown(
	content,
	max_chars=2000,
	min_chars=100,
	context_prefix=context,
	)

	if dry_run:
	print(f" [DRY RUN] {source_path.name}: {len(chunks)} chunks, "
	f"{len(content)} chars")
	return manifest, len(chunks)

	# Write .eng for each chunk
	chunk_records: list[ChunkRecord] = []
	project_dir = KNOWLEDGE_DIR / project
	project_dir.mkdir(parents=True, exist_ok=True)

	for chunk in chunks:
	filename = eng_filename(
	project=project,
	slug=slug,
	date=date_str,
	chunk_index=chunk.index,
	chunk_total=len(chunks),
	)
	eng_path = project_dir / filename

	# Fingerprint the chunk text (with context)
	fp_tensor, fp_source = _get_fingerprint(chunk.text)

	session_id = f"{project}/{slug}"
	if len(chunks) > 1:
	session_id += f"_c{chunk.index + 1:03d}"

	_write_knowledge_eng(
	fp_tensor=fp_tensor,
	chunk=chunk,
	eng_path=eng_path,
	session_id=session_id,
	fp_source=fp_source,
	source_path=str(source_path),
	project=project,
	chunk_index=chunk.index,
	chunk_total=len(chunks),
	)

	chunk_records.append(ChunkRecord(
	eng_path=str(eng_path),
	chunk_index=chunk.index,
	chunk_total=len(chunks),
	char_start=chunk.char_start,
	char_end=chunk.char_end,
	indexed_at=time.time(),
	))

	# Register in manifest
	manifest = manifest.register(
	source_path=str(source_path),
	content_hash=content_hash,
	project=project,
	file_size=len(content.encode("utf-8")),
	chunks=chunk_records,
	)

	return manifest, len(chunks)


	def index_batch(
	source: Path,
	project: str,
	incremental: bool = True,
	dry_run: bool = False,
	force: bool = False,
	) -> dict:
	"""
	Index all markdown files under source path.

	Returns summary dict with stats.
	"""
	manifest = Manifest.load()
	date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")

	files = discover_markdown_files(source)
	if not files:
	return {"error": f"No .md files found under {source}"}

	stats = {
	"source": str(source),
	"project": project,
	"files_found": len(files),
	"files_indexed": 0,
	"files_skipped": 0,
	"chunks_written": 0,
	"dry_run": dry_run,
	"incremental": incremental,
	"date": date_str,
	}

	print(f"\nENGRAM Knowledge Indexer")
	print(f"{'=' * 50}")
	print(f"Source: {source}")
	print(f"Project: {project}")
	print(f"Files found: {len(files)}")
	print(f"Mode: {'DRY RUN' if dry_run else 'LIVE'}")
	print(f"{'=' * 50}\n")

	for i, fpath in enumerate(files, 1):
	prev_chunks = manifest.total_chunks
	manifest, n_chunks = index_file(
	source_path=fpath,
	project=project,
	manifest=manifest,
	date_str=date_str,
	dry_run=dry_run,
	force=force,
	)

	if n_chunks > 0:
	stats["files_indexed"] += 1
	stats["chunks_written"] += n_chunks
	status = "INDEXED" if not dry_run else "DRY RUN"
	print(f" [{i}/{len(files)}] {status}: {fpath.name} "
	f"→ {n_chunks} chunks")
	else:
	stats["files_skipped"] += 1
	print(f" [{i}/{len(files)}] SKIP (unchanged): {fpath.name}")

	print(f"\n{'=' * 50}")
	print(f"Done. {stats['files_indexed']} files → "
	f"{stats['chunks_written']} chunks")
	if stats["files_skipped"]:
	print(f"Skipped {stats['files_skipped']} unchanged files")
	print(f"Manifest: {manifest.summary()}")
	print(f"{'=' * 50}\n")

	return stats


	# ── CLI ──────────────────────────────────────────────────────────────

	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Index markdown files into ENGRAM .eng knowledge files"
	)
	parser.add_argument(
	"--source", "-s",
	required=True,
	help="Path to file or directory to index",
	)
	parser.add_argument(
	"--project", "-p",
	default="engram",
	help="Project namespace (default: engram)",
	)
	parser.add_argument(
	"--dry-run", "-n",
	action="store_true",
	help="Show what would be indexed without writing",
	)
	parser.add_argument(
	"--force", "-f",
	action="store_true",
	help="Re-index all files regardless of content hash",
	)
	parser.add_argument(
	"--incremental", "-i",
	action="store_true",
	default=True,
	help="Skip unchanged files (default: true)",
	)

	args = parser.parse_args()
	source = Path(args.source).resolve()

	if not source.exists():
	print(f"Error: {source} does not exist", file=sys.stderr)
	sys.exit(1)

	stats = index_batch(
	source=source,
	project=args.project,
	incremental=args.incremental,
	dry_run=args.dry_run,
	force=args.force,
	)

	if "error" in stats:
	print(f"Error: {stats['error']}", file=sys.stderr)
	sys.exit(1)


	if __name__ == "__main__":
	main()