File size: 11,741 Bytes

954cf8a

#!/usr/bin/env python3
"""
scripts/index_knowledge.py — Batch index markdown files into .eng binaries.

Processes markdown files from a directory (or single file), chunks them,
fingerprints each chunk, and writes .eng files to the knowledge index.

Usage:
    # Index a single file
    python scripts/index_knowledge.py --source path/to/file.md --project engram

    # Index a directory recursively
    python scripts/index_knowledge.py --source path/to/docs/ --project engram

    # Re-index changed files only (incremental)
    python scripts/index_knowledge.py --source path/to/docs/ --project engram --incremental

    # Dry run — show what would be indexed
    python scripts/index_knowledge.py --source path/to/docs/ --project engram --dry-run

    # Force re-index everything
    python scripts/index_knowledge.py --source path/to/docs/ --project engram --force

Environment:
    ENGRAM_SESSIONS_DIR   Base sessions dir (default: ~/.engram/sessions)
    ENGRAM_KNOWLEDGE_DIR  Knowledge index dir (default: ~/.engram/knowledge)
    ENGRAM_MODEL_PATH     Path to GGUF model for real fingerprints (optional)
    PYTHONPATH=.          Must include project root for kvcos imports
"""

from __future__ import annotations

import argparse
import hashlib
import json
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path

# Ensure project root is importable
sys.path.insert(0, str(Path(__file__).parent.parent))

import torch

from kvcos.engram.chunker import Chunk, chunk_markdown, eng_filename, slug_from_path
from kvcos.engram.format import EigramEncoder
from kvcos.engram.manifest import ChunkRecord, Manifest, _content_hash, _file_hash


# ── Configuration ────────────────────────────────────────────────────

KNOWLEDGE_DIR = Path(
    os.environ.get("ENGRAM_KNOWLEDGE_DIR", "~/.engram/knowledge")
).expanduser()

SKIP_PATTERNS = {
    "node_modules",
    ".venv",
    "__pycache__",
    ".git",
    ".eng",
    "site-packages",
}

SKIP_FILES = {
    "LICENSE.md",
    "CHANGELOG.md",
    "SECURITY.md",
}


# ── Fingerprinting ──────────────────────────────────────────────────

from kvcos.engram.embedder import get_fingerprint as _get_fingerprint


# ── .eng Writer ──────────────────────────────────────────────────────

_encoder = EigramEncoder()


def _write_knowledge_eng(
    fp_tensor: torch.Tensor,
    chunk: Chunk,
    eng_path: Path,
    session_id: str,
    fp_source: str,
    source_path: str,
    project: str,
    chunk_index: int,
    chunk_total: int,
) -> Path:
    """Write a .eng binary for a knowledge chunk."""
    dim = fp_tensor.shape[0]
    basis_rank = 116
    vec_perdoc = torch.zeros(basis_rank)
    vec_fcdb = torch.zeros(basis_rank)
    joint_center = torch.zeros(128)

    # Truncate description to 256 chars for binary
    description = chunk.text[:256]

    blob = _encoder.encode(
        vec_perdoc=vec_perdoc,
        vec_fcdb=vec_fcdb,
        joint_center=joint_center,
        corpus_hash=hashlib.sha256(source_path.encode()).hexdigest()[:32],
        model_id=fp_source[:16],
        basis_rank=basis_rank,
        n_corpus=0,
        layer_range=(0, 0),
        context_len=len(chunk.text),
        l2_norm=float(torch.norm(fp_tensor).item()),
        scs=0.0,
        margin_proof=0.0,
        task_description=description,
        cache_id=session_id,
        vec_fourier=fp_tensor if dim == 2048 else None,
        vec_fourier_v2=fp_tensor,
        confusion_flag=False,
    )

    eng_path.parent.mkdir(parents=True, exist_ok=True)
    with open(eng_path, "wb") as f:
        f.write(blob)

    # Write extended sidecar with full metadata
    meta = {
        "cache_id": session_id,
        "task_description": chunk.text[:500],
        "source_path": source_path,
        "project": project,
        "fp_source": fp_source,
        "chunk_index": chunk_index,
        "chunk_total": chunk_total,
        "char_start": chunk.char_start,
        "char_end": chunk.char_end,
        "headers": list(chunk.headers),
        "ts": time.time(),
        "type": "knowledge",
    }
    meta_path = Path(str(eng_path) + ".meta.json")
    with open(meta_path, "w") as f:
        json.dump(meta, f, indent=2)

    return eng_path


# ── Discovery ────────────────────────────────────────────────────────

def discover_markdown_files(source: Path) -> list[Path]:
    """Find all indexable .md files under source path."""
    if source.is_file():
        return [source] if source.suffix == ".md" else []

    files: list[Path] = []
    for p in sorted(source.rglob("*.md")):
        # Skip files in excluded directories
        if any(skip in p.parts for skip in SKIP_PATTERNS):
            continue
        # Skip excluded filenames
        if p.name in SKIP_FILES:
            continue
        # Skip empty files
        if p.stat().st_size == 0:
            continue
        files.append(p)

    return files


# ── Main Pipeline ────────────────────────────────────────────────────

def index_file(
    source_path: Path,
    project: str,
    manifest: Manifest,
    date_str: str,
    dry_run: bool = False,
    force: bool = False,
) -> tuple[Manifest, int]:
    """
    Index a single markdown file into .eng chunks.

    Returns:
        (updated_manifest, chunks_written)
    """
    content = source_path.read_text(encoding="utf-8", errors="replace")
    content_hash = _content_hash(content)

    # Incremental: skip if unchanged
    if not force and not manifest.needs_reindex(str(source_path), content_hash):
        return manifest, 0

    slug = slug_from_path(str(source_path))
    context = f"Source: {source_path.name} | Project: {project}"

    # Chunk the content
    chunks = chunk_markdown(
        content,
        max_chars=2000,
        min_chars=100,
        context_prefix=context,
    )

    if dry_run:
        print(f"  [DRY RUN] {source_path.name}: {len(chunks)} chunks, "
              f"{len(content)} chars")
        return manifest, len(chunks)

    # Write .eng for each chunk
    chunk_records: list[ChunkRecord] = []
    project_dir = KNOWLEDGE_DIR / project
    project_dir.mkdir(parents=True, exist_ok=True)

    for chunk in chunks:
        filename = eng_filename(
            project=project,
            slug=slug,
            date=date_str,
            chunk_index=chunk.index,
            chunk_total=len(chunks),
        )
        eng_path = project_dir / filename

        # Fingerprint the chunk text (with context)
        fp_tensor, fp_source = _get_fingerprint(chunk.text)

        session_id = f"{project}/{slug}"
        if len(chunks) > 1:
            session_id += f"_c{chunk.index + 1:03d}"

        _write_knowledge_eng(
            fp_tensor=fp_tensor,
            chunk=chunk,
            eng_path=eng_path,
            session_id=session_id,
            fp_source=fp_source,
            source_path=str(source_path),
            project=project,
            chunk_index=chunk.index,
            chunk_total=len(chunks),
        )

        chunk_records.append(ChunkRecord(
            eng_path=str(eng_path),
            chunk_index=chunk.index,
            chunk_total=len(chunks),
            char_start=chunk.char_start,
            char_end=chunk.char_end,
            indexed_at=time.time(),
        ))

    # Register in manifest
    manifest = manifest.register(
        source_path=str(source_path),
        content_hash=content_hash,
        project=project,
        file_size=len(content.encode("utf-8")),
        chunks=chunk_records,
    )

    return manifest, len(chunks)


def index_batch(
    source: Path,
    project: str,
    incremental: bool = True,
    dry_run: bool = False,
    force: bool = False,
) -> dict:
    """
    Index all markdown files under source path.

    Returns summary dict with stats.
    """
    manifest = Manifest.load()
    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")

    files = discover_markdown_files(source)
    if not files:
        return {"error": f"No .md files found under {source}"}

    stats = {
        "source": str(source),
        "project": project,
        "files_found": len(files),
        "files_indexed": 0,
        "files_skipped": 0,
        "chunks_written": 0,
        "dry_run": dry_run,
        "incremental": incremental,
        "date": date_str,
    }

    print(f"\nENGRAM Knowledge Indexer")
    print(f"{'=' * 50}")
    print(f"Source:      {source}")
    print(f"Project:     {project}")
    print(f"Files found: {len(files)}")
    print(f"Mode:        {'DRY RUN' if dry_run else 'LIVE'}")
    print(f"{'=' * 50}\n")

    for i, fpath in enumerate(files, 1):
        prev_chunks = manifest.total_chunks
        manifest, n_chunks = index_file(
            source_path=fpath,
            project=project,
            manifest=manifest,
            date_str=date_str,
            dry_run=dry_run,
            force=force,
        )

        if n_chunks > 0:
            stats["files_indexed"] += 1
            stats["chunks_written"] += n_chunks
            status = "INDEXED" if not dry_run else "DRY RUN"
            print(f"  [{i}/{len(files)}] {status}: {fpath.name} "
                  f"→ {n_chunks} chunks")
        else:
            stats["files_skipped"] += 1
            print(f"  [{i}/{len(files)}] SKIP (unchanged): {fpath.name}")

    print(f"\n{'=' * 50}")
    print(f"Done. {stats['files_indexed']} files → "
          f"{stats['chunks_written']} chunks")
    if stats["files_skipped"]:
        print(f"Skipped {stats['files_skipped']} unchanged files")
    print(f"Manifest: {manifest.summary()}")
    print(f"{'=' * 50}\n")

    return stats


# ── CLI ──────────────────────────────────────────────────────────────

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Index markdown files into ENGRAM .eng knowledge files"
    )
    parser.add_argument(
        "--source", "-s",
        required=True,
        help="Path to file or directory to index",
    )
    parser.add_argument(
        "--project", "-p",
        default="engram",
        help="Project namespace (default: engram)",
    )
    parser.add_argument(
        "--dry-run", "-n",
        action="store_true",
        help="Show what would be indexed without writing",
    )
    parser.add_argument(
        "--force", "-f",
        action="store_true",
        help="Re-index all files regardless of content hash",
    )
    parser.add_argument(
        "--incremental", "-i",
        action="store_true",
        default=True,
        help="Skip unchanged files (default: true)",
    )

    args = parser.parse_args()
    source = Path(args.source).resolve()

    if not source.exists():
        print(f"Error: {source} does not exist", file=sys.stderr)
        sys.exit(1)

    stats = index_batch(
        source=source,
        project=args.project,
        incremental=args.incremental,
        dry_run=args.dry_run,
        force=args.force,
    )

    if "error" in stats:
        print(f"Error: {stats['error']}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()