File size: 11,741 Bytes
954cf8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
#!/usr/bin/env python3
"""
scripts/index_knowledge.py β€” Batch index markdown files into .eng binaries.

Processes markdown files from a directory (or single file), chunks them,
fingerprints each chunk, and writes .eng files to the knowledge index.

Usage:
    # Index a single file
    python scripts/index_knowledge.py --source path/to/file.md --project engram

    # Index a directory recursively
    python scripts/index_knowledge.py --source path/to/docs/ --project engram

    # Re-index changed files only (incremental)
    python scripts/index_knowledge.py --source path/to/docs/ --project engram --incremental

    # Dry run β€” show what would be indexed
    python scripts/index_knowledge.py --source path/to/docs/ --project engram --dry-run

    # Force re-index everything
    python scripts/index_knowledge.py --source path/to/docs/ --project engram --force

Environment:
    ENGRAM_SESSIONS_DIR   Base sessions dir (default: ~/.engram/sessions)
    ENGRAM_KNOWLEDGE_DIR  Knowledge index dir (default: ~/.engram/knowledge)
    ENGRAM_MODEL_PATH     Path to GGUF model for real fingerprints (optional)
    PYTHONPATH=.          Must include project root for kvcos imports
"""

from __future__ import annotations

import argparse
import hashlib
import json
import os
import sys
import time
from datetime import datetime, timezone
from pathlib import Path

# Ensure project root is importable
sys.path.insert(0, str(Path(__file__).parent.parent))

import torch

from kvcos.engram.chunker import Chunk, chunk_markdown, eng_filename, slug_from_path
from kvcos.engram.format import EigramEncoder
from kvcos.engram.manifest import ChunkRecord, Manifest, _content_hash, _file_hash


# ── Configuration ────────────────────────────────────────────────────

KNOWLEDGE_DIR = Path(
    os.environ.get("ENGRAM_KNOWLEDGE_DIR", "~/.engram/knowledge")
).expanduser()

SKIP_PATTERNS = {
    "node_modules",
    ".venv",
    "__pycache__",
    ".git",
    ".eng",
    "site-packages",
}

SKIP_FILES = {
    "LICENSE.md",
    "CHANGELOG.md",
    "SECURITY.md",
}


# ── Fingerprinting ──────────────────────────────────────────────────

from kvcos.engram.embedder import get_fingerprint as _get_fingerprint


# ── .eng Writer ──────────────────────────────────────────────────────

_encoder = EigramEncoder()


def _write_knowledge_eng(
    fp_tensor: torch.Tensor,
    chunk: Chunk,
    eng_path: Path,
    session_id: str,
    fp_source: str,
    source_path: str,
    project: str,
    chunk_index: int,
    chunk_total: int,
) -> Path:
    """Write a .eng binary for a knowledge chunk."""
    dim = fp_tensor.shape[0]
    basis_rank = 116
    vec_perdoc = torch.zeros(basis_rank)
    vec_fcdb = torch.zeros(basis_rank)
    joint_center = torch.zeros(128)

    # Truncate description to 256 chars for binary
    description = chunk.text[:256]

    blob = _encoder.encode(
        vec_perdoc=vec_perdoc,
        vec_fcdb=vec_fcdb,
        joint_center=joint_center,
        corpus_hash=hashlib.sha256(source_path.encode()).hexdigest()[:32],
        model_id=fp_source[:16],
        basis_rank=basis_rank,
        n_corpus=0,
        layer_range=(0, 0),
        context_len=len(chunk.text),
        l2_norm=float(torch.norm(fp_tensor).item()),
        scs=0.0,
        margin_proof=0.0,
        task_description=description,
        cache_id=session_id,
        vec_fourier=fp_tensor if dim == 2048 else None,
        vec_fourier_v2=fp_tensor,
        confusion_flag=False,
    )

    eng_path.parent.mkdir(parents=True, exist_ok=True)
    with open(eng_path, "wb") as f:
        f.write(blob)

    # Write extended sidecar with full metadata
    meta = {
        "cache_id": session_id,
        "task_description": chunk.text[:500],
        "source_path": source_path,
        "project": project,
        "fp_source": fp_source,
        "chunk_index": chunk_index,
        "chunk_total": chunk_total,
        "char_start": chunk.char_start,
        "char_end": chunk.char_end,
        "headers": list(chunk.headers),
        "ts": time.time(),
        "type": "knowledge",
    }
    meta_path = Path(str(eng_path) + ".meta.json")
    with open(meta_path, "w") as f:
        json.dump(meta, f, indent=2)

    return eng_path


# ── Discovery ────────────────────────────────────────────────────────

def discover_markdown_files(source: Path) -> list[Path]:
    """Find all indexable .md files under source path."""
    if source.is_file():
        return [source] if source.suffix == ".md" else []

    files: list[Path] = []
    for p in sorted(source.rglob("*.md")):
        # Skip files in excluded directories
        if any(skip in p.parts for skip in SKIP_PATTERNS):
            continue
        # Skip excluded filenames
        if p.name in SKIP_FILES:
            continue
        # Skip empty files
        if p.stat().st_size == 0:
            continue
        files.append(p)

    return files


# ── Main Pipeline ────────────────────────────────────────────────────

def index_file(
    source_path: Path,
    project: str,
    manifest: Manifest,
    date_str: str,
    dry_run: bool = False,
    force: bool = False,
) -> tuple[Manifest, int]:
    """
    Index a single markdown file into .eng chunks.

    Returns:
        (updated_manifest, chunks_written)
    """
    content = source_path.read_text(encoding="utf-8", errors="replace")
    content_hash = _content_hash(content)

    # Incremental: skip if unchanged
    if not force and not manifest.needs_reindex(str(source_path), content_hash):
        return manifest, 0

    slug = slug_from_path(str(source_path))
    context = f"Source: {source_path.name} | Project: {project}"

    # Chunk the content
    chunks = chunk_markdown(
        content,
        max_chars=2000,
        min_chars=100,
        context_prefix=context,
    )

    if dry_run:
        print(f"  [DRY RUN] {source_path.name}: {len(chunks)} chunks, "
              f"{len(content)} chars")
        return manifest, len(chunks)

    # Write .eng for each chunk
    chunk_records: list[ChunkRecord] = []
    project_dir = KNOWLEDGE_DIR / project
    project_dir.mkdir(parents=True, exist_ok=True)

    for chunk in chunks:
        filename = eng_filename(
            project=project,
            slug=slug,
            date=date_str,
            chunk_index=chunk.index,
            chunk_total=len(chunks),
        )
        eng_path = project_dir / filename

        # Fingerprint the chunk text (with context)
        fp_tensor, fp_source = _get_fingerprint(chunk.text)

        session_id = f"{project}/{slug}"
        if len(chunks) > 1:
            session_id += f"_c{chunk.index + 1:03d}"

        _write_knowledge_eng(
            fp_tensor=fp_tensor,
            chunk=chunk,
            eng_path=eng_path,
            session_id=session_id,
            fp_source=fp_source,
            source_path=str(source_path),
            project=project,
            chunk_index=chunk.index,
            chunk_total=len(chunks),
        )

        chunk_records.append(ChunkRecord(
            eng_path=str(eng_path),
            chunk_index=chunk.index,
            chunk_total=len(chunks),
            char_start=chunk.char_start,
            char_end=chunk.char_end,
            indexed_at=time.time(),
        ))

    # Register in manifest
    manifest = manifest.register(
        source_path=str(source_path),
        content_hash=content_hash,
        project=project,
        file_size=len(content.encode("utf-8")),
        chunks=chunk_records,
    )

    return manifest, len(chunks)


def index_batch(
    source: Path,
    project: str,
    incremental: bool = True,
    dry_run: bool = False,
    force: bool = False,
) -> dict:
    """
    Index all markdown files under source path.

    Returns summary dict with stats.
    """
    manifest = Manifest.load()
    date_str = datetime.now(timezone.utc).strftime("%Y-%m-%d")

    files = discover_markdown_files(source)
    if not files:
        return {"error": f"No .md files found under {source}"}

    stats = {
        "source": str(source),
        "project": project,
        "files_found": len(files),
        "files_indexed": 0,
        "files_skipped": 0,
        "chunks_written": 0,
        "dry_run": dry_run,
        "incremental": incremental,
        "date": date_str,
    }

    print(f"\nENGRAM Knowledge Indexer")
    print(f"{'=' * 50}")
    print(f"Source:      {source}")
    print(f"Project:     {project}")
    print(f"Files found: {len(files)}")
    print(f"Mode:        {'DRY RUN' if dry_run else 'LIVE'}")
    print(f"{'=' * 50}\n")

    for i, fpath in enumerate(files, 1):
        prev_chunks = manifest.total_chunks
        manifest, n_chunks = index_file(
            source_path=fpath,
            project=project,
            manifest=manifest,
            date_str=date_str,
            dry_run=dry_run,
            force=force,
        )

        if n_chunks > 0:
            stats["files_indexed"] += 1
            stats["chunks_written"] += n_chunks
            status = "INDEXED" if not dry_run else "DRY RUN"
            print(f"  [{i}/{len(files)}] {status}: {fpath.name} "
                  f"β†’ {n_chunks} chunks")
        else:
            stats["files_skipped"] += 1
            print(f"  [{i}/{len(files)}] SKIP (unchanged): {fpath.name}")

    print(f"\n{'=' * 50}")
    print(f"Done. {stats['files_indexed']} files β†’ "
          f"{stats['chunks_written']} chunks")
    if stats["files_skipped"]:
        print(f"Skipped {stats['files_skipped']} unchanged files")
    print(f"Manifest: {manifest.summary()}")
    print(f"{'=' * 50}\n")

    return stats


# ── CLI ──────────────────────────────────────────────────────────────

def main() -> None:
    parser = argparse.ArgumentParser(
        description="Index markdown files into ENGRAM .eng knowledge files"
    )
    parser.add_argument(
        "--source", "-s",
        required=True,
        help="Path to file or directory to index",
    )
    parser.add_argument(
        "--project", "-p",
        default="engram",
        help="Project namespace (default: engram)",
    )
    parser.add_argument(
        "--dry-run", "-n",
        action="store_true",
        help="Show what would be indexed without writing",
    )
    parser.add_argument(
        "--force", "-f",
        action="store_true",
        help="Re-index all files regardless of content hash",
    )
    parser.add_argument(
        "--incremental", "-i",
        action="store_true",
        default=True,
        help="Skip unchanged files (default: true)",
    )

    args = parser.parse_args()
    source = Path(args.source).resolve()

    if not source.exists():
        print(f"Error: {source} does not exist", file=sys.stderr)
        sys.exit(1)

    stats = index_batch(
        source=source,
        project=args.project,
        incremental=args.incremental,
        dry_run=args.dry_run,
        force=args.force,
    )

    if "error" in stats:
        print(f"Error: {stats['error']}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()