File size: 3,986 Bytes
a152b95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68d96ea
9dfd3f0
 
 
68d96ea
a152b95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
"""Ingest documents into the hybrid vector store.

Usage:
    python scripts/ingest.py --config configs/tasks/tech_docs.yaml
    python scripts/ingest.py --doc-dir data/tech_docs/ --store-path .cache/store
"""

from __future__ import annotations

import argparse
import sys
from pathlib import Path

# Ensure the package is importable when running as a script
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from agent_bench.rag.chunker import chunk_text
from agent_bench.rag.embedder import Embedder
from agent_bench.rag.store import HybridStore


def ingest(
    doc_dir: str,
    store_path: str,
    chunk_strategy: str = "recursive",
    chunk_size: int = 512,
    chunk_overlap: int = 64,
    model_name: str = "all-MiniLM-L6-v2",
    cache_dir: str = ".cache/embeddings",
) -> None:
    """Ingest all markdown files from doc_dir into a HybridStore."""
    doc_path = Path(doc_dir)
    if not doc_path.exists():
        print(f"Error: document directory {doc_dir} does not exist")
        sys.exit(1)

    # Exclude curation metadata files that live alongside corpus content.
    # SOURCES.md and QUESTION_PLAN.md are version-controlled curation
    # artifacts, not corpus content.
    _EXCLUDED = {"SOURCES.md", "QUESTION_PLAN.md", "README.md"}
    md_files = sorted(f for f in doc_path.glob("*.md") if f.name not in _EXCLUDED)
    if not md_files:
        print(f"Error: no markdown files found in {doc_dir}")
        sys.exit(1)

    print(f"Found {len(md_files)} markdown files in {doc_dir}")

    # Chunk all documents
    all_chunks = []
    for md_file in md_files:
        text = md_file.read_text(encoding="utf-8")
        source = md_file.name  # bare filename
        chunks = chunk_text(
            text, source, strategy=chunk_strategy, chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        print(f"  {source}: {len(chunks)} chunks")
        all_chunks.extend(chunks)

    print(f"Total chunks: {len(all_chunks)}")

    # Embed
    print(f"Embedding with {model_name}...")
    embedder = Embedder(model_name=model_name, cache_dir=cache_dir)
    texts = [c.content for c in all_chunks]
    embeddings = embedder.embed_batch(texts)
    print(f"Embeddings shape: {embeddings.shape}")

    # Store
    store = HybridStore(dimension=embeddings.shape[1])
    store.add(all_chunks, embeddings)
    store.save(store_path)

    stats = store.stats()
    print(f"Store saved to {store_path}")
    print(f"  Chunks: {stats.total_chunks}")
    print(f"  FAISS index size: {stats.faiss_index_size}")
    print(f"  Unique sources: {stats.unique_sources}")


def main() -> None:
    parser = argparse.ArgumentParser(description="Ingest documents into vector store")
    parser.add_argument("--doc-dir", default="data/tech_docs/", help="Document directory")
    parser.add_argument("--store-path", default=".cache/store", help="Store output path")
    parser.add_argument("--chunk-strategy", default="recursive", choices=["recursive", "fixed"])
    parser.add_argument("--chunk-size", type=int, default=512)
    parser.add_argument("--chunk-overlap", type=int, default=64)
    parser.add_argument("--model", default="all-MiniLM-L6-v2", help="Embedding model name")
    parser.add_argument("--cache-dir", default=".cache/embeddings", help="Embedding cache dir")
    parser.add_argument(
        "--config", default=None, help="Task config YAML (overrides other args for doc-dir)"
    )
    args = parser.parse_args()

    doc_dir = args.doc_dir
    if args.config:
        from agent_bench.core.config import load_task_config

        task = load_task_config(Path(args.config).stem, path=Path(args.config))
        doc_dir = task.document_dir

    ingest(
        doc_dir=doc_dir,
        store_path=args.store_path,
        chunk_strategy=args.chunk_strategy,
        chunk_size=args.chunk_size,
        chunk_overlap=args.chunk_overlap,
        model_name=args.model,
        cache_dir=args.cache_dir,
    )


if __name__ == "__main__":
    main()