Spaces:
Running
Running
| """Ingest documents into the hybrid vector store. | |
| Usage: | |
| python scripts/ingest.py --config configs/tasks/tech_docs.yaml | |
| python scripts/ingest.py --doc-dir data/tech_docs/ --store-path .cache/store | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| from pathlib import Path | |
| # Ensure the package is importable when running as a script | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) | |
| from agent_bench.rag.chunker import chunk_text | |
| from agent_bench.rag.embedder import Embedder | |
| from agent_bench.rag.store import HybridStore | |
| def ingest( | |
| doc_dir: str, | |
| store_path: str, | |
| chunk_strategy: str = "recursive", | |
| chunk_size: int = 512, | |
| chunk_overlap: int = 64, | |
| model_name: str = "all-MiniLM-L6-v2", | |
| cache_dir: str = ".cache/embeddings", | |
| ) -> None: | |
| """Ingest all markdown files from doc_dir into a HybridStore.""" | |
| doc_path = Path(doc_dir) | |
| if not doc_path.exists(): | |
| print(f"Error: document directory {doc_dir} does not exist") | |
| sys.exit(1) | |
| # Exclude curation metadata files that live alongside corpus content. | |
| # SOURCES.md and QUESTION_PLAN.md are version-controlled curation | |
| # artifacts, not corpus content. | |
| _EXCLUDED = {"SOURCES.md", "QUESTION_PLAN.md", "README.md"} | |
| md_files = sorted(f for f in doc_path.glob("*.md") if f.name not in _EXCLUDED) | |
| if not md_files: | |
| print(f"Error: no markdown files found in {doc_dir}") | |
| sys.exit(1) | |
| print(f"Found {len(md_files)} markdown files in {doc_dir}") | |
| # Chunk all documents | |
| all_chunks = [] | |
| for md_file in md_files: | |
| text = md_file.read_text(encoding="utf-8") | |
| source = md_file.name # bare filename | |
| chunks = chunk_text( | |
| text, source, strategy=chunk_strategy, chunk_size=chunk_size, chunk_overlap=chunk_overlap | |
| ) | |
| print(f" {source}: {len(chunks)} chunks") | |
| all_chunks.extend(chunks) | |
| print(f"Total chunks: {len(all_chunks)}") | |
| # Embed | |
| print(f"Embedding with {model_name}...") | |
| embedder = Embedder(model_name=model_name, cache_dir=cache_dir) | |
| texts = [c.content for c in all_chunks] | |
| embeddings = embedder.embed_batch(texts) | |
| print(f"Embeddings shape: {embeddings.shape}") | |
| # Store | |
| store = HybridStore(dimension=embeddings.shape[1]) | |
| store.add(all_chunks, embeddings) | |
| store.save(store_path) | |
| stats = store.stats() | |
| print(f"Store saved to {store_path}") | |
| print(f" Chunks: {stats.total_chunks}") | |
| print(f" FAISS index size: {stats.faiss_index_size}") | |
| print(f" Unique sources: {stats.unique_sources}") | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Ingest documents into vector store") | |
| parser.add_argument("--doc-dir", default="data/tech_docs/", help="Document directory") | |
| parser.add_argument("--store-path", default=".cache/store", help="Store output path") | |
| parser.add_argument("--chunk-strategy", default="recursive", choices=["recursive", "fixed"]) | |
| parser.add_argument("--chunk-size", type=int, default=512) | |
| parser.add_argument("--chunk-overlap", type=int, default=64) | |
| parser.add_argument("--model", default="all-MiniLM-L6-v2", help="Embedding model name") | |
| parser.add_argument("--cache-dir", default=".cache/embeddings", help="Embedding cache dir") | |
| parser.add_argument( | |
| "--config", default=None, help="Task config YAML (overrides other args for doc-dir)" | |
| ) | |
| args = parser.parse_args() | |
| doc_dir = args.doc_dir | |
| if args.config: | |
| from agent_bench.core.config import load_task_config | |
| task = load_task_config(Path(args.config).stem, path=Path(args.config)) | |
| doc_dir = task.document_dir | |
| ingest( | |
| doc_dir=doc_dir, | |
| store_path=args.store_path, | |
| chunk_strategy=args.chunk_strategy, | |
| chunk_size=args.chunk_size, | |
| chunk_overlap=args.chunk_overlap, | |
| model_name=args.model, | |
| cache_dir=args.cache_dir, | |
| ) | |
| if __name__ == "__main__": | |
| main() | |