from __future__ import annotations import argparse from pathlib import Path import faiss import numpy as np from legalrag.config import AppConfig from legalrag.retrieval.builders.bm25_builder import build_bm25_index from legalrag.retrieval.builders.colbert_builder import build_colbert_index from legalrag.retrieval.builders.faiss_builder import build_faiss_index from legalrag.retrieval.corpus_loader import load_chunks_from_dir from legalrag.utils.logger import get_logger logger = get_logger(__name__) def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser( description="Build indices for LegalRAG (FAISS dense + BM25 sparse + optional ColBERT)." ) g = p.add_mutually_exclusive_group() p.add_argument( "--hnsw-m", type=int, default=32, help="HNSW parameter M (higher → better recall, more memory).", ) p.add_argument( "--hnsw-ef-construction", type=int, default=200, help="HNSW efConstruction (higher → better graph quality, slower build).", ) p.add_argument( "--hnsw-ef-search", type=int, default=128, help="HNSW efSearch at query time (higher → higher recall, slower search).", ) g.add_argument("--only-colbert", action="store_true", help="Only build ColBERT index (skip FAISS/BM25).") g.add_argument("--only-faiss", action="store_true", help="Only build FAISS (skip BM25/ColBERT).") g.add_argument("--only-bm25", action="store_true", help="Only build BM25 (skip FAISS/ColBERT).") p.add_argument("--no-faiss", action="store_true", help="Skip FAISS in default build.") p.add_argument("--no-bm25", action="store_true", help="Skip BM25 in default build.") p.add_argument("--no-colbert", action="store_true", help="Skip ColBERT.") p.add_argument( "--index-version", type=str, default="", help="Write indexes into data/index/versions/ and activate if requested.", ) p.add_argument( "--activate", action="store_true", help="Activate the provided --index-version after building.", ) return p.parse_args() def main() -> None: args = parse_args() idx_version = (args.index_version or "").strip() or None cfg = AppConfig.load(index_version=idx_version) rcfg = cfg.retrieval rcfg.hnsw_m = args.hnsw_m rcfg.hnsw_ef_construction = args.hnsw_ef_construction rcfg.hnsw_ef_search = args.hnsw_ef_search chunks = load_chunks_from_dir(rcfg.processed_dir, rcfg.processed_glob) logger.info("Loaded %d law chunks from %s/%s", len(chunks), rcfg.processed_dir, rcfg.processed_glob) by_lang = {} for c in chunks: lang = (getattr(c, "lang", None) or "zh").strip().lower() by_lang.setdefault(lang, []).append(c) if not by_lang: logger.error("No chunks found to index.") return for lang, lang_chunks in sorted(by_lang.items()): lang_cfg = cfg.with_lang(lang) logger.info("Building indexes for lang=%s (chunks=%d)", lang, len(lang_chunks)) if args.only_colbert: build_colbert_index(lang_cfg, lang_chunks) continue if args.only_faiss: build_faiss_index(lang_cfg, lang_chunks) continue if args.only_bm25: build_bm25_index(lang_cfg, lang_chunks) continue if not args.no_faiss: build_faiss_index(lang_cfg, lang_chunks) if not args.no_bm25: build_bm25_index(lang_cfg, lang_chunks) if not args.no_colbert: try: build_colbert_index(lang_cfg, lang_chunks) except Exception as e: print(f"⚠️ Warning: ColBERT index build failed for lang={lang}, continuing without it.\nReason: {e}") if idx_version and args.activate: from legalrag.index.registry import IndexRegistry for lang in sorted(by_lang.keys()): lang_cfg = cfg.with_lang(lang) registry = IndexRegistry(Path(lang_cfg.paths.index_dir)) registry.activate(idx_version) if __name__ == "__main__": main()