Legal-RAG / scripts /build_index.py
flora-l's picture
recreate the space for demo
ddbc0c8
from __future__ import annotations
import argparse
from pathlib import Path
import faiss
import numpy as np
from legalrag.config import AppConfig
from legalrag.retrieval.builders.bm25_builder import build_bm25_index
from legalrag.retrieval.builders.colbert_builder import build_colbert_index
from legalrag.retrieval.builders.faiss_builder import build_faiss_index
from legalrag.retrieval.corpus_loader import load_chunks_from_dir
from legalrag.utils.logger import get_logger
logger = get_logger(__name__)
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Build indices for LegalRAG (FAISS dense + BM25 sparse + optional ColBERT)."
)
g = p.add_mutually_exclusive_group()
p.add_argument(
"--hnsw-m",
type=int,
default=32,
help="HNSW parameter M (higher → better recall, more memory).",
)
p.add_argument(
"--hnsw-ef-construction",
type=int,
default=200,
help="HNSW efConstruction (higher → better graph quality, slower build).",
)
p.add_argument(
"--hnsw-ef-search",
type=int,
default=128,
help="HNSW efSearch at query time (higher → higher recall, slower search).",
)
g.add_argument("--only-colbert", action="store_true", help="Only build ColBERT index (skip FAISS/BM25).")
g.add_argument("--only-faiss", action="store_true", help="Only build FAISS (skip BM25/ColBERT).")
g.add_argument("--only-bm25", action="store_true", help="Only build BM25 (skip FAISS/ColBERT).")
p.add_argument("--no-faiss", action="store_true", help="Skip FAISS in default build.")
p.add_argument("--no-bm25", action="store_true", help="Skip BM25 in default build.")
p.add_argument("--no-colbert", action="store_true", help="Skip ColBERT.")
p.add_argument(
"--index-version",
type=str,
default="",
help="Write indexes into data/index/versions/<version> and activate if requested.",
)
p.add_argument(
"--activate",
action="store_true",
help="Activate the provided --index-version after building.",
)
return p.parse_args()
def main() -> None:
args = parse_args()
idx_version = (args.index_version or "").strip() or None
cfg = AppConfig.load(index_version=idx_version)
rcfg = cfg.retrieval
rcfg.hnsw_m = args.hnsw_m
rcfg.hnsw_ef_construction = args.hnsw_ef_construction
rcfg.hnsw_ef_search = args.hnsw_ef_search
chunks = load_chunks_from_dir(rcfg.processed_dir, rcfg.processed_glob)
logger.info("Loaded %d law chunks from %s/%s", len(chunks), rcfg.processed_dir, rcfg.processed_glob)
by_lang = {}
for c in chunks:
lang = (getattr(c, "lang", None) or "zh").strip().lower()
by_lang.setdefault(lang, []).append(c)
if not by_lang:
logger.error("No chunks found to index.")
return
for lang, lang_chunks in sorted(by_lang.items()):
lang_cfg = cfg.with_lang(lang)
logger.info("Building indexes for lang=%s (chunks=%d)", lang, len(lang_chunks))
if args.only_colbert:
build_colbert_index(lang_cfg, lang_chunks)
continue
if args.only_faiss:
build_faiss_index(lang_cfg, lang_chunks)
continue
if args.only_bm25:
build_bm25_index(lang_cfg, lang_chunks)
continue
if not args.no_faiss:
build_faiss_index(lang_cfg, lang_chunks)
if not args.no_bm25:
build_bm25_index(lang_cfg, lang_chunks)
if not args.no_colbert:
try:
build_colbert_index(lang_cfg, lang_chunks)
except Exception as e:
print(f"⚠️ Warning: ColBERT index build failed for lang={lang}, continuing without it.\nReason: {e}")
if idx_version and args.activate:
from legalrag.index.registry import IndexRegistry
for lang in sorted(by_lang.keys()):
lang_cfg = cfg.with_lang(lang)
registry = IndexRegistry(Path(lang_cfg.paths.index_dir))
registry.activate(idx_version)
if __name__ == "__main__":
main()