| |
| |
| |
| |
|
|
| from typing import List |
| import faiss |
| import logging |
|
|
| LOG = logging.getLogger(__name__) |
|
|
|
|
| def merge_ondisk( |
| trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str, shift_ids=False |
| ) -> None: |
| """Add the contents of the indexes stored in shard_fnames into the index |
| trained_index. The on-disk data is stored in ivfdata_fname""" |
| assert not isinstance( |
| trained_index, faiss.IndexIVFPQR |
| ), "IndexIVFPQR is not supported as an on disk index." |
| |
| |
| ivfs = [] |
| for fname in shard_fnames: |
| |
| |
| |
| LOG.info("read " + fname) |
| index = faiss.read_index(fname, faiss.IO_FLAG_MMAP) |
| index_ivf = faiss.extract_index_ivf(index) |
| ivfs.append(index_ivf.invlists) |
|
|
| |
| index_ivf.own_invlists = False |
|
|
| |
| index = trained_index |
| index_ivf = faiss.extract_index_ivf(index) |
|
|
| assert index.ntotal == 0, "works only on empty index" |
|
|
| |
| |
| invlists = faiss.OnDiskInvertedLists( |
| index_ivf.nlist, index_ivf.code_size, ivfdata_fname |
| ) |
|
|
| |
| ivf_vector = faiss.InvertedListsPtrVector() |
| for ivf in ivfs: |
| ivf_vector.push_back(ivf) |
|
|
| LOG.info("merge %d inverted lists " % ivf_vector.size()) |
| ntotal = invlists.merge_from_multiple(ivf_vector.data(), ivf_vector.size(), shift_ids) |
|
|
| |
| index.ntotal = index_ivf.ntotal = ntotal |
| index_ivf.replace_invlists(invlists, True) |
| invlists.this.disown() |
|
|