beta-NORM / scripts /embd_index.py
GitHub Actions
Sync from GitHub master
92145af
raw
history blame contribute delete
974 Bytes
from utils import base_utils as bu
from utils import md_to_faiss
from utils import retrieval_utils as ru
def main():
config = bu.load_config("configs/config.json")
model_name = config["embeddings"]["model_name"]
input_path = config["paths"]["input_path"]
index_dir = config["paths"]["index_dir"]
output_dir = config["embeddings"]["output_dir"]
splitter = config["splitter"]["type"]
chunk_size = config["splitter"]["chunk_size"]
chunk_overlap = config["splitter"]["overlap"]
max_docs = config["embeddings"]["max_files_for_debug"]
retrieval_model = ru.load_model(model_name)
md_to_faiss.build_faiss_from_md(
input_path = input_path,
index_dir = index_dir,
model_name = model_name,
splitter = splitter,
chunk_size = chunk_size,
chunk_overlap = chunk_overlap,
retrieval_model = retrieval_model,
max_documents = max_docs
)
if __name__ == "__main__":
main()