import os import sys ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if ROOT not in sys.path: sys.path.insert(0, ROOT) from utils import base_utils as bu from utils import retrieval_utils as ru def main(): config = bu.load_config("configs/config.json") os.makedirs(config["embeddings"]["output_dir"], exist_ok=True) model_name = config["embeddings"]["model_name"] input_path = config["paths"]["input_path"] output_dir = config["embeddings"]["output_dir"] splitter = config["splitter"]["type"] chunk_size = config["splitter"]["chunk_size"] chunk_overlap = config["splitter"]["overlap"] retrieval_model = ru.load_model(model_name) numpy_output_dir = config.get("paths", {}).get("embeddings_dir", "data/embeddings") max_files = config.get("embeddings", {}).get("max_files_for_debug") or None ru.generate_embeddings( chunk_size=chunk_size, chunk_overlap=chunk_overlap, model_name=model_name, input_path=input_path, output_folder=output_dir, splitter=splitter, retrieval_model=retrieval_model, export_numpy=True, numpy_output_dir=numpy_output_dir, max_files=max_files, ) if __name__ == "__main__": main()