| import os | |
| import sys | |
| ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| if ROOT not in sys.path: | |
| sys.path.insert(0, ROOT) | |
| from utils import base_utils as bu | |
| from utils import retrieval_utils as ru | |
| def main(): | |
| config = bu.load_config("configs/config.json") | |
| os.makedirs(config["embeddings"]["output_dir"], exist_ok=True) | |
| model_name = config["embeddings"]["model_name"] | |
| input_path = config["paths"]["input_path"] | |
| output_dir = config["embeddings"]["output_dir"] | |
| splitter = config["splitter"]["type"] | |
| chunk_size = config["splitter"]["chunk_size"] | |
| chunk_overlap = config["splitter"]["overlap"] | |
| retrieval_model = ru.load_model(model_name) | |
| numpy_output_dir = config.get("paths", {}).get("embeddings_dir", "data/embeddings") | |
| max_files = config.get("embeddings", {}).get("max_files_for_debug") or None | |
| ru.generate_embeddings( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| model_name=model_name, | |
| input_path=input_path, | |
| output_folder=output_dir, | |
| splitter=splitter, | |
| retrieval_model=retrieval_model, | |
| export_numpy=True, | |
| numpy_output_dir=numpy_output_dir, | |
| max_files=max_files, | |
| ) | |
| if __name__ == "__main__": | |
| main() |