beta-NORM / scripts /generate_embeddings.py
GitHub Actions
Snapshot from GitHub master for HF Space
6f54a86
import os
import sys
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT not in sys.path:
sys.path.insert(0, ROOT)
from utils import base_utils as bu
from utils import retrieval_utils as ru
def main():
config = bu.load_config("configs/config.json")
os.makedirs(config["embeddings"]["output_dir"], exist_ok=True)
model_name = config["embeddings"]["model_name"]
input_path = config["paths"]["input_path"]
output_dir = config["embeddings"]["output_dir"]
splitter = config["splitter"]["type"]
chunk_size = config["splitter"]["chunk_size"]
chunk_overlap = config["splitter"]["overlap"]
retrieval_model = ru.load_model(model_name)
numpy_output_dir = config.get("paths", {}).get("embeddings_dir", "data/embeddings")
max_files = config.get("embeddings", {}).get("max_files_for_debug") or None
ru.generate_embeddings(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
model_name=model_name,
input_path=input_path,
output_folder=output_dir,
splitter=splitter,
retrieval_model=retrieval_model,
export_numpy=True,
numpy_output_dir=numpy_output_dir,
max_files=max_files,
)
if __name__ == "__main__":
main()