File size: 1,302 Bytes
6f54a86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import os
import sys
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)
from utils import base_utils as bu
from utils import retrieval_utils as ru

def main():
    config = bu.load_config("configs/config.json")

    os.makedirs(config["embeddings"]["output_dir"], exist_ok=True)

    model_name = config["embeddings"]["model_name"]
    input_path = config["paths"]["input_path"]
    output_dir = config["embeddings"]["output_dir"]
    splitter = config["splitter"]["type"]
    chunk_size = config["splitter"]["chunk_size"]
    chunk_overlap = config["splitter"]["overlap"]
    retrieval_model = ru.load_model(model_name)

    numpy_output_dir = config.get("paths", {}).get("embeddings_dir", "data/embeddings")
    max_files = config.get("embeddings", {}).get("max_files_for_debug") or None

    ru.generate_embeddings(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        model_name=model_name,
        input_path=input_path,
        output_folder=output_dir,
        splitter=splitter,
        retrieval_model=retrieval_model,
        export_numpy=True,
        numpy_output_dir=numpy_output_dir,
        max_files=max_files,
    )

if __name__ == "__main__":
    main()