File size: 4,319 Bytes
237451b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import faiss
import numpy as np
import json
from tqdm import tqdm
import os
from torch.nn import DataParallel
from transformers import AutoTokenizer, AutoModel, T5EncoderModel
from torch.utils.data import DataLoader, TensorDataset
import torch
from sentence_transformers import SentenceTransformer
from multiprocessing import Pool
import time


# os.environ["CUDA_VISIBLE_DEVICES"] = "2"

start_time = time.time()

with open("/mnt/ceph_rbd/hf_models/feedbackcorpus/merged_triple_processed_new_withID.json", "r") as fi:
    data = json.load(fi)

sentences = [_['contents'] for _ in data]
print("Chunks nums: ", len(sentences))


# model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl'
# model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5'
model_path = '/mnt/ceph_rbd/hf_models/contriever'

### Using SentenceTransformer
# def encode_sentences_on_gpu(params):   
#     print("Encode start...") 
#     sentences_chunk, device_id = params
#     device = torch.device(f'cuda:{device_id}')
#     model = SentenceTransformer(model_path, device=device)
#     embeddings = model.encode(
#         sentences_chunk,
#         batch_size=2048,
#         show_progress_bar=True,
#         convert_to_numpy=True,
#         normalize_embeddings=True
#     )
#     print("Encode Success...")
#     return embeddings


# num_gpus = torch.cuda.device_count()
# print(f"Number of GPUs: {num_gpus}")

# sentence_embeddings = encode_sentences_on_gpu((sentences,3))

# # sentences_chunks = np.array_split(sentences, num_gpus)
# # params = [(sentences_chunks[i], i) for i in range(num_gpus)]
# # with Pool(processes=num_gpus) as pool:
# #     embeddings_list = pool.map(encode_sentences_on_gpu, params)
# print("Encode Finished...")


# sentence_embeddings = np.concatenate(embeddings_list, axis=0)


### Using Transformers

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

# Enable multi-GPU support if multiple GPUs are available
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)
model = model.to('cuda')
print("Model load success...")

# Batch size
batch_size = 2048*4
print("len(sentences)//b_z: ", len(sentences)//batch_size)

# Mean pooling function
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.0)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

# Generator to yield tokenized batches on the fly
def sentence_generator(sentences, tokenizer, batch_size):
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
        yield inputs['input_ids'], inputs['attention_mask']


all_embeddings = []
with torch.no_grad():
    for input_ids, attention_mask in tqdm(sentence_generator(sentences, tokenizer, batch_size), 
                                          total=(len(sentences) + batch_size - 1) // batch_size, 
                                          desc="Processing batches"):
        input_ids, attention_mask = input_ids.to('cuda'), attention_mask.to('cuda')
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # Apply mean pooling
        embeddings = mean_pooling(outputs[0], attention_mask)
        all_embeddings.append(embeddings.cpu())

sentence_embeddings = torch.cat(all_embeddings).numpy()



# Create a FAISS index
print("Starting create FAISS...")
dim = sentence_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(dim)  # Inner product for cosine similarity

batch_size = 100000
for i in tqdm(range(0, len(sentence_embeddings), batch_size), desc="Adding embeddings to FAISS index"):
    faiss_index.add(sentence_embeddings[i:i + batch_size])

# faiss_index.add(sentence_embeddings)

faiss_index_file = 'results/faiss_index.bin'
faiss.write_index(faiss_index, faiss_index_file)
print(f"FAISS index saved to {faiss_index_file}")

# embeddings_file = 'document_embeddings.npy'
# np.save(embeddings_file, sentence_embeddings)
# print(f"Document embeddings saved to {embeddings_file}")

end_time = time.time()
execution_time_hours = (end_time - start_time) / 3600
print(f"Total execution time: {execution_time_hours:.2f} hours")