File size: 4,730 Bytes
7164129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import faiss
import numpy as np
import json
from tqdm import tqdm
import os
from torch.nn import DataParallel
from transformers import AutoTokenizer, AutoModel, T5EncoderModel
import torch
from sentence_transformers import SentenceTransformer
from multiprocessing import Pool
import time

start_time = time.time()

with open("merged_triple_processed_new_withID.json", "r") as fi:
    data = json.load(fi)

sentences = [_['contents'] for _ in data]
print("Chunks nums: ", len(sentences))

# model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl'
# model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5'
model_path = 'facebook/contriever'

### Using SentenceTransformer
# def encode_sentences_on_gpu(params):
#     sentences_chunk, device_id = params
#     device = torch.device(f'cuda:{device_id}')
#     model = SentenceTransformer(model_path, device=device)
#     embeddings = model.encode(
#         sentences_chunk,
#         batch_size=1024,
#         show_progress_bar=False,
#         convert_to_numpy=True,
#         normalize_embeddings=True
#     )
#     return embeddings


# num_gpus = torch.cuda.device_count()
# print(f"Number of GPUs: {num_gpus}")

# sentences_chunks = np.array_split(sentences, num_gpus)
# params = [(sentences_chunks[i], i) for i in range(num_gpus)]

# with Pool(processes=num_gpus) as pool:
#     embeddings_list = pool.map(encode_sentences_on_gpu, params)


# sentence_embeddings = np.concatenate(embeddings_list, axis=0)


### Using Transformers

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)
# model = T5EncoderModel.from_pretrained(model_path)
model = DataParallel(model)  # Wrap the model for multi-GPU support
model.eval()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

batch_size = 1024

def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

def process_in_batches(sentences, batch_size):
    sentence_embeddings_list = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch_sentences = sentences[i:i + batch_size]
        encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt').to(device)
        
        with torch.no_grad():
            model_output = model(**encoded_input)
            batch_sentence_embeddings = mean_pooling(model_output[0], encoded_input['attention_mask'])

            # CLS pooling for BGE
            # batch_sentence_embeddings = model_output[0][:, 0]
            # pooling for GTR
            # batch_sentence_embeddings = model_output.last_hidden_state.mean(dim=1)
            
            # batch_sentence_embeddings = torch.nn.functional.normalize(batch_sentence_embeddings, p=2, dim=1)
            sentence_embeddings_list.append(batch_sentence_embeddings.cpu())  # Move to CPU to save GPU memory
    
    
    sentence_embeddings = torch.cat(sentence_embeddings_list, dim=0)
    return sentence_embeddings


sentence_embeddings = process_in_batches(sentences, batch_size)

sentence_embeddings = sentence_embeddings.cpu().numpy()


# Create a FAISS index
dim = sentence_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(dim)  # Inner product for cosine similarity

faiss_index.add(sentence_embeddings)

faiss_index_file = 'faiss_index.bin'
faiss.write_index(faiss_index, faiss_index_file)
print(f"FAISS index saved to {faiss_index_file}")

embeddings_file = 'document_embeddings.npy'
np.save(embeddings_file, sentence_embeddings)
print(f"Document embeddings saved to {embeddings_file}")

end_time = time.time()
execution_time_hours = (end_time - start_time) / 3600
print(f"Total execution time: {execution_time_hours:.2f} hours")


# instruction = "Represent this sentence for searching relevant passages: "
# queries = ["Who is the president of U.S.A.?"]

# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

# # Compute token embeddings
# with torch.no_grad():
#     model_output = model(**encoded_input)
#     # Perform pooling. In this case, cls pooling.
#     sentence_embeddings = model_output[0][:, 0]
# # normalize embeddings
# query_vector = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)


# k = 5  # Number of nearest neighbors to retrieve
# distances, indices = faiss_index.search(np.array([query_vector.cpu()], dtype=np.float32), k)

# # Print the most similar documents
# for i, index in enumerate(indices[0]):
#     distance = distances[0][i]
#     print(f"Nearest neighbor {i+1}: {documents[index]}, Distance {distance}")