File size: 4,319 Bytes

237451b

import faiss
import numpy as np
import json
from tqdm import tqdm
import os
from torch.nn import DataParallel
from transformers import AutoTokenizer, AutoModel, T5EncoderModel
from torch.utils.data import DataLoader, TensorDataset
import torch
from sentence_transformers import SentenceTransformer
from multiprocessing import Pool
import time


# os.environ["CUDA_VISIBLE_DEVICES"] = "2"

start_time = time.time()

with open("/mnt/ceph_rbd/hf_models/feedbackcorpus/merged_triple_processed_new_withID.json", "r") as fi:
    data = json.load(fi)

sentences = [_['contents'] for _ in data]
print("Chunks nums: ", len(sentences))


# model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl'
# model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5'
model_path = '/mnt/ceph_rbd/hf_models/contriever'

### Using SentenceTransformer
# def encode_sentences_on_gpu(params):   
#     print("Encode start...") 
#     sentences_chunk, device_id = params
#     device = torch.device(f'cuda:{device_id}')
#     model = SentenceTransformer(model_path, device=device)
#     embeddings = model.encode(
#         sentences_chunk,
#         batch_size=2048,
#         show_progress_bar=True,
#         convert_to_numpy=True,
#         normalize_embeddings=True
#     )
#     print("Encode Success...")
#     return embeddings


# num_gpus = torch.cuda.device_count()
# print(f"Number of GPUs: {num_gpus}")

# sentence_embeddings = encode_sentences_on_gpu((sentences,3))

# # sentences_chunks = np.array_split(sentences, num_gpus)
# # params = [(sentences_chunks[i], i) for i in range(num_gpus)]
# # with Pool(processes=num_gpus) as pool:
# #     embeddings_list = pool.map(encode_sentences_on_gpu, params)
# print("Encode Finished...")


# sentence_embeddings = np.concatenate(embeddings_list, axis=0)


### Using Transformers

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

# Enable multi-GPU support if multiple GPUs are available
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)
model = model.to('cuda')
print("Model load success...")

# Batch size
batch_size = 2048*4
print("len(sentences)//b_z: ", len(sentences)//batch_size)

# Mean pooling function
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.0)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

# Generator to yield tokenized batches on the fly
def sentence_generator(sentences, tokenizer, batch_size):
    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
        yield inputs['input_ids'], inputs['attention_mask']


all_embeddings = []
with torch.no_grad():
    for input_ids, attention_mask in tqdm(sentence_generator(sentences, tokenizer, batch_size), 
                                          total=(len(sentences) + batch_size - 1) // batch_size, 
                                          desc="Processing batches"):
        input_ids, attention_mask = input_ids.to('cuda'), attention_mask.to('cuda')
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # Apply mean pooling
        embeddings = mean_pooling(outputs[0], attention_mask)
        all_embeddings.append(embeddings.cpu())

sentence_embeddings = torch.cat(all_embeddings).numpy()



# Create a FAISS index
print("Starting create FAISS...")
dim = sentence_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(dim)  # Inner product for cosine similarity

batch_size = 100000
for i in tqdm(range(0, len(sentence_embeddings), batch_size), desc="Adding embeddings to FAISS index"):
    faiss_index.add(sentence_embeddings[i:i + batch_size])

# faiss_index.add(sentence_embeddings)

faiss_index_file = 'results/faiss_index.bin'
faiss.write_index(faiss_index, faiss_index_file)
print(f"FAISS index saved to {faiss_index_file}")

# embeddings_file = 'document_embeddings.npy'
# np.save(embeddings_file, sentence_embeddings)
# print(f"Document embeddings saved to {embeddings_file}")

end_time = time.time()
execution_time_hours = (end_time - start_time) / 3600
print(f"Total execution time: {execution_time_hours:.2f} hours")