File size: 4,730 Bytes
import faiss
import numpy as np
import json
from tqdm import tqdm
import os
from torch.nn import DataParallel
from transformers import AutoTokenizer, AutoModel, T5EncoderModel
import torch
from sentence_transformers import SentenceTransformer
from multiprocessing import Pool
import time

start_time = time.time()

with open("merged_triple_processed_new_withID.json", "r") as fi:
    data = json.load(fi)

sentences = [_['contents'] for _ in data]
print("Chunks nums: ", len(sentences))

# model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl'
# model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5'
model_path = 'facebook/contriever'

### Using SentenceTransformer
# def encode_sentences_on_gpu(params):
#     sentences_chunk, device_id = params
#     device = torch.device(f'cuda:{device_id}')
#     model = SentenceTransformer(model_path, device=device)
#     embeddings = model.encode(
#         sentences_chunk,
#         batch_size=1024,
#         show_progress_bar=False,
#         convert_to_numpy=True,
#         normalize_embeddings=True
#     )
#     return embeddings


# num_gpus = torch.cuda.device_count()
# print(f"Number of GPUs: {num_gpus}")

# sentences_chunks = np.array_split(sentences, num_gpus)
# params = [(sentences_chunks[i], i) for i in range(num_gpus)]

# with Pool(processes=num_gpus) as pool:
#     embeddings_list = pool.map(encode_sentences_on_gpu, params)


# sentence_embeddings = np.concatenate(embeddings_list, axis=0)


### Using Transformers

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)
# model = T5EncoderModel.from_pretrained(model_path)
model = DataParallel(model)  # Wrap the model for multi-GPU support
model.eval()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

batch_size = 1024

def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

def process_in_batches(sentences, batch_size):
    sentence_embeddings_list = []
    for i in tqdm(range(0, len(sentences), batch_size)):
        batch_sentences = sentences[i:i + batch_size]
        encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt').to(device)
        
        with torch.no_grad():
            model_output = model(**encoded_input)
            batch_sentence_embeddings = mean_pooling(model_output[0], encoded_input['attention_mask'])

            # CLS pooling for BGE
            # batch_sentence_embeddings = model_output[0][:, 0]
            # pooling for GTR
            # batch_sentence_embeddings = model_output.last_hidden_state.mean(dim=1)
            
            # batch_sentence_embeddings = torch.nn.functional.normalize(batch_sentence_embeddings, p=2, dim=1)
            sentence_embeddings_list.append(batch_sentence_embeddings.cpu())  # Move to CPU to save GPU memory
    
    
    sentence_embeddings = torch.cat(sentence_embeddings_list, dim=0)
    return sentence_embeddings


sentence_embeddings = process_in_batches(sentences, batch_size)

sentence_embeddings = sentence_embeddings.cpu().numpy()


# Create a FAISS index
dim = sentence_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(dim)  # Inner product for cosine similarity

faiss_index.add(sentence_embeddings)

faiss_index_file = 'faiss_index.bin'
faiss.write_index(faiss_index, faiss_index_file)
print(f"FAISS index saved to {faiss_index_file}")

embeddings_file = 'document_embeddings.npy'
np.save(embeddings_file, sentence_embeddings)
print(f"Document embeddings saved to {embeddings_file}")

end_time = time.time()
execution_time_hours = (end_time - start_time) / 3600
print(f"Total execution time: {execution_time_hours:.2f} hours")


# instruction = "Represent this sentence for searching relevant passages: "
# queries = ["Who is the president of U.S.A.?"]

# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

# # Compute token embeddings
# with torch.no_grad():
#     model_output = model(**encoded_input)
#     # Perform pooling. In this case, cls pooling.
#     sentence_embeddings = model_output[0][:, 0]
# # normalize embeddings
# query_vector = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)


# k = 5  # Number of nearest neighbors to retrieve
# distances, indices = faiss_index.search(np.array([query_vector.cpu()], dtype=np.float32), k)

# # Print the most similar documents
# for i, index in enumerate(indices[0]):
#     distance = distances[0][i]
#     print(f"Nearest neighbor {i+1}: {documents[index]}, Distance {distance}")