import faiss import numpy as np import json from tqdm import tqdm import os from torch.nn import DataParallel from transformers import AutoTokenizer, AutoModel, T5EncoderModel from torch.utils.data import DataLoader, TensorDataset import torch from sentence_transformers import SentenceTransformer from multiprocessing import Pool import time # os.environ["CUDA_VISIBLE_DEVICES"] = "2" start_time = time.time() with open("/mnt/ceph_rbd/hf_models/feedbackcorpus/merged_triple_processed_new_withID.json", "r") as fi: data = json.load(fi) sentences = [_['contents'] for _ in data] print("Chunks nums: ", len(sentences)) # model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl' # model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5' model_path = '/mnt/ceph_rbd/hf_models/contriever' ### Using SentenceTransformer # def encode_sentences_on_gpu(params): # print("Encode start...") # sentences_chunk, device_id = params # device = torch.device(f'cuda:{device_id}') # model = SentenceTransformer(model_path, device=device) # embeddings = model.encode( # sentences_chunk, # batch_size=2048, # show_progress_bar=True, # convert_to_numpy=True, # normalize_embeddings=True # ) # print("Encode Success...") # return embeddings # num_gpus = torch.cuda.device_count() # print(f"Number of GPUs: {num_gpus}") # sentence_embeddings = encode_sentences_on_gpu((sentences,3)) # # sentences_chunks = np.array_split(sentences, num_gpus) # # params = [(sentences_chunks[i], i) for i in range(num_gpus)] # # with Pool(processes=num_gpus) as pool: # # embeddings_list = pool.map(encode_sentences_on_gpu, params) # print("Encode Finished...") # sentence_embeddings = np.concatenate(embeddings_list, axis=0) ### Using Transformers tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModel.from_pretrained(model_path) # Enable multi-GPU support if multiple GPUs are available if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) model = model.to('cuda') print("Model load success...") # Batch size batch_size = 2048*4 print("len(sentences)//b_z: ", len(sentences)//batch_size) # Mean pooling function def mean_pooling(token_embeddings, mask): token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.0) sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None] return sentence_embeddings # Generator to yield tokenized batches on the fly def sentence_generator(sentences, tokenizer, batch_size): for i in range(0, len(sentences), batch_size): batch_sentences = sentences[i:i + batch_size] inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt') yield inputs['input_ids'], inputs['attention_mask'] all_embeddings = [] with torch.no_grad(): for input_ids, attention_mask in tqdm(sentence_generator(sentences, tokenizer, batch_size), total=(len(sentences) + batch_size - 1) // batch_size, desc="Processing batches"): input_ids, attention_mask = input_ids.to('cuda'), attention_mask.to('cuda') outputs = model(input_ids=input_ids, attention_mask=attention_mask) # Apply mean pooling embeddings = mean_pooling(outputs[0], attention_mask) all_embeddings.append(embeddings.cpu()) sentence_embeddings = torch.cat(all_embeddings).numpy() # Create a FAISS index print("Starting create FAISS...") dim = sentence_embeddings.shape[1] faiss_index = faiss.IndexFlatIP(dim) # Inner product for cosine similarity batch_size = 100000 for i in tqdm(range(0, len(sentence_embeddings), batch_size), desc="Adding embeddings to FAISS index"): faiss_index.add(sentence_embeddings[i:i + batch_size]) # faiss_index.add(sentence_embeddings) faiss_index_file = 'results/faiss_index.bin' faiss.write_index(faiss_index, faiss_index_file) print(f"FAISS index saved to {faiss_index_file}") # embeddings_file = 'document_embeddings.npy' # np.save(embeddings_file, sentence_embeddings) # print(f"Document embeddings saved to {embeddings_file}") end_time = time.time() execution_time_hours = (end_time - start_time) / 3600 print(f"Total execution time: {execution_time_hours:.2f} hours")