File size: 4,319 Bytes
237451b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | import faiss
import numpy as np
import json
from tqdm import tqdm
import os
from torch.nn import DataParallel
from transformers import AutoTokenizer, AutoModel, T5EncoderModel
from torch.utils.data import DataLoader, TensorDataset
import torch
from sentence_transformers import SentenceTransformer
from multiprocessing import Pool
import time
# os.environ["CUDA_VISIBLE_DEVICES"] = "2"
start_time = time.time()
with open("/mnt/ceph_rbd/hf_models/feedbackcorpus/merged_triple_processed_new_withID.json", "r") as fi:
data = json.load(fi)
sentences = [_['contents'] for _ in data]
print("Chunks nums: ", len(sentences))
# model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl'
# model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5'
model_path = '/mnt/ceph_rbd/hf_models/contriever'
### Using SentenceTransformer
# def encode_sentences_on_gpu(params):
# print("Encode start...")
# sentences_chunk, device_id = params
# device = torch.device(f'cuda:{device_id}')
# model = SentenceTransformer(model_path, device=device)
# embeddings = model.encode(
# sentences_chunk,
# batch_size=2048,
# show_progress_bar=True,
# convert_to_numpy=True,
# normalize_embeddings=True
# )
# print("Encode Success...")
# return embeddings
# num_gpus = torch.cuda.device_count()
# print(f"Number of GPUs: {num_gpus}")
# sentence_embeddings = encode_sentences_on_gpu((sentences,3))
# # sentences_chunks = np.array_split(sentences, num_gpus)
# # params = [(sentences_chunks[i], i) for i in range(num_gpus)]
# # with Pool(processes=num_gpus) as pool:
# # embeddings_list = pool.map(encode_sentences_on_gpu, params)
# print("Encode Finished...")
# sentence_embeddings = np.concatenate(embeddings_list, axis=0)
### Using Transformers
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)
# Enable multi-GPU support if multiple GPUs are available
if torch.cuda.device_count() > 1:
model = torch.nn.DataParallel(model)
model = model.to('cuda')
print("Model load success...")
# Batch size
batch_size = 2048*4
print("len(sentences)//b_z: ", len(sentences)//batch_size)
# Mean pooling function
def mean_pooling(token_embeddings, mask):
token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.0)
sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
return sentence_embeddings
# Generator to yield tokenized batches on the fly
def sentence_generator(sentences, tokenizer, batch_size):
for i in range(0, len(sentences), batch_size):
batch_sentences = sentences[i:i + batch_size]
inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
yield inputs['input_ids'], inputs['attention_mask']
all_embeddings = []
with torch.no_grad():
for input_ids, attention_mask in tqdm(sentence_generator(sentences, tokenizer, batch_size),
total=(len(sentences) + batch_size - 1) // batch_size,
desc="Processing batches"):
input_ids, attention_mask = input_ids.to('cuda'), attention_mask.to('cuda')
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
# Apply mean pooling
embeddings = mean_pooling(outputs[0], attention_mask)
all_embeddings.append(embeddings.cpu())
sentence_embeddings = torch.cat(all_embeddings).numpy()
# Create a FAISS index
print("Starting create FAISS...")
dim = sentence_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(dim) # Inner product for cosine similarity
batch_size = 100000
for i in tqdm(range(0, len(sentence_embeddings), batch_size), desc="Adding embeddings to FAISS index"):
faiss_index.add(sentence_embeddings[i:i + batch_size])
# faiss_index.add(sentence_embeddings)
faiss_index_file = 'results/faiss_index.bin'
faiss.write_index(faiss_index, faiss_index_file)
print(f"FAISS index saved to {faiss_index_file}")
# embeddings_file = 'document_embeddings.npy'
# np.save(embeddings_file, sentence_embeddings)
# print(f"Document embeddings saved to {embeddings_file}")
end_time = time.time()
execution_time_hours = (end_time - start_time) / 3600
print(f"Total execution time: {execution_time_hours:.2f} hours") |