File size: 4,730 Bytes
7164129 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import faiss
import numpy as np
import json
from tqdm import tqdm
import os
from torch.nn import DataParallel
from transformers import AutoTokenizer, AutoModel, T5EncoderModel
import torch
from sentence_transformers import SentenceTransformer
from multiprocessing import Pool
import time
start_time = time.time()
with open("merged_triple_processed_new_withID.json", "r") as fi:
data = json.load(fi)
sentences = [_['contents'] for _ in data]
print("Chunks nums: ", len(sentences))
# model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl'
# model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5'
model_path = 'facebook/contriever'
### Using SentenceTransformer
# def encode_sentences_on_gpu(params):
# sentences_chunk, device_id = params
# device = torch.device(f'cuda:{device_id}')
# model = SentenceTransformer(model_path, device=device)
# embeddings = model.encode(
# sentences_chunk,
# batch_size=1024,
# show_progress_bar=False,
# convert_to_numpy=True,
# normalize_embeddings=True
# )
# return embeddings
# num_gpus = torch.cuda.device_count()
# print(f"Number of GPUs: {num_gpus}")
# sentences_chunks = np.array_split(sentences, num_gpus)
# params = [(sentences_chunks[i], i) for i in range(num_gpus)]
# with Pool(processes=num_gpus) as pool:
# embeddings_list = pool.map(encode_sentences_on_gpu, params)
# sentence_embeddings = np.concatenate(embeddings_list, axis=0)
### Using Transformers
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)
# model = T5EncoderModel.from_pretrained(model_path)
model = DataParallel(model) # Wrap the model for multi-GPU support
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
batch_size = 1024
def mean_pooling(token_embeddings, mask):
token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
return sentence_embeddings
def process_in_batches(sentences, batch_size):
sentence_embeddings_list = []
for i in tqdm(range(0, len(sentences), batch_size)):
batch_sentences = sentences[i:i + batch_size]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt').to(device)
with torch.no_grad():
model_output = model(**encoded_input)
batch_sentence_embeddings = mean_pooling(model_output[0], encoded_input['attention_mask'])
# CLS pooling for BGE
# batch_sentence_embeddings = model_output[0][:, 0]
# pooling for GTR
# batch_sentence_embeddings = model_output.last_hidden_state.mean(dim=1)
# batch_sentence_embeddings = torch.nn.functional.normalize(batch_sentence_embeddings, p=2, dim=1)
sentence_embeddings_list.append(batch_sentence_embeddings.cpu()) # Move to CPU to save GPU memory
sentence_embeddings = torch.cat(sentence_embeddings_list, dim=0)
return sentence_embeddings
sentence_embeddings = process_in_batches(sentences, batch_size)
sentence_embeddings = sentence_embeddings.cpu().numpy()
# Create a FAISS index
dim = sentence_embeddings.shape[1]
faiss_index = faiss.IndexFlatIP(dim) # Inner product for cosine similarity
faiss_index.add(sentence_embeddings)
faiss_index_file = 'faiss_index.bin'
faiss.write_index(faiss_index, faiss_index_file)
print(f"FAISS index saved to {faiss_index_file}")
embeddings_file = 'document_embeddings.npy'
np.save(embeddings_file, sentence_embeddings)
print(f"Document embeddings saved to {embeddings_file}")
end_time = time.time()
execution_time_hours = (end_time - start_time) / 3600
print(f"Total execution time: {execution_time_hours:.2f} hours")
# instruction = "Represent this sentence for searching relevant passages: "
# queries = ["Who is the president of U.S.A.?"]
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
# # Compute token embeddings
# with torch.no_grad():
# model_output = model(**encoded_input)
# # Perform pooling. In this case, cls pooling.
# sentence_embeddings = model_output[0][:, 0]
# # normalize embeddings
# query_vector = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
# k = 5 # Number of nearest neighbors to retrieve
# distances, indices = faiss_index.search(np.array([query_vector.cpu()], dtype=np.float32), k)
# # Print the most similar documents
# for i, index in enumerate(indices[0]):
# distance = distances[0][i]
# print(f"Nearest neighbor {i+1}: {documents[index]}, Distance {distance}")
|