feedback / judge_retrieval_necessary_check /encode_corpus.py

Upload folder using huggingface_hub

237451b verified about 1 year ago

4.32 kB

	import faiss
	import numpy as np
	import json
	from tqdm import tqdm
	import os
	from torch.nn import DataParallel
	from transformers import AutoTokenizer, AutoModel, T5EncoderModel
	from torch.utils.data import DataLoader, TensorDataset
	import torch
	from sentence_transformers import SentenceTransformer
	from multiprocessing import Pool
	import time


	# os.environ["CUDA_VISIBLE_DEVICES"] = "2"

	start_time = time.time()

	with open("/mnt/ceph_rbd/hf_models/feedbackcorpus/merged_triple_processed_new_withID.json", "r") as fi:
	data = json.load(fi)

	sentences = [_['contents'] for _ in data]
	print("Chunks nums: ", len(sentences))


	# model_path = '/mnt/ceph_rbd/hf_models/gtr-t5-xl'
	# model_path = '/mnt/ceph_rbd/hf_models/bge-large-en-v1.5'
	model_path = '/mnt/ceph_rbd/hf_models/contriever'

	### Using SentenceTransformer
	# def encode_sentences_on_gpu(params):
	# print("Encode start...")
	# sentences_chunk, device_id = params
	# device = torch.device(f'cuda:{device_id}')
	# model = SentenceTransformer(model_path, device=device)
	# embeddings = model.encode(
	# sentences_chunk,
	# batch_size=2048,
	# show_progress_bar=True,
	# convert_to_numpy=True,
	# normalize_embeddings=True
	# )
	# print("Encode Success...")
	# return embeddings


	# num_gpus = torch.cuda.device_count()
	# print(f"Number of GPUs: {num_gpus}")

	# sentence_embeddings = encode_sentences_on_gpu((sentences,3))

	# # sentences_chunks = np.array_split(sentences, num_gpus)
	# # params = [(sentences_chunks[i], i) for i in range(num_gpus)]
	# # with Pool(processes=num_gpus) as pool:
	# # embeddings_list = pool.map(encode_sentences_on_gpu, params)
	# print("Encode Finished...")


	# sentence_embeddings = np.concatenate(embeddings_list, axis=0)


	### Using Transformers

	tokenizer = AutoTokenizer.from_pretrained(model_path)
	model = AutoModel.from_pretrained(model_path)

	# Enable multi-GPU support if multiple GPUs are available
	if torch.cuda.device_count() > 1:
	model = torch.nn.DataParallel(model)
	model = model.to('cuda')
	print("Model load success...")

	# Batch size
	batch_size = 2048*4
	print("len(sentences)//b_z: ", len(sentences)//batch_size)

	# Mean pooling function
	def mean_pooling(token_embeddings, mask):
	token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.0)
	sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
	return sentence_embeddings

	# Generator to yield tokenized batches on the fly
	def sentence_generator(sentences, tokenizer, batch_size):
	for i in range(0, len(sentences), batch_size):
	batch_sentences = sentences[i:i + batch_size]
	inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors='pt')
	yield inputs['input_ids'], inputs['attention_mask']


	all_embeddings = []
	with torch.no_grad():
	for input_ids, attention_mask in tqdm(sentence_generator(sentences, tokenizer, batch_size),
	total=(len(sentences) + batch_size - 1) // batch_size,
	desc="Processing batches"):
	input_ids, attention_mask = input_ids.to('cuda'), attention_mask.to('cuda')
	outputs = model(input_ids=input_ids, attention_mask=attention_mask)
	# Apply mean pooling
	embeddings = mean_pooling(outputs[0], attention_mask)
	all_embeddings.append(embeddings.cpu())

	sentence_embeddings = torch.cat(all_embeddings).numpy()



	# Create a FAISS index
	print("Starting create FAISS...")
	dim = sentence_embeddings.shape[1]
	faiss_index = faiss.IndexFlatIP(dim) # Inner product for cosine similarity

	batch_size = 100000
	for i in tqdm(range(0, len(sentence_embeddings), batch_size), desc="Adding embeddings to FAISS index"):
	faiss_index.add(sentence_embeddings[i:i + batch_size])

	# faiss_index.add(sentence_embeddings)

	faiss_index_file = 'results/faiss_index.bin'
	faiss.write_index(faiss_index, faiss_index_file)
	print(f"FAISS index saved to {faiss_index_file}")

	# embeddings_file = 'document_embeddings.npy'
	# np.save(embeddings_file, sentence_embeddings)
	# print(f"Document embeddings saved to {embeddings_file}")

	end_time = time.time()
	execution_time_hours = (end_time - start_time) / 3600
	print(f"Total execution time: {execution_time_hours:.2f} hours")