f541119578
/

codescripts

Model card Files Files and versions

codescripts / bertencode.py

f541119578's picture

Upload folder using huggingface_hub

fdf190d verified about 1 year ago

history blame contribute delete

2.23 kB

	"""
	This example starts multiple processes (1 per GPU), which encode
	sentences in parallel. This gives a near linear speed-up
	when encoding large text collections.
	"""
	from tqdm import tqdm
	import logging
	import json
	import torch
	from sentence_transformers import LoggingHandler, SentenceTransformer

	logging.basicConfig(
	format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
	)

	# Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
	if __name__ == "__main__":
	# Create a large list of 100k sentences
	f = open("/home/aiscuser/fhw/data/qwq_python_selected.json","r+")
	lines = f.readlines()
	sentences = []
	for line in tqdm(lines):
	d= json.loads(line)
	sentences.append(d["instruction"])


	# Define the model
	model = SentenceTransformer("/home/aiscuser/fhw/model_weights/all-roberta-large-v1")

	# Start the multi-process pool on all available CUDA devices
	pool = model.start_multi_process_pool(["cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:0", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:1", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:2", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:3", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:4", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:5", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:6", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", "cuda:7", ])

	# Compute the embeddings using the multi-process pool
	emb = model.encode_multi_process(sentences, pool)

	print("Embeddings computed. Shape:", emb.shape)

	# Optional: Stop the processes in the pool
	model.stop_multi_process_pool(pool)
	torch.save(emb, "/home/aiscuser/fhw/embeddings/qwq_ins_embeddings.pt", pickle_protocol=4)