Alverciito
/

wikipedia_segmentation

Sentence Similarity

sentence_cosenet

feature-extraction

sentence-embeddings

information-retrieval

semantic-search

Model card Files Files and versions

wikipedia_segmentation / research_files /benchmark /segmentation_benchmark /zero_shot_transfer.py

alverciito

zero-shot benchmark

edbcb21 30 days ago

history blame contribute delete

4.09 kB

	# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
	# #
	# This file was created by: Alberto Palomo Alonso #
	# Universidad de Alcalá - Escuela Politécnica Superior #
	# #
	# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
	# Import statements:
	import torch
	import tqdm
	from transformers import AutoTokenizer, AutoModel
	from datasets import load_dataset
	from scipy.stats import spearmanr

	#Mean Pooling - Take attention mask into account for correct averaging
	def mean_pooling(model_output, attention_mask):
	token_embeddings = model_output[0] #First element of model_output contains all token embeddings
	input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
	return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


	def zero_shot_proposed(
	model_repo: str = None,
	data_repo: str = None,
	batch_size: int = 32,
	device: torch.device = torch.device('cpu')
	):
	"""

	"""
	# Pathing:
	if model_repo is None:
	model_repo = input("Enter the model repository path or identifier: ")
	if data_repo is None:
	data_repo = input("Enter the dataset repository path or identifier: ")

	# Loading:
	tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
	model = AutoModel.from_pretrained(model_repo, trust_remote_code=True)
	dataset = load_dataset(data_repo)
	model.eval()
	model.to(device)

	y = list()
	y_hat = list()

	with torch.no_grad():
	for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
	if not hasattr(model, 'get_sentence_embedding'):
	inputs_1 = tokenizer(batch['sentence1'], return_tensors="pt", padding=True, truncation=True, max_length=382)
	inputs_2 = tokenizer(batch['sentence2'], return_tensors="pt", padding=True, truncation=True, max_length=382)
	inputs_1 = {k: v.to(device) for k, v in inputs_1.items()}
	inputs_2 = {k: v.to(device) for k, v in inputs_2.items()}
	embeddings_1 = model(**inputs_1)
	embeddings_2 = model(**inputs_2)
	embeddings_1 = mean_pooling(embeddings_1, inputs_1['attention_mask'])
	embeddings_2 = mean_pooling(embeddings_2, inputs_2['attention_mask'])
	embeddings_1 = torch.nn.functional.normalize(embeddings_1, p=2, dim=-1)
	embeddings_2 = torch.nn.functional.normalize(embeddings_2, p=2, dim=-1)
	sim = (embeddings_1 * embeddings_2).sum(dim=-1)
	else:
	inputs_1 = tokenizer(batch['sentence1'], return_tensors="pt", padding='max_length', truncation=True, max_length=382)
	inputs_2 = tokenizer(batch['sentence2'], return_tensors="pt", padding='max_length', truncation=True, max_length=382)
	inputs_1 = {k: v.to(device) for k, v in inputs_1.items()}
	inputs_2 = {k: v.to(device) for k, v in inputs_2.items()}
	embeddings_1 = model.get_sentence_embedding(
	input_ids=inputs_1["input_ids"],
	attention_mask=inputs_1["attention_mask"]
	)
	embeddings_2 = model.get_sentence_embedding(
	input_ids=inputs_2["input_ids"],
	attention_mask=inputs_2["attention_mask"]
	)
	sim = model.similarity(embeddings_1, embeddings_2)

	y.extend(batch['label'])
	y_hat.extend(sim.cpu().numpy().tolist())

	# Benchmarking:
	for _y, _yh in zip(y, y_hat):
	print(f"Gold: {_y:.4f} - Predicted: {_yh:.4f}")
	rho, _ = spearmanr(y, y_hat)

	print(f"Average Spearman correlation: {rho:.4f}")
	return rho
	# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
	# END OF FILE #
	# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #