# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - # # # # This file was created by: Alberto Palomo Alonso # # Universidad de Alcalá - Escuela Politécnica Superior # # # # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - # # Import statements: import torch import tqdm from transformers import AutoTokenizer, AutoModel from datasets import load_dataset from scipy.stats import spearmanr #Mean Pooling - Take attention mask into account for correct averaging def mean_pooling(model_output, attention_mask): token_embeddings = model_output[0] #First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) def zero_shot_proposed( model_repo: str = None, data_repo: str = None, batch_size: int = 32, device: torch.device = torch.device('cpu') ): """ """ # Pathing: if model_repo is None: model_repo = input("Enter the model repository path or identifier: ") if data_repo is None: data_repo = input("Enter the dataset repository path or identifier: ") # Loading: tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True) model = AutoModel.from_pretrained(model_repo, trust_remote_code=True) dataset = load_dataset(data_repo) model.eval() model.to(device) y = list() y_hat = list() with torch.no_grad(): for batch in tqdm.tqdm(dataset['test'].batch(batch_size)): if not hasattr(model, 'get_sentence_embedding'): inputs_1 = tokenizer(batch['sentence1'], return_tensors="pt", padding=True, truncation=True, max_length=382) inputs_2 = tokenizer(batch['sentence2'], return_tensors="pt", padding=True, truncation=True, max_length=382) inputs_1 = {k: v.to(device) for k, v in inputs_1.items()} inputs_2 = {k: v.to(device) for k, v in inputs_2.items()} embeddings_1 = model(**inputs_1) embeddings_2 = model(**inputs_2) embeddings_1 = mean_pooling(embeddings_1, inputs_1['attention_mask']) embeddings_2 = mean_pooling(embeddings_2, inputs_2['attention_mask']) embeddings_1 = torch.nn.functional.normalize(embeddings_1, p=2, dim=-1) embeddings_2 = torch.nn.functional.normalize(embeddings_2, p=2, dim=-1) sim = (embeddings_1 * embeddings_2).sum(dim=-1) else: inputs_1 = tokenizer(batch['sentence1'], return_tensors="pt", padding='max_length', truncation=True, max_length=382) inputs_2 = tokenizer(batch['sentence2'], return_tensors="pt", padding='max_length', truncation=True, max_length=382) inputs_1 = {k: v.to(device) for k, v in inputs_1.items()} inputs_2 = {k: v.to(device) for k, v in inputs_2.items()} embeddings_1 = model.get_sentence_embedding( input_ids=inputs_1["input_ids"], attention_mask=inputs_1["attention_mask"] ) embeddings_2 = model.get_sentence_embedding( input_ids=inputs_2["input_ids"], attention_mask=inputs_2["attention_mask"] ) sim = model.similarity(embeddings_1, embeddings_2) y.extend(batch['label']) y_hat.extend(sim.cpu().numpy().tolist()) # Benchmarking: for _y, _yh in zip(y, y_hat): print(f"Gold: {_y:.4f} - Predicted: {_yh:.4f}") rho, _ = spearmanr(y, y_hat) print(f"Average Spearman correlation: {rho:.4f}") return rho # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - # # END OF FILE # # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #