| | from multiprocessing import Pool |
| | from typing import List |
| |
|
| | import numpy as np |
| | import torch |
| | from pyscripts.utils.dialog_eval.vert import ( |
| | get_auto_bleu2_geometric, |
| | get_self_bleu2_geometric, |
| | run_f, |
| | ) |
| | from scipy.stats import gmean |
| | from sklearn.metrics.pairwise import cosine_similarity |
| | from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer |
| |
|
| |
|
| | def perplexity(LLM_Output: str, model_id: str = "gpt2") -> str: |
| | """ |
| | Compute the perplexity of the given text using a specified model from the |
| | `evaluate` library (default: GPT-2). |
| | |
| | Args: |
| | LLM_Output str: |
| | The text (string) for which perplexity is to be computed. |
| | model_id (str, optional): |
| | The identifier of the model to use for computing |
| | perplexity. Defaults to "gpt2". |
| | |
| | Returns: |
| | str: |
| | A formatted string showing the perplexity of the |
| | provided text(s), for example: |
| | "Perplexity: 45.23\n" |
| | |
| | Raises: |
| | ImportError: |
| | If the `evaluate` library is not installed or cannot be imported. |
| | |
| | Example: |
| | >>> text = "Hello world, this is a test." |
| | >>> result = perplexity(text, model_id="gpt2") |
| | >>> print(result) |
| | "Perplexity: 27.34\n" |
| | """ |
| | try: |
| | import evaluate |
| | except Exception as e: |
| | print("Error: evaluate is not properly installed.") |
| | raise e |
| | perplexity = evaluate.load("perplexity", module_type="metric") |
| | results = perplexity.compute(model_id=model_id, predictions=[LLM_Output]) |
| | return f"Perplexity: {results['mean_perplexity']:.2f}\n" |
| |
|
| |
|
| | def vert(LLM_response_arr: List[str]) -> str: |
| | """ |
| | Calculate and return Self BLEU-2, Auto BLEU-2 and VERT-2 |
| | metrics for a list of LLM responses. |
| | |
| | Args: |
| | LLM_response_arr (List[str]): |
| | A list of responses (strings) generated by the language |
| | model acting as text dialog response generator. |
| | |
| | Returns: |
| | str: |
| | A formatted string that includes each computed metric and the final |
| | VERT value, for example: |
| | |
| | "Self-BLEU2-geometric: 42.13 |
| | Auto-BLEU2-geometric: 38.94 |
| | VERT: 40.5 |
| | " |
| | |
| | Example: |
| | >>> # Suppose we have the following LLM responses: |
| | >>> responses = ["Hello world", "Foo bar", "Lorem ipsum dolor sit amet"] |
| | >>> result = vert(responses) |
| | >>> print(result) |
| | "Self-BLEU2-geometric: 42.13 |
| | Auto-BLEU2-geometric: 38.94 |
| | VERT: 40.5 |
| | " |
| | """ |
| | terms = [x.strip().split() for x in LLM_response_arr] |
| |
|
| | tasks = [ |
| | ("Self-BLEU2-geometric", get_self_bleu2_geometric), |
| | ("Auto-BLEU2-geometric", get_auto_bleu2_geometric), |
| | ] |
| | n_processes = min(16, len(tasks)) |
| | with Pool(n_processes) as pool: |
| | metrics = pool.map(run_f, [(t[1], terms) for t in tasks]) |
| | metric_arr = [] |
| | str1 = "" |
| | for (metric_name, _), metric in zip(tasks, metrics): |
| | metric, sem = np.mean(metric), np.std(metric) / np.sqrt(len(metric)) |
| |
|
| | metric, sem = [round(100 * x, 2) for x in [metric, sem]] |
| | metric_arr.append(metric) |
| |
|
| | str1 += f"{metric_name}: {metric}\n" |
| | str1 += f"VERT: {round(gmean(metric_arr), 2)}\n" |
| | return str1 |
| |
|
| |
|
| | def bert_score( |
| | total_response_arr: List[str], bert_model_name: str = "bert-base-uncased" |
| | ) -> str: |
| | """ |
| | Compute a cosine similarity score between the concatenated |
| | context (all but the last element) |
| | and the final response (last element) using a BERT-based model. |
| | This serves as a simplified |
| | measure of how closely the response aligns with the preceding context semantically. |
| | |
| | Args: |
| | total_response_arr (List[str]): |
| | A list of strings. The last element represents the response, |
| | while all other elements |
| | are treated as the context. |
| | bert_model_name (str, optional): |
| | The name or path of the BERT model to use (from the Hugging Face Model Hub). |
| | Defaults to "bert-base-uncased". |
| | |
| | Returns: |
| | str: |
| | A string containing the cosine similarity |
| | (as a percentage) followed by a newline. |
| | For example: |
| | "Cosine Similarity: 85.67\n" |
| | |
| | Example: |
| | >>> total_responses = [ |
| | ... "User: Hi, how are you?", |
| | ... "Assistant: I'm good! How can I help you today?", |
| | ... "User: Can you tell me a joke?", |
| | ... "Assistant: Sure! Here's one: Why did the chicken join a band?" |
| | ... ] |
| | >>> result = bert_score(total_responses, bert_model_name="bert-base-uncased") |
| | >>> print(result) |
| | "Cosine Similarity: 75.89\n" |
| | """ |
| |
|
| | def cosine_similarity_context_response(context, response, model, tokenizer): |
| | |
| | context_inputs = tokenizer(context, return_tensors="pt", truncation=True) |
| | response_inputs = tokenizer(response, return_tensors="pt", truncation=True) |
| | for k in context_inputs: |
| | context_inputs[k] = context_inputs[k].cuda() |
| | for k in response_inputs: |
| | response_inputs[k] = response_inputs[k].cuda() |
| |
|
| | |
| | with torch.no_grad(): |
| | context_embedding = model(**context_inputs).last_hidden_state.mean(dim=1) |
| | response_embedding = model(**response_inputs).last_hidden_state.mean(dim=1) |
| |
|
| | |
| | similarity = cosine_similarity( |
| | context_embedding.cpu().numpy(), response_embedding.cpu().numpy() |
| | ) |
| | return similarity[0][0] |
| |
|
| | bert_model = AutoModel.from_pretrained(bert_model_name).cuda() |
| | bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name) |
| | similarity = cosine_similarity_context_response( |
| | " ".join(total_response_arr[:-1]), |
| | total_response_arr[-1], |
| | bert_model, |
| | bert_tokenizer, |
| | ) |
| | return f"Cosine Similarity: {similarity*100:.2f}" + "\n" |
| |
|
| |
|
| | def DialoGPT_perplexity( |
| | user_utterance: str, |
| | response: str, |
| | dialog_model_name: str = "microsoft/DialoGPT-medium", |
| | ) -> str: |
| | """ |
| | Compute the perplexity of a response given a user utterance using a pre-trained |
| | DialoGPT model. The function loads DialoGPT (medium by default) |
| | from the Hugging Face Model Hub, then calculates the perplexity |
| | for the |
| | (context + response) sequence. |
| | |
| | Args: |
| | user_utterance (str): |
| | The user utterance preceding the model's response. |
| | response (str): |
| | The generated response whose perplexity needs to be evaluated. |
| | |
| | Returns: |
| | str: |
| | A formatted string containing the DialoGPT perplexity score. For example: |
| | "DialoGPT Perplexity: 25.67\n" |
| | |
| | Example: |
| | >>> user_text = "Hi, how are you today?" |
| | >>> system_response = "I'm good, thank you! How can I help you?" |
| | >>> result = DialoGPT_perplexity(user_text, system_response) |
| | >>> print(result) |
| | "DialoGPT Perplexity: 31.45\n" |
| | """ |
| |
|
| | def evaluate_response_with_dialoGPT(context, response, model, tokenizer): |
| | """ |
| | Evaluate the appropriateness of a response based on the |
| | given context using DialoGPT. |
| | |
| | Args: |
| | context (str): The dialogue context (previous conversation). |
| | response (str): The generated response to evaluate. |
| | model: Pre-trained DialoGPT model. |
| | tokenizer: Corresponding tokenizer for the DialoGPT model. |
| | |
| | Returns: |
| | float: Perplexity score of the response given the context. |
| | """ |
| | model.eval() |
| |
|
| | |
| | input_text = context + tokenizer.eos_token + response + tokenizer.eos_token |
| | inputs = tokenizer(input_text, return_tensors="pt", truncation=True) |
| | inputs["input_ids"] = inputs["input_ids"].cuda() |
| | inputs["attention_mask"] = inputs["attention_mask"].cuda() |
| | |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = model(**inputs, labels=inputs["input_ids"].cuda()) |
| | loss = outputs.loss |
| |
|
| | |
| | perplexity = torch.exp(loss) |
| | return perplexity.cpu().item() |
| |
|
| | |
| | model_name = dialog_model_name |
| | model = AutoModelForCausalLM.from_pretrained(model_name).cuda() |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | perplexity = evaluate_response_with_dialoGPT( |
| | user_utterance, response, model, tokenizer |
| | ) |
| | return f"DialoGPT Perplexity: {perplexity:.2f}" + "\n" |
| |
|