|
|
import argparse |
|
|
from typing import Any |
|
|
import mteb |
|
|
import json |
|
|
import torch |
|
|
|
|
|
import numpy as np |
|
|
from mteb.models.instructions import task_to_instruction |
|
|
from mteb.models.text_formatting_utils import corpus_to_texts |
|
|
|
|
|
from llm2vec import LLM2Vec |
|
|
|
|
|
def llm2vec_instruction(instruction): |
|
|
if len(instruction) > 0 and instruction[-1] != ":": |
|
|
instruction = instruction.strip(".") + ":" |
|
|
return instruction |
|
|
|
|
|
|
|
|
class LLM2VecWrapper: |
|
|
def __init__(self, model=None, task_to_instructions=None): |
|
|
|
|
|
self.task_to_instructions = task_to_instructions |
|
|
self.model = model |
|
|
|
|
|
def encode( |
|
|
self, |
|
|
sentences: list[str], |
|
|
*, |
|
|
prompt_name: str = None, |
|
|
**kwargs: Any, |
|
|
) -> np.ndarray: |
|
|
if prompt_name is not None: |
|
|
instruction = ( |
|
|
self.task_to_instructions[prompt_name] |
|
|
if self.task_to_instructions |
|
|
and prompt_name in self.task_to_instructions |
|
|
else llm2vec_instruction(task_to_instruction(prompt_name)) |
|
|
) |
|
|
else: |
|
|
instruction = "" |
|
|
|
|
|
sentences = [[instruction, sentence] for sentence in sentences] |
|
|
return self.model.encode(sentences, **kwargs) |
|
|
|
|
|
def encode_corpus( |
|
|
self, |
|
|
corpus: list[dict[str, str]] | dict[str, list[str]] | list[str], |
|
|
prompt_name: str = None, |
|
|
**kwargs: Any, |
|
|
) -> np.ndarray: |
|
|
sentences = corpus_to_texts(corpus, sep=" ") |
|
|
sentences = [["", sentence] for sentence in sentences] |
|
|
if "request_qid" in kwargs: |
|
|
kwargs.pop("request_qid") |
|
|
return self.model.encode(sentences, **kwargs) |
|
|
|
|
|
def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray: |
|
|
return self.encode(queries, **kwargs) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument( |
|
|
"--base_model_name_or_path", |
|
|
type=str, |
|
|
default="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--peft_model_name_or_path", |
|
|
type=str, |
|
|
default="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised", |
|
|
) |
|
|
parser.add_argument("--task_name", type=str, default="STS16") |
|
|
parser.add_argument( |
|
|
"--task_to_instructions_fp", |
|
|
type=str, |
|
|
default="test_configs/mteb/task_to_instructions.json", |
|
|
) |
|
|
parser.add_argument("--output_dir", type=str, default="results") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
task_to_instructions = None |
|
|
if args.task_to_instructions_fp is not None: |
|
|
with open(args.task_to_instructions_fp, "r") as f: |
|
|
task_to_instructions = json.load(f) |
|
|
|
|
|
l2v_model = LLM2Vec.from_pretrained( |
|
|
args.base_model_name_or_path, |
|
|
peft_model_name_or_path=args.peft_model_name_or_path, |
|
|
device_map="cuda" if torch.cuda.is_available() else "cpu", |
|
|
torch_dtype=torch.bfloat16, |
|
|
) |
|
|
|
|
|
model = LLM2VecWrapper(model=l2v_model, task_to_instructions=task_to_instructions) |
|
|
tasks = mteb.get_tasks(tasks=[args.task_name]) |
|
|
evaluation = mteb.MTEB(tasks=tasks) |
|
|
results = evaluation.run(model, output_folder=args.output_dir) |
|
|
|