vlm_clone_2 / llm2vec /experiments /mteb_eval_custom.py
tuandunghcmut's picture
Add files using upload-large-folder tool
e9cd0c7 verified
import argparse
from typing import Any
import mteb
import json
import torch
import numpy as np
from mteb.models.instructions import task_to_instruction
from mteb.models.text_formatting_utils import corpus_to_texts
from llm2vec import LLM2Vec
def llm2vec_instruction(instruction):
if len(instruction) > 0 and instruction[-1] != ":":
instruction = instruction.strip(".") + ":"
return instruction
class LLM2VecWrapper:
def __init__(self, model=None, task_to_instructions=None):
self.task_to_instructions = task_to_instructions
self.model = model
def encode(
self,
sentences: list[str],
*,
prompt_name: str = None,
**kwargs: Any, # noqa
) -> np.ndarray:
if prompt_name is not None:
instruction = (
self.task_to_instructions[prompt_name]
if self.task_to_instructions
and prompt_name in self.task_to_instructions
else llm2vec_instruction(task_to_instruction(prompt_name))
)
else:
instruction = ""
sentences = [[instruction, sentence] for sentence in sentences]
return self.model.encode(sentences, **kwargs)
def encode_corpus(
self,
corpus: list[dict[str, str]] | dict[str, list[str]] | list[str],
prompt_name: str = None,
**kwargs: Any,
) -> np.ndarray:
sentences = corpus_to_texts(corpus, sep=" ")
sentences = [["", sentence] for sentence in sentences]
if "request_qid" in kwargs:
kwargs.pop("request_qid")
return self.model.encode(sentences, **kwargs)
def encode_queries(self, queries: list[str], **kwargs: Any) -> np.ndarray:
return self.encode(queries, **kwargs)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--base_model_name_or_path",
type=str,
default="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp",
)
parser.add_argument(
"--peft_model_name_or_path",
type=str,
default="McGill-NLP/LLM2Vec-Meta-Llama-3-8B-Instruct-mntp-supervised",
)
parser.add_argument("--task_name", type=str, default="STS16")
parser.add_argument(
"--task_to_instructions_fp",
type=str,
default="test_configs/mteb/task_to_instructions.json",
)
parser.add_argument("--output_dir", type=str, default="results")
args = parser.parse_args()
task_to_instructions = None
if args.task_to_instructions_fp is not None:
with open(args.task_to_instructions_fp, "r") as f:
task_to_instructions = json.load(f)
l2v_model = LLM2Vec.from_pretrained(
args.base_model_name_or_path,
peft_model_name_or_path=args.peft_model_name_or_path,
device_map="cuda" if torch.cuda.is_available() else "cpu",
torch_dtype=torch.bfloat16,
)
model = LLM2VecWrapper(model=l2v_model, task_to_instructions=task_to_instructions)
tasks = mteb.get_tasks(tasks=[args.task_name])
evaluation = mteb.MTEB(tasks=tasks)
results = evaluation.run(model, output_folder=args.output_dir)