jtl11
/

reranker-int4-cpu

Model card Files Files and versions

reranker-int4-cpu / handler.py

jtl11's picture

Upload folder using huggingface_hub

7632e39 verified 6 months ago

history blame contribute delete

3.07 kB

	import torch
	from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
	from typing import Dict, List, Any

	class EndpointHandler():
	def __init__(self, path=""):
	self.tokenizer = AutoTokenizer.from_pretrained(path, padding_side='left')
	self.model = AutoModelForCausalLM.from_pretrained(path, device_map="cpu", torch_dtype="auto").eval()

	self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
	self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")

	prefix = "<\|im_start\|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<\|im_end\|>\n<\|im_start\|>user\n"
	suffix = "<\|im_end\|>\n<\|im_start\|>assistant\n<think>\n\n</think>\n\n"
	self.prefix_tokens = self.tokenizer.encode(prefix, add_special_tokens=False)
	self.suffix_tokens = self.tokenizer.encode(suffix, add_special_tokens=False)

	def format_instruction(self, instruction, query, doc):
	if instruction is None:
	instruction = 'Given a web search query, retrieve relevant passages that answer the query'
	output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(instruction=instruction,query=query, doc=doc)
	return output


	def process_inputs(self, pairs):
	max_length = 8192
	inputs = self.tokenizer(
	pairs, padding=False, truncation='longest_first',
	return_attention_mask=False, max_length=max_length - len(self.prefix_tokens) - len(self.suffix_tokens)
	)
	for i, ele in enumerate(inputs['input_ids']):
	inputs['input_ids'][i] = self.prefix_tokens + ele + self.suffix_tokens
	inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=max_length)
	for key in inputs:
	inputs[key] = inputs[key].to(self.model.device)
	return inputs


	def compute_logits(self, inputs, **kwargs):
	batch_scores = self.model(**inputs, cache_implementation="static").logits[:, -1, :]
	true_vector = batch_scores[:, self.token_true_id]
	false_vector = batch_scores[:, self.token_false_id]
	batch_scores = torch.stack([false_vector, true_vector], dim=1)
	batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
	scores = batch_scores[:, 1].exp().tolist()
	return scores

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:

	task = 'Given a web search query, retrieve relevant passages that answer the query'

	inputs = data.pop("inputs", data)

	if 'query' not in inputs or 'documents' not in inputs:
	raise ValueError("query and documents are required.")

	pairs = [self.format_instruction(task, inputs['query'], doc) for doc in inputs['documents']]

	# Tokenize the input texts
	inputs = self.process_inputs(pairs)
	scores = self.compute_logits(inputs)

	return dict(scores=scores)