|
|
--- |
|
|
library_name: transformers |
|
|
license: mit |
|
|
datasets: |
|
|
- Zhengping/UNLI |
|
|
- Zhengping/UNLI-style-synthetic |
|
|
language: |
|
|
- en |
|
|
metrics: |
|
|
- pearsonr |
|
|
- spearmanr |
|
|
- accuracy |
|
|
base_model: |
|
|
- Qwen/Qwen2.5-14B-Instruct |
|
|
--- |
|
|
|
|
|
# Model Card for Model ID |
|
|
|
|
|
<!-- Provide a quick summary of what the model is/does. --> |
|
|
|
|
|
|
|
|
|
|
|
## Model Details |
|
|
|
|
|
### Model Description |
|
|
|
|
|
<!-- Provide a longer summary of what this model is. --> |
|
|
|
|
|
This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated. |
|
|
|
|
|
- **Developed by:** Liaoyaqi Wang, Zhengping Jiang, Anqi Liu, Benjamin Van Durme |
|
|
- **Model type:** Decoding-based Regression Model (Classification) |
|
|
- **Language(s) (NLP):** `en` |
|
|
- **License:** mit |
|
|
- **Finetuned from model [optional]:** Qwen/Qwen2.5-14B-Instruct |
|
|
|
|
|
### Model Sources [optional] |
|
|
|
|
|
<!-- Provide the basic links for the model. --> |
|
|
|
|
|
- **Repository:** [Decoding-based Regression](https://github.com/zipJiang/decoding-based-regression.git) |
|
|
- **Paper [optional]:** [Always Tell Me The Odds: Fine-grained Conditional Probability Estimation](https://arxiv.org/pdf/2505.01595) |
|
|
|
|
|
## Uses |
|
|
|
|
|
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. --> |
|
|
|
|
|
### Direct Use |
|
|
|
|
|
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. --> |
|
|
|
|
|
```python |
|
|
import enum |
|
|
import transformers |
|
|
import torch |
|
|
from transformers.pipelines import PIPELINE_REGISTRY |
|
|
from transformers import ( |
|
|
pipeline, |
|
|
Pipeline, |
|
|
TextGenerationPipeline, |
|
|
PreTrainedTokenizer, |
|
|
AutoModelForCausalLM, |
|
|
PreTrainedTokenizer |
|
|
) |
|
|
from transformers.pipelines.text_generation import Chat, ReturnType |
|
|
from typing import ( |
|
|
Dict, |
|
|
Callable, |
|
|
Tuple, |
|
|
List, |
|
|
) |
|
|
|
|
|
|
|
|
class LevelToScorePipeline(TextGenerationPipeline): |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
level_to_score_func: Callable[[Tuple[torch.FloatTensor], PreTrainedTokenizer], Tuple[List[float], List[List[float]]]], |
|
|
*args, |
|
|
**kwargs |
|
|
): |
|
|
super().__init__(*args, **kwargs) |
|
|
self._level_to_score_func = level_to_score_func |
|
|
|
|
|
def preprocess( |
|
|
self, |
|
|
prompt_text, |
|
|
prefix="", |
|
|
handle_long_generation=None, |
|
|
add_special_tokens=None, |
|
|
truncation=None, |
|
|
padding=None, |
|
|
max_length=None, |
|
|
continue_final_message=None, |
|
|
**generate_kwargs, |
|
|
): |
|
|
# Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults |
|
|
tokenizer_kwargs = { |
|
|
"add_special_tokens": add_special_tokens, |
|
|
"truncation": truncation, |
|
|
"padding": padding, |
|
|
"max_length": max_length, |
|
|
} |
|
|
tokenizer_kwargs = {key: value for key, value in tokenizer_kwargs.items() if value is not None} |
|
|
|
|
|
if isinstance(prompt_text, Chat): |
|
|
tokenizer_kwargs.pop("add_special_tokens", None) # ignore add_special_tokens on chats |
|
|
# If the user passes a chat that ends in an assistant message, we treat it as a prefill by default |
|
|
# because very few models support multiple separate, consecutive assistant messages |
|
|
if continue_final_message is None: |
|
|
continue_final_message = prompt_text.messages[-1]["role"] == "assistant" |
|
|
inputs = self.tokenizer.apply_chat_template( |
|
|
prompt_text.messages, |
|
|
add_generation_prompt=not continue_final_message, |
|
|
continue_final_message=continue_final_message, |
|
|
return_dict=True, |
|
|
return_tensors=self.framework, |
|
|
**tokenizer_kwargs, |
|
|
) |
|
|
else: |
|
|
inputs = self.tokenizer(prefix + prompt_text, return_tensors=self.framework, **tokenizer_kwargs) |
|
|
|
|
|
inputs["prompt_text"] = prompt_text |
|
|
|
|
|
if handle_long_generation == "hole": |
|
|
cur_len = inputs["input_ids"].shape[-1] |
|
|
if "max_new_tokens" in generate_kwargs: |
|
|
new_tokens = generate_kwargs["max_new_tokens"] |
|
|
else: |
|
|
new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len |
|
|
if new_tokens < 0: |
|
|
raise ValueError("We cannot infer how many new tokens are expected") |
|
|
if cur_len + new_tokens > self.tokenizer.model_max_length: |
|
|
keep_length = self.tokenizer.model_max_length - new_tokens |
|
|
if keep_length <= 0: |
|
|
raise ValueError( |
|
|
"We cannot use `hole` to handle this generation the number of desired tokens exceeds the" |
|
|
" models max length" |
|
|
) |
|
|
|
|
|
inputs["input_ids"] = inputs["input_ids"][:, -keep_length:] |
|
|
if "attention_mask" in inputs: |
|
|
inputs["attention_mask"] = inputs["attention_mask"][:, -keep_length:] |
|
|
|
|
|
return inputs |
|
|
|
|
|
def _forward(self, model_inputs, **generate_kwargs): |
|
|
input_ids = model_inputs["input_ids"] |
|
|
attention_mask = model_inputs.get("attention_mask", None) |
|
|
# Allow empty prompts |
|
|
if input_ids.shape[1] == 0: |
|
|
input_ids = None |
|
|
attention_mask = None |
|
|
in_b = 1 |
|
|
else: |
|
|
in_b = input_ids.shape[0] |
|
|
prompt_text = model_inputs.pop("prompt_text") |
|
|
|
|
|
# If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying |
|
|
# generate_kwargs, as some of the parameterization may come from the initialization of the pipeline. |
|
|
prefix_length = generate_kwargs.pop("prefix_length", 0) |
|
|
if prefix_length > 0: |
|
|
has_max_new_tokens = "max_new_tokens" in generate_kwargs or ( |
|
|
"generation_config" in generate_kwargs |
|
|
and generate_kwargs["generation_config"].max_new_tokens is not None |
|
|
) |
|
|
if not has_max_new_tokens: |
|
|
generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length |
|
|
generate_kwargs["max_length"] += prefix_length |
|
|
has_min_new_tokens = "min_new_tokens" in generate_kwargs or ( |
|
|
"generation_config" in generate_kwargs |
|
|
and generate_kwargs["generation_config"].min_new_tokens is not None |
|
|
) |
|
|
if not has_min_new_tokens and "min_length" in generate_kwargs: |
|
|
generate_kwargs["min_length"] += prefix_length |
|
|
|
|
|
# User-defined `generation_config` passed to the pipeline call take precedence |
|
|
if "generation_config" not in generate_kwargs: |
|
|
generate_kwargs["generation_config"] = self.generation_config |
|
|
|
|
|
generate_kwargs["output_scores"] = not generate_kwargs.get("do_sample", False) |
|
|
generate_kwargs["return_dict_in_generate"] = True |
|
|
|
|
|
generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs) |
|
|
|
|
|
logits = None |
|
|
|
|
|
# TODO: check good default |
|
|
if generate_kwargs.get("return_scores", True): |
|
|
assert not generate_kwargs.get("do_sample", False), "return_logits=True is only supported for do_sample=False" |
|
|
|
|
|
# Proceed to process logits and convert to score average. |
|
|
# next_token_logits is [batch_size, vocab_size] |
|
|
# raw_logits is a tuple of ([next_token_logits, past_key_values]) |
|
|
|
|
|
logits = generated_sequence.scores |
|
|
|
|
|
out_b = generated_sequence.sequences.shape[0] |
|
|
if self.framework == "pt": |
|
|
generated_sequence = generated_sequence.sequences.reshape(in_b, out_b // in_b, *generated_sequence.sequences.shape[1:]) |
|
|
# elif self.framework == "tf": |
|
|
# generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:])) |
|
|
return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text, "logits": logits} |
|
|
|
|
|
def postprocess( |
|
|
self, |
|
|
model_outputs, |
|
|
return_type=ReturnType.FULL_TEXT, |
|
|
clean_up_tokenization_spaces=True, |
|
|
continue_final_message=None, |
|
|
): |
|
|
generated_sequence = model_outputs["generated_sequence"][0] |
|
|
input_ids = model_outputs["input_ids"] |
|
|
prompt_text = model_outputs["prompt_text"] |
|
|
logits = model_outputs["logits"] |
|
|
|
|
|
#TODO: This is now making many assumptions about how the logits are ordered, |
|
|
# Should think about how to make this explicit |
|
|
scores, selective_logits = self._level_to_score_func(logits, self.tokenizer) |
|
|
|
|
|
generated_sequence = generated_sequence.numpy().tolist() |
|
|
records = [] |
|
|
for sequence in generated_sequence: |
|
|
if return_type == ReturnType.TENSORS: |
|
|
record = {"generated_token_ids": sequence} |
|
|
elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}: |
|
|
# Decode text |
|
|
text = self.tokenizer.decode( |
|
|
sequence, |
|
|
skip_special_tokens=True, |
|
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces, |
|
|
) |
|
|
|
|
|
# Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used |
|
|
if input_ids is None: |
|
|
prompt_length = 0 |
|
|
else: |
|
|
prompt_length = len( |
|
|
self.tokenizer.decode( |
|
|
input_ids[0], |
|
|
skip_special_tokens=True, |
|
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces, |
|
|
) |
|
|
) |
|
|
|
|
|
all_text = text[prompt_length:] |
|
|
if return_type == ReturnType.FULL_TEXT: |
|
|
if isinstance(prompt_text, str): |
|
|
all_text = prompt_text + all_text |
|
|
elif isinstance(prompt_text, Chat): |
|
|
if continue_final_message is None: |
|
|
# If the user passes a chat ending in an assistant message, we treat it as a prefill by |
|
|
# default because very few models support multiple separate, consecutive assistant messages |
|
|
continue_final_message = prompt_text.messages[-1]["role"] == "assistant" |
|
|
if continue_final_message: |
|
|
# With assistant prefill, concat onto the end of the last message |
|
|
all_text = list(prompt_text.messages)[:-1] + [ |
|
|
{ |
|
|
"role": prompt_text.messages[-1]["role"], |
|
|
"content": prompt_text.messages[-1]["content"] + all_text, |
|
|
} |
|
|
] |
|
|
else: |
|
|
# When we're not starting from a prefill, the output is a new assistant message |
|
|
all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}] |
|
|
record = { |
|
|
"generated_text": all_text, |
|
|
"score": scores[0], |
|
|
"selective_logits": selective_logits[0] |
|
|
} |
|
|
records.append(record) |
|
|
|
|
|
return records |
|
|
|
|
|
|
|
|
class SingleLabelRankDict: |
|
|
def __init__( |
|
|
self, |
|
|
rank_dict: Dict[Text, Any] |
|
|
): |
|
|
self._rank_dict = rank_dict |
|
|
|
|
|
def __len__(self) -> int: |
|
|
return len(self._rank_dict) |
|
|
|
|
|
def get_rank_dict(self, tokenizer: PreTrainedTokenizer) -> Dict[int, Any]: |
|
|
return {tokenizer.convert_tokens_to_ids([token])[0]: value for token, value in self._rank_dict.items()} |
|
|
|
|
|
def to_tokenizer(self, tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer: |
|
|
"""Augment tokenizer vocab with `rank_dict` IN-PLACE. |
|
|
""" |
|
|
vocabs: List[Text] = self._rank_dict.keys() |
|
|
new_vocab = [vocab for vocab in vocabs if vocab not in tokenizer.get_vocab()] |
|
|
tokenizer.add_tokens(new_vocab) |
|
|
return tokenizer |
|
|
def from_tokenizer(cls, tokenizer: PreTrainedTokenizer) -> "SingleLabelRankDict": |
|
|
vocab = tokenizer.get_vocab() |
|
|
rank_dict = {} |
|
|
pattern = re.compile(r" <\|label_level_(\d+)\|>") |
|
|
|
|
|
for token in vocab.keys(): |
|
|
match = pattern.match(token) |
|
|
if match: |
|
|
value = int(match.group(1)) |
|
|
# normalized_value = value / (len(vocab) - 1) |
|
|
rank_dict[token] = value |
|
|
|
|
|
# normalize rank_values |
|
|
num_levels = max(rank_dict.values()) + 1 |
|
|
for token in rank_dict.keys(): |
|
|
rank_dict[token] = 1. / num_levels * (rank_dict[token] + 0.5) |
|
|
|
|
|
return cls(rank_dict=rank_dict) |
|
|
|
|
|
|
|
|
model = transformers.AutoModelForCausalLM.from_pretrained( |
|
|
"Zhengping/conditional-probability-regression", |
|
|
torch_dtype="auto", |
|
|
attn_implementation="flash_attention_2", |
|
|
) |
|
|
tokenizer = transformers.AutoTokenizer.from_pretrained( |
|
|
"Zhengping/conditional-probability-regression", |
|
|
) |
|
|
|
|
|
rank_dict = SingleLabelRankDict.from_tokenizer(tokenizer) |
|
|
|
|
|
PIPELINE_REGISTRY.register_pipeline( |
|
|
"level-to-score", |
|
|
pipeline_class=LevelToScorePipeline, |
|
|
pt_model=AutoModelForCausalLM |
|
|
) |
|
|
|
|
|
# This allows fine-grained labeling, the greedy decoding gives a coarse score, |
|
|
# one can also attach their own level-to-score function to the pipeline, e.g. using UNLI |
|
|
# label transformation to get it more binarized |
|
|
def _level_to_score_func( |
|
|
logits: Tuple[torch.FloatTensor], |
|
|
tokenizer: PreTrainedTokenizer |
|
|
) -> Tuple[List[float], List[float]]: |
|
|
""" """ |
|
|
logits = logits[0] |
|
|
num_labels = len(rank_dict) |
|
|
considering_ids = tokenizer.convert_tokens_to_ids([f" <|label_level_{i}|>" for i in range(num_labels)]) |
|
|
selective_logits = torch.index_select(logits, 1, torch.tensor(considering_ids, device=logits.device)) |
|
|
step_size = 1 / num_labels |
|
|
expectation = torch.tensor([[i * step_size + 1 / 2 * step_size for i in range(num_labels)]], device=selective_logits.device) |
|
|
scores = torch.softmax(selective_logits, dim=-1) @ expectation.T |
|
|
scores = scores.squeeze(-1).tolist() |
|
|
return scores, selective_logits.tolist() |
|
|
|
|
|
pipe = pipeline( |
|
|
"level-to-score", |
|
|
model=model, |
|
|
max_new_tokens=2, |
|
|
tokenizer=tokenizer, |
|
|
device=0, |
|
|
level_to_score_func=_level_to_score_func, |
|
|
torch_dtype=torch.bfloat16, |
|
|
) |
|
|
|
|
|
template = UNLITemplate() |
|
|
|
|
|
premise = "Sam is sleeping." |
|
|
hypothesis = "Sam is awake." |
|
|
|
|
|
inputs = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": "### Question: Given the premise \"{premise}\", how likely is it that the hypothesis \"{hypothesis}\" is true?\n\n".format( |
|
|
premise=premise, |
|
|
hypothesis=hypothesis |
|
|
) |
|
|
}, |
|
|
{ |
|
|
"role": "assitant", |
|
|
"content": "### Answer:" |
|
|
} |
|
|
] |
|
|
|
|
|
result = pipe(inputs) |
|
|
print(result) |
|
|
``` |
|
|
|
|
|
|
|
|
## Use with vLLM |
|
|
|
|
|
`TODO` |
|
|
|
|
|
|
|
|
#### Summary |
|
|
|
|
|
LLM-based Fine-grained Conditional Probability Estimation |
|
|
|
|
|
## Citation [optional] |
|
|
|
|
|
```bibtex |
|
|
@article{wang2025always, |
|
|
title={Always Tell Me The Odds: Fine-grained Conditional Probability Estimation}, |
|
|
author={Wang, Liaoyaqi and Jiang, Zhengping and Liu, Anqi and Van Durme, Benjamin}, |
|
|
journal={arXiv preprint arXiv:2505.01595}, |
|
|
year={2025} |
|
|
} |
|
|
``` |
|
|
|
|
|
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. --> |