RexReranker

Models Data ERSS GitHub Blog

RexReranker-0.6B-FP8

Model Summary

RexReranker-0.6B-FP8 is state-of-the-art decoder-based generation reranker for e-commerce product discovery. Given a user query and a candidate product (title + optional description/attributes), it outputs a relevance score derived from the model’s token-level probability of a binary judgment (“yes” vs “no”).

Intended Use

Primary use cases

  • Second-stage reranking for product search (high-recall retrieval → top-k rerank).
  • Shopping/commerce assistants: selecting and ordering candidate products given natural-language constraints (size, compatibility, color, etc.).
  • Offline evaluation / benchmarking of reranking approaches for product discovery.

Model Details

Model type

  • Text Ranking / Reranking model
  • Decoder LM architecture (Qwen3ForCausalLM)
  • Parameters: ~0.6B

Training Data

This model is trained for e-commerce relevance reranking using the project’s released relevance datasets:

  • Amazebay-Relevance: 6.33M rows of query–product pairs (train split ~6.29M, validation ~38k)

Evaluation

Pareto Comparison:

Performance on various Query Types: radar_chart

How to Use

Using vLLM

# Requires vllm>=0.8.5
import logging
from typing import Dict, Optional, List

import json
import logging

import torch

from transformers import AutoTokenizer, is_torch_npu_available
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import destroy_model_parallel
import gc
import math
from vllm.inputs.data import TokensPrompt


        
def format_instruction(instruction, query, doc):
    text = [
        {"role": "system", "content": "Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\"."},
        {"role": "user", "content": f"<Instruct>: {instruction}\n\n<Query>: {query}\n\n<Document>: {doc}"}
    ]
    return text

def process_inputs(pairs, instruction, max_length, suffix_tokens):
    messages = [format_instruction(instruction, query, doc) for query, doc in pairs]
    messages =  tokenizer.apply_chat_template(
        messages, tokenize=True, add_generation_prompt=False, enable_thinking=False
    )
    messages = [ele[:max_length] + suffix_tokens for ele in messages]
    messages = [TokensPrompt(prompt_token_ids=ele) for ele in messages]
    return messages

def compute_logits(model, messages, sampling_params, true_token, false_token):
    outputs = model.generate(messages, sampling_params, use_tqdm=False)
    scores = []
    for i in range(len(outputs)):
        final_logits = outputs[i].outputs[0].logprobs[-1]
        token_count = len(outputs[i].outputs[0].token_ids)
        if true_token not in final_logits:
            true_logit = -10
        else:
            true_logit = final_logits[true_token].logprob
        if false_token not in final_logits:
            false_logit = -10
        else:
            false_logit = final_logits[false_token].logprob
        true_score = math.exp(true_logit)
        false_score = math.exp(false_logit)
        score = true_score / (true_score + false_score)
        scores.append(score)
    return scores

number_of_gpu = torch.cuda.device_count()
tokenizer = AutoTokenizer.from_pretrained('thebajajra/RexReranker-0.6B-FP8')
model = LLM(model='thebajajra/RexReranker-0.6B-FP8', tensor_parallel_size=number_of_gpu, max_model_len=10000, enable_prefix_caching=True, gpu_memory_utilization=0.8)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
max_length=8192
suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)
true_token = tokenizer("yes", add_special_tokens=False).input_ids[0]
false_token = tokenizer("no", add_special_tokens=False).input_ids[0]
sampling_params = SamplingParams(temperature=0, 
    max_tokens=1,
    logprobs=20, 
    allowed_token_ids=[true_token, false_token],
)

        
task = 'Given a web search query, retrieve relevant passages that answer the query'
queries = ["visual fractions workbooks for children",
    "replacement motor mount for 2008 focus",
]
documents = [
    "Fractions and Decimals Workbook for Grades 4 to 5",
    "3pcs Set - Motor Mounts Kit Compatible with 08-11 Ford Focus 2.0L Auto Automatic and Manual Trans Transmission AT MT - Engine Mounts",
]

pairs = list(zip(queries, documents))
inputs = process_inputs(pairs, task, max_length-len(suffix_tokens), suffix_tokens)
scores = compute_logits(model, inputs, sampling_params, true_token, false_token)
print('scores', scores)

destroy_model_parallel()

Using HF Transformers

# Requires transformers>=4.51.0
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM

def format_instruction(instruction, query, doc):
    if instruction is None:
        instruction = 'Given a web search query, retrieve relevant passages that answer the query'
    output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(instruction=instruction,query=query, doc=doc)
    return output

def process_inputs(pairs):
    inputs = tokenizer(
        pairs, padding=False, truncation='longest_first',
        return_attention_mask=False, max_length=max_length - len(prefix_tokens) - len(suffix_tokens)
    )
    for i, ele in enumerate(inputs['input_ids']):
        inputs['input_ids'][i] = prefix_tokens + ele + suffix_tokens
    inputs = tokenizer.pad(inputs, padding=True, return_tensors="pt", max_length=max_length)
    for key in inputs:
        inputs[key] = inputs[key].to(model.device)
    return inputs

@torch.no_grad()
def compute_logits(inputs, **kwargs):
    batch_scores = model(**inputs).logits[:, -1, :]
    true_vector = batch_scores[:, token_true_id]
    false_vector = batch_scores[:, token_false_id]
    batch_scores = torch.stack([false_vector, true_vector], dim=1)
    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)
    scores = batch_scores[:, 1].exp().tolist()
    return scores

tokenizer = AutoTokenizer.from_pretrained("thebajajra/RexReranker-0.6B-FP8", padding_side='left')
model = AutoModelForCausalLM.from_pretrained("thebajajra/RexReranker-0.6B-FP8").eval()
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
# model = AutoModelForCausalLM.from_pretrained("thebajajra/RexReranker-0.6B-FP8", torch_dtype=torch.float16, attn_implementation="flash_attention_2").cuda().eval()
token_false_id = tokenizer.convert_tokens_to_ids("no")
token_true_id = tokenizer.convert_tokens_to_ids("yes")
max_length = 8192

prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False)
suffix_tokens = tokenizer.encode(suffix, add_special_tokens=False)
        
task = 'Given a web search query, retrieve relevant passages that answer the query'

queries = ["visual fractions workbooks for children",
    "replacement motor mount for 2008 focus",
]
documents = [
    "Fractions and Decimals Workbook for Grades 4 to 5",
    "3pcs Set - Motor Mounts Kit Compatible with 08-11 Ford Focus 2.0L Auto Automatic and Manual Trans Transmission AT MT - Engine Mounts",
]

pairs = [format_instruction(task, query, doc) for query, doc in zip(queries, documents)]

# Tokenize the input texts
inputs = process_inputs(pairs)
scores = compute_logits(inputs)

print("scores: ", scores)
Downloads last month
49
Safetensors
Model size
0.8B params
Tensor type
BF16
·
F8_E4M3
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for thebajajra/RexReranker-0.6B-FP8

Quantized
(2)
this model

Dataset used to train thebajajra/RexReranker-0.6B-FP8

Collection including thebajajra/RexReranker-0.6B-FP8

Evaluation results

  • nDCG@5 on ERESS (E-commerce Relevance Evaluation Scoring Suite)
    self-reported
    0.925
  • nDCG@10 on ERESS (E-commerce Relevance Evaluation Scoring Suite)
    self-reported
    0.887