YAML Metadata Warning: empty or missing yaml metadata in repo card
Check out the documentation for more information.
import json
import matplotlib.pyplot as plt
import numpy as np
import torch
from ranx import evaluate
from tqdm.auto import tqdm
from rm_model import humanPreferenceModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Create a list of model configurations
model_configs = [
{
"name": 'rm_byt5_base',
"config": "google/byt5-base",
"path": 'voidful/rm_byt5_base',
}
]
eval_dataset = "test_rm.jsonl"
# one data example: {"question": "Screenshot Software recommendation - free, Windows XP/7", "answers": ["My favourite: FSCapture 5.3 (last free version)\nPortable, lightweight, free.\n\n", "Use Irfan View, is is faster than XnView and allows to set up a capture hotkey, or start capturing with a delay (possible via hotkey too).\n", "I know you are looking for a free solution; this is more of an FYI, in case you have Microsoft OneNote...\nYou can press Win - S to take a screenshot that is pasted inside the OneNote program...Then right-click the image (while it is selected), and click \"Save As\". You can then save the image anywhere you like...\n"], "accepted_answer": ["Windows 7 comes with the snipping tool, which can be activated via hotkey with a little tweaking.\nSome nifty third party tools include Cropper:\n\nGreenshot:\n\nand of course, Gadwin.\n"]}
maxlen = 512
batch_size = 3
def rank_answers(model, question, answers):
model.eval()
with torch.inference_mode():
inputs = model.tokenizer([f"question: {question} answer: {answer}" for answer in answers], return_tensors="pt",
padding=True, truncation=True, max_length=maxlen).to(device)
decoder_input_ids = model.transformer_model._shift_right(inputs["input_ids"])
outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"],
decoder_input_ids=decoder_input_ids)
answer_scores = outputs.cpu()
return list(zip(answers, answer_scores))
def create_test_data():
testing_data = []
with open(eval_dataset, "r", encoding="utf8") as f:
for line in f:
testing_data.append(json.loads(line))
return testing_data
def create_qrels_and_run(test_data, model):
qrels = {}
run = {}
selected_scores = []
nonselected_scores = []
query_id = 0
for example in tqdm(test_data):
question = example["question"]
correct_answer = example["accepted_answer"][0]
answers = example["answers"] + example["accepted_answer"]
ranked_answers = rank_answers(model, question, answers)
qrels[query_id] = {i: int(answer == correct_answer) for i, answer in enumerate(answers)}
run[query_id] = {i: score for i, (_, score) in enumerate(ranked_answers)}
for answer, score in ranked_answers:
if answer == correct_answer:
selected_scores.append(score.cpu().detach().numpy())
else:
nonselected_scores.append(score.cpu().detach().numpy())
query_id += 1
return qrels, run, selected_scores, nonselected_scores
# Wrap your current code inside a function
def evaluate_model(model_config, model_name, model_path):
model = humanPreferenceModel(model_config)
if model_path:
model.load_state_dict(torch.load(model_path, map_location='cuda:0'))
model.eval()
test_data = create_test_data()
qrels, run, selected_scores, nonselected_scores = create_qrels_and_run(test_data, model)
# Compute mean score for selected and non-selected answers
mean_selected_score = np.mean(selected_scores) if len(selected_scores) > 0 else 0
mean_nonselected_score = np.mean(nonselected_scores)
print(f"Mean score for selected answers: {mean_selected_score:.4f}")
print(f"Mean score for non-selected answers: {mean_nonselected_score:.4f}")
print("Selected scores:", len(selected_scores), selected_scores[:5])
print("Non-selected scores:", len(nonselected_scores), nonselected_scores[:5])
# Evaluate and print results
metrics_to_compute = ["hits@5", "hit_rate@5", "precision@5", "recall@5", "f1@5", "r-precision", "bpref", "rbp.95",
"mrr@5", "map@5", "ndcg@5", "ndcg_burges@5"]
results = evaluate(qrels, run, metrics_to_compute)
print(results)
results_perc = {metric: result * 100 for metric, result in results.items()}
selected_scores_flat = [score.item() for score in selected_scores]
nonselected_scores_flat = [score.item() for score in nonselected_scores]
statistics = {'mean': np.mean}
plt.hist(nonselected_scores_flat, bins=100, alpha=0.3, label='Non-selected answers')
plt.hist(selected_scores_flat, bins=100, alpha=0.3, label='Selected answers')
colors = {'selected': 'peru', 'non-selected': 'steelblue'}
linestyles = ['dashed', 'dashed', 'dotted', 'dotted', 'dotted']
for idx, (stat_name, stat_func) in enumerate(statistics.items()):
for group_idx, group in enumerate(['non-selected', 'selected']):
scores = selected_scores_flat if group == 'selected' else nonselected_scores_flat
stat_value = stat_func(scores)
plt.axvline(stat_value, color=colors[group], linestyle=linestyles[idx], linewidth=1)
y_pos = plt.ylim()[1] * (0.9 - (idx * 2 + group_idx) * 0.05)
x_offset = plt.xlim()[1] * 0.01
plt.text(stat_value + x_offset, y_pos, f"{stat_name}: {stat_value:.2f}", color=colors[group], ha='left',
fontsize=8)
plt.legend(loc='best', bbox_to_anchor=(1, 1))
ax = plt.gca()
legend = ax.get_legend()
result_str = '\n'.join([f"{metric}: {result:.2f}%" for metric, result in results_perc.items()])
plt.text(plt.xlim()[1] * 1.05, plt.ylim()[0] + (plt.ylim()[1] - plt.ylim()[0]) * 0.05, result_str, fontsize=8)
plt.subplots_adjust(right=0.8)
legend.set_bbox_to_anchor((1, 1))
plt.title('Score distribution for selected and non-selected answers')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.savefig(f'score_distribution_answers_{model_name}.png', dpi=300, bbox_inches='tight')
plt.show()
plt.close()
return results, selected_scores, nonselected_scores
# Iterate over model configurations
for config in model_configs:
results, selected_scores, nonselected_scores = evaluate_model(config['config'], config['name'], config['path'])
print(f"Results for {config['name']}: {results}")
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support