File size: 3,160 Bytes
ee657a1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | from .pairmatch import pairmatch_baseline
from typing import List, Tuple, Dict, Any, Optional
from tqdm import tqdm
import numpy as np
# Reference:
# Judge result -- {0: 'Tie', 1: 'A is better than B', 2: 'A is no better than B'}
# Insight:
# Deterministic Sorting fails due to sthochasticity of LLM & inconsistency of evaluation results
# Naive Sort O(N^2): Obtains the top ranked conversation index
# Note that our pairwise comparison is permuation invariant, so the naive sort essentially ensemble stochasitcity of LLMs
def naive_sort_sthocastic(conversations: List[str],
sub_objectives: List[str]) -> List[float]:
"""
O(N^2) comparison with equal reward of dual success
Pairmatch_permuted_backward_eval is permutation invariant
-- pairmatch(i,j) & pairmatch(j, i) differs only in stochasticity of LLM
-- fixing seed and result should be the same
"""
scores = [0] * len(conversations)
for i in range(len(conversations)):
for j in range(len(conversations)):
conversation_A = conversations[i]
conversation_B = conversations[j]
conversation_history_pair = (conversation_A, conversation_B)
judge, info = pairmatch_baseline(conversation_history_pair, sub_objectives)
if judge == 1:
scores[i] += 1
elif judge == 2:
scores[j] += 1
return scores / sum(scores) # scores always sum to 1
# Stochastic Bubble Sort O(N) -- no permuted comparison
def stochastic_bubble_sort(conversations: List[str],
sub_objectives: List[str],
store_path: Optional[str] = None,
name: str = 'scores') -> List[float]:
"""
O(N) comparison
Equivalent permuted comparison is discarded
"""
scores = {sub_objective : [0] * len(conversations) for sub_objective in sub_objectives}
for i in tqdm(range(len(conversations)), desc='Stochastic Bubble Sort w. POE'):
for j in range(len(conversations)-i-1):
print('Begin Pairmatch for ', j,'&',j+1, '...')
conversation_A = conversations[j]
conversation_B = conversations[j+1]
conversation_history_pair = (conversation_A, conversation_B)
info = pairmatch_baseline(conversation_history_pair, sub_objectives)
# print('Pairmatch completed for ', j,'&',j+1)
for sub_objective in sub_objectives:
judge = info[sub_objective]['relative_score']
scores[sub_objective][j] += judge[0]
scores[sub_objective][j+1] += judge[1]
# store judgement dict into json file
# write new line if file exists
if store_path is not None:
import json
with open(f'{store_path}/{name}.json', 'w') as f:
json.dump(info, f)
print('Pairmatch completed for ', j,'&',j+1)
for sub_objective in sub_objectives:
scores[sub_objective] = np.array(scores[sub_objective]) / sum(scores[sub_objective]) # scores always sum to 1
return scores
|