anno / simSearch /src /sort.py
Ksgk-fy's picture
Upload 67 files
ee657a1 verified
from .pairmatch import pairmatch_baseline
from typing import List, Tuple, Dict, Any, Optional
from tqdm import tqdm
import numpy as np
# Reference:
# Judge result -- {0: 'Tie', 1: 'A is better than B', 2: 'A is no better than B'}
# Insight:
# Deterministic Sorting fails due to sthochasticity of LLM & inconsistency of evaluation results
# Naive Sort O(N^2): Obtains the top ranked conversation index
# Note that our pairwise comparison is permuation invariant, so the naive sort essentially ensemble stochasitcity of LLMs
def naive_sort_sthocastic(conversations: List[str],
sub_objectives: List[str]) -> List[float]:
"""
O(N^2) comparison with equal reward of dual success
Pairmatch_permuted_backward_eval is permutation invariant
-- pairmatch(i,j) & pairmatch(j, i) differs only in stochasticity of LLM
-- fixing seed and result should be the same
"""
scores = [0] * len(conversations)
for i in range(len(conversations)):
for j in range(len(conversations)):
conversation_A = conversations[i]
conversation_B = conversations[j]
conversation_history_pair = (conversation_A, conversation_B)
judge, info = pairmatch_baseline(conversation_history_pair, sub_objectives)
if judge == 1:
scores[i] += 1
elif judge == 2:
scores[j] += 1
return scores / sum(scores) # scores always sum to 1
# Stochastic Bubble Sort O(N) -- no permuted comparison
def stochastic_bubble_sort(conversations: List[str],
sub_objectives: List[str],
store_path: Optional[str] = None,
name: str = 'scores') -> List[float]:
"""
O(N) comparison
Equivalent permuted comparison is discarded
"""
scores = {sub_objective : [0] * len(conversations) for sub_objective in sub_objectives}
for i in tqdm(range(len(conversations)), desc='Stochastic Bubble Sort w. POE'):
for j in range(len(conversations)-i-1):
print('Begin Pairmatch for ', j,'&',j+1, '...')
conversation_A = conversations[j]
conversation_B = conversations[j+1]
conversation_history_pair = (conversation_A, conversation_B)
info = pairmatch_baseline(conversation_history_pair, sub_objectives)
# print('Pairmatch completed for ', j,'&',j+1)
for sub_objective in sub_objectives:
judge = info[sub_objective]['relative_score']
scores[sub_objective][j] += judge[0]
scores[sub_objective][j+1] += judge[1]
# store judgement dict into json file
# write new line if file exists
if store_path is not None:
import json
with open(f'{store_path}/{name}.json', 'w') as f:
json.dump(info, f)
print('Pairmatch completed for ', j,'&',j+1)
for sub_objective in sub_objectives:
scores[sub_objective] = np.array(scores[sub_objective]) / sum(scores[sub_objective]) # scores always sum to 1
return scores