from .pairmatch import pairmatch_baseline from typing import List, Tuple, Dict, Any, Optional from tqdm import tqdm import numpy as np # Reference: # Judge result -- {0: 'Tie', 1: 'A is better than B', 2: 'A is no better than B'} # Insight: # Deterministic Sorting fails due to sthochasticity of LLM & inconsistency of evaluation results # Naive Sort O(N^2): Obtains the top ranked conversation index # Note that our pairwise comparison is permuation invariant, so the naive sort essentially ensemble stochasitcity of LLMs def naive_sort_sthocastic(conversations: List[str], sub_objectives: List[str]) -> List[float]: """ O(N^2) comparison with equal reward of dual success Pairmatch_permuted_backward_eval is permutation invariant -- pairmatch(i,j) & pairmatch(j, i) differs only in stochasticity of LLM -- fixing seed and result should be the same """ scores = [0] * len(conversations) for i in range(len(conversations)): for j in range(len(conversations)): conversation_A = conversations[i] conversation_B = conversations[j] conversation_history_pair = (conversation_A, conversation_B) judge, info = pairmatch_baseline(conversation_history_pair, sub_objectives) if judge == 1: scores[i] += 1 elif judge == 2: scores[j] += 1 return scores / sum(scores) # scores always sum to 1 # Stochastic Bubble Sort O(N) -- no permuted comparison def stochastic_bubble_sort(conversations: List[str], sub_objectives: List[str], store_path: Optional[str] = None, name: str = 'scores') -> List[float]: """ O(N) comparison Equivalent permuted comparison is discarded """ scores = {sub_objective : [0] * len(conversations) for sub_objective in sub_objectives} for i in tqdm(range(len(conversations)), desc='Stochastic Bubble Sort w. POE'): for j in range(len(conversations)-i-1): print('Begin Pairmatch for ', j,'&',j+1, '...') conversation_A = conversations[j] conversation_B = conversations[j+1] conversation_history_pair = (conversation_A, conversation_B) info = pairmatch_baseline(conversation_history_pair, sub_objectives) # print('Pairmatch completed for ', j,'&',j+1) for sub_objective in sub_objectives: judge = info[sub_objective]['relative_score'] scores[sub_objective][j] += judge[0] scores[sub_objective][j+1] += judge[1] # store judgement dict into json file # write new line if file exists if store_path is not None: import json with open(f'{store_path}/{name}.json', 'w') as f: json.dump(info, f) print('Pairmatch completed for ', j,'&',j+1) for sub_objective in sub_objectives: scores[sub_objective] = np.array(scores[sub_objective]) / sum(scores[sub_objective]) # scores always sum to 1 return scores