| from src import independent_eval, independent_score, pairmatch_baseline | |
| from data.rude_prompt import RUDE_TIPS | |
| import glob, json | |
| from typing import List | |
| def present_biases(biases: List[str]): | |
| for bias in biases: | |
| print(bias) | |
| print('------'*10) | |
| express_judge = lambda x: {0: 'Tie', 1: 'A', 2: 'B'}[x] | |
| # Compare Individually vs. Pairwise | |
| def experiment_pairwise_baseline(): | |
| more_emoji_tip = "Use π emoji as much as possible" # the emoji existence tip suffices to tell the story | |
| conversation_files = glob.glob('./data/confirmation_bias/conversation_*.json') | |
| name_pairs = [('1AT.json', '8BT.json', 'A use emoji once, B use emoji 8 times', 1, 8), | |
| ('1AT.json', '3BT.json', 'A use emoji once, B use emoji 3 times', 1, 3), | |
| ('3AT.json', '8BT.json', 'A use emoji 3 times, B use emoji 8 times', 3, 8)] | |
| individual_eval_bias_score, individual_eval_biases, pairwise_bias_score, pairwise_biases = 0, [], 0, [] | |
| for experiment_config in name_pairs: | |
| (nameA, nameB, test_desc, ca, cb) = experiment_config | |
| names = [nameA, nameB] | |
| conversation_A = [c for c in conversation_files if names[0] in c][0] | |
| conversation_B = [c for c in conversation_files if names[1] in c][0] | |
| conversation_A = json.load(open(conversation_A, 'r')) | |
| conversation_B = json.load(open(conversation_B, 'r')) | |
| conversation_history_pair = (conversation_A, conversation_B) | |
| # Naive Independent Scoring | |
| naive_scores, naive_info = independent_score(conversation_history_pair, [more_emoji_tip]) | |
| naive_info = naive_info[more_emoji_tip] | |
| print('------'*10) | |
| print('Pair description: ', test_desc) | |
| print('Naive Independent Score - A: ', naive_info['score_A'], ' | Score - B: ', naive_info['score_B'], ' | GT-Score A: ', ca, ' | GT-Score B: ', cb) | |
| independent_correct = ((naive_info['score_A']>=naive_info['score_B']) == (ca>=cb)) | |
| if not independent_correct: | |
| individual_eval_bias_score += 1 | |
| individual_eval_biases.append('|| ' + test_desc + ' || \n' + 'Score A: ' + str(naive_info["score_A"]) + '\n -- Argument: ' + naive_info["argument_A"] + '\nScore B: ' + str(naive_info["score_B"]) + '\n -- Argument: ' + naive_info["argument_B"]) | |
| # Pairwise Comparison should improve the result here -- nothing fancy, just a simple comparison | |
| print('------'*10) | |
| info = pairmatch_baseline(conversation_history_pair, [more_emoji_tip]) | |
| info = info[more_emoji_tip] | |
| relative_score = info['relative_score'] | |
| print('Pairwise Comparison Relative Score - A: ', relative_score[0], '| Score - B: ', relative_score[1], ' | GT-Score A: ', ca, ' | GT-Score B: ', cb) | |
| pairwise_correct = ((relative_score[0]>=relative_score[1]) == (ca>=cb)) | |
| if not pairwise_correct: | |
| pairwise_bias_score += 1 | |
| pairwise_biases.append('|| ' + test_desc + ' || \n' + 'Relative Score A: ' + str(relative_score[0]) + '\n -- Argument A|(A, B): ' + info["argument_A_from_AB"] + '\n -- Confidence A|(A, B): ' + info["conf_A_from_AB"] + '\n -- Argument A|(B,A): ' + info["argument_A_from_BA"] + '\n -- Confidence A|(B,A): ' + info["conf_A_from_BA"] + '\nRelative Score B: ' + str(relative_score[1]) + '\n -- Argument B | (A, B): ' + info["argument_B_from_AB"] + '\n -- Confidence B | (A, B): ' + info['conf_B_from_AB'] + '\n -- Argument B | (B, A): ' + info['argument_B_from_BA'] + | |
| '\n -- Confidence B | (B, A): ' + info['conf_B_from_BA']) | |
| individual_eval_bias_score /= len(name_pairs) | |
| pairwise_bias_score /= (2*len(name_pairs)) | |
| return individual_eval_bias_score, individual_eval_biases, pairwise_bias_score, pairwise_biases | |
| # Anchoring bias, for the most part | |
| def experiment_biases(): | |
| more_emoji_tip = "Use π emoji as much as possible" # the emoji existence tip suffices to tell the story | |
| conversation_files = glob.glob('./data/confirmation_bias/conversation_*.json') | |
| name_pairs = [('1AT.json', '8BT.json', 'A use emoji once, B use emoji 8 times', 1, 8), | |
| ('1AT.json', '3BT.json', 'A use emoji once, B use emoji 3 times', 1, 3), | |
| ('3AT.json', '8BT.json', 'A use emoji 3 times, B use emoji 8 times', 3, 8)] | |
| anchoring_bias_score, anchoring_biases = 0, [] | |
| for experiment_config in name_pairs: | |
| (nameA, nameB, test_desc, ca, cb) = experiment_config | |
| names = [nameA, nameB] | |
| conversation_A = [c for c in conversation_files if names[0] in c][0] | |
| conversation_B = [c for c in conversation_files if names[1] in c][0] | |
| conversation_A = json.load(open(conversation_A, 'r')) | |
| conversation_B = json.load(open(conversation_B, 'r')) | |
| conversation_history_pair = (conversation_A, conversation_B) | |
| # Pairwise Comparison should improve the result here -- nothing fancy, just a simple comparison | |
| print('------'*10) | |
| info = pairmatch_baseline(conversation_history_pair, [more_emoji_tip]) | |
| info = info[more_emoji_tip] | |
| relative_score = info['relative_score'] | |
| # in the toy examples, we are only doing counting comparison where cA<cB, the decision that 'B is not following the tip' is a Bias | |
| if info['eval_A_from_AB'] and not info['eval_B_from_AB']: | |
| anchoring_bias_score += 1 | |
| anchoring_biases.append('|| ' + test_desc + ' || \n' + 'Evaluation B: ' + str(info["eval_B_from_AB"]) + '\n -- Argument: ' + info["argument_B_from_AB"] + '\n -- Confidence: ', info['confidence_B_from_AB']) | |
| print('Pairwise Comparison Relative Score - A: ', relative_score[0], '| Score - B: ', relative_score[1], ' | GT-Score A: ', ca, ' | GT-Score B: ', cb) | |
| individual_eval_bias_score /= len(name_pairs) | |
| pairwise_bias_score /= (2*len(name_pairs)) | |
| return individual_eval_bias_score, individual_eval_biases, pairwise_bias_score, pairwise_biases | |
| # # Anchoring Bias && Confirmation Bias | |
| # def experiment_biases(name=['anchor', 'confirmation'], models=['GPT3.5', 'GPT4', 'Gemini']): | |
| # # Toy environment setup: | |
| # # - Emoji Existence Tip Evaluation | |
| # emoji_existence_tip = "Use π emoji at least once" # the emoji existence tip suffices to tell the story | |
| # # - Conversation A & B contains same amount of emoji π | |
| # # ---- case1. they all have only 1 π | |
| # # ---- case2. they all have 8 π (harder to ignore if your aren't blind) | |
| # # ---- case3. no emoji is used for both conversations | |
| # # - Conversation A & B contains same amount of emoji π | |
| # conversation_files = glob.glob('./data/confirmation_bias/conversation_*.json') | |
| # name_pairs = [('1AT.json', '1BT.json', 'A&B use emoji once'), ('3AT.json', '3BT.json', 'A&B use emoji 3 times'), ('8AT.json', '8BT.json', 'A&B use emoji 8 times'), ('AF.json', 'BF.json', 'A&B use no emoji')] | |
| # naive_bias_score, naive_biases, anchor_bias_score, anchor_biases, confirmation_bias_score, confirmation_biases = 0, [], 0, [], 0, [] | |
| # for experiment_config in name_pairs: | |
| # (nameA, nameB, test_desc) = experiment_config | |
| # names = [nameA, nameB] | |
| # gts = ['T' in name for name in names] | |
| # conversation_A = [c for c in conversation_files if names[0] in c][0] | |
| # conversation_B = [c for c in conversation_files if names[1] in c][0] | |
| # conversation_A = json.load(open(conversation_A, 'r')) | |
| # conversation_B = json.load(open(conversation_B, 'r')) | |
| # conversation_history_pair = (conversation_A, conversation_B) | |
| # # Anchor Bias happens when compare (A, B) in order, and judgement of A affect directly on the judgement of B, when A follows the tip, confirmation bias more likely causes negative evaluation on B, and LLM 'ignores the fact' | |
| # # Confirmation Bias happens when evaluate A with a reference argument. LLM will be inclined to folow the rhetoric in the reference argument, and ignore the 'fact' | |
| # judge, info = pairmatch_baseline(conversation_history_pair, [emoji_existence_tip]) | |
| # info = info[emoji_existence_tip] | |
| # # Naive Bias Check -- can not see emoji π, or hallucinate emoji π, basically error in LLM, lack of capacity | |
| # _, naive_info = independent_eval(conversation_history_pair, [emoji_existence_tip]) | |
| # naive_info = naive_info[emoji_existence_tip] | |
| # express_eval = lambda x: {True: 'Follows Tip', False: 'Not follow tip'}[x] | |
| # express_reflect = lambda x: {True: 'Agree', False: 'Disagree'}[x] | |
| # if naive_info['eval_A'] != gts[0]: | |
| # naive_bias_score += 1 | |
| # naive_biases.append(test_desc + ' || \n' + 'Evaluation A: ' + express_eval(naive_info["eval_A"]) + '\n -- Argument: ' + naive_info["argument_A"]) | |
| # if naive_info['eval_B'] != gts[1]: | |
| # naive_bias_score += 1 | |
| # naive_biases.append(test_desc + ' || \n' + 'Evaluation B: ' + express_eval(naive_info["eval_B"]) + '\n -- Argument: ' + naive_info["argument_B"]) | |
| # # Use info dict to analze the two biases -- also would help validate the reflection method's performance here | |
| # # - Anchor Bias | We know both A & B contains exact same number of π, we add 1 to anchor bias score if this fact is ignored, and A is judged differently from B | |
| # if info['eval_A_from_AB'] != info['eval_B_from_AB']: | |
| # anchor_bias_score += 1 | |
| # anchor_biases.append(test_desc + ' || \n' + 'Evaluation A: ' + express_eval(info["eval_A_from_AB"]) + '\n -- Argument: ' + info['argument_A_from_AB'] + '\nEvaluation B: ' + express_eval(info["eval_B_from_AB"]) + '\n -- Argument: ' + info['argument_B_from_AB']) | |
| # if info['eval_A_from_BA'] != info['eval_B_from_BA']: | |
| # anchor_bias_score += 1 | |
| # anchor_biases.append(test_desc + ' || \n' + 'Evaluation A: ' + express_eval(info["eval_A_from_BA"]) + '\n -- Argument: ' + info['argument_A_from_BA'] + '\nEvaluation B: ' + express_eval(info["eval_B_from_BA"]) + '\n -- Argument: ' + info['argument_B_from_BA']) | |
| # # print('Check keys in info: ', info.keys()) | |
| # # - Confirmation Bias | When the argument is wrong, and the reflection of LLM continues the error, confirmation bias is in-play || BTW, independent evaluation works fine here | |
| # if (info['reflect_A_from_AB'] == info['eval_A_from_BA']) and (info['eval_A_from_AB'] != gts[0]): | |
| # confirmation_bias_score += 1 | |
| # confirmation_biases.append(test_desc + ' || \n' + 'Evaluation A: ' + express_eval(info["eval_A_from_AB"]) + '\n -- Argument: '+info['argument_A_from_AB'] + '\nReflection A: ' + express_reflect(info["reflect_A_from_AB"]) + '\n -- Argument: ' + info['reflect_argument_A_from_AB']) | |
| # if (info['reflect_B_from_AB'] == info['eval_B_from_AB']) and (info['eval_B_from_AB'] != gts[1]): | |
| # confirmation_bias_score += 1 | |
| # confirmation_biases.append(test_desc + ' || \n' + 'Evaluation B: ' + express_eval(info["eval_B_from_AB"]) + '\n -- Argument: '+info['argument_B_from_AB'] + '\nReflection B: ' + express_reflect(info["reflect_B_from_AB"]) + '\n -- Argument: ' + info['reflect_argument_B_from_AB']) | |
| # if (info['reflect_A_from_BA'] == info['eval_A_from_AB']) and (info['eval_A_from_BA'] != gts[0]): | |
| # confirmation_bias_score += 1 | |
| # confirmation_biases.append(test_desc + ' || \n' + 'Evaluation A: ' + express_eval(info["eval_A_from_BA"]) + '\n -- Argument: '+info['argument_A_from_BA'] + '\nReflection A: ' + express_reflect(info["reflect_A_from_BA"]) + '\n -- Argument: ' + info['reflect_argument_A_from_BA']) | |
| # if (info['reflect_B_from_BA'] == info['eval_B_from_BA']) and (info['eval_B_from_BA'] != gts[1]): | |
| # confirmation_bias_score += 1 | |
| # confirmation_biases.append(test_desc + ' || \n' + 'Evaluation B: ' + express_eval(info["eval_B_from_BA"]) + '\n -- Argument: '+info['argument_B_from_BA'] + '\nReflection B: ' + express_reflect(info["reflect_B_from_BA"]) + '\n -- Argument: ' + info['reflect_argument_B_from_BA']) | |
| # naive_bias_score /= (2 * len(name_pairs)) | |
| # anchor_bias_score /= (2 * len(name_pairs)) | |
| # confirmation_bias_score /= (4 * len(name_pairs)) | |
| # return naive_bias_score, naive_biases, anchor_bias_score, anchor_biases, confirmation_bias_score, confirmation_biases | |
| # Bias Check for LLMs | |
| model_name = 'GPT-4' | |
| # naive_bias_score, naive_biases, anchor_bias_score, anchor_biases, confirmation_bias_score, confirmation_biases = experiment_biases() # experiment with GPT4 here | |
| # print('------'*10) | |
| # print('Naive Bias Score: ', naive_bias_score) | |
| # present_biases(naive_biases) | |
| # print('Anchor Bias Score: ', anchor_bias_score) | |
| # present_biases(anchor_biases) | |
| # print('Confirmation Bias Score: ', confirmation_bias_score) | |
| # present_biases(confirmation_biases) | |
| # # Store bias score into dict, and into csv file | |
| # import pandas as pd | |
| # bias_score = {'Naive': naive_bias_score, 'Anchor': anchor_bias_score, 'Confirmation': confirmation_bias_score} | |
| # bias_score_df = pd.DataFrame(bias_score.items(), columns=['Bias', 'Score']) | |
| # bias_score_df.to_csv(f'./runs/bias_experiment/{model_name}_bias_score.csv', index=False) | |
| # Pairwise Comparison Justification | |
| # independent_eval_bias_score, independent_eval_biases, compare_bias_socre, compare_biases = experiment_pairwise_comparison() | |
| # print('------'*10) | |
| # print('Individual Evaluation Bias Score: ', independent_eval_bias_score) | |
| # present_biases(independent_eval_biases) | |
| # print('Pairwise Comparison Bias Score: ', compare_bias_socre) | |
| # present_biases(compare_biases) | |
| # # Store bias score into dict, and into csv file | |
| # import pandas as pd | |
| # bias_score = {'Independent': independent_eval_bias_score, 'Pairwise': compare_bias_socre} | |
| # bias_score_df = pd.DataFrame(bias_score.items(), columns=['Bias', 'Score']) | |
| # bias_score_df.to_csv(f'./runs/bias_experiment/{model_name}_pairwise_bias_score.csv', index=False) | |
| # parsing test example | |
| # from src.pairmatch import parse_BA_compare_respond | |
| # # Example usage | |
| # response = """ | |
| # The customer in conversation B IS following the tip because they use the π emoji multiple times throughout the conversation to highlight their points of interest or queries.\n Confidence level: 10\n | |
| # The customer in conversation A IS NOT following the tip because they do not use the π emoji at all in the conversation. Confidence level: 10 | |
| # In comparison, the customer in conversation B IS NOT worse at following the tip because they use the π emoji as advised, while the customer in conversation A doesn't use it at all. | |
| # Confidence Level: 10.\n In comparison, the customer in conversation B IS NOT worse at following the tip because they use the π emoji as advised, while the customer in conversation A doesn't use it at all. Confidence Level: 10. | |
| # """ | |
| # print('Original Response: \n', response) | |
| # parsed_responses = parse_BA_compare_respond(response) | |
| # print('Parsed Response: \n') | |
| # for key, item in parsed_responses.items(): | |
| # print(key, ': ', item) | |
| # print('------'*10) | |
| individual_eval_bias_score, individual_eval_biases, pairwise_bias_score, pairwise_biases = experiment_pairwise_baseline() | |
| # Store bias score into dict, and into csv file | |
| import pandas as pd | |
| bias_score = {'Individual': individual_eval_bias_score, 'Pairwise': pairwise_bias_score} | |
| bias_score_df = pd.DataFrame(bias_score.items(), columns=['Bias', 'Score']) | |
| bias_score_df.to_csv(f'./runs/bias_experiment/{model_name}_pairwise_bias_score.csv', index=False) | |
| print('------'*10) | |
| print('Individual Evaluation Bias Score: ', individual_eval_bias_score) | |
| present_biases(individual_eval_biases) | |
| print('Pairwise Comparison Bias Score: ', pairwise_bias_score) | |
| present_biases(pairwise_biases) |