Spaces:

Ksgk-fy
/

anno

No application file

App Files Files Community

anno / simSearch /experiment.py

Ksgk-fy

Upload 67 files

ee657a1 verified about 2 years ago

raw

history blame contribute delete

15.8 kB

	from src import independent_eval, independent_score, pairmatch_baseline
	from data.rude_prompt import RUDE_TIPS
	import glob, json

	from typing import List
	def present_biases(biases: List[str]):
	for bias in biases:
	print(bias)
	print('------'*10)

	express_judge = lambda x: {0: 'Tie', 1: 'A', 2: 'B'}[x]


	# Compare Individually vs. Pairwise
	def experiment_pairwise_baseline():
	more_emoji_tip = "Use 👉 emoji as much as possible" # the emoji existence tip suffices to tell the story
	conversation_files = glob.glob('./data/confirmation_bias/conversation_*.json')

	name_pairs = [('1AT.json', '8BT.json', 'A use emoji once, B use emoji 8 times', 1, 8),
	('1AT.json', '3BT.json', 'A use emoji once, B use emoji 3 times', 1, 3),
	('3AT.json', '8BT.json', 'A use emoji 3 times, B use emoji 8 times', 3, 8)]

	individual_eval_bias_score, individual_eval_biases, pairwise_bias_score, pairwise_biases = 0, [], 0, []
	for experiment_config in name_pairs:
	(nameA, nameB, test_desc, ca, cb) = experiment_config
	names = [nameA, nameB]
	conversation_A = [c for c in conversation_files if names[0] in c][0]
	conversation_B = [c for c in conversation_files if names[1] in c][0]
	conversation_A = json.load(open(conversation_A, 'r'))
	conversation_B = json.load(open(conversation_B, 'r'))
	conversation_history_pair = (conversation_A, conversation_B)
	# Naive Independent Scoring
	naive_scores, naive_info = independent_score(conversation_history_pair, [more_emoji_tip])
	naive_info = naive_info[more_emoji_tip]
	print('------'*10)
	print('Pair description: ', test_desc)
	print('Naive Independent Score - A: ', naive_info['score_A'], ' \| Score - B: ', naive_info['score_B'], ' \| GT-Score A: ', ca, ' \| GT-Score B: ', cb)
	independent_correct = ((naive_info['score_A']>=naive_info['score_B']) == (ca>=cb))
	if not independent_correct:
	individual_eval_bias_score += 1
	individual_eval_biases.append('\|\| ' + test_desc + ' \|\| \n' + 'Score A: ' + str(naive_info["score_A"]) + '\n -- Argument: ' + naive_info["argument_A"] + '\nScore B: ' + str(naive_info["score_B"]) + '\n -- Argument: ' + naive_info["argument_B"])

	# Pairwise Comparison should improve the result here -- nothing fancy, just a simple comparison
	print('------'*10)
	info = pairmatch_baseline(conversation_history_pair, [more_emoji_tip])
	info = info[more_emoji_tip]
	relative_score = info['relative_score']
	print('Pairwise Comparison Relative Score - A: ', relative_score[0], '\| Score - B: ', relative_score[1], ' \| GT-Score A: ', ca, ' \| GT-Score B: ', cb)
	pairwise_correct = ((relative_score[0]>=relative_score[1]) == (ca>=cb))

	if not pairwise_correct:
	pairwise_bias_score += 1
	pairwise_biases.append('\|\| ' + test_desc + ' \|\| \n' + 'Relative Score A: ' + str(relative_score[0]) + '\n -- Argument A\|(A, B): ' + info["argument_A_from_AB"] + '\n -- Confidence A\|(A, B): ' + info["conf_A_from_AB"] + '\n -- Argument A\|(B,A): ' + info["argument_A_from_BA"] + '\n -- Confidence A\|(B,A): ' + info["conf_A_from_BA"] + '\nRelative Score B: ' + str(relative_score[1]) + '\n -- Argument B \| (A, B): ' + info["argument_B_from_AB"] + '\n -- Confidence B \| (A, B): ' + info['conf_B_from_AB'] + '\n -- Argument B \| (B, A): ' + info['argument_B_from_BA'] +
	'\n -- Confidence B \| (B, A): ' + info['conf_B_from_BA'])

	individual_eval_bias_score /= len(name_pairs)
	pairwise_bias_score /= (2*len(name_pairs))
	return individual_eval_bias_score, individual_eval_biases, pairwise_bias_score, pairwise_biases



	# Anchoring bias, for the most part
	def experiment_biases():
	more_emoji_tip = "Use 👉 emoji as much as possible" # the emoji existence tip suffices to tell the story
	conversation_files = glob.glob('./data/confirmation_bias/conversation_*.json')

	name_pairs = [('1AT.json', '8BT.json', 'A use emoji once, B use emoji 8 times', 1, 8),
	('1AT.json', '3BT.json', 'A use emoji once, B use emoji 3 times', 1, 3),
	('3AT.json', '8BT.json', 'A use emoji 3 times, B use emoji 8 times', 3, 8)]

	anchoring_bias_score, anchoring_biases = 0, []
	for experiment_config in name_pairs:
	(nameA, nameB, test_desc, ca, cb) = experiment_config
	names = [nameA, nameB]
	conversation_A = [c for c in conversation_files if names[0] in c][0]
	conversation_B = [c for c in conversation_files if names[1] in c][0]
	conversation_A = json.load(open(conversation_A, 'r'))
	conversation_B = json.load(open(conversation_B, 'r'))
	conversation_history_pair = (conversation_A, conversation_B)

	# Pairwise Comparison should improve the result here -- nothing fancy, just a simple comparison
	print('------'*10)
	info = pairmatch_baseline(conversation_history_pair, [more_emoji_tip])
	info = info[more_emoji_tip]
	relative_score = info['relative_score']

	# in the toy examples, we are only doing counting comparison where cA<cB, the decision that 'B is not following the tip' is a Bias
	if info['eval_A_from_AB'] and not info['eval_B_from_AB']:
	anchoring_bias_score += 1
	anchoring_biases.append('\|\| ' + test_desc + ' \|\| \n' + 'Evaluation B: ' + str(info["eval_B_from_AB"]) + '\n -- Argument: ' + info["argument_B_from_AB"] + '\n -- Confidence: ', info['confidence_B_from_AB'])

	print('Pairwise Comparison Relative Score - A: ', relative_score[0], '\| Score - B: ', relative_score[1], ' \| GT-Score A: ', ca, ' \| GT-Score B: ', cb)


	individual_eval_bias_score /= len(name_pairs)
	pairwise_bias_score /= (2*len(name_pairs))
	return individual_eval_bias_score, individual_eval_biases, pairwise_bias_score, pairwise_biases


	# # Anchoring Bias && Confirmation Bias
	# def experiment_biases(name=['anchor', 'confirmation'], models=['GPT3.5', 'GPT4', 'Gemini']):
	# # Toy environment setup:
	# # - Emoji Existence Tip Evaluation
	# emoji_existence_tip = "Use 👉 emoji at least once" # the emoji existence tip suffices to tell the story
	# # - Conversation A & B contains same amount of emoji 👉
	# # ---- case1. they all have only 1 👉
	# # ---- case2. they all have 8 👉 (harder to ignore if your aren't blind)
	# # ---- case3. no emoji is used for both conversations

	# # - Conversation A & B contains same amount of emoji 👉
	# conversation_files = glob.glob('./data/confirmation_bias/conversation_*.json')


	# name_pairs = [('1AT.json', '1BT.json', 'A&B use emoji once'), ('3AT.json', '3BT.json', 'A&B use emoji 3 times'), ('8AT.json', '8BT.json', 'A&B use emoji 8 times'), ('AF.json', 'BF.json', 'A&B use no emoji')]
	# naive_bias_score, naive_biases, anchor_bias_score, anchor_biases, confirmation_bias_score, confirmation_biases = 0, [], 0, [], 0, []
	# for experiment_config in name_pairs:
	# (nameA, nameB, test_desc) = experiment_config
	# names = [nameA, nameB]
	# gts = ['T' in name for name in names]
	# conversation_A = [c for c in conversation_files if names[0] in c][0]
	# conversation_B = [c for c in conversation_files if names[1] in c][0]

	# conversation_A = json.load(open(conversation_A, 'r'))
	# conversation_B = json.load(open(conversation_B, 'r'))
	# conversation_history_pair = (conversation_A, conversation_B)

	# # Anchor Bias happens when compare (A, B) in order, and judgement of A affect directly on the judgement of B, when A follows the tip, confirmation bias more likely causes negative evaluation on B, and LLM 'ignores the fact'
	# # Confirmation Bias happens when evaluate A with a reference argument. LLM will be inclined to folow the rhetoric in the reference argument, and ignore the 'fact'
	# judge, info = pairmatch_baseline(conversation_history_pair, [emoji_existence_tip])
	# info = info[emoji_existence_tip]

	# # Naive Bias Check -- can not see emoji 👉, or hallucinate emoji 👉, basically error in LLM, lack of capacity
	# _, naive_info = independent_eval(conversation_history_pair, [emoji_existence_tip])
	# naive_info = naive_info[emoji_existence_tip]

	# express_eval = lambda x: {True: 'Follows Tip', False: 'Not follow tip'}[x]
	# express_reflect = lambda x: {True: 'Agree', False: 'Disagree'}[x]

	# if naive_info['eval_A'] != gts[0]:
	# naive_bias_score += 1
	# naive_biases.append(test_desc + ' \|\| \n' + 'Evaluation A: ' + express_eval(naive_info["eval_A"]) + '\n -- Argument: ' + naive_info["argument_A"])
	# if naive_info['eval_B'] != gts[1]:
	# naive_bias_score += 1
	# naive_biases.append(test_desc + ' \|\| \n' + 'Evaluation B: ' + express_eval(naive_info["eval_B"]) + '\n -- Argument: ' + naive_info["argument_B"])

	# # Use info dict to analze the two biases -- also would help validate the reflection method's performance here
	# # - Anchor Bias \| We know both A & B contains exact same number of 👉, we add 1 to anchor bias score if this fact is ignored, and A is judged differently from B
	# if info['eval_A_from_AB'] != info['eval_B_from_AB']:
	# anchor_bias_score += 1
	# anchor_biases.append(test_desc + ' \|\| \n' + 'Evaluation A: ' + express_eval(info["eval_A_from_AB"]) + '\n -- Argument: ' + info['argument_A_from_AB'] + '\nEvaluation B: ' + express_eval(info["eval_B_from_AB"]) + '\n -- Argument: ' + info['argument_B_from_AB'])
	# if info['eval_A_from_BA'] != info['eval_B_from_BA']:
	# anchor_bias_score += 1
	# anchor_biases.append(test_desc + ' \|\| \n' + 'Evaluation A: ' + express_eval(info["eval_A_from_BA"]) + '\n -- Argument: ' + info['argument_A_from_BA'] + '\nEvaluation B: ' + express_eval(info["eval_B_from_BA"]) + '\n -- Argument: ' + info['argument_B_from_BA'])

	# # print('Check keys in info: ', info.keys())
	# # - Confirmation Bias \| When the argument is wrong, and the reflection of LLM continues the error, confirmation bias is in-play \|\| BTW, independent evaluation works fine here
	# if (info['reflect_A_from_AB'] == info['eval_A_from_BA']) and (info['eval_A_from_AB'] != gts[0]):
	# confirmation_bias_score += 1
	# confirmation_biases.append(test_desc + ' \|\| \n' + 'Evaluation A: ' + express_eval(info["eval_A_from_AB"]) + '\n -- Argument: '+info['argument_A_from_AB'] + '\nReflection A: ' + express_reflect(info["reflect_A_from_AB"]) + '\n -- Argument: ' + info['reflect_argument_A_from_AB'])
	# if (info['reflect_B_from_AB'] == info['eval_B_from_AB']) and (info['eval_B_from_AB'] != gts[1]):
	# confirmation_bias_score += 1
	# confirmation_biases.append(test_desc + ' \|\| \n' + 'Evaluation B: ' + express_eval(info["eval_B_from_AB"]) + '\n -- Argument: '+info['argument_B_from_AB'] + '\nReflection B: ' + express_reflect(info["reflect_B_from_AB"]) + '\n -- Argument: ' + info['reflect_argument_B_from_AB'])
	# if (info['reflect_A_from_BA'] == info['eval_A_from_AB']) and (info['eval_A_from_BA'] != gts[0]):
	# confirmation_bias_score += 1
	# confirmation_biases.append(test_desc + ' \|\| \n' + 'Evaluation A: ' + express_eval(info["eval_A_from_BA"]) + '\n -- Argument: '+info['argument_A_from_BA'] + '\nReflection A: ' + express_reflect(info["reflect_A_from_BA"]) + '\n -- Argument: ' + info['reflect_argument_A_from_BA'])
	# if (info['reflect_B_from_BA'] == info['eval_B_from_BA']) and (info['eval_B_from_BA'] != gts[1]):
	# confirmation_bias_score += 1
	# confirmation_biases.append(test_desc + ' \|\| \n' + 'Evaluation B: ' + express_eval(info["eval_B_from_BA"]) + '\n -- Argument: '+info['argument_B_from_BA'] + '\nReflection B: ' + express_reflect(info["reflect_B_from_BA"]) + '\n -- Argument: ' + info['reflect_argument_B_from_BA'])

	# naive_bias_score /= (2 * len(name_pairs))
	# anchor_bias_score /= (2 * len(name_pairs))
	# confirmation_bias_score /= (4 * len(name_pairs))

	# return naive_bias_score, naive_biases, anchor_bias_score, anchor_biases, confirmation_bias_score, confirmation_biases


	# Bias Check for LLMs
	model_name = 'GPT-4'
	# naive_bias_score, naive_biases, anchor_bias_score, anchor_biases, confirmation_bias_score, confirmation_biases = experiment_biases() # experiment with GPT4 here
	# print('------'*10)
	# print('Naive Bias Score: ', naive_bias_score)
	# present_biases(naive_biases)
	# print('Anchor Bias Score: ', anchor_bias_score)
	# present_biases(anchor_biases)
	# print('Confirmation Bias Score: ', confirmation_bias_score)
	# present_biases(confirmation_biases)

	# # Store bias score into dict, and into csv file
	# import pandas as pd
	# bias_score = {'Naive': naive_bias_score, 'Anchor': anchor_bias_score, 'Confirmation': confirmation_bias_score}
	# bias_score_df = pd.DataFrame(bias_score.items(), columns=['Bias', 'Score'])
	# bias_score_df.to_csv(f'./runs/bias_experiment/{model_name}_bias_score.csv', index=False)


	# Pairwise Comparison Justification
	# independent_eval_bias_score, independent_eval_biases, compare_bias_socre, compare_biases = experiment_pairwise_comparison()
	# print('------'*10)
	# print('Individual Evaluation Bias Score: ', independent_eval_bias_score)
	# present_biases(independent_eval_biases)
	# print('Pairwise Comparison Bias Score: ', compare_bias_socre)
	# present_biases(compare_biases)

	# # Store bias score into dict, and into csv file
	# import pandas as pd
	# bias_score = {'Independent': independent_eval_bias_score, 'Pairwise': compare_bias_socre}
	# bias_score_df = pd.DataFrame(bias_score.items(), columns=['Bias', 'Score'])
	# bias_score_df.to_csv(f'./runs/bias_experiment/{model_name}_pairwise_bias_score.csv', index=False)





	# parsing test example
	# from src.pairmatch import parse_BA_compare_respond
	# # Example usage
	# response = """
	# The customer in conversation B IS following the tip because they use the 👉 emoji multiple times throughout the conversation to highlight their points of interest or queries.\n Confidence level: 10\n
	# The customer in conversation A IS NOT following the tip because they do not use the 👉 emoji at all in the conversation. Confidence level: 10
	# In comparison, the customer in conversation B IS NOT worse at following the tip because they use the 👉 emoji as advised, while the customer in conversation A doesn't use it at all.
	# Confidence Level: 10.\n In comparison, the customer in conversation B IS NOT worse at following the tip because they use the 👉 emoji as advised, while the customer in conversation A doesn't use it at all. Confidence Level: 10.
	# """
	# print('Original Response: \n', response)
	# parsed_responses = parse_BA_compare_respond(response)
	# print('Parsed Response: \n')
	# for key, item in parsed_responses.items():
	# print(key, ': ', item)
	# print('------'*10)


	individual_eval_bias_score, individual_eval_biases, pairwise_bias_score, pairwise_biases = experiment_pairwise_baseline()
	# Store bias score into dict, and into csv file
	import pandas as pd
	bias_score = {'Individual': individual_eval_bias_score, 'Pairwise': pairwise_bias_score}
	bias_score_df = pd.DataFrame(bias_score.items(), columns=['Bias', 'Score'])
	bias_score_df.to_csv(f'./runs/bias_experiment/{model_name}_pairwise_bias_score.csv', index=False)

	print('------'*10)
	print('Individual Evaluation Bias Score: ', individual_eval_bias_score)
	present_biases(individual_eval_biases)
	print('Pairwise Comparison Bias Score: ', pairwise_bias_score)
	present_biases(pairwise_biases)