Spaces:

AnonymousResearch
/

WatermarkLeaderboard

Sleeping

App Files Files Community

WatermarkLeaderboard / Reproducibility /Attack_dipper.py

kirudang

Copy files from original watermark leaderboard

40b3335 about 1 month ago

raw

history blame contribute delete

6.09 kB

	import os
	# Set the HF_HOME environment variable to point to the desired cache location
	os.environ["HF_TOKEN"] = "Your_HuggingFace_Token_Here"
	# Specify the directory path
	cache_dir = 'Your_Desired_Cache_Directory_Here'
	# Set the HF_HOME environment variable
	os.environ['HF_HOME'] = cache_dir
	import argparse
	import json
	import nltk
	import time
	import os
	import tqdm

	from nltk.tokenize import sent_tokenize

	from transformers import T5Tokenizer, T5ForConditionalGeneration
	import torch

	#nltk.download("punkt")

	def main(args):
	# Clear the cache
	torch.cuda.empty_cache()
	# Load data from the specified JSON file
	with open(args.data, 'r') as f:
	data = json.load(f)
	data = [{"query": item["input"], "output_with_watermark": item[args.column_in_data]} for item in data[4960:args.Ninputs]]

	# Load the model and tokenizer
	time1 = time.time()
	tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl")
	model = T5ForConditionalGeneration.from_pretrained(args.model_name)
	print("Model loaded in ", time.time() - time1)
	model.cuda()
	model.eval()

	# Initialize lists to store the attacked output and the inputs for the paraphrase model
	attack_results = []
	input_counter = 0

	# Iterate over the data
	for idx, dd in tqdm.tqdm(enumerate(data), total=len(data)):
	print(f"Processing input {idx + 1} / {len(data)}")
	input_gen = dd["output_with_watermark"].strip() if isinstance(dd["output_with_watermark"], str) else dd["output_with_watermark"][0].strip()

	# Initialize dipper_inputs and w_wm_output_attacked to empty lists
	dipper_inputs = []
	w_wm_output_attacked = []

	assert args.lex in [0, 20, 40, 60, 80, 100], "Lexical diversity must be one of 0, 20, 40, 60, 80, 100."
	assert args.order in [0, 20, 40, 60, 80, 100], "Order diversity must be one of 0, 20, 40, 60, 80, 100."
	# Calculate the control codes for the paraphrase model
	lex_code = int(100 - args.lex)
	order_code = int(100 - args.order)

	# Remove spurious newlines
	# removing all extra whitespace from
	input_gen = " ".join(input_gen.split())
	# Split the input into sentences
	sentences = sent_tokenize(input_gen)
	# White space removal
	prefix = " ".join(dd["query"].replace("\n", " ").split())
	output_text = ""
	final_input_text = ""

	# Generate the paraphrase for each sentence window
	for sent_idx in range(0, len(sentences), args.sent_interval):
	curr_sent_window = " ".join(sentences[sent_idx : sent_idx + args.sent_interval])
	if args.no_ctx:
	final_input_text = f"lexical = {lex_code}, order = {order_code} <sent> {curr_sent_window} </sent>"
	else:
	final_input_text = f"lexical = {lex_code}, order = {order_code} {prefix} <sent> {curr_sent_window} </sent>"

	final_input = tokenizer([final_input_text], return_tensors="pt")
	final_input = {k: v.cuda() for k, v in final_input.items()}

	# Generate the paraphrase
	with torch.inference_mode():
	outputs = model.generate(
	**final_input,
	do_sample=True,
	top_p=0.75,
	top_k=None,
	max_length=400
	)
	outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
	prefix += " " + outputs[0]
	output_text += " " + outputs[0]

	# Store the attacked output and the input for the paraphrase model
	w_wm_output_attacked.append(output_text.strip())
	dipper_inputs.append(final_input_text)

	# Create a dictionary with the four specified columns
	result = {
	"original_query": dd["query"],
	"watermarked_response": dd["output_with_watermark"],
	#"final_input_text": dipper_inputs,
	"paraphrased_response": w_wm_output_attacked[0]

	}
	# Add the result to the list of results
	attack_results.append(result)

	# Increment the input counter
	input_counter += 1

	# Save the results after processing every saving_freq inputs
	if input_counter % args.saving_freq == 0:
	# Save the generated data to a JSON file
	# Check if the file exists
	if os.path.isfile(f"{args.output_name}_{input_counter-args.saving_freq}.json"):
	os.remove(f"{args.output_name}_{input_counter-args.saving_freq}.json")

	with open(f"{args.output_name}_{input_counter}.json", "w") as json_file:
	json.dump(attack_results, json_file, indent=4)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Attack by Dipper Paraphrasing")
	parser.add_argument("--data",type=str,default="Llama3_SIR_test_13860.json", help="The data to be attacked / paraphrased.")
	parser.add_argument("--column_in_data", type=str, default="output_only", help="Column in the data to be attacked / paraphrased.")
	parser.add_argument("--output_name", type=str, default="Dipper_Llama3_SIR_13860_4960_", help="The output directory to save the attacked / paraphrased data.")
	parser.add_argument("--Ninputs", type=int, default=13860, help="Number of inputs to be attacked / paraphrased.")
	parser.add_argument("--saving_freq", type=int, default=10, help="The frequency of saving the output.")
	parser.add_argument("--model_name", type=str, default="kalpeshk2011/dipper-paraphraser-xxl", help="The model name to use.")
	parser.add_argument("--no_ctx", type=bool, default=True, help="Whether to use context or not.")
	parser.add_argument("--sent_interval", type=int, default=3,help="The sentence interval.")
	parser.add_argument("--lex",type=int, default=60, help="Lexical diversity knob for the paraphrase attack.")
	parser.add_argument("--order",type=int,default=60,help="Order diversity knob for the paraphrase attack.")

	args = parser.parse_args()

	main(args)