kirudang's picture
Copy files from original watermark leaderboard
40b3335
import os
# Set the HF_HOME environment variable to point to the desired cache location
os.environ["HF_TOKEN"] = "Your_HuggingFace_Token_Here"
# Specify the directory path
cache_dir = 'Your_Desired_Cache_Directory_Here'
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = cache_dir
import argparse
import json
import nltk
import time
import os
import tqdm
from nltk.tokenize import sent_tokenize
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
#nltk.download("punkt")
def main(args):
# Clear the cache
torch.cuda.empty_cache()
# Load data from the specified JSON file
with open(args.data, 'r') as f:
data = json.load(f)
data = [{"query": item["input"], "output_with_watermark": item[args.column_in_data]} for item in data[4960:args.Ninputs]]
# Load the model and tokenizer
time1 = time.time()
tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-xxl")
model = T5ForConditionalGeneration.from_pretrained(args.model_name)
print("Model loaded in ", time.time() - time1)
model.cuda()
model.eval()
# Initialize lists to store the attacked output and the inputs for the paraphrase model
attack_results = []
input_counter = 0
# Iterate over the data
for idx, dd in tqdm.tqdm(enumerate(data), total=len(data)):
print(f"Processing input {idx + 1} / {len(data)}")
input_gen = dd["output_with_watermark"].strip() if isinstance(dd["output_with_watermark"], str) else dd["output_with_watermark"][0].strip()
# Initialize dipper_inputs and w_wm_output_attacked to empty lists
dipper_inputs = []
w_wm_output_attacked = []
assert args.lex in [0, 20, 40, 60, 80, 100], "Lexical diversity must be one of 0, 20, 40, 60, 80, 100."
assert args.order in [0, 20, 40, 60, 80, 100], "Order diversity must be one of 0, 20, 40, 60, 80, 100."
# Calculate the control codes for the paraphrase model
lex_code = int(100 - args.lex)
order_code = int(100 - args.order)
# Remove spurious newlines
# removing all extra whitespace from
input_gen = " ".join(input_gen.split())
# Split the input into sentences
sentences = sent_tokenize(input_gen)
# White space removal
prefix = " ".join(dd["query"].replace("\n", " ").split())
output_text = ""
final_input_text = ""
# Generate the paraphrase for each sentence window
for sent_idx in range(0, len(sentences), args.sent_interval):
curr_sent_window = " ".join(sentences[sent_idx : sent_idx + args.sent_interval])
if args.no_ctx:
final_input_text = f"lexical = {lex_code}, order = {order_code} <sent> {curr_sent_window} </sent>"
else:
final_input_text = f"lexical = {lex_code}, order = {order_code} {prefix} <sent> {curr_sent_window} </sent>"
final_input = tokenizer([final_input_text], return_tensors="pt")
final_input = {k: v.cuda() for k, v in final_input.items()}
# Generate the paraphrase
with torch.inference_mode():
outputs = model.generate(
**final_input,
do_sample=True,
top_p=0.75,
top_k=None,
max_length=400
)
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
prefix += " " + outputs[0]
output_text += " " + outputs[0]
# Store the attacked output and the input for the paraphrase model
w_wm_output_attacked.append(output_text.strip())
dipper_inputs.append(final_input_text)
# Create a dictionary with the four specified columns
result = {
"original_query": dd["query"],
"watermarked_response": dd["output_with_watermark"],
#"final_input_text": dipper_inputs,
"paraphrased_response": w_wm_output_attacked[0]
}
# Add the result to the list of results
attack_results.append(result)
# Increment the input counter
input_counter += 1
# Save the results after processing every saving_freq inputs
if input_counter % args.saving_freq == 0:
# Save the generated data to a JSON file
# Check if the file exists
if os.path.isfile(f"{args.output_name}_{input_counter-args.saving_freq}.json"):
os.remove(f"{args.output_name}_{input_counter-args.saving_freq}.json")
with open(f"{args.output_name}_{input_counter}.json", "w") as json_file:
json.dump(attack_results, json_file, indent=4)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Attack by Dipper Paraphrasing")
parser.add_argument("--data",type=str,default="Llama3_SIR_test_13860.json", help="The data to be attacked / paraphrased.")
parser.add_argument("--column_in_data", type=str, default="output_only", help="Column in the data to be attacked / paraphrased.")
parser.add_argument("--output_name", type=str, default="Dipper_Llama3_SIR_13860_4960_", help="The output directory to save the attacked / paraphrased data.")
parser.add_argument("--Ninputs", type=int, default=13860, help="Number of inputs to be attacked / paraphrased.")
parser.add_argument("--saving_freq", type=int, default=10, help="The frequency of saving the output.")
parser.add_argument("--model_name", type=str, default="kalpeshk2011/dipper-paraphraser-xxl", help="The model name to use.")
parser.add_argument("--no_ctx", type=bool, default=True, help="Whether to use context or not.")
parser.add_argument("--sent_interval", type=int, default=3,help="The sentence interval.")
parser.add_argument("--lex",type=int, default=60, help="Lexical diversity knob for the paraphrase attack.")
parser.add_argument("--order",type=int,default=60,help="Order diversity knob for the paraphrase attack.")
args = parser.parse_args()
main(args)