kirudang's picture
Copy files from original watermark leaderboard
40b3335
import os
# Ensure the HF_HOME environment variable points to your desired cache location
# os.environ["HF_TOKEN"] = "your_hugging_face_token_here" # Replace with your Hugging Face token
cache_dir = '/network/rit/lab/Lai_ReSecureAI/kiel/wmm'
os.environ['HF_HOME'] = cache_dir
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import json
# Clear cache
torch.cuda.empty_cache()
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize parameters
model_name = "DeepSeek" #"Llama3" #'DeepSeek' #
WM = "SafeSeal"
num_data = 20000
num_epochs = 5
learning_rate_ = 1e-5
N = 1000
# Print parameters
# Print parameters
print(f'Device: {device}')
print(f'Model: {model_name}')
print(f'WM: {WM}')
print(f'Number of data: {num_data}')
print(f'Number of epochs: {num_epochs}')
print(f'Learning rate: {learning_rate_}')
print(f'Number of generated data: {N}')
# Base model
if model_name == 'Llama3':
LLM_name = "meta-llama/Meta-Llama-3-8B"
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B",
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
device_map={"": 0})
elif model_name == 'DeepSeek':
LLM_name = "deepseek-ai/deepseek-llm-7b-base"
base_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-llm-7b-base",
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
device_map={"": 0})
# Adapter path
def get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data):
return f"./adversary_models/{model_name}_{WM}_epoch{num_epochs}_lr{learning_rate_}_data{num_data}_"
#return f"/network/rit/lab/Lai_ReSecureAI/phung/adversary_models/{model_name}_epoch{num_epochs}_lr{learning_rate_}_K{K}_Threshold{Threshold}_data{num_data}_testing_batch{batch_no}_"
adapter = get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data)
# Check if the adapter path exists
print(f'Path to adapter: {adapter}')
if os.path.exists(adapter):
print("Path exists.")
else:
print("Path does not exist.")
# Merge the base model and adapter
model = PeftModel.from_pretrained(base_model, adapter)
print("Model loaded successfully.")
model = model.merge_and_unload()
print("Model merged successfully.")
model.to(device)
# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(LLM_name, cache_dir=cache_dir,use_fast=False)
# # Add special tokens
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# tokenizer.pad_token = tokenizer.eos_token
# Load the data
max_output_tokens=90
min_output_tokens=10
data_link = "/network/rit/lab/Lai_ReSecureAI/kiel/New_WM/Summarization/cnn.json"
output_results = []
input_counter = 0
saving_freq = 10
data = "test"
output_name = f"Adversary_{model_name}_{WM}_Summarization_{data}_{num_data}_{num_epochs}_{learning_rate_}_"
# torch.clear_cache()
def text_summarize(input_text, model, tokenizer, max_output_tokens , min_output_tokens):
#prompt = f"{input_text}\nThe summary is:"
prompt = f"""
Input: The CNN/Daily Mail dataset is one of the most widely used datasets for text summarization.
It contains news articles and their corresponding highlights, which act as summaries.
State-of-the-art models often use this dataset to fine-tune their summarization capabilities.
Example Summary: The CNN/Daily Mail dataset is commonly used for training summarization models with news articles and highlights.
Now summarize the following text with maximum 60 words: {input_text}
The summary is:"""
# prompt = f"""
# Now summarize the following text with maximum 60 words: {input_text}
# The summary is:"""
inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
inputs_tokens = inputs['input_ids'].cuda()
output = model.generate(
inputs_tokens,
max_new_tokens=max_output_tokens,
min_new_tokens=min_output_tokens,
do_sample=True,
temperature=0.9,
top_k=50,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id, # Prevents truncation issues
repetition_penalty=1.2 # Ensures complete sentence termination
)
summary = tokenizer.decode(output[0], skip_special_tokens=True)
return summary.split("The summary is:")[-1].strip()
with open(data_link, "r", encoding="utf-8") as f:
data_subset = json.load(f)
# Filter test data
test_data = [sample for sample in data_subset if sample["type"] == data]
# Testing loop
for i, sample in enumerate(test_data[:N]):
print(f"Processed {i+1}/{len(test_data[:N])}")
text = sample["article"]
summary = text_summarize(text, model, tokenizer, max_output_tokens , min_output_tokens)
# Store the input and output in a dictionary
data_dict = {
"id": sample["id"],
"article": sample["article"],
"highlights": sample["highlights"],
"summary": summary,
"type": data
}
output_results.append(data_dict)
input_counter += 1
# Save the results freqently
if input_counter % saving_freq == 0:
# Check if the file exits
if os.path.isfile(output_name + "_" + str(input_counter-saving_freq) + ".json"):
os.remove(output_name + "_" + str(input_counter-saving_freq) + ".json")
with open(output_name + "_" + str(input_counter) + ".json", "w", encoding="utf-8") as json_file:
json.dump(output_results, json_file, indent=4)
print(f"Summarization complete. Results saved to {output_name}.")