File size: 5,584 Bytes
40b3335 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import os
# Ensure the HF_HOME environment variable points to your desired cache location
# os.environ["HF_TOKEN"] = "your_hugging_face_token_here" # Replace with your Hugging Face token
cache_dir = '/network/rit/lab/Lai_ReSecureAI/kiel/wmm'
os.environ['HF_HOME'] = cache_dir
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import json
# Clear cache
torch.cuda.empty_cache()
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize parameters
model_name = "DeepSeek" #"Llama3" #'DeepSeek' #
WM = "SafeSeal"
num_data = 20000
num_epochs = 5
learning_rate_ = 1e-5
N = 1000
# Print parameters
# Print parameters
print(f'Device: {device}')
print(f'Model: {model_name}')
print(f'WM: {WM}')
print(f'Number of data: {num_data}')
print(f'Number of epochs: {num_epochs}')
print(f'Learning rate: {learning_rate_}')
print(f'Number of generated data: {N}')
# Base model
if model_name == 'Llama3':
LLM_name = "meta-llama/Meta-Llama-3-8B"
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B",
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
device_map={"": 0})
elif model_name == 'DeepSeek':
LLM_name = "deepseek-ai/deepseek-llm-7b-base"
base_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-llm-7b-base",
low_cpu_mem_usage=True,
return_dict=True,
torch_dtype=torch.float16,
device_map={"": 0})
# Adapter path
def get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data):
return f"./adversary_models/{model_name}_{WM}_epoch{num_epochs}_lr{learning_rate_}_data{num_data}_"
#return f"/network/rit/lab/Lai_ReSecureAI/phung/adversary_models/{model_name}_epoch{num_epochs}_lr{learning_rate_}_K{K}_Threshold{Threshold}_data{num_data}_testing_batch{batch_no}_"
adapter = get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data)
# Check if the adapter path exists
print(f'Path to adapter: {adapter}')
if os.path.exists(adapter):
print("Path exists.")
else:
print("Path does not exist.")
# Merge the base model and adapter
model = PeftModel.from_pretrained(base_model, adapter)
print("Model loaded successfully.")
model = model.merge_and_unload()
print("Model merged successfully.")
model.to(device)
# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(LLM_name, cache_dir=cache_dir,use_fast=False)
# # Add special tokens
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# tokenizer.pad_token = tokenizer.eos_token
# Load the data
max_output_tokens=90
min_output_tokens=10
data_link = "/network/rit/lab/Lai_ReSecureAI/kiel/New_WM/Summarization/cnn.json"
output_results = []
input_counter = 0
saving_freq = 10
data = "test"
output_name = f"Adversary_{model_name}_{WM}_Summarization_{data}_{num_data}_{num_epochs}_{learning_rate_}_"
# torch.clear_cache()
def text_summarize(input_text, model, tokenizer, max_output_tokens , min_output_tokens):
#prompt = f"{input_text}\nThe summary is:"
prompt = f"""
Input: The CNN/Daily Mail dataset is one of the most widely used datasets for text summarization.
It contains news articles and their corresponding highlights, which act as summaries.
State-of-the-art models often use this dataset to fine-tune their summarization capabilities.
Example Summary: The CNN/Daily Mail dataset is commonly used for training summarization models with news articles and highlights.
Now summarize the following text with maximum 60 words: {input_text}
The summary is:"""
# prompt = f"""
# Now summarize the following text with maximum 60 words: {input_text}
# The summary is:"""
inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
inputs_tokens = inputs['input_ids'].cuda()
output = model.generate(
inputs_tokens,
max_new_tokens=max_output_tokens,
min_new_tokens=min_output_tokens,
do_sample=True,
temperature=0.9,
top_k=50,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.eos_token_id, # Prevents truncation issues
repetition_penalty=1.2 # Ensures complete sentence termination
)
summary = tokenizer.decode(output[0], skip_special_tokens=True)
return summary.split("The summary is:")[-1].strip()
with open(data_link, "r", encoding="utf-8") as f:
data_subset = json.load(f)
# Filter test data
test_data = [sample for sample in data_subset if sample["type"] == data]
# Testing loop
for i, sample in enumerate(test_data[:N]):
print(f"Processed {i+1}/{len(test_data[:N])}")
text = sample["article"]
summary = text_summarize(text, model, tokenizer, max_output_tokens , min_output_tokens)
# Store the input and output in a dictionary
data_dict = {
"id": sample["id"],
"article": sample["article"],
"highlights": sample["highlights"],
"summary": summary,
"type": data
}
output_results.append(data_dict)
input_counter += 1
# Save the results freqently
if input_counter % saving_freq == 0:
# Check if the file exits
if os.path.isfile(output_name + "_" + str(input_counter-saving_freq) + ".json"):
os.remove(output_name + "_" + str(input_counter-saving_freq) + ".json")
with open(output_name + "_" + str(input_counter) + ".json", "w", encoding="utf-8") as json_file:
json.dump(output_results, json_file, indent=4)
print(f"Summarization complete. Results saved to {output_name}.")
|