|
|
import os |
|
|
|
|
|
|
|
|
cache_dir = '/network/rit/lab/Lai_ReSecureAI/kiel/wmm' |
|
|
os.environ['HF_HOME'] = cache_dir |
|
|
|
|
|
import time |
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
from peft import PeftModel |
|
|
import json |
|
|
|
|
|
|
|
|
torch.cuda.empty_cache() |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
|
|
|
model_name = "DeepSeek" |
|
|
WM = "SafeSeal" |
|
|
num_data = 20000 |
|
|
num_epochs = 5 |
|
|
learning_rate_ = 1e-5 |
|
|
N = 1000 |
|
|
|
|
|
|
|
|
|
|
|
print(f'Device: {device}') |
|
|
print(f'Model: {model_name}') |
|
|
print(f'WM: {WM}') |
|
|
print(f'Number of data: {num_data}') |
|
|
print(f'Number of epochs: {num_epochs}') |
|
|
print(f'Learning rate: {learning_rate_}') |
|
|
print(f'Number of generated data: {N}') |
|
|
|
|
|
|
|
|
if model_name == 'Llama3': |
|
|
LLM_name = "meta-llama/Meta-Llama-3-8B" |
|
|
base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", |
|
|
low_cpu_mem_usage=True, |
|
|
return_dict=True, |
|
|
torch_dtype=torch.float16, |
|
|
device_map={"": 0}) |
|
|
elif model_name == 'DeepSeek': |
|
|
LLM_name = "deepseek-ai/deepseek-llm-7b-base" |
|
|
base_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-llm-7b-base", |
|
|
low_cpu_mem_usage=True, |
|
|
return_dict=True, |
|
|
torch_dtype=torch.float16, |
|
|
device_map={"": 0}) |
|
|
|
|
|
|
|
|
def get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data): |
|
|
return f"./adversary_models/{model_name}_{WM}_epoch{num_epochs}_lr{learning_rate_}_data{num_data}_" |
|
|
|
|
|
|
|
|
adapter = get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data) |
|
|
|
|
|
|
|
|
print(f'Path to adapter: {adapter}') |
|
|
if os.path.exists(adapter): |
|
|
print("Path exists.") |
|
|
else: |
|
|
print("Path does not exist.") |
|
|
|
|
|
|
|
|
model = PeftModel.from_pretrained(base_model, adapter) |
|
|
print("Model loaded successfully.") |
|
|
model = model.merge_and_unload() |
|
|
print("Model merged successfully.") |
|
|
model.to(device) |
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(LLM_name, cache_dir=cache_dir,use_fast=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
max_output_tokens=90 |
|
|
min_output_tokens=10 |
|
|
data_link = "/network/rit/lab/Lai_ReSecureAI/kiel/New_WM/Summarization/cnn.json" |
|
|
output_results = [] |
|
|
input_counter = 0 |
|
|
saving_freq = 10 |
|
|
data = "test" |
|
|
output_name = f"Adversary_{model_name}_{WM}_Summarization_{data}_{num_data}_{num_epochs}_{learning_rate_}_" |
|
|
|
|
|
|
|
|
|
|
|
def text_summarize(input_text, model, tokenizer, max_output_tokens , min_output_tokens): |
|
|
|
|
|
prompt = f""" |
|
|
Input: The CNN/Daily Mail dataset is one of the most widely used datasets for text summarization. |
|
|
It contains news articles and their corresponding highlights, which act as summaries. |
|
|
State-of-the-art models often use this dataset to fine-tune their summarization capabilities. |
|
|
|
|
|
Example Summary: The CNN/Daily Mail dataset is commonly used for training summarization models with news articles and highlights. |
|
|
|
|
|
Now summarize the following text with maximum 60 words: {input_text} |
|
|
The summary is:""" |
|
|
|
|
|
|
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) |
|
|
inputs_tokens = inputs['input_ids'].cuda() |
|
|
|
|
|
output = model.generate( |
|
|
inputs_tokens, |
|
|
max_new_tokens=max_output_tokens, |
|
|
min_new_tokens=min_output_tokens, |
|
|
do_sample=True, |
|
|
temperature=0.9, |
|
|
top_k=50, |
|
|
eos_token_id=tokenizer.eos_token_id, |
|
|
pad_token_id=tokenizer.eos_token_id, |
|
|
repetition_penalty=1.2 |
|
|
) |
|
|
|
|
|
summary = tokenizer.decode(output[0], skip_special_tokens=True) |
|
|
return summary.split("The summary is:")[-1].strip() |
|
|
|
|
|
with open(data_link, "r", encoding="utf-8") as f: |
|
|
data_subset = json.load(f) |
|
|
|
|
|
|
|
|
test_data = [sample for sample in data_subset if sample["type"] == data] |
|
|
|
|
|
|
|
|
for i, sample in enumerate(test_data[:N]): |
|
|
print(f"Processed {i+1}/{len(test_data[:N])}") |
|
|
text = sample["article"] |
|
|
summary = text_summarize(text, model, tokenizer, max_output_tokens , min_output_tokens) |
|
|
|
|
|
|
|
|
data_dict = { |
|
|
"id": sample["id"], |
|
|
"article": sample["article"], |
|
|
"highlights": sample["highlights"], |
|
|
"summary": summary, |
|
|
"type": data |
|
|
} |
|
|
|
|
|
output_results.append(data_dict) |
|
|
input_counter += 1 |
|
|
|
|
|
|
|
|
if input_counter % saving_freq == 0: |
|
|
|
|
|
if os.path.isfile(output_name + "_" + str(input_counter-saving_freq) + ".json"): |
|
|
os.remove(output_name + "_" + str(input_counter-saving_freq) + ".json") |
|
|
with open(output_name + "_" + str(input_counter) + ".json", "w", encoding="utf-8") as json_file: |
|
|
json.dump(output_results, json_file, indent=4) |
|
|
|
|
|
print(f"Summarization complete. Results saved to {output_name}.") |
|
|
|