File size: 5,584 Bytes
40b3335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
# Ensure the HF_HOME environment variable points to your desired cache location
# os.environ["HF_TOKEN"] = "your_hugging_face_token_here"  # Replace with your Hugging Face token
cache_dir = '/network/rit/lab/Lai_ReSecureAI/kiel/wmm'
os.environ['HF_HOME'] = cache_dir

import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import json

# Clear cache
torch.cuda.empty_cache()
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize parameters
model_name =  "DeepSeek" #"Llama3" #'DeepSeek' #
WM = "SafeSeal"
num_data = 20000
num_epochs = 5
learning_rate_ = 1e-5
N = 1000

# Print parameters
# Print parameters
print(f'Device: {device}')
print(f'Model: {model_name}')
print(f'WM: {WM}')
print(f'Number of data: {num_data}')
print(f'Number of epochs: {num_epochs}')
print(f'Learning rate: {learning_rate_}')
print(f'Number of generated data: {N}')

# Base model
if model_name == 'Llama3':
    LLM_name = "meta-llama/Meta-Llama-3-8B"
    base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0})
elif model_name == 'DeepSeek':
    LLM_name = "deepseek-ai/deepseek-llm-7b-base"
    base_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-llm-7b-base",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0})

# Adapter path
def get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data):
    return f"./adversary_models/{model_name}_{WM}_epoch{num_epochs}_lr{learning_rate_}_data{num_data}_"
    #return f"/network/rit/lab/Lai_ReSecureAI/phung/adversary_models/{model_name}_epoch{num_epochs}_lr{learning_rate_}_K{K}_Threshold{Threshold}_data{num_data}_testing_batch{batch_no}_"

adapter = get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data)

# Check if the adapter path exists
print(f'Path to adapter: {adapter}')
if os.path.exists(adapter):
    print("Path exists.")
else:
    print("Path does not exist.")

# Merge the base model and adapter
model = PeftModel.from_pretrained(base_model, adapter)
print("Model loaded successfully.")
model = model.merge_and_unload()
print("Model merged successfully.")
model.to(device)

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(LLM_name, cache_dir=cache_dir,use_fast=False)
# # Add special tokens
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# tokenizer.pad_token = tokenizer.eos_token

# Load the data
max_output_tokens=90 
min_output_tokens=10
data_link = "/network/rit/lab/Lai_ReSecureAI/kiel/New_WM/Summarization/cnn.json"
output_results = []
input_counter = 0
saving_freq = 10
data = "test"
output_name = f"Adversary_{model_name}_{WM}_Summarization_{data}_{num_data}_{num_epochs}_{learning_rate_}_"

# torch.clear_cache()

def text_summarize(input_text, model, tokenizer, max_output_tokens , min_output_tokens):
    #prompt = f"{input_text}\nThe summary is:"
    prompt = f"""
    Input: The CNN/Daily Mail dataset is one of the most widely used datasets for text summarization. 
    It contains news articles and their corresponding highlights, which act as summaries.
    State-of-the-art models often use this dataset to fine-tune their summarization capabilities.

    Example Summary: The CNN/Daily Mail dataset is commonly used for training summarization models with news articles and highlights.

    Now summarize the following text with maximum 60 words: {input_text}
    The summary is:"""    
    # prompt = f"""
    # Now summarize the following text with maximum 60 words: {input_text}
    # The summary is:"""
    inputs = tokenizer(prompt, return_tensors="pt",  add_special_tokens=True)
    inputs_tokens = inputs['input_ids'].cuda()
    
    output = model.generate(
        inputs_tokens,
        max_new_tokens=max_output_tokens,
        min_new_tokens=min_output_tokens,
        do_sample=True,
        temperature=0.9,
        top_k=50,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,  # Prevents truncation issues
        repetition_penalty=1.2  # Ensures complete sentence termination
    )
    
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary.split("The summary is:")[-1].strip()

with open(data_link, "r", encoding="utf-8") as f:
    data_subset = json.load(f)

# Filter test data
test_data = [sample for sample in data_subset if sample["type"] == data]

# Testing loop
for i, sample in enumerate(test_data[:N]):
    print(f"Processed {i+1}/{len(test_data[:N])}")
    text = sample["article"]
    summary = text_summarize(text, model, tokenizer, max_output_tokens , min_output_tokens)

    # Store the input and output in a dictionary
    data_dict = {
        "id": sample["id"], 
        "article": sample["article"],
        "highlights": sample["highlights"],
        "summary": summary,
        "type": data
    }

    output_results.append(data_dict)
    input_counter += 1

    # Save the results freqently
    if input_counter % saving_freq == 0:
        # Check if the file exits
        if os.path.isfile(output_name + "_" + str(input_counter-saving_freq) + ".json"):
            os.remove(output_name + "_" + str(input_counter-saving_freq) + ".json")
        with open(output_name + "_" + str(input_counter) + ".json", "w", encoding="utf-8") as json_file:
            json.dump(output_results, json_file, indent=4)

print(f"Summarization complete. Results saved to {output_name}.")