Spaces:

AnonymousResearch
/

WatermarkLeaderboard

Sleeping

File size: 5,584 Bytes

40b3335

import os
# Ensure the HF_HOME environment variable points to your desired cache location
# os.environ["HF_TOKEN"] = "your_hugging_face_token_here"  # Replace with your Hugging Face token
cache_dir = '/network/rit/lab/Lai_ReSecureAI/kiel/wmm'
os.environ['HF_HOME'] = cache_dir

import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import json

# Clear cache
torch.cuda.empty_cache()
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize parameters
model_name =  "DeepSeek" #"Llama3" #'DeepSeek' #
WM = "SafeSeal"
num_data = 20000
num_epochs = 5
learning_rate_ = 1e-5
N = 1000

# Print parameters
# Print parameters
print(f'Device: {device}')
print(f'Model: {model_name}')
print(f'WM: {WM}')
print(f'Number of data: {num_data}')
print(f'Number of epochs: {num_epochs}')
print(f'Learning rate: {learning_rate_}')
print(f'Number of generated data: {N}')

# Base model
if model_name == 'Llama3':
    LLM_name = "meta-llama/Meta-Llama-3-8B"
    base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0})
elif model_name == 'DeepSeek':
    LLM_name = "deepseek-ai/deepseek-llm-7b-base"
    base_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-llm-7b-base",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0})

# Adapter path
def get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data):
    return f"./adversary_models/{model_name}_{WM}_epoch{num_epochs}_lr{learning_rate_}_data{num_data}_"
    #return f"/network/rit/lab/Lai_ReSecureAI/phung/adversary_models/{model_name}_epoch{num_epochs}_lr{learning_rate_}_K{K}_Threshold{Threshold}_data{num_data}_testing_batch{batch_no}_"

adapter = get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data)

# Check if the adapter path exists
print(f'Path to adapter: {adapter}')
if os.path.exists(adapter):
    print("Path exists.")
else:
    print("Path does not exist.")

# Merge the base model and adapter
model = PeftModel.from_pretrained(base_model, adapter)
print("Model loaded successfully.")
model = model.merge_and_unload()
print("Model merged successfully.")
model.to(device)

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(LLM_name, cache_dir=cache_dir,use_fast=False)
# # Add special tokens
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# tokenizer.pad_token = tokenizer.eos_token

# Load the data
max_output_tokens=90 
min_output_tokens=10
data_link = "/network/rit/lab/Lai_ReSecureAI/kiel/New_WM/Summarization/cnn.json"
output_results = []
input_counter = 0
saving_freq = 10
data = "test"
output_name = f"Adversary_{model_name}_{WM}_Summarization_{data}_{num_data}_{num_epochs}_{learning_rate_}_"

# torch.clear_cache()

def text_summarize(input_text, model, tokenizer, max_output_tokens , min_output_tokens):
    #prompt = f"{input_text}\nThe summary is:"
    prompt = f"""
    Input: The CNN/Daily Mail dataset is one of the most widely used datasets for text summarization. 
    It contains news articles and their corresponding highlights, which act as summaries.
    State-of-the-art models often use this dataset to fine-tune their summarization capabilities.

    Example Summary: The CNN/Daily Mail dataset is commonly used for training summarization models with news articles and highlights.

    Now summarize the following text with maximum 60 words: {input_text}
    The summary is:"""    
    # prompt = f"""
    # Now summarize the following text with maximum 60 words: {input_text}
    # The summary is:"""
    inputs = tokenizer(prompt, return_tensors="pt",  add_special_tokens=True)
    inputs_tokens = inputs['input_ids'].cuda()
    
    output = model.generate(
        inputs_tokens,
        max_new_tokens=max_output_tokens,
        min_new_tokens=min_output_tokens,
        do_sample=True,
        temperature=0.9,
        top_k=50,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,  # Prevents truncation issues
        repetition_penalty=1.2  # Ensures complete sentence termination
    )
    
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary.split("The summary is:")[-1].strip()

with open(data_link, "r", encoding="utf-8") as f:
    data_subset = json.load(f)

# Filter test data
test_data = [sample for sample in data_subset if sample["type"] == data]

# Testing loop
for i, sample in enumerate(test_data[:N]):
    print(f"Processed {i+1}/{len(test_data[:N])}")
    text = sample["article"]
    summary = text_summarize(text, model, tokenizer, max_output_tokens , min_output_tokens)

    # Store the input and output in a dictionary
    data_dict = {
        "id": sample["id"], 
        "article": sample["article"],
        "highlights": sample["highlights"],
        "summary": summary,
        "type": data
    }

    output_results.append(data_dict)
    input_counter += 1

    # Save the results freqently
    if input_counter % saving_freq == 0:
        # Check if the file exits
        if os.path.isfile(output_name + "_" + str(input_counter-saving_freq) + ".json"):
            os.remove(output_name + "_" + str(input_counter-saving_freq) + ".json")
        with open(output_name + "_" + str(input_counter) + ".json", "w", encoding="utf-8") as json_file:
            json.dump(output_results, json_file, indent=4)

print(f"Summarization complete. Results saved to {output_name}.")