import os # Ensure the HF_HOME environment variable points to your desired cache location # os.environ["HF_TOKEN"] = "your_hugging_face_token_here" # Replace with your Hugging Face token cache_dir = '/network/rit/lab/Lai_ReSecureAI/kiel/wmm' os.environ['HF_HOME'] = cache_dir import time import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import json # Clear cache torch.cuda.empty_cache() device = "cuda" if torch.cuda.is_available() else "cpu" # Initialize parameters model_name = "DeepSeek" #"Llama3" #'DeepSeek' # WM = "SafeSeal" num_data = 20000 num_epochs = 5 learning_rate_ = 1e-5 N = 1000 # Print parameters # Print parameters print(f'Device: {device}') print(f'Model: {model_name}') print(f'WM: {WM}') print(f'Number of data: {num_data}') print(f'Number of epochs: {num_epochs}') print(f'Learning rate: {learning_rate_}') print(f'Number of generated data: {N}') # Base model if model_name == 'Llama3': LLM_name = "meta-llama/Meta-Llama-3-8B" base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", low_cpu_mem_usage=True, return_dict=True, torch_dtype=torch.float16, device_map={"": 0}) elif model_name == 'DeepSeek': LLM_name = "deepseek-ai/deepseek-llm-7b-base" base_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-llm-7b-base", low_cpu_mem_usage=True, return_dict=True, torch_dtype=torch.float16, device_map={"": 0}) # Adapter path def get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data): return f"./adversary_models/{model_name}_{WM}_epoch{num_epochs}_lr{learning_rate_}_data{num_data}_" #return f"/network/rit/lab/Lai_ReSecureAI/phung/adversary_models/{model_name}_epoch{num_epochs}_lr{learning_rate_}_K{K}_Threshold{Threshold}_data{num_data}_testing_batch{batch_no}_" adapter = get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data) # Check if the adapter path exists print(f'Path to adapter: {adapter}') if os.path.exists(adapter): print("Path exists.") else: print("Path does not exist.") # Merge the base model and adapter model = PeftModel.from_pretrained(base_model, adapter) print("Model loaded successfully.") model = model.merge_and_unload() print("Model merged successfully.") model.to(device) # Initialize the tokenizer and model tokenizer = AutoTokenizer.from_pretrained(LLM_name, cache_dir=cache_dir,use_fast=False) # # Add special tokens # tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # tokenizer.pad_token = tokenizer.eos_token # Load the data max_output_tokens=90 min_output_tokens=10 data_link = "/network/rit/lab/Lai_ReSecureAI/kiel/New_WM/Summarization/cnn.json" output_results = [] input_counter = 0 saving_freq = 10 data = "test" output_name = f"Adversary_{model_name}_{WM}_Summarization_{data}_{num_data}_{num_epochs}_{learning_rate_}_" # torch.clear_cache() def text_summarize(input_text, model, tokenizer, max_output_tokens , min_output_tokens): #prompt = f"{input_text}\nThe summary is:" prompt = f""" Input: The CNN/Daily Mail dataset is one of the most widely used datasets for text summarization. It contains news articles and their corresponding highlights, which act as summaries. State-of-the-art models often use this dataset to fine-tune their summarization capabilities. Example Summary: The CNN/Daily Mail dataset is commonly used for training summarization models with news articles and highlights. Now summarize the following text with maximum 60 words: {input_text} The summary is:""" # prompt = f""" # Now summarize the following text with maximum 60 words: {input_text} # The summary is:""" inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) inputs_tokens = inputs['input_ids'].cuda() output = model.generate( inputs_tokens, max_new_tokens=max_output_tokens, min_new_tokens=min_output_tokens, do_sample=True, temperature=0.9, top_k=50, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, # Prevents truncation issues repetition_penalty=1.2 # Ensures complete sentence termination ) summary = tokenizer.decode(output[0], skip_special_tokens=True) return summary.split("The summary is:")[-1].strip() with open(data_link, "r", encoding="utf-8") as f: data_subset = json.load(f) # Filter test data test_data = [sample for sample in data_subset if sample["type"] == data] # Testing loop for i, sample in enumerate(test_data[:N]): print(f"Processed {i+1}/{len(test_data[:N])}") text = sample["article"] summary = text_summarize(text, model, tokenizer, max_output_tokens , min_output_tokens) # Store the input and output in a dictionary data_dict = { "id": sample["id"], "article": sample["article"], "highlights": sample["highlights"], "summary": summary, "type": data } output_results.append(data_dict) input_counter += 1 # Save the results freqently if input_counter % saving_freq == 0: # Check if the file exits if os.path.isfile(output_name + "_" + str(input_counter-saving_freq) + ".json"): os.remove(output_name + "_" + str(input_counter-saving_freq) + ".json") with open(output_name + "_" + str(input_counter) + ".json", "w", encoding="utf-8") as json_file: json.dump(output_results, json_file, indent=4) print(f"Summarization complete. Results saved to {output_name}.")