Spaces:

AnonymousResearch
/

WatermarkLeaderboard

Sleeping

App Files Files Community

WatermarkLeaderboard / Reproducibility /Inference_sum.py

kirudang

Copy files from original watermark leaderboard

40b3335 about 1 month ago

raw

history blame contribute delete

5.58 kB

	import os
	# Ensure the HF_HOME environment variable points to your desired cache location
	# os.environ["HF_TOKEN"] = "your_hugging_face_token_here" # Replace with your Hugging Face token
	cache_dir = '/network/rit/lab/Lai_ReSecureAI/kiel/wmm'
	os.environ['HF_HOME'] = cache_dir

	import time
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from peft import PeftModel
	import json

	# Clear cache
	torch.cuda.empty_cache()
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Initialize parameters
	model_name = "DeepSeek" #"Llama3" #'DeepSeek' #
	WM = "SafeSeal"
	num_data = 20000
	num_epochs = 5
	learning_rate_ = 1e-5
	N = 1000

	# Print parameters
	# Print parameters
	print(f'Device: {device}')
	print(f'Model: {model_name}')
	print(f'WM: {WM}')
	print(f'Number of data: {num_data}')
	print(f'Number of epochs: {num_epochs}')
	print(f'Learning rate: {learning_rate_}')
	print(f'Number of generated data: {N}')

	# Base model
	if model_name == 'Llama3':
	LLM_name = "meta-llama/Meta-Llama-3-8B"
	base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B",
	low_cpu_mem_usage=True,
	return_dict=True,
	torch_dtype=torch.float16,
	device_map={"": 0})
	elif model_name == 'DeepSeek':
	LLM_name = "deepseek-ai/deepseek-llm-7b-base"
	base_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-llm-7b-base",
	low_cpu_mem_usage=True,
	return_dict=True,
	torch_dtype=torch.float16,
	device_map={"": 0})

	# Adapter path
	def get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data):
	return f"./adversary_models/{model_name}_{WM}_epoch{num_epochs}_lr{learning_rate_}_data{num_data}_"
	#return f"/network/rit/lab/Lai_ReSecureAI/phung/adversary_models/{model_name}_epoch{num_epochs}_lr{learning_rate_}_K{K}_Threshold{Threshold}_data{num_data}_testing_batch{batch_no}_"

	adapter = get_adapter_path(model_name,WM, num_epochs, learning_rate_, num_data)

	# Check if the adapter path exists
	print(f'Path to adapter: {adapter}')
	if os.path.exists(adapter):
	print("Path exists.")
	else:
	print("Path does not exist.")

	# Merge the base model and adapter
	model = PeftModel.from_pretrained(base_model, adapter)
	print("Model loaded successfully.")
	model = model.merge_and_unload()
	print("Model merged successfully.")
	model.to(device)

	# Initialize the tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained(LLM_name, cache_dir=cache_dir,use_fast=False)
	# # Add special tokens
	# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
	# tokenizer.pad_token = tokenizer.eos_token

	# Load the data
	max_output_tokens=90
	min_output_tokens=10
	data_link = "/network/rit/lab/Lai_ReSecureAI/kiel/New_WM/Summarization/cnn.json"
	output_results = []
	input_counter = 0
	saving_freq = 10
	data = "test"
	output_name = f"Adversary_{model_name}_{WM}_Summarization_{data}_{num_data}_{num_epochs}_{learning_rate_}_"

	# torch.clear_cache()

	def text_summarize(input_text, model, tokenizer, max_output_tokens , min_output_tokens):
	#prompt = f"{input_text}\nThe summary is:"
	prompt = f"""
	Input: The CNN/Daily Mail dataset is one of the most widely used datasets for text summarization.
	It contains news articles and their corresponding highlights, which act as summaries.
	State-of-the-art models often use this dataset to fine-tune their summarization capabilities.

	Example Summary: The CNN/Daily Mail dataset is commonly used for training summarization models with news articles and highlights.

	Now summarize the following text with maximum 60 words: {input_text}
	The summary is:"""
	# prompt = f"""
	# Now summarize the following text with maximum 60 words: {input_text}
	# The summary is:"""
	inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
	inputs_tokens = inputs['input_ids'].cuda()

	output = model.generate(
	inputs_tokens,
	max_new_tokens=max_output_tokens,
	min_new_tokens=min_output_tokens,
	do_sample=True,
	temperature=0.9,
	top_k=50,
	eos_token_id=tokenizer.eos_token_id,
	pad_token_id=tokenizer.eos_token_id, # Prevents truncation issues
	repetition_penalty=1.2 # Ensures complete sentence termination
	)

	summary = tokenizer.decode(output[0], skip_special_tokens=True)
	return summary.split("The summary is:")[-1].strip()

	with open(data_link, "r", encoding="utf-8") as f:
	data_subset = json.load(f)

	# Filter test data
	test_data = [sample for sample in data_subset if sample["type"] == data]

	# Testing loop
	for i, sample in enumerate(test_data[:N]):
	print(f"Processed {i+1}/{len(test_data[:N])}")
	text = sample["article"]
	summary = text_summarize(text, model, tokenizer, max_output_tokens , min_output_tokens)

	# Store the input and output in a dictionary
	data_dict = {
	"id": sample["id"],
	"article": sample["article"],
	"highlights": sample["highlights"],
	"summary": summary,
	"type": data
	}

	output_results.append(data_dict)
	input_counter += 1

	# Save the results freqently
	if input_counter % saving_freq == 0:
	# Check if the file exits
	if os.path.isfile(output_name + "_" + str(input_counter-saving_freq) + ".json"):
	os.remove(output_name + "_" + str(input_counter-saving_freq) + ".json")
	with open(output_name + "_" + str(input_counter) + ".json", "w", encoding="utf-8") as json_file:
	json.dump(output_results, json_file, indent=4)

	print(f"Summarization complete. Results saved to {output_name}.")