incomplete commit

485127c 4 days ago

20.4 kB

	# Copyright (c) Guangsheng Bao.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.
	import time

	import numpy as np
	import datasets
	import torch
	import random
	import argparse
	import os
	import json
	import custom_datasets
	from model import load_tokenizer, load_model

	ROLES = {'xsum': 'You are a News writer.',
	'writing': 'You are a Fiction writer.',
	'pubmed': 'You are a Technical writer.',
	'yelp_polarity': 'You are a Review writer on Yelp.',
	'essay': 'You are a student of high school and university level. And now, you are an Essay writer.'}
	PROMPTS = {'xsum': 'Please write an article with about 150 words starting exactly with:',
	'writing': 'Please write an article with about 150 words starting exactly with:',
	'pubmed': 'Please answer the question in about 50 words.',
	'yelp_polarity': 'Please write a review with about 150 words starting exactly with:',
	'essay': 'Please write an essay with about 200 words starting exactly with:'}

	def save_data(output_file, args, data):
	# write args to file
	args_file = f"{output_file}.args.json"
	with open(args_file, "w") as fout:
	json.dump(args.__dict__, fout, indent=4)
	print(f"Args written into {args_file}")

	# write the data to a json file in the save folder
	data_file = f"{output_file}.raw_data.json"
	with open(data_file, "w") as fout:
	json.dump(data, fout, indent=4)
	print(f"Raw data written into {data_file}")


	def load_data(input_file):
	data_file = f"{input_file}.raw_data.json"
	with open(data_file, "r") as fin:
	data = json.load(fin)
	print(f"Raw data loaded from {data_file}")
	return data


	class DataBuilder:
	def __init__(self, args):
	self.args = args
	self.base_tokenizer = load_tokenizer(args.base_model_name, args.cache_dir)
	self.base_model = None if args.openai_model else load_model(args.base_model_name, args.device, args.cache_dir)

	def _openai_sample(self, prefix):
	def _drop_last_word(text):
	return ' '.join(text.split(' ')[:-1])

	from openai import OpenAI

	client = OpenAI(api_key=self.args.openai_key)
	assert self.args.openai_key is not None, "Must provide OpenAI API key as --openai_key"
	if self.args.openai_base is not None:
	# TODO: The 'openai.api_base' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(base_url=self.args.openai_base)'
	# openai.api_base = self.args.openai_base
	OpenAI(base_url=self.args.openai_base)

	if self.args.dataset != 'pubmed': # keep Answer: prefix for pubmed
	prefix = _drop_last_word(prefix)

	# sample from the openai model
	kwargs = {"max_tokens": 200}
	if self.args.do_top_p:
	kwargs['top_p'] = self.args.top_p
	elif self.args.do_top_k:
	kwargs['top_k'] = self.args.top_k
	elif self.args.do_temperature:
	kwargs['temperature'] = self.args.temperature

	if self.args.openai_model == 'davinci':
	# kwargs["engine"] = self.args.openai_model
	kwargs["model"] = "text-davinci-003"
	response = client.completions.create(prompt=f"{prefix}", **kwargs)
	return prefix + response.choices[0].text

	elif self.args.openai_model in ['gpt-3.5-turbo', 'gpt-4', 'gpt-4o']:
	messages = [
	{'role': 'system', 'content': ROLES[self.args.dataset]},
	{'role': 'user', 'content': f'{PROMPTS[self.args.dataset]} {prefix}'},
	]
	kwargs["model"] = self.args.openai_model
	kwargs["messages"] = messages
	response = client.chat.completions.create(**kwargs)
	response = response.choices[0].message.content
	# ChatGPT may repeat the prefix
	if response.startswith(prefix[:20]):
	return response
	return prefix + ' ' + response

	else:
	raise NotImplementedError

	def _gemini_sample(self, prefix) -> str:
	from google import genai
	from google.genai import types

	# 1) Initialize the client (uses GOOGLE_API_KEY from env)
	client = genai.Client()

	# 2) Optionally drop the last word for non-pubmed datasets
	if self.args.dataset != 'pubmed':
	prefix = ' '.join(prefix.split()[:-1])

	instruct = ROLES[self.args.dataset] + PROMPTS[self.args.dataset]

	# 3) Build the request dict from self.args
	params = {
	"model": self.args.gemini_model,
	"contents": prefix,
	}
	response = client.models.generate_content(
	**params,
	config=types.GenerateContentConfig(
	top_p=self.args.top_p if self.args.do_top_p else None,
	top_k=self.args.top_k if self.args.do_top_k else None,
	temperature=self.args.temperature if self.args.do_temperature else None,
	seed=self.args.seed,
	candidate_count=1,
	system_instruction=instruct,
	),
	)
	response = response.text.strip()

	# print(f"Gemini response: {response}")
	# 5) Return response
	if response.startswith(prefix[:20]):
	return response
	return prefix + ' ' + response

	def _claude_sample(self, prefix: str) -> str:
	from anthropic import Anthropic

	client = Anthropic()

	# 2) For non-pubmed, drop last word as in your other samplers
	if self.args.dataset != "pubmed":
	prefix = " ".join(prefix.split()[:-1])

	# 3) Build system + user content just like in GPT path
	model_full_name = {'claude-3-5-haiku': "claude-3-5-haiku-20241022"}

	system_instruction = ROLES[self.args.dataset]

	# 4) Assemble request kwargs
	req = {
	"system": system_instruction,
	"temperature": self.args.temperature if self.args.do_temperature else None,
	"top_p": self.args.top_p if self.args.do_top_p else None,
	"top_k": self.args.top_k if self.args.do_top_k else None,
	}
	response = client.messages.create(
	model=model_full_name[self.args.claude_model],
	max_tokens=200,
	messages=[{"role": "user", "content": f'{PROMPTS[self.args.dataset]} {prefix}'}],
	**{k: v for k, v in req.items() if v is not None}
	)
	response = response.content[0].text.strip()
	response = response.removeprefix("Here's the article:").lstrip("\r\n")
	print(f"Claude response: {response}")
	return response

	def _sample_rewrite_text_from_model(self, human_texts, min_words, sampling_kwargs):
	revised_statement = "You are a professional rewriting expert and you can help paraphrasing this paragraph without missing the original details. Please keep the length of the rewritten text similar to the original text. Original text: \"{}\""
	texts = [revised_statement.format(o) for o in human_texts]

	self.base_model.eval()
	decoded = ['' for _ in range(len(texts))]

	tries = 0
	m = 0
	while m < min_words:
	if tries != 0:
	print()
	print(f"min words: {m}, needed {min_words}, regenerating (try {tries})")
	prefixes = self.base_tokenizer.batch_decode(all_encoded['input_ids'], skip_special_tokens=True)
	for prefix, x in zip(prefixes, decoded):
	if len(x.split()) == m:
	print(prefix, '=>', x)

	all_encoded = self.base_tokenizer(texts, return_tensors="pt", padding=True, padding_side='left', return_token_type_ids=False).to(self.args.device)
	prompt_lens = all_encoded['input_ids'].shape[1]
	outputs = self.base_model.generate(*all_encoded, min_new_tokens=min_words, max_length=prompt_lens2, do_sample=True, **sampling_kwargs, pad_token_id=self.base_tokenizer.eos_token_id, eos_token_id=self.base_tokenizer.eos_token_id)
	gen_ids = outputs[:, prompt_lens:]
	decoded = self.base_tokenizer.batch_decode(gen_ids, skip_special_tokens=True)

	m = min(len(x.split()) for x in decoded)
	tries += 1

	return decoded

	def build_sampling_kwargs(self):
	sampling_kwargs = {}
	if self.args.do_top_p:
	sampling_kwargs['top_p'] = self.args.top_p
	elif self.args.do_top_k:
	sampling_kwargs['top_k'] = self.args.top_k
	elif self.args.do_temperature:
	sampling_kwargs['temperature'] = self.args.temperature

	if self.args.do_exact_cond_prob:
	sampling_kwargs['top_p'] = 1.0
	sampling_kwargs['top_k'] = 0
	sampling_kwargs['temperature'] = 1.0
	return sampling_kwargs

	# sample from base_model using **only** the first 30 tokens in each example as context
	def _sample_from_model(self, texts, min_words=55, prompt_tokens=30):
	# encode each text as a list of token ids
	if self.args.dataset == 'pubmed':
	texts = [t[:t.index(custom_datasets.SEPARATOR)] for t in texts]
	all_encoded = self.base_tokenizer(texts, return_tensors="pt", padding=True, return_token_type_ids=False).to(self.args.device)
	else:
	all_encoded = self.base_tokenizer(texts, return_tensors="pt", padding=True, return_token_type_ids=False).to(self.args.device)
	all_encoded = {key: value[:, :prompt_tokens] for key, value in all_encoded.items()}

	if self.args.openai_model or self.args.gemini_model or self.args.claude_model:
	# decode the prefixes back into text
	prefixes = self.base_tokenizer.batch_decode(all_encoded['input_ids'], skip_special_tokens=True)

	decoded = []
	for idx, prefix in enumerate(prefixes):
	while idx >= len(decoded):
	try:
	if self.args.openai_model:
	decoded.append(self._openai_sample(prefix))
	elif self.args.gemini_model:
	decoded.append(self._gemini_sample(prefix))
	elif self.args.claude_model:
	decoded.append(self._claude_sample(prefix))
	except Exception as ex:
	print(ex)
	print('Wait 10 minutes before retry ...')
	time.sleep(600)

	else:
	self.base_model.eval()
	decoded = ['' for _ in range(len(texts))]

	# sample from the model until we get a sample with at least min_words words for each example
	# this is an inefficient way to do this (since we regenerate for all inputs if just one is too short), but it works
	tries = 0
	m = 0
	while m < min_words:
	if tries != 0:
	print()
	print(f"min words: {m}, needed {min_words}, regenerating (try {tries})")
	prefixes = self.base_tokenizer.batch_decode(all_encoded['input_ids'], skip_special_tokens=True)
	for prefix, x in zip(prefixes, decoded):
	if len(x.split()) == m:
	print(prefix, '=>', x)

	sampling_kwargs = self.build_sampling_kwargs()
	min_length = 50 if self.args.dataset in ['pubmed'] else 150
	outputs = self.base_model.generate(all_encoded, min_length=min_length, max_new_tokens=None, max_length=self.args.max_length, do_sample=True, sampling_kwargs, pad_token_id=self.base_tokenizer.eos_token_id, eos_token_id=self.base_tokenizer.eos_token_id)
	decoded = self.base_tokenizer.batch_decode(outputs, skip_special_tokens=True)
	m = min(len(x.split()) for x in decoded)
	tries += 1

	return decoded

	def generate_samples(self, raw_data, batch_size):
	# trim to shorter length
	def _trim_to_shorter_length(texta, textb, textc=None):
	# truncate to shorter of o and s (optional for textc)
	shorter_length = min(len(texta.split(' ')), len(textb.split(' ')))
	if textc is not None:
	shorter_length = min(shorter_length, len(textc.split(' ')))
	texta = ' '.join(texta.split(' ')[:shorter_length])
	textb = ' '.join(textb.split(' ')[:shorter_length])
	if textc is not None:
	textc = ' '.join(textc.split(' ')[:shorter_length])
	return texta, textb, textc
	else:
	return texta, textb

	def _trim_human_prompt(texta, n_human_prompts):
	text = ' '.join(texta.split(' ')[n_human_prompts:])
	return texta

	def _truncate_to_substring(text, substring, idx_occurrence):
	# truncate everything after the idx_occurrence occurrence of substring
	assert idx_occurrence > 0, 'idx_occurrence must be > 0'
	idx = -1
	for _ in range(idx_occurrence):
	idx = text.find(substring, idx + 1)
	if idx == -1:
	return text
	return text[:idx]

	data = {
	"original": [],
	"sampled": [],
	}
	if self.args.revised_human_text:
	new_data = {'revised': []}

	min_generated_words = 30 if self.args.dataset in ['pubmed'] else 55
	for batch in range(len(raw_data) // batch_size):
	print('Generating LLM samples for batch', batch, 'of', len(raw_data) // batch_size)
	original_text = raw_data[batch * batch_size:(batch + 1) * batch_size]
	sampled_text = self._sample_from_model(original_text, min_words=min_generated_words,
	prompt_tokens=self.args.n_prompts)

	for o, s in zip(original_text, sampled_text):
	if self.args.dataset == 'pubmed':
	s = _truncate_to_substring(s, 'Question:', 2)
	o = o.replace(custom_datasets.SEPARATOR, ' ')

	if self.args.trim_human:
	o = _trim_human_prompt(o, self.args.n_prompts)
	s = _trim_human_prompt(s, self.args.n_prompts)

	o, s = _trim_to_shorter_length(o, s)

	# add to the data
	data["original"].append(o)
	data["sampled"].append(s)

	# if revised-text, then remove the last word from each sampled text
	if self.args.revised_human_text:
	human_texts = data["original"][(-batch_size):]
	machine_texts = data["sampled"][(-batch_size):]
	sampling_kwargs = self.build_sampling_kwargs()
	revised_original = self._sample_rewrite_text_from_model(human_texts, min_generated_words, sampling_kwargs)

	for i, (o, r, s) in enumerate(zip(human_texts, revised_original, machine_texts)):
	if self.args.dataset == 'pubmed':
	r = r.replace(custom_datasets.SEPARATOR, ' ')

	o, r, s = _trim_to_shorter_length(o, r, s)

	data['original'][batch * batch_size + i] = o
	data['sampled'][batch * batch_size + i] = s
	new_data['revised'].append(r)

	if self.args.revised_human_text:
	# add the revised human text to the data
	data['revised'] = new_data['revised']

	return data

	def generate_data(args, dataset, key):
	# strip newlines from each example; replace one or more newlines with a single space
	def _strip_newlines(text):
	return ' '.join(text.split())

	# load data
	if dataset in custom_datasets.DATASETS:
	data = custom_datasets.load(dataset, args.cache_dir)
	else:
	data = custom_datasets.load_dataset(dataset, split='train', cache_dir=args.cache_dir)[key]

	# get unique examples, strip whitespace, and remove newlines
	# then take just the long examples, shuffle, take the first 5,000 to tokenize to save time
	# then take just the examples that are <= 512 tokens (for the base model)
	# then generate n_samples samples

	# remove duplicates from the data
	data = list(dict.fromkeys(data)) # deterministic, as opposed to set()

	# strip whitespace around each example
	data = [x.strip() for x in data]

	# remove newlines from each example
	data = [_strip_newlines(x) for x in data]

	# try to keep only examples with > 250 words
	if dataset in ['writing', 'squad', 'xsum', 'yelp_polarity', "essay"]:
	long_data = [x for x in data if len(x.split()) > 250]
	if len(long_data) > 0:
	data = long_data

	random.shuffle(data)
	data = data[:5_000]

	# keep only examples with <= 512 tokens according to base_tokenizer
	# this step has the extra effect of removing examples with low-quality/garbage content
	data_builder = DataBuilder(args)
	tokenized_data = data_builder.base_tokenizer(data)
	data = [x for x, y in zip(data, tokenized_data["input_ids"]) if len(y) <= 512]

	# print stats about remaining data
	print(f"Total number of samples: {len(data)}")
	print(f"Average number of words: {np.mean([len(x.split()) for x in data])}")

	return data_builder.generate_samples(data[:args.n_samples], batch_size=args.batch_size)

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--output_file', type=str, default="./exp_main/data/yelp_gpt2-xl")
	parser.add_argument('--dataset', type=str, default='pubmed', choices=['xsum', 'squad', 'writing', 'pubmed', 'essay', 'yelp'])
	parser.add_argument('--n_samples', type=int, default=200)
	parser.add_argument('--openai_base', type=str, default=None)
	parser.add_argument('--openai_key', type=str, default=None)
	parser.add_argument('--openai_model', type=str, default=None, choices=['davinci', 'gpt-3.5-turbo', 'gpt-4', 'gpt-4o'])
	parser.add_argument('--gemini_model', type=str, default=None, choices=['gemini-2.5-flash'])
	parser.add_argument('--claude_model', type=str, default=None, choices=['claude-3-5-haiku'])
	parser.add_argument('--base_model_name', type=str, default="opt-2.7b")
	parser.add_argument('--batch_size', type=int, default=50)
	parser.add_argument('--do_exact_cond_prob', action='store_true')
	# parser.add_argument('--do_exact_cond_prob', type=bool, default=True)
	parser.add_argument('--do_top_k', action='store_true')
	parser.add_argument('--top_k', type=int, default=40)
	parser.add_argument('--do_top_p', action='store_true')
	parser.add_argument('--top_p', type=float, default=0.96)
	parser.add_argument('--do_temperature', action='store_true')
	parser.add_argument('--temperature', type=float, default=0.8)
	parser.add_argument('--n_prompts', type=int, default=120)
	parser.add_argument('--max_length', type=int, default=200)
	parser.add_argument('--trim_human', action='store_true')
	parser.add_argument('--revised_human_text', action='store_true')
	# parser.add_argument('--revised_human_text', type=bool, default=True)
	parser.add_argument('--seed', type=int, default=0)
	parser.add_argument('--device', type=str, default="cuda")
	parser.add_argument('--cache_dir', type=str, default="../cache")
	args = parser.parse_args()

	os.environ["XDG_CACHE_HOME"] = args.cache_dir
	if not os.path.exists(args.cache_dir):
	os.makedirs(args.cache_dir)
	print(f"Using cache dir {args.cache_dir}")

	random.seed(args.seed)
	torch.manual_seed(args.seed)
	np.random.seed(args.seed)

	if args.dataset == 'yelp':
	args.dataset = 'yelp_polarity'

	print(f'Loading dataset {args.dataset}...')
	dataset_keys = {'xsum': 'document', 'squad': 'context', 'writing': 'document', 'essay': 'document', 'yelp_polarity': 'text'}
	data = generate_data(args, args.dataset, dataset_keys[args.dataset] if args.dataset in dataset_keys else None)

	if args.dataset == 'yelp':
	args.dataset = 'yelp_polarity'

	save_data(args.output_file, args, data)