Spaces:

du-lab
/

MLR-Copilot

Running

App Files Files Community

MLR-Copilot / reactagent /p2m_actions.py

Lim0011

Upload 251 files

85e3d20 verified over 1 year ago

raw

history blame contribute delete

14.6 kB

	import os
	import torch
	import datasets
	import transformers
	import json
	from .schema import ActionInfo, EnvException, EnhancedJSONEncoder

	from reactagent.prompt2model.prompt_parser import MockPromptSpec, TaskType
	from reactagent.prompt2model.dataset_retriever import DescriptionDatasetRetriever
	from reactagent.prompt2model.dataset_generator import PromptBasedDatasetGenerator, DatasetSplit
	from reactagent.prompt2model.dataset_processor import TextualizeProcessor
	from reactagent.prompt2model.model_retriever import DescriptionModelRetriever
	from reactagent.prompt2model.model_trainer import GenerationModelTrainer
	from reactagent.prompt2model.model_executor import GenerationModelExecutor, ModelOutput
	from reactagent.prompt2model.model_evaluator import Seq2SeqEvaluator

	def generate_dataset(instruction, examples, save_dir, num_train, num_valid, num_test, work_dir = '.', **kwargs):
	try:
	num_train = int(num_train)
	num_valid = int(num_valid)
	num_test = int(num_test)
	except ValueError:
	raise EnvException("Number of examples should be an integer")

	prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples=examples)
	generator = PromptBasedDatasetGenerator()
	dataset_dict = generator.generate_dataset_dict(prompt_spec, {
	DatasetSplit.TRAIN: num_train,
	DatasetSplit.VAL: num_valid,
	DatasetSplit.TEST: num_test
	})

	save_path = os.path.join(work_dir, save_dir)
	dataset_dict.save_to_disk(save_path)

	return f"Dataset successfully generated and saved to {save_path}"

	def retrieve_dataset(instruction, save_dir, work_dir = '.', **kwargs):
	prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples="")
	retriever = DescriptionDatasetRetriever()
	dataset_dict = retriever.retrieve_dataset_dict(prompt_spec)

	save_path = os.path.join(work_dir, save_dir)
	dataset_dict.save_to_disk(save_path)

	return f"Dataset successfully generated and saved to {save_path}"

	def retrieve_model(instruction, work_dir = '.', **kwargs):
	prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples="")
	retriever = DescriptionModelRetriever(use_bm25=True, use_HyDE=True)
	top_models = retriever.retrieve(prompt_spec)

	return "Top Models:\n" + "".join(f"{i+1}. {model}\n" for i, model in enumerate(top_models))

	def process_dataset(instruction, load_dirs, save_dirs, work_dir = '.', **kwargs):
	prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples="")
	load_dirs = load_dirs.split(':')
	save_dirs = save_dirs.split(':')
	if len(load_dirs) != len(save_dirs):
	raise EnvException("Number of load directories should match number of save directories")
	load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
	save_paths = [os.path.join(work_dir, save_dir) for save_dir in save_dirs]

	# load the datasets
	dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]

	# process the datasets
	processor = TextualizeProcessor(has_encoder=True)
	modified_dataset_dicts = processor.process_dataset_dict(prompt_spec, dataset_dicts)

	# save the processed datasets
	for dataset_dict, save_path in zip(modified_dataset_dicts, save_paths):
	dataset_dict.save_to_disk(save_path)

	return f"Data successfully processed and saved to {save_paths}"

	def train_model(model_name, load_dirs, result_dir, epochs, batch_size, warmup_steps, weight_decay, learning_rate, work_dir = '.', **kwargs):
	try:
	epochs = int(epochs)
	batch_size = int(batch_size)
	warmup_steps = int(warmup_steps)
	weight_decay = float(weight_decay)
	learning_rate = float(learning_rate)
	except ValueError:
	raise EnvException("Numerical parameters should be integers or floats as appropriate")

	load_dirs = load_dirs.split(':')
	result_dir = os.path.join(work_dir, result_dir)

	# load the datasets
	load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
	dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]

	training_datasets = [dataset_dict["train"] for dataset_dict in dataset_dicts]
	validation_datasets = [dataset_dict["val"] for dataset_dict in dataset_dicts]

	trainer = GenerationModelTrainer(
	model_name,
	has_encoder=True,
	executor_batch_size=batch_size,
	tokenizer_max_length=1024,
	sequence_max_length=1280,
	)

	hparams ={
	"output_dir": os.path.join(result_dir, "training_output"),
	"save_strategy": "epoch",
	"num_train_epochs": epochs,
	"per_device_train_batch_size": batch_size,
	"evaluation_strategy": "epoch",
	"warmup_steps": warmup_steps,
	"weight_decay": weight_decay,
	"learning_rate": learning_rate,
	},

	trained_model, trained_tokenizer = trainer.train_model(
	hyperparameter_choices=hparams,
	training_datasets=training_datasets,
	validation_datasets=validation_datasets,
	)

	trained_model.save_pretrained(os.path.join(result_dir, "trained_model"))
	trained_tokenizer.save_pretrained(os.path.join(result_dir, "trained_tokenizer"))

	return f"Model and Tokenizer successfully trained and saved respectively to {result_dir}/trained_model and {result_dir}/trained_tokenizer"

	def execute_model(result_dir, load_dirs, save_path, batch_size, input_column, work_dir = '.', **kwargs):
	load_dirs = load_dirs.split(':')
	result_dir = os.path.join(work_dir, result_dir)
	save_path = os.path.join(work_dir, save_path)

	try:
	batch_size = int(batch_size)
	except ValueError:
	raise EnvException("Batch size should be an integer")

	# load the datasets
	load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
	dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]
	test_datasets = [dataset_dict["test"] for dataset_dict in dataset_dicts]
	test_dataset = datasets.concatenate_datasets(test_datasets)

	trained_model_path = os.path.join(result_dir, "trained_model")
	trained_tokenizer_path = os.path.join(result_dir, "trained_tokenizer")

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	trained_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(trained_model_path).to(device)
	trained_tokenizer = transformers.AutoTokenizer.from_pretrained(trained_tokenizer_path)

	executor = GenerationModelExecutor(
	trained_model,
	trained_tokenizer,
	batch_size,
	tokenizer_max_length=1024,
	sequence_max_length=1280,
	)

	outputs = executor.make_prediction(
	test_set=test_dataset,
	input_column=input_column
	)

	with open(save_path, 'w') as f:
	json.dump(outputs, f, cls=EnhancedJSONEncoder)

	return f"Model successfully executed on the test sets of the specified datasets and saved to {save_path}"

	def evaluate_model(load_dirs, save_path, output_column, work_dir = '.', **kwargs):
	load_dirs = load_dirs.split(':')
	# load the datasets
	load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
	dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]
	test_datasets = [dataset_dict["test"] for dataset_dict in dataset_dicts]
	test_dataset = datasets.concatenate_datasets(test_datasets)

	save_path = os.path.join(work_dir, save_path)
	with open(save_path, 'r') as f:
	outputs = json.load(f)
	outputs = [ModelOutput(**output) for output in outputs]

	evaluator = Seq2SeqEvaluator()
	metric_values = evaluator.evaluate_model(
	test_dataset,
	gt_column=output_column,
	predictions=outputs,
	encoder_model_name="xlm-roberta-base",
	)

	return f"Evaluation metrics: {metric_values}"

	P2M_ACTIONS = [
	ActionInfo(
	name="Retrieve Model",
	description="Retrieve a suitable model based on a detailed description of the requirements. You can obtain the model given the name using the transformers.AutoModel.from_pretrained function.",
	usage={
	"instruction": "an instruction on how to generate the output from the input",
	},
	return_value="The observation will be a list of suitable models. You can choose one of them based on the requirements.",
	is_primitive=False,
	function=retrieve_model
	),
	]
	# P2M_ACTIONS = [
	# ActionInfo(
	# name="Generate Dataset",
	# description="Generate a dataset based on an instruction and examples. You can load the dataset later from `save_dir` using the load_from_disk function of the HuggingFace datasets library.",
	# usage={
	# "instruction": "an instruction on how to generate the output from the input",
	# "examples": "examples of input-output pairs",
	# "save_dir": "directory to save the generated dataset dict to. We recommend saving to data/generated/",
	# "num_train": "number of examples to generate in the training set",
	# "num_valid": "number of examples to generate in the validation set",
	# "num_test": "number of examples to generate in the test set",
	# },
	# return_value="The observation will be a success message if the dataset was generated successfully. Otherwise, an error message will be returned.",
	# is_primitive=False,
	# function=generate_dataset
	# ),
	# ActionInfo(
	# name="Retrieve Dataset",
	# description="Retrieve a suitable dataset based on a detailed description of the requirements. You can load the dataset later from `save_dir` using the load_from_disk function of the HuggingFace datasets library.",
	# usage={
	# "instruction": "an instruction on how to generate the output from the input",
	# "save_dir": "directory to save the generated dataset dict to. We recommend saving to data/retrieved/",
	# },
	# return_value="The observation will be a success message if the dataset was retrieved successfully. Otherwise, an error message will be returned.",
	# is_primitive=False,
	# function=retrieve_dataset
	# ),
	# ActionInfo(
	# name="Retrieve Model",
	# description="Retrieve a suitable model based on a detailed description of the requirements. You can obtain the model given the name using the transformers.AutoModelForSeq2SeqLM.from_pretrained function.",
	# usage={
	# "instruction": "an instruction on how to generate the output from the input",
	# },
	# return_value="The observation will be a list of suitable models. You can choose one of them based on the requirements.",
	# is_primitive=False,
	# function=retrieve_model
	# ),
	# ActionInfo(
	# name="Process Dataset",
	# description="Process dataset based on a detailed description of the requirements. You can load the processed data later from `save_dirs` using the load_from_disk function of the HuggingFace datasets library. The input text will be in the `model_input` column and the output text will be in the `model_output` column.",
	# usage={
	# "instruction": "an instruction on how to generate the output from the input",
	# "load_dirs": "directories to load the dataset dicts from, separated by colons",
	# "save_dirs": "directories to save the processed dataset dicts to, separated by colons. The order should match the order of the loaded datasets. We recommend saving to data/processed/",
	# },
	# return_value="The observation will be a success message if the data was processed successfully. Otherwise, an error message will be returned.",
	# is_primitive=False,
	# function=process_dataset
	# ),
	# ActionInfo(
	# name="Train Model",
	# description="Train a Seq2Seq model from HuggingFace transformers library using the processed datasets and given hyperparameters.",
	# usage={
	# "model_name": "name of the model to train",
	# "load_dirs": "directories to load the dataset dicts from, separated by colons",
	# "result_dir": "directory to save the trained model and tokenizer to. We recommend using results/{trial_id}/. The trained model will be available as `{result_dir}/trained_model/` and the tokenizer will be available as `{result_dir}/trained_tokenizer/`.",
	# "epochs": "number of epochs to train the model for",
	# "batch_size": "batch size for training the model",
	# "warmup_steps": "number of warmup steps for the optimizer",
	# "weight_decay": "weight decay for the optimizer",
	# "learning_rate": "learning rate for the optimizer",
	# },
	# return_value="The observation will be a success message if the model was trained successfully. Otherwise, an error message will be returned.",
	# is_primitive=False,
	# function=train_model
	# ),
	# ActionInfo(
	# name="Execute Model on Test Set",
	# description="Execute a trained model on the test sets of specified dataset dicts.",
	# usage={
	# "result_dir": "directory where the trained model and tokenizer are saved",
	# "load_dirs": "directories to load the dataset dicts from, separated by colons",
	# "save_path": "file to save the results of the model execution in json format",
	# "batch_size": "batch size for executing the model",
	# "input_column": "column name of the input text",
	# },
	# return_value="The observation will be a success message if the model was executed successfully. Otherwise, an error message will be returned.",
	# is_primitive=False,
	# function=execute_model,
	# ),
	# ActionInfo(
	# name="Evaluate Model",
	# description="Evaluate a trained model on the test sets of specified dataset dicts.",
	# usage={
	# "load_dirs": "directories to load the dataset dicts from, separated by colons",
	# "save_path": "file to load the results of the model execution in json format",
	# "output_column": "column name of the output text",
	# },
	# return_value="The values for various evaluation metrics will be returned.",
	# is_primitive=False,
	# function=evaluate_model,
	# )
	# ]