Clemspace
/

CHEMISTral7Bv0.3

Model card Files Files and versions

Metrics Training metrics Community

CHEMISTral7Bv0.3 / finetune /data /args.py

Clemspace's picture

Initial model upload

cb9e677 almost 2 years ago

history blame contribute delete

2.76 kB

	import logging
	from dataclasses import dataclass, field

	from simple_parsing.helpers import Serializable

	logger = logging.getLogger("data")


	@dataclass()
	class InstructArgs(Serializable):
	shuffle: bool = True

	# For function calling training examples only the last tool call
	# of the assistant message can be used for training. Therefore,
	# we chunk longer function calling conversations into multiple
	# training samples to not loose any data. E.g.:
	# [[
	# UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1
	# UserMessage_2, AssisantToolCallMessage_2, ToolMessage_2, AssisantMessage_2
	# ]]
	# => is chunked into two training samples:
	# [[
	# UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1
	# ],
	# [
	# UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1
	# UserMessage_2, AssisantToolCallMessage_2, ToolMessage_2, AssisantMessage_2
	# ]]
	# NOTE: Only if your data is already pre-chunked should this argument be set to False
	dynamic_chunk_fn_call: bool = True


	@dataclass()
	class DataArgs(Serializable):
	# The data arguments `data` and `instruct_data` are a string in the format
	# "data_source_dir_1:weight_1,data_source_dir_2:weight_2,...". The weight
	# will be used to sample the data sources. If the sum of the weights is
	# not 1 when concatenating the two arguments `data` and `instruct_data`,
	# it will be normalized. The data sources folders must contain jsonl files.
	# If the value is an empty string, no data will be used for the corresponding
	# data type.
	data: str = (
	"" # Each line in the jsonl files inside the data source directories must be a dictionary with a "text" key. See Readme for more details. Can be left empty.
	)
	shuffle: bool = False
	instruct_data: str = (
	"" # Each line in the jsonl files inside the data source directories must be a dictionary with a "interactions" key. See Readme for more details. Can be left empty.
	)
	eval_instruct_data: str = (
	"" # Each line in the jsonl files inside the data source directories must be a dictionary with a "interactions" key. See Readme for more details. Can be left empty.
	)
	instruct: InstructArgs = field(default_factory=InstructArgs)

	def __post_init__(self) -> None:
	if (
	self.instruct.shuffle is False
	and self.instruct.dynamic_chunk_fn_call is True
	):
	raise ValueError(
	"Make sure to either enable `data.instruct.shuffle=True` or `data.instruct.dynamic_chunk_fn_call=False`. Dynamic chunking is only possible if data is loaded and shuffled before training."
	)