| import logging | |
| from dataclasses import dataclass, field | |
| from simple_parsing.helpers import Serializable | |
| logger = logging.getLogger("data") | |
| class InstructArgs(Serializable): | |
| shuffle: bool = True | |
| # For function calling training examples only the last tool call | |
| # of the assistant message can be used for training. Therefore, | |
| # we chunk longer function calling conversations into multiple | |
| # training samples to not loose any data. E.g.: | |
| # [[ | |
| # UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1 | |
| # UserMessage_2, AssisantToolCallMessage_2, ToolMessage_2, AssisantMessage_2 | |
| # ]] | |
| # => is chunked into two training samples: | |
| # [[ | |
| # UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1 | |
| # ], | |
| # [ | |
| # UserMessage_1, AssisantToolCallMessage_1, ToolMessage_1, AssisantMessage_1 | |
| # UserMessage_2, AssisantToolCallMessage_2, ToolMessage_2, AssisantMessage_2 | |
| # ]] | |
| # NOTE: Only if your data is already pre-chunked should this argument be set to False | |
| dynamic_chunk_fn_call: bool = True | |
| class DataArgs(Serializable): | |
| # The data arguments `data` and `instruct_data` are a string in the format | |
| # "data_source_dir_1:weight_1,data_source_dir_2:weight_2,...". The weight | |
| # will be used to sample the data sources. If the sum of the weights is | |
| # not 1 when concatenating the two arguments `data` and `instruct_data`, | |
| # it will be normalized. The data sources folders must contain jsonl files. | |
| # If the value is an empty string, no data will be used for the corresponding | |
| # data type. | |
| data: str = ( | |
| "" # Each line in the jsonl files inside the data source directories must be a dictionary with a "text" key. See Readme for more details. Can be left empty. | |
| ) | |
| shuffle: bool = False | |
| instruct_data: str = ( | |
| "" # Each line in the jsonl files inside the data source directories must be a dictionary with a "interactions" key. See Readme for more details. Can be left empty. | |
| ) | |
| eval_instruct_data: str = ( | |
| "" # Each line in the jsonl files inside the data source directories must be a dictionary with a "interactions" key. See Readme for more details. Can be left empty. | |
| ) | |
| instruct: InstructArgs = field(default_factory=InstructArgs) | |
| def __post_init__(self) -> None: | |
| if ( | |
| self.instruct.shuffle is False | |
| and self.instruct.dynamic_chunk_fn_call is True | |
| ): | |
| raise ValueError( | |
| "Make sure to either enable `data.instruct.shuffle=True` or `data.instruct.dynamic_chunk_fn_call=False`. Dynamic chunking is only possible if data is loaded and shuffled before training." | |
| ) | |