Upload formats.py with huggingface_hub
Browse files- formats.py +28 -9
formats.py
CHANGED
|
@@ -17,15 +17,17 @@ class SystemFormat(Format):
|
|
| 17 |
r"""Generates the whole input to the model, from constant strings that are given as args, and from values found in specified fields of the instance.
|
| 18 |
|
| 19 |
SystemFormat expects the input instance to contain:
|
| 20 |
-
1. A field named "
|
|
|
|
| 21 |
from the source dataset), in the context of the underlying task.
|
| 22 |
-
|
| 23 |
-
|
| 24 |
and "target", representing a single demo.
|
|
|
|
| 25 |
|
| 26 |
SystemFormat formats the above fields into a single string to be inputted to the model. This string overwrites
|
| 27 |
field "source" of the instance. Formatting is driven by two args: 'demo_format' and 'model_input_format'.
|
| 28 |
-
SystemFormat also pops
|
| 29 |
|
| 30 |
Args:
|
| 31 |
demos_field (str): the name of the field that contains the demos, being a list of dicts, each with "source" and "target" keys
|
|
@@ -67,10 +69,10 @@ class SystemFormat(Format):
|
|
| 67 |
"""
|
| 68 |
|
| 69 |
demos_field: str = "demos"
|
| 70 |
-
demo_format: str =
|
| 71 |
-
|
|
|
|
| 72 |
)
|
| 73 |
-
model_input_format: str = "{instruction}{demos}{source}\n"
|
| 74 |
|
| 75 |
@staticmethod
|
| 76 |
def _retrieve_field_and_assert_not_none(instance, field_name) -> str:
|
|
@@ -95,9 +97,20 @@ class SystemFormat(Format):
|
|
| 95 |
instruction = self._retrieve_field_and_assert_not_none(
|
| 96 |
instance=instance, field_name="instruction"
|
| 97 |
)
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
if "instruction" in instance:
|
| 100 |
instance.pop("instruction")
|
|
|
|
|
|
|
| 101 |
|
| 102 |
demo_instances = []
|
| 103 |
if self.demos_field is not None and self.demos_field in instance:
|
|
@@ -111,13 +124,19 @@ class SystemFormat(Format):
|
|
| 111 |
|
| 112 |
demos_string = ""
|
| 113 |
for demo_instance in demo_instances:
|
| 114 |
-
demo_str = self.demo_format.format(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
demos_string += demo_str
|
| 116 |
|
| 117 |
output = self.model_input_format.format(
|
|
|
|
| 118 |
instruction=instruction,
|
| 119 |
demos=demos_string,
|
| 120 |
source=source,
|
|
|
|
| 121 |
)
|
| 122 |
instance["source"] = output
|
| 123 |
return instance
|
|
|
|
| 17 |
r"""Generates the whole input to the model, from constant strings that are given as args, and from values found in specified fields of the instance.
|
| 18 |
|
| 19 |
SystemFormat expects the input instance to contain:
|
| 20 |
+
1. A field named "system_prompt" whose value is a string (potentially empty) that delivers a task independent opening text.
|
| 21 |
+
2. A field named "source" whose value is a string verbalizing the original values in the instance (as read
|
| 22 |
from the source dataset), in the context of the underlying task.
|
| 23 |
+
3. A field named "instruction" that contains a (non-None) string.
|
| 24 |
+
4. A field named with the value in arg 'demos_field', containing a list of dicts, each dict with fields "source"
|
| 25 |
and "target", representing a single demo.
|
| 26 |
+
5. A field named "target_prefx" that contains a string to prefix the target in both each demo, and to end the whole generated prompt
|
| 27 |
|
| 28 |
SystemFormat formats the above fields into a single string to be inputted to the model. This string overwrites
|
| 29 |
field "source" of the instance. Formatting is driven by two args: 'demo_format' and 'model_input_format'.
|
| 30 |
+
SystemFormat also pops fields "system_prompt", "instruction", "target_prefix", and the field containing the demos out from the input instance.
|
| 31 |
|
| 32 |
Args:
|
| 33 |
demos_field (str): the name of the field that contains the demos, being a list of dicts, each with "source" and "target" keys
|
|
|
|
| 69 |
"""
|
| 70 |
|
| 71 |
demos_field: str = "demos"
|
| 72 |
+
demo_format: str = "{source}\n{target_prefix}{target}\n\n" # example: "User: {source}\nAgent: {target}\n\n"
|
| 73 |
+
model_input_format: str = (
|
| 74 |
+
"{system_prompt}{instruction}{demos}{source}\n{target_prefix}"
|
| 75 |
)
|
|
|
|
| 76 |
|
| 77 |
@staticmethod
|
| 78 |
def _retrieve_field_and_assert_not_none(instance, field_name) -> str:
|
|
|
|
| 97 |
instruction = self._retrieve_field_and_assert_not_none(
|
| 98 |
instance=instance, field_name="instruction"
|
| 99 |
)
|
| 100 |
+
target_prefix = self._retrieve_field_and_assert_not_none(
|
| 101 |
+
instance=instance, field_name="target_prefix"
|
| 102 |
+
)
|
| 103 |
+
system_prompt = self._retrieve_field_and_assert_not_none(
|
| 104 |
+
instance=instance, field_name="system_prompt"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# pop "system_prompt", "instruction", and "target_prefix" from instance
|
| 108 |
+
if "target_prefix" in instance:
|
| 109 |
+
instance.pop("target_prefix")
|
| 110 |
if "instruction" in instance:
|
| 111 |
instance.pop("instruction")
|
| 112 |
+
if "system_prompt" in instance:
|
| 113 |
+
instance.pop("system_prompt")
|
| 114 |
|
| 115 |
demo_instances = []
|
| 116 |
if self.demos_field is not None and self.demos_field in instance:
|
|
|
|
| 124 |
|
| 125 |
demos_string = ""
|
| 126 |
for demo_instance in demo_instances:
|
| 127 |
+
demo_str = self.demo_format.format(
|
| 128 |
+
target_prefix=target_prefix,
|
| 129 |
+
source=demo_instance["source"],
|
| 130 |
+
target=demo_instance["target"],
|
| 131 |
+
)
|
| 132 |
demos_string += demo_str
|
| 133 |
|
| 134 |
output = self.model_input_format.format(
|
| 135 |
+
system_prompt=system_prompt,
|
| 136 |
instruction=instruction,
|
| 137 |
demos=demos_string,
|
| 138 |
source=source,
|
| 139 |
+
target_prefix=target_prefix,
|
| 140 |
)
|
| 141 |
instance["source"] = output
|
| 142 |
return instance
|