Rick-AdaptKey commited on
Commit
624b7ed
·
verified ·
1 Parent(s): c8e75a1

Upload configs/teleyaml.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configs/teleyaml.py +98 -0
configs/teleyaml.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Processing functions for TeleYAML dataset - v2 with nested format support."""
15
+ from typing import Any, Optional
16
+ from megatron.bridge.data.builders.hf_dataset import ProcessExampleOutput
17
+ from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
18
+
19
+
20
+ def _flatten_messages(messages: list[dict[str, str]]) -> str:
21
+ """Convert a list of chat messages into a formatted string.
22
+
23
+ Args:
24
+ messages: List of message dicts with 'role' and 'content' keys
25
+
26
+ Returns:
27
+ Formatted string with role tags
28
+ """
29
+ parts = []
30
+ for msg in messages:
31
+ role = msg.get("role", "user")
32
+ content = msg.get("content", "")
33
+ parts.append(f"<{role}>\n{content}\n</{role}>")
34
+ return "\n".join(parts)
35
+
36
+
37
+ def _flatten_output(output_dict: dict[str, Any]) -> str:
38
+ """Convert nested output dict into a formatted string.
39
+
40
+ Args:
41
+ output_dict: Dict with 'reasoning_context' and/or 'content' keys
42
+
43
+ Returns:
44
+ Formatted string combining reasoning and content
45
+ """
46
+ reasoning = output_dict.get("reasoning_context", "")
47
+ content = output_dict.get("content", "")
48
+
49
+ if reasoning and content:
50
+ return f"<reasoning>\n{reasoning}\n</reasoning>\n\n{content}"
51
+ elif reasoning:
52
+ return reasoning
53
+ else:
54
+ return content
55
+
56
+
57
+ def process_teleyaml_example(
58
+ example: dict[str, Any], tokenizer: Optional[MegatronTokenizer] = None
59
+ ) -> ProcessExampleOutput:
60
+ """Process a TeleYAML example into the required format.
61
+
62
+ Handles both flat format (v1) and nested format (v2):
63
+
64
+ Flat (v1):
65
+ {"input": "string", "output": "string"}
66
+
67
+ Nested (v2):
68
+ {"input": {"messages": [...]}, "output": {"reasoning_context": "...", "content": "..."}}
69
+
70
+ Args:
71
+ example: Raw TeleYAML example
72
+ tokenizer: Optional tokenizer (not used)
73
+
74
+ Returns:
75
+ ProcessExampleOutput with formatted input/output and original answers
76
+ """
77
+ raw_input = example.get("input", "")
78
+ raw_output = example.get("output", "")
79
+
80
+ # Handle input - check if nested messages format
81
+ if isinstance(raw_input, dict) and "messages" in raw_input:
82
+ _input = _flatten_messages(raw_input["messages"])
83
+ elif isinstance(raw_input, str):
84
+ _input = raw_input
85
+ else:
86
+ _input = str(raw_input)
87
+
88
+ # Handle output - check if nested dict format
89
+ if isinstance(raw_output, dict):
90
+ _output = _flatten_output(raw_output)
91
+ elif isinstance(raw_output, str):
92
+ _output = raw_output
93
+ else:
94
+ _output = str(raw_output)
95
+
96
+ original_answers = [_output]
97
+
98
+ return ProcessExampleOutput(input=_input, output=_output, original_answers=original_answers)