Upload configs/teleyaml.py with huggingface_hub

Browse files

Files changed (1) hide show

configs/teleyaml.py +98 -0

configs/teleyaml.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processing functions for TeleYAML dataset - v2 with nested format support."""
+from typing import Any, Optional
+from megatron.bridge.data.builders.hf_dataset import ProcessExampleOutput
+from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer
+def _flatten_messages(messages: list[dict[str, str]]) -> str:
+    """Convert a list of chat messages into a formatted string.
+    Args:
+        messages: List of message dicts with 'role' and 'content' keys
+    Returns:
+        Formatted string with role tags
+    """
+    parts = []
+    for msg in messages:
+        role = msg.get("role", "user")
+        content = msg.get("content", "")
+        parts.append(f"<{role}>\n{content}\n</{role}>")
+    return "\n".join(parts)
+def _flatten_output(output_dict: dict[str, Any]) -> str:
+    """Convert nested output dict into a formatted string.
+    Args:
+        output_dict: Dict with 'reasoning_context' and/or 'content' keys
+    Returns:
+        Formatted string combining reasoning and content
+    """
+    reasoning = output_dict.get("reasoning_context", "")
+    content = output_dict.get("content", "")
+    if reasoning and content:
+        return f"<reasoning>\n{reasoning}\n</reasoning>\n\n{content}"
+    elif reasoning:
+        return reasoning
+    else:
+        return content
+def process_teleyaml_example(
+    example: dict[str, Any], tokenizer: Optional[MegatronTokenizer] = None
+) -> ProcessExampleOutput:
+    """Process a TeleYAML example into the required format.
+    Handles both flat format (v1) and nested format (v2):
+    Flat (v1):
+        {"input": "string", "output": "string"}
+    Nested (v2):
+        {"input": {"messages": [...]}, "output": {"reasoning_context": "...", "content": "..."}}
+    Args:
+        example: Raw TeleYAML example
+        tokenizer: Optional tokenizer (not used)
+    Returns:
+        ProcessExampleOutput with formatted input/output and original answers
+    """
+    raw_input = example.get("input", "")
+    raw_output = example.get("output", "")
+    # Handle input - check if nested messages format
+    if isinstance(raw_input, dict) and "messages" in raw_input:
+        _input = _flatten_messages(raw_input["messages"])
+    elif isinstance(raw_input, str):
+        _input = raw_input
+    else:
+        _input = str(raw_input)
+    # Handle output - check if nested dict format
+    if isinstance(raw_output, dict):
+        _output = _flatten_output(raw_output)
+    elif isinstance(raw_output, str):
+        _output = raw_output
+    else:
+        _output = str(raw_output)
+    original_answers = [_output]
+    return ProcessExampleOutput(input=_input, output=_output, original_answers=original_answers)