| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """Processing functions for TeleYAML dataset - v2 with nested format support.""" |
| | from typing import Any, Optional |
| | from megatron.bridge.data.builders.hf_dataset import ProcessExampleOutput |
| | from megatron.bridge.training.tokenizers.tokenizer import MegatronTokenizer |
| |
|
| |
|
| | def _flatten_messages(messages: list[dict[str, str]]) -> str: |
| | """Convert a list of chat messages into a formatted string. |
| | |
| | Args: |
| | messages: List of message dicts with 'role' and 'content' keys |
| | |
| | Returns: |
| | Formatted string with role tags |
| | """ |
| | parts = [] |
| | for msg in messages: |
| | role = msg.get("role", "user") |
| | content = msg.get("content", "") |
| | parts.append(f"<{role}>\n{content}\n</{role}>") |
| | return "\n".join(parts) |
| |
|
| |
|
| | def _flatten_output(output_dict: dict[str, Any]) -> str: |
| | """Convert nested output dict into a formatted string. |
| | |
| | Args: |
| | output_dict: Dict with 'reasoning_context' and/or 'content' keys |
| | |
| | Returns: |
| | Formatted string combining reasoning and content |
| | """ |
| | reasoning = output_dict.get("reasoning_context", "") |
| | content = output_dict.get("content", "") |
| | |
| | if reasoning and content: |
| | return f"<reasoning>\n{reasoning}\n</reasoning>\n\n{content}" |
| | elif reasoning: |
| | return reasoning |
| | else: |
| | return content |
| |
|
| |
|
| | def process_teleyaml_example( |
| | example: dict[str, Any], tokenizer: Optional[MegatronTokenizer] = None |
| | ) -> ProcessExampleOutput: |
| | """Process a TeleYAML example into the required format. |
| | |
| | Handles both flat format (v1) and nested format (v2): |
| | |
| | Flat (v1): |
| | {"input": "string", "output": "string"} |
| | |
| | Nested (v2): |
| | {"input": {"messages": [...]}, "output": {"reasoning_context": "...", "content": "..."}} |
| | |
| | Args: |
| | example: Raw TeleYAML example |
| | tokenizer: Optional tokenizer (not used) |
| | |
| | Returns: |
| | ProcessExampleOutput with formatted input/output and original answers |
| | """ |
| | raw_input = example.get("input", "") |
| | raw_output = example.get("output", "") |
| | |
| | |
| | if isinstance(raw_input, dict) and "messages" in raw_input: |
| | _input = _flatten_messages(raw_input["messages"]) |
| | elif isinstance(raw_input, str): |
| | _input = raw_input |
| | else: |
| | _input = str(raw_input) |
| | |
| | |
| | if isinstance(raw_output, dict): |
| | _output = _flatten_output(raw_output) |
| | elif isinstance(raw_output, str): |
| | _output = raw_output |
| | else: |
| | _output = str(raw_output) |
| | |
| | original_answers = [_output] |
| | |
| | return ProcessExampleOutput(input=_input, output=_output, original_answers=original_answers) |
| |
|