"""Turn one raw dataset row into a normalized chat-format record, given a FieldMapping. Pure function - no network, no HF imports, easy to test. """ from __future__ import annotations from typing import Optional from models import FieldMapping def _stringify(value) -> Optional[str]: if value is None: return None if isinstance(value, str): return value if value.strip() else None return str(value) def extract_triplet(row: dict, mapping: FieldMapping, system_prompt: str) -> Optional[dict]: """Returns {"messages": [...]} in OpenAI/ShareGPT chat format, or None if this row didn't have usable user/assistant text. """ user_text: Optional[str] = None asst_text: Optional[str] = None if mapping.kind == "conversation_list": items = row.get(mapping.config["list_field"]) or [] role_key = mapping.config["role_key"] content_key = mapping.config["content_key"] human_tag = mapping.config["human_tag"] gpt_tag = mapping.config["gpt_tag"] user_text = _stringify( next((item.get(content_key) for item in items if item.get(role_key) == human_tag), None) ) asst_text = _stringify( next((item.get(content_key) for item in items if item.get(role_key) == gpt_tag), None) ) elif mapping.kind == "flat_pair": user_text = _stringify(row.get(mapping.config["user_field"])) asst_text = _stringify(row.get(mapping.config["assistant_field"])) else: return None if not user_text or not asst_text: return None return { "messages": [ {"role": "system", "content": system_prompt or ""}, {"role": "user", "content": user_text}, {"role": "assistant", "content": asst_text}, ] }