Spaces:
Running
Running
| """Turn one raw dataset row into a normalized chat-format record, given a | |
| FieldMapping. Pure function - no network, no HF imports, easy to test. | |
| """ | |
| from __future__ import annotations | |
| from typing import Optional | |
| from models import FieldMapping | |
| def _stringify(value) -> Optional[str]: | |
| if value is None: | |
| return None | |
| if isinstance(value, str): | |
| return value if value.strip() else None | |
| return str(value) | |
| def extract_triplet(row: dict, mapping: FieldMapping, system_prompt: str) -> Optional[dict]: | |
| """Returns {"messages": [...]} in OpenAI/ShareGPT chat format, or None | |
| if this row didn't have usable user/assistant text. | |
| """ | |
| user_text: Optional[str] = None | |
| asst_text: Optional[str] = None | |
| if mapping.kind == "conversation_list": | |
| items = row.get(mapping.config["list_field"]) or [] | |
| role_key = mapping.config["role_key"] | |
| content_key = mapping.config["content_key"] | |
| human_tag = mapping.config["human_tag"] | |
| gpt_tag = mapping.config["gpt_tag"] | |
| user_text = _stringify( | |
| next((item.get(content_key) for item in items if item.get(role_key) == human_tag), None) | |
| ) | |
| asst_text = _stringify( | |
| next((item.get(content_key) for item in items if item.get(role_key) == gpt_tag), None) | |
| ) | |
| elif mapping.kind == "flat_pair": | |
| user_text = _stringify(row.get(mapping.config["user_field"])) | |
| asst_text = _stringify(row.get(mapping.config["assistant_field"])) | |
| else: | |
| return None | |
| if not user_text or not asst_text: | |
| return None | |
| return { | |
| "messages": [ | |
| {"role": "system", "content": system_prompt or ""}, | |
| {"role": "user", "content": user_text}, | |
| {"role": "assistant", "content": asst_text}, | |
| ] | |
| } | |