Dataset-Creator / field_mapper.py
TitleOS's picture
Upload 9 files
390cebe verified
Raw
History Blame Contribute Delete
1.82 kB
"""Turn one raw dataset row into a normalized chat-format record, given a
FieldMapping. Pure function - no network, no HF imports, easy to test.
"""
from __future__ import annotations
from typing import Optional
from models import FieldMapping
def _stringify(value) -> Optional[str]:
if value is None:
return None
if isinstance(value, str):
return value if value.strip() else None
return str(value)
def extract_triplet(row: dict, mapping: FieldMapping, system_prompt: str) -> Optional[dict]:
"""Returns {"messages": [...]} in OpenAI/ShareGPT chat format, or None
if this row didn't have usable user/assistant text.
"""
user_text: Optional[str] = None
asst_text: Optional[str] = None
if mapping.kind == "conversation_list":
items = row.get(mapping.config["list_field"]) or []
role_key = mapping.config["role_key"]
content_key = mapping.config["content_key"]
human_tag = mapping.config["human_tag"]
gpt_tag = mapping.config["gpt_tag"]
user_text = _stringify(
next((item.get(content_key) for item in items if item.get(role_key) == human_tag), None)
)
asst_text = _stringify(
next((item.get(content_key) for item in items if item.get(role_key) == gpt_tag), None)
)
elif mapping.kind == "flat_pair":
user_text = _stringify(row.get(mapping.config["user_field"]))
asst_text = _stringify(row.get(mapping.config["assistant_field"]))
else:
return None
if not user_text or not asst_text:
return None
return {
"messages": [
{"role": "system", "content": system_prompt or ""},
{"role": "user", "content": user_text},
{"role": "assistant", "content": asst_text},
]
}