File size: 1,816 Bytes
390cebe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
"""Turn one raw dataset row into a normalized chat-format record, given a
FieldMapping. Pure function - no network, no HF imports, easy to test.
"""
from __future__ import annotations

from typing import Optional

from models import FieldMapping


def _stringify(value) -> Optional[str]:
    if value is None:
        return None
    if isinstance(value, str):
        return value if value.strip() else None
    return str(value)


def extract_triplet(row: dict, mapping: FieldMapping, system_prompt: str) -> Optional[dict]:
    """Returns {"messages": [...]} in OpenAI/ShareGPT chat format, or None
    if this row didn't have usable user/assistant text.
    """
    user_text: Optional[str] = None
    asst_text: Optional[str] = None

    if mapping.kind == "conversation_list":
        items = row.get(mapping.config["list_field"]) or []
        role_key = mapping.config["role_key"]
        content_key = mapping.config["content_key"]
        human_tag = mapping.config["human_tag"]
        gpt_tag = mapping.config["gpt_tag"]
        user_text = _stringify(
            next((item.get(content_key) for item in items if item.get(role_key) == human_tag), None)
        )
        asst_text = _stringify(
            next((item.get(content_key) for item in items if item.get(role_key) == gpt_tag), None)
        )

    elif mapping.kind == "flat_pair":
        user_text = _stringify(row.get(mapping.config["user_field"]))
        asst_text = _stringify(row.get(mapping.config["assistant_field"]))

    else:
        return None

    if not user_text or not asst_text:
        return None

    return {
        "messages": [
            {"role": "system", "content": system_prompt or ""},
            {"role": "user", "content": user_text},
            {"role": "assistant", "content": asst_text},
        ]
    }