File size: 4,726 Bytes
d4398e6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""

Output Formatter Module

=========================

Export datasets in multiple JSONL formats:

- OpenAI Chat JSONL

- Completion JSONL

- Classification JSONL

- Custom schema JSONL

"""

import json
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from pathlib import Path
import pandas as pd


@dataclass
class OutputFormatConfig:
    """Configuration for output formatting."""
    format_type: str = "openai_chat"  # "openai_chat", "completion", "classification", "custom"
    custom_schema: Dict[str, str] = field(default_factory=dict)
    # custom_schema maps output_key -> source_column, e.g. {"text": "instruction", "label": "category"}


def format_openai_chat(

    df: pd.DataFrame,

    system_prompt: str,

    instruction_col: str,

    output_col: str,

    input_col: Optional[str] = None,

) -> List[Dict[str, Any]]:
    """

    Format as OpenAI Chat JSONL.

    Each entry: {"messages": [{"role": "system", ...}, {"role": "user", ...}, {"role": "assistant", ...}]}

    """
    data = []
    for _, row in df.iterrows():
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})

        user_content = str(row[instruction_col])
        if input_col and input_col in df.columns:
            context = str(row.get(input_col, ''))
            if context and context != 'nan':
                user_content += f"\n\nContext: {context}"

        messages.append({"role": "user", "content": user_content})
        messages.append({"role": "assistant", "content": str(row[output_col])})

        data.append({"messages": messages})
    return data


def format_completion(

    df: pd.DataFrame,

    instruction_col: str,

    output_col: str,

) -> List[Dict[str, Any]]:
    """

    Format as Completion JSONL.

    Each entry: {"prompt": "...", "completion": "..."}

    """
    data = []
    for _, row in df.iterrows():
        data.append({
            "prompt": str(row[instruction_col]),
            "completion": str(row[output_col]),
        })
    return data


def format_classification(

    df: pd.DataFrame,

    text_col: str,

    label_col: str,

) -> List[Dict[str, Any]]:
    """

    Format as Classification JSONL.

    Each entry: {"text": "...", "label": "..."}

    """
    data = []
    for _, row in df.iterrows():
        data.append({
            "text": str(row[text_col]),
            "label": str(row[label_col]),
        })
    return data


def format_custom(

    df: pd.DataFrame,

    schema: Dict[str, str],

) -> List[Dict[str, Any]]:
    """

    Format using a custom schema.

    schema: dict mapping output_key -> source_column name

    """
    data = []
    for _, row in df.iterrows():
        entry = {}
        for out_key, src_col in schema.items():
            if src_col in df.columns:
                entry[out_key] = str(row[src_col])
            else:
                entry[out_key] = ""
        data.append(entry)
    return data


def export_jsonl(data: List[Dict[str, Any]], path: str) -> str:
    """Write a list of dicts as JSONL to a file."""
    output_path = Path(path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(output_path, 'w', encoding='utf-8') as f:
        for entry in data:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

    return str(output_path)


def generate_preview(data: List[Dict[str, Any]], n: int = 3) -> str:
    """Return a pretty-printed JSON string of the first n entries."""
    return json.dumps(data[:n], indent=2, ensure_ascii=False)


def format_dataset(

    df: pd.DataFrame,

    config: OutputFormatConfig,

    system_prompt: str = "",

    instruction_col: str = "",

    output_col: str = "",

    input_col: Optional[str] = None,

    label_col: Optional[str] = None,

) -> List[Dict[str, Any]]:
    """Format the dataset according to the configured format type."""
    if config.format_type == "openai_chat":
        return format_openai_chat(df, system_prompt, instruction_col, output_col, input_col)
    elif config.format_type == "completion":
        return format_completion(df, instruction_col, output_col)
    elif config.format_type == "classification":
        text_col = instruction_col or (list(df.columns)[0] if len(df.columns) > 0 else "")
        lbl_col = label_col or output_col
        return format_classification(df, text_col, lbl_col)
    elif config.format_type == "custom":
        return format_custom(df, config.custom_schema)
    else:
        return format_openai_chat(df, system_prompt, instruction_col, output_col, input_col)