Auto-FineTune-Ops / preprocessing /output_formatter.py
aneeb15's picture
Initial release of Auto-FineTune-Ops
d4398e6
"""
Output Formatter Module
=========================
Export datasets in multiple JSONL formats:
- OpenAI Chat JSONL
- Completion JSONL
- Classification JSONL
- Custom schema JSONL
"""
import json
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
from pathlib import Path
import pandas as pd
@dataclass
class OutputFormatConfig:
"""Configuration for output formatting."""
format_type: str = "openai_chat" # "openai_chat", "completion", "classification", "custom"
custom_schema: Dict[str, str] = field(default_factory=dict)
# custom_schema maps output_key -> source_column, e.g. {"text": "instruction", "label": "category"}
def format_openai_chat(
df: pd.DataFrame,
system_prompt: str,
instruction_col: str,
output_col: str,
input_col: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Format as OpenAI Chat JSONL.
Each entry: {"messages": [{"role": "system", ...}, {"role": "user", ...}, {"role": "assistant", ...}]}
"""
data = []
for _, row in df.iterrows():
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
user_content = str(row[instruction_col])
if input_col and input_col in df.columns:
context = str(row.get(input_col, ''))
if context and context != 'nan':
user_content += f"\n\nContext: {context}"
messages.append({"role": "user", "content": user_content})
messages.append({"role": "assistant", "content": str(row[output_col])})
data.append({"messages": messages})
return data
def format_completion(
df: pd.DataFrame,
instruction_col: str,
output_col: str,
) -> List[Dict[str, Any]]:
"""
Format as Completion JSONL.
Each entry: {"prompt": "...", "completion": "..."}
"""
data = []
for _, row in df.iterrows():
data.append({
"prompt": str(row[instruction_col]),
"completion": str(row[output_col]),
})
return data
def format_classification(
df: pd.DataFrame,
text_col: str,
label_col: str,
) -> List[Dict[str, Any]]:
"""
Format as Classification JSONL.
Each entry: {"text": "...", "label": "..."}
"""
data = []
for _, row in df.iterrows():
data.append({
"text": str(row[text_col]),
"label": str(row[label_col]),
})
return data
def format_custom(
df: pd.DataFrame,
schema: Dict[str, str],
) -> List[Dict[str, Any]]:
"""
Format using a custom schema.
schema: dict mapping output_key -> source_column name
"""
data = []
for _, row in df.iterrows():
entry = {}
for out_key, src_col in schema.items():
if src_col in df.columns:
entry[out_key] = str(row[src_col])
else:
entry[out_key] = ""
data.append(entry)
return data
def export_jsonl(data: List[Dict[str, Any]], path: str) -> str:
"""Write a list of dicts as JSONL to a file."""
output_path = Path(path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
for entry in data:
f.write(json.dumps(entry, ensure_ascii=False) + '\n')
return str(output_path)
def generate_preview(data: List[Dict[str, Any]], n: int = 3) -> str:
"""Return a pretty-printed JSON string of the first n entries."""
return json.dumps(data[:n], indent=2, ensure_ascii=False)
def format_dataset(
df: pd.DataFrame,
config: OutputFormatConfig,
system_prompt: str = "",
instruction_col: str = "",
output_col: str = "",
input_col: Optional[str] = None,
label_col: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""Format the dataset according to the configured format type."""
if config.format_type == "openai_chat":
return format_openai_chat(df, system_prompt, instruction_col, output_col, input_col)
elif config.format_type == "completion":
return format_completion(df, instruction_col, output_col)
elif config.format_type == "classification":
text_col = instruction_col or (list(df.columns)[0] if len(df.columns) > 0 else "")
lbl_col = label_col or output_col
return format_classification(df, text_col, lbl_col)
elif config.format_type == "custom":
return format_custom(df, config.custom_schema)
else:
return format_openai_chat(df, system_prompt, instruction_col, output_col, input_col)