Spaces:
Configuration error
Configuration error
| """ | |
| Output Formatter Module | |
| ========================= | |
| Export datasets in multiple JSONL formats: | |
| - OpenAI Chat JSONL | |
| - Completion JSONL | |
| - Classification JSONL | |
| - Custom schema JSONL | |
| """ | |
| import json | |
| from dataclasses import dataclass, field | |
| from typing import List, Dict, Any, Optional | |
| from pathlib import Path | |
| import pandas as pd | |
| class OutputFormatConfig: | |
| """Configuration for output formatting.""" | |
| format_type: str = "openai_chat" # "openai_chat", "completion", "classification", "custom" | |
| custom_schema: Dict[str, str] = field(default_factory=dict) | |
| # custom_schema maps output_key -> source_column, e.g. {"text": "instruction", "label": "category"} | |
| def format_openai_chat( | |
| df: pd.DataFrame, | |
| system_prompt: str, | |
| instruction_col: str, | |
| output_col: str, | |
| input_col: Optional[str] = None, | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Format as OpenAI Chat JSONL. | |
| Each entry: {"messages": [{"role": "system", ...}, {"role": "user", ...}, {"role": "assistant", ...}]} | |
| """ | |
| data = [] | |
| for _, row in df.iterrows(): | |
| messages = [] | |
| if system_prompt: | |
| messages.append({"role": "system", "content": system_prompt}) | |
| user_content = str(row[instruction_col]) | |
| if input_col and input_col in df.columns: | |
| context = str(row.get(input_col, '')) | |
| if context and context != 'nan': | |
| user_content += f"\n\nContext: {context}" | |
| messages.append({"role": "user", "content": user_content}) | |
| messages.append({"role": "assistant", "content": str(row[output_col])}) | |
| data.append({"messages": messages}) | |
| return data | |
| def format_completion( | |
| df: pd.DataFrame, | |
| instruction_col: str, | |
| output_col: str, | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Format as Completion JSONL. | |
| Each entry: {"prompt": "...", "completion": "..."} | |
| """ | |
| data = [] | |
| for _, row in df.iterrows(): | |
| data.append({ | |
| "prompt": str(row[instruction_col]), | |
| "completion": str(row[output_col]), | |
| }) | |
| return data | |
| def format_classification( | |
| df: pd.DataFrame, | |
| text_col: str, | |
| label_col: str, | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Format as Classification JSONL. | |
| Each entry: {"text": "...", "label": "..."} | |
| """ | |
| data = [] | |
| for _, row in df.iterrows(): | |
| data.append({ | |
| "text": str(row[text_col]), | |
| "label": str(row[label_col]), | |
| }) | |
| return data | |
| def format_custom( | |
| df: pd.DataFrame, | |
| schema: Dict[str, str], | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Format using a custom schema. | |
| schema: dict mapping output_key -> source_column name | |
| """ | |
| data = [] | |
| for _, row in df.iterrows(): | |
| entry = {} | |
| for out_key, src_col in schema.items(): | |
| if src_col in df.columns: | |
| entry[out_key] = str(row[src_col]) | |
| else: | |
| entry[out_key] = "" | |
| data.append(entry) | |
| return data | |
| def export_jsonl(data: List[Dict[str, Any]], path: str) -> str: | |
| """Write a list of dicts as JSONL to a file.""" | |
| output_path = Path(path) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| for entry in data: | |
| f.write(json.dumps(entry, ensure_ascii=False) + '\n') | |
| return str(output_path) | |
| def generate_preview(data: List[Dict[str, Any]], n: int = 3) -> str: | |
| """Return a pretty-printed JSON string of the first n entries.""" | |
| return json.dumps(data[:n], indent=2, ensure_ascii=False) | |
| def format_dataset( | |
| df: pd.DataFrame, | |
| config: OutputFormatConfig, | |
| system_prompt: str = "", | |
| instruction_col: str = "", | |
| output_col: str = "", | |
| input_col: Optional[str] = None, | |
| label_col: Optional[str] = None, | |
| ) -> List[Dict[str, Any]]: | |
| """Format the dataset according to the configured format type.""" | |
| if config.format_type == "openai_chat": | |
| return format_openai_chat(df, system_prompt, instruction_col, output_col, input_col) | |
| elif config.format_type == "completion": | |
| return format_completion(df, instruction_col, output_col) | |
| elif config.format_type == "classification": | |
| text_col = instruction_col or (list(df.columns)[0] if len(df.columns) > 0 else "") | |
| lbl_col = label_col or output_col | |
| return format_classification(df, text_col, lbl_col) | |
| elif config.format_type == "custom": | |
| return format_custom(df, config.custom_schema) | |
| else: | |
| return format_openai_chat(df, system_prompt, instruction_col, output_col, input_col) | |