""" Dataset Preprocessing to JSONL Instruction Format Transforms raw/processed Vivekananda dataset into instruction-style JSONL: - Each line: {"instruction": str, "input": str, "output": str} Usage: python training/data_preprocess.py \ --source data/processed/vivekananda_dataset_1.json \ --out data/datasets/sft_train.jsonl \ --val data/datasets/sft_val.jsonl \ --val-ratio 0.05 Notes: - Heuristics try to form instruction-answer pairs from available fields. - Preserves persona: encourages responses in Vivekananda’s voice. """ import json import random from pathlib import Path from typing import Dict, Any, List, Tuple import argparse def to_instruction(example: Dict[str, Any]) -> Tuple[str, str, str]: """Convert a raw example into (instruction, input, output). Heuristics: - If example has 'question' and 'answer', use as instruction/output. - Else if it has 'title' and 'text', create a summarization/explication task. - Else if only 'text', create a reflection/instruction task. """ q = example.get("question") or example.get("prompt") a = example.get("answer") or example.get("response") or example.get("output") title = example.get("title") text = example.get("text") or example.get("content") or example.get("body") if q and a: instruction = q.strip() output = a.strip() return instruction, "", output if title and text: instruction = f"Explain the following passage from '{title}' in the voice of Swami Vivekananda, focusing on its practical message for personal strength and service." input_text = text.strip() # Output may be missing; leave empty for SFT where model learns from labels only when available return instruction, input_text, example.get("summary") or "" if text: instruction = "Convey the core teaching of the passage below in Swami Vivekananda’s tone—intellectual fire, moral strength, cultural pride, and universalism—without modern platitudes." return instruction, text.strip(), example.get("summary") or "" # Fallback generic row return "Answer in Swami Vivekananda’s voice about the given input.", example.get("raw", ""), example.get("target", "") def process_dataset(source_path: Path) -> List[Dict[str, str]]: data = json.loads(source_path.read_text(encoding="utf-8")) rows = [] for ex in data if isinstance(data, list) else data.get("items", []): instr, inp, out = to_instruction(ex) rows.append({"instruction": instr, "input": inp, "output": out}) return rows def split_train_val(rows: List[Dict[str, str]], val_ratio: float) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]: random.shuffle(rows) n_val = max(1, int(len(rows) * val_ratio)) return rows[n_val:], rows[:n_val] def save_jsonl(rows: List[Dict[str, str]], out_path: Path): out_path.parent.mkdir(parents=True, exist_ok=True) with out_path.open("w", encoding="utf-8") as f: for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n") def main(): parser = argparse.ArgumentParser(description="Preprocess Vivekananda dataset to JSONL instruction format") parser.add_argument("--source", type=str, default="data/processed/vivekananda_dataset_1.json") parser.add_argument("--out", type=str, default="data/datasets/sft_train.jsonl") parser.add_argument("--val", type=str, default="data/datasets/sft_val.jsonl") parser.add_argument("--val-ratio", type=float, default=0.05) args = parser.parse_args() source = Path(args.source) train_out = Path(args.out) val_out = Path(args.val) rows = process_dataset(source) train_rows, val_rows = split_train_val(rows, args.val_ratio) save_jsonl(train_rows, train_out) save_jsonl(val_rows, val_out) print(f"Saved train: {train_out} ({len(train_rows)}) | val: {val_out} ({len(val_rows)})") if __name__ == "__main__": main()