| """ |
| Dataset Preprocessing to JSONL Instruction Format |
| |
| Transforms raw/processed Vivekananda dataset into instruction-style JSONL: |
| - Each line: {"instruction": str, "input": str, "output": str} |
| |
| Usage: |
| python training/data_preprocess.py \ |
| --source data/processed/vivekananda_dataset_1.json \ |
| --out data/datasets/sft_train.jsonl \ |
| --val data/datasets/sft_val.jsonl \ |
| --val-ratio 0.05 |
| |
| Notes: |
| - Heuristics try to form instruction-answer pairs from available fields. |
| - Preserves persona: encourages responses in Vivekananda’s voice. |
| """ |
|
|
| import json |
| import random |
| from pathlib import Path |
| from typing import Dict, Any, List, Tuple |
| import argparse |
|
|
|
|
| def to_instruction(example: Dict[str, Any]) -> Tuple[str, str, str]: |
| """Convert a raw example into (instruction, input, output). |
| |
| Heuristics: |
| - If example has 'question' and 'answer', use as instruction/output. |
| - Else if it has 'title' and 'text', create a summarization/explication task. |
| - Else if only 'text', create a reflection/instruction task. |
| """ |
| q = example.get("question") or example.get("prompt") |
| a = example.get("answer") or example.get("response") or example.get("output") |
| title = example.get("title") |
| text = example.get("text") or example.get("content") or example.get("body") |
|
|
| if q and a: |
| instruction = q.strip() |
| output = a.strip() |
| return instruction, "", output |
|
|
| if title and text: |
| instruction = f"Explain the following passage from '{title}' in the voice of Swami Vivekananda, focusing on its practical message for personal strength and service." |
| input_text = text.strip() |
| |
| return instruction, input_text, example.get("summary") or "" |
|
|
| if text: |
| instruction = "Convey the core teaching of the passage below in Swami Vivekananda’s tone—intellectual fire, moral strength, cultural pride, and universalism—without modern platitudes." |
| return instruction, text.strip(), example.get("summary") or "" |
|
|
| |
| return "Answer in Swami Vivekananda’s voice about the given input.", example.get("raw", ""), example.get("target", "") |
|
|
|
|
| def process_dataset(source_path: Path) -> List[Dict[str, str]]: |
| data = json.loads(source_path.read_text(encoding="utf-8")) |
| rows = [] |
| for ex in data if isinstance(data, list) else data.get("items", []): |
| instr, inp, out = to_instruction(ex) |
| rows.append({"instruction": instr, "input": inp, "output": out}) |
| return rows |
|
|
|
|
| def split_train_val(rows: List[Dict[str, str]], val_ratio: float) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]: |
| random.shuffle(rows) |
| n_val = max(1, int(len(rows) * val_ratio)) |
| return rows[n_val:], rows[:n_val] |
|
|
|
|
| def save_jsonl(rows: List[Dict[str, str]], out_path: Path): |
| out_path.parent.mkdir(parents=True, exist_ok=True) |
| with out_path.open("w", encoding="utf-8") as f: |
| for r in rows: |
| f.write(json.dumps(r, ensure_ascii=False) + "\n") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Preprocess Vivekananda dataset to JSONL instruction format") |
| parser.add_argument("--source", type=str, default="data/processed/vivekananda_dataset_1.json") |
| parser.add_argument("--out", type=str, default="data/datasets/sft_train.jsonl") |
| parser.add_argument("--val", type=str, default="data/datasets/sft_val.jsonl") |
| parser.add_argument("--val-ratio", type=float, default=0.05) |
| args = parser.parse_args() |
|
|
| source = Path(args.source) |
| train_out = Path(args.out) |
| val_out = Path(args.val) |
|
|
| rows = process_dataset(source) |
| train_rows, val_rows = split_train_val(rows, args.val_ratio) |
| save_jsonl(train_rows, train_out) |
| save_jsonl(val_rows, val_out) |
| print(f"Saved train: {train_out} ({len(train_rows)}) | val: {val_out} ({len(val_rows)})") |
|
|
|
|
| if __name__ == "__main__": |
| main() |