File size: 3,972 Bytes
3c15254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""
Dataset Preprocessing to JSONL Instruction Format

Transforms raw/processed Vivekananda dataset into instruction-style JSONL:
- Each line: {"instruction": str, "input": str, "output": str}

Usage:
  python training/data_preprocess.py \
    --source data/processed/vivekananda_dataset_1.json \
    --out data/datasets/sft_train.jsonl \
    --val data/datasets/sft_val.jsonl \
    --val-ratio 0.05

Notes:
- Heuristics try to form instruction-answer pairs from available fields.
- Preserves persona: encourages responses in Vivekananda’s voice.
"""

import json
import random
from pathlib import Path
from typing import Dict, Any, List, Tuple
import argparse


def to_instruction(example: Dict[str, Any]) -> Tuple[str, str, str]:
    """Convert a raw example into (instruction, input, output).

    Heuristics:
    - If example has 'question' and 'answer', use as instruction/output.
    - Else if it has 'title' and 'text', create a summarization/explication task.
    - Else if only 'text', create a reflection/instruction task.
    """
    q = example.get("question") or example.get("prompt")
    a = example.get("answer") or example.get("response") or example.get("output")
    title = example.get("title")
    text = example.get("text") or example.get("content") or example.get("body")

    if q and a:
        instruction = q.strip()
        output = a.strip()
        return instruction, "", output

    if title and text:
        instruction = f"Explain the following passage from '{title}' in the voice of Swami Vivekananda, focusing on its practical message for personal strength and service."
        input_text = text.strip()
        # Output may be missing; leave empty for SFT where model learns from labels only when available
        return instruction, input_text, example.get("summary") or ""

    if text:
        instruction = "Convey the core teaching of the passage below in Swami Vivekananda’s tone—intellectual fire, moral strength, cultural pride, and universalism—without modern platitudes."
        return instruction, text.strip(), example.get("summary") or ""

    # Fallback generic row
    return "Answer in Swami Vivekananda’s voice about the given input.", example.get("raw", ""), example.get("target", "")


def process_dataset(source_path: Path) -> List[Dict[str, str]]:
    data = json.loads(source_path.read_text(encoding="utf-8"))
    rows = []
    for ex in data if isinstance(data, list) else data.get("items", []):
        instr, inp, out = to_instruction(ex)
        rows.append({"instruction": instr, "input": inp, "output": out})
    return rows


def split_train_val(rows: List[Dict[str, str]], val_ratio: float) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]:
    random.shuffle(rows)
    n_val = max(1, int(len(rows) * val_ratio))
    return rows[n_val:], rows[:n_val]


def save_jsonl(rows: List[Dict[str, str]], out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


def main():
    parser = argparse.ArgumentParser(description="Preprocess Vivekananda dataset to JSONL instruction format")
    parser.add_argument("--source", type=str, default="data/processed/vivekananda_dataset_1.json")
    parser.add_argument("--out", type=str, default="data/datasets/sft_train.jsonl")
    parser.add_argument("--val", type=str, default="data/datasets/sft_val.jsonl")
    parser.add_argument("--val-ratio", type=float, default=0.05)
    args = parser.parse_args()

    source = Path(args.source)
    train_out = Path(args.out)
    val_out = Path(args.val)

    rows = process_dataset(source)
    train_rows, val_rows = split_train_val(rows, args.val_ratio)
    save_jsonl(train_rows, train_out)
    save_jsonl(val_rows, val_out)
    print(f"Saved train: {train_out} ({len(train_rows)}) | val: {val_out} ({len(val_rows)})")


if __name__ == "__main__":
    main()