VivekanandaAI / data_preprocess.py
jyotirmoy05's picture
Upload 18 files
3c15254 verified
"""
Dataset Preprocessing to JSONL Instruction Format
Transforms raw/processed Vivekananda dataset into instruction-style JSONL:
- Each line: {"instruction": str, "input": str, "output": str}
Usage:
python training/data_preprocess.py \
--source data/processed/vivekananda_dataset_1.json \
--out data/datasets/sft_train.jsonl \
--val data/datasets/sft_val.jsonl \
--val-ratio 0.05
Notes:
- Heuristics try to form instruction-answer pairs from available fields.
- Preserves persona: encourages responses in Vivekananda’s voice.
"""
import json
import random
from pathlib import Path
from typing import Dict, Any, List, Tuple
import argparse
def to_instruction(example: Dict[str, Any]) -> Tuple[str, str, str]:
"""Convert a raw example into (instruction, input, output).
Heuristics:
- If example has 'question' and 'answer', use as instruction/output.
- Else if it has 'title' and 'text', create a summarization/explication task.
- Else if only 'text', create a reflection/instruction task.
"""
q = example.get("question") or example.get("prompt")
a = example.get("answer") or example.get("response") or example.get("output")
title = example.get("title")
text = example.get("text") or example.get("content") or example.get("body")
if q and a:
instruction = q.strip()
output = a.strip()
return instruction, "", output
if title and text:
instruction = f"Explain the following passage from '{title}' in the voice of Swami Vivekananda, focusing on its practical message for personal strength and service."
input_text = text.strip()
# Output may be missing; leave empty for SFT where model learns from labels only when available
return instruction, input_text, example.get("summary") or ""
if text:
instruction = "Convey the core teaching of the passage below in Swami Vivekananda’s tone—intellectual fire, moral strength, cultural pride, and universalism—without modern platitudes."
return instruction, text.strip(), example.get("summary") or ""
# Fallback generic row
return "Answer in Swami Vivekananda’s voice about the given input.", example.get("raw", ""), example.get("target", "")
def process_dataset(source_path: Path) -> List[Dict[str, str]]:
data = json.loads(source_path.read_text(encoding="utf-8"))
rows = []
for ex in data if isinstance(data, list) else data.get("items", []):
instr, inp, out = to_instruction(ex)
rows.append({"instruction": instr, "input": inp, "output": out})
return rows
def split_train_val(rows: List[Dict[str, str]], val_ratio: float) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]:
random.shuffle(rows)
n_val = max(1, int(len(rows) * val_ratio))
return rows[n_val:], rows[:n_val]
def save_jsonl(rows: List[Dict[str, str]], out_path: Path):
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", encoding="utf-8") as f:
for r in rows:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
def main():
parser = argparse.ArgumentParser(description="Preprocess Vivekananda dataset to JSONL instruction format")
parser.add_argument("--source", type=str, default="data/processed/vivekananda_dataset_1.json")
parser.add_argument("--out", type=str, default="data/datasets/sft_train.jsonl")
parser.add_argument("--val", type=str, default="data/datasets/sft_val.jsonl")
parser.add_argument("--val-ratio", type=float, default=0.05)
args = parser.parse_args()
source = Path(args.source)
train_out = Path(args.out)
val_out = Path(args.val)
rows = process_dataset(source)
train_rows, val_rows = split_train_val(rows, args.val_ratio)
save_jsonl(train_rows, train_out)
save_jsonl(val_rows, val_out)
print(f"Saved train: {train_out} ({len(train_rows)}) | val: {val_out} ({len(val_rows)})")
if __name__ == "__main__":
main()