VivekanandaAI / data_preprocess.py

Upload 18 files

3c15254 verified about 2 months ago

3.97 kB

	"""
	Dataset Preprocessing to JSONL Instruction Format

	Transforms raw/processed Vivekananda dataset into instruction-style JSONL:
	- Each line: {"instruction": str, "input": str, "output": str}

	Usage:
	python training/data_preprocess.py \
	--source data/processed/vivekananda_dataset_1.json \
	--out data/datasets/sft_train.jsonl \
	--val data/datasets/sft_val.jsonl \
	--val-ratio 0.05

	Notes:
	- Heuristics try to form instruction-answer pairs from available fields.
	- Preserves persona: encourages responses in Vivekananda’s voice.
	"""

	import json
	import random
	from pathlib import Path
	from typing import Dict, Any, List, Tuple
	import argparse


	def to_instruction(example: Dict[str, Any]) -> Tuple[str, str, str]:
	"""Convert a raw example into (instruction, input, output).

	Heuristics:
	- If example has 'question' and 'answer', use as instruction/output.
	- Else if it has 'title' and 'text', create a summarization/explication task.
	- Else if only 'text', create a reflection/instruction task.
	"""
	q = example.get("question") or example.get("prompt")
	a = example.get("answer") or example.get("response") or example.get("output")
	title = example.get("title")
	text = example.get("text") or example.get("content") or example.get("body")

	if q and a:
	instruction = q.strip()
	output = a.strip()
	return instruction, "", output

	if title and text:
	instruction = f"Explain the following passage from '{title}' in the voice of Swami Vivekananda, focusing on its practical message for personal strength and service."
	input_text = text.strip()
	# Output may be missing; leave empty for SFT where model learns from labels only when available
	return instruction, input_text, example.get("summary") or ""

	if text:
	instruction = "Convey the core teaching of the passage below in Swami Vivekananda’s tone—intellectual fire, moral strength, cultural pride, and universalism—without modern platitudes."
	return instruction, text.strip(), example.get("summary") or ""

	# Fallback generic row
	return "Answer in Swami Vivekananda’s voice about the given input.", example.get("raw", ""), example.get("target", "")


	def process_dataset(source_path: Path) -> List[Dict[str, str]]:
	data = json.loads(source_path.read_text(encoding="utf-8"))
	rows = []
	for ex in data if isinstance(data, list) else data.get("items", []):
	instr, inp, out = to_instruction(ex)
	rows.append({"instruction": instr, "input": inp, "output": out})
	return rows


	def split_train_val(rows: List[Dict[str, str]], val_ratio: float) -> Tuple[List[Dict[str, str]], List[Dict[str, str]]]:
	random.shuffle(rows)
	n_val = max(1, int(len(rows) * val_ratio))
	return rows[n_val:], rows[:n_val]


	def save_jsonl(rows: List[Dict[str, str]], out_path: Path):
	out_path.parent.mkdir(parents=True, exist_ok=True)
	with out_path.open("w", encoding="utf-8") as f:
	for r in rows:
	f.write(json.dumps(r, ensure_ascii=False) + "\n")


	def main():
	parser = argparse.ArgumentParser(description="Preprocess Vivekananda dataset to JSONL instruction format")
	parser.add_argument("--source", type=str, default="data/processed/vivekananda_dataset_1.json")
	parser.add_argument("--out", type=str, default="data/datasets/sft_train.jsonl")
	parser.add_argument("--val", type=str, default="data/datasets/sft_val.jsonl")
	parser.add_argument("--val-ratio", type=float, default=0.05)
	args = parser.parse_args()

	source = Path(args.source)
	train_out = Path(args.out)
	val_out = Path(args.val)

	rows = process_dataset(source)
	train_rows, val_rows = split_train_val(rows, args.val_ratio)
	save_jsonl(train_rows, train_out)
	save_jsonl(val_rows, val_out)
	print(f"Saved train: {train_out} ({len(train_rows)}) \| val: {val_out} ({len(val_rows)})")


	if __name__ == "__main__":
	main()