vipisdeath
/

HGRXv1

text-generation-inference

Model card Files Files and versions

HGRXv1 / prepare_data.py

vipisdeath's picture

Create prepare_data.py

5418f20 verified 15 days ago

history blame contribute delete

708 Bytes

	from datasets import load_dataset

	# 1. Define the name of the dataset from Hugging Face
	DATASET_NAME = "Alibaba-Apsara/Superior-Reasoning-SFT-gpt-oss-120b"

	print("Loading Stage 1 of the dataset... this might take a minute.")

	# 2. Load the dataset (using streaming=True so your computer doesn't crash)
	dataset = load_dataset(DATASET_NAME, name="stage1", split="train", streaming=True)

	# 3. Take the first 3 examples to see what they look like
	for i, example in enumerate(dataset.take(3)):
	print(f"\n--- EXAMPLE {i+1} ---")
	print(f"DOMAIN: {example['domain']}")
	print(f"INPUT: {example['input'][:200]}...") # Showing first 200 chars
	print(f"REASONING/OUTPUT: {example['output'][:200]}...")