Flair
English
text-generation-inference
HGRXv1 / prepare_data.py
vipisdeath's picture
Create prepare_data.py
5418f20 verified
from datasets import load_dataset
# 1. Define the name of the dataset from Hugging Face
DATASET_NAME = "Alibaba-Apsara/Superior-Reasoning-SFT-gpt-oss-120b"
print("Loading Stage 1 of the dataset... this might take a minute.")
# 2. Load the dataset (using streaming=True so your computer doesn't crash)
dataset = load_dataset(DATASET_NAME, name="stage1", split="train", streaming=True)
# 3. Take the first 3 examples to see what they look like
for i, example in enumerate(dataset.take(3)):
print(f"\n--- EXAMPLE {i+1} ---")
print(f"DOMAIN: {example['domain']}")
print(f"INPUT: {example['input'][:200]}...") # Showing first 200 chars
print(f"REASONING/OUTPUT: {example['output'][:200]}...")