dpe1/jules-tinyreasoner / src /generate_real_sft_data.py
dpe1's picture
download
raw
3.79 kB
from datasets import load_dataset
import json
import random
import re
import os
from src.capabilities import lookup_dictionary, evaluate_math
def inject_capabilities(text):
# Find words to define (3-8 characters, lowercase only for simplicity)
words = re.findall(r'\b[a-z]{3,8}\b', text)
if words:
word = random.choice(words)
definition = lookup_dictionary(word)
if "No definition found" not in definition:
cap_call = f" [DEFINE]{word}[CAPABILITY_STOP]{definition}[CAPABILITY_STOP]"
# Inject before a sentence that might use it
sentences = text.split('. ')
if len(sentences) > 1:
idx = random.randint(0, len(sentences) - 1)
# Insert the capability call and then a sentence explaining/using it
sentences.insert(idx, f"I should check the meaning of {word}.{cap_call} The word {word} means {definition}")
text = '. '.join(sentences)
else:
text = f"I need to define {word}.{cap_call} {text} (Note: {word} means {definition})"
# Add a random math check if numbers are present
nums = re.findall(r'\d+', text)
if len(nums) >= 2:
n1, n2 = random.sample(nums, 2)
expr = f"{n1} + {n2}"
res = evaluate_math(expr)
cap_call = f" [SYMPY]{expr}[CAPABILITY_STOP]{res}[CAPABILITY_STOP]"
text = f"First, let me check a calculation: {expr} = ?{cap_call} So {expr} is {res}. " + text
return text
def main():
print("Loading dataset...")
# Using a subset of hermes-agent-reasoning-traces or similar
try:
ds = load_dataset("lambda/hermes-agent-reasoning-traces", "kimi", split="train", streaming=True)
except:
print("Failed to load hermes, trying a simpler one")
ds = load_dataset("fka/awesome-chatgpt-prompts", split="train", streaming=True)
data = []
count = 0
max_examples = 1000
print(f"Processing up to {max_examples} examples...")
for item in ds:
if count >= max_examples:
break
# Extract prompt and response
# Hermes has 'conversations' list. fka has 'prompt' and 'response' (or just prompt)
if 'conversations' in item:
convs = item['conversations']
prompt = ""
completion = ""
for msg in convs:
if msg['from'] == 'human':
prompt += msg['value'] + "\n"
elif msg['from'] == 'gpt':
completion += msg['value'] + "\n"
if not prompt or not completion:
continue
else:
# Fallback for other datasets
prompt = item.get('prompt', '')
completion = item.get('response', item.get('completion', ''))
if not prompt or not completion:
continue
# Inject capabilities into completion
completion = inject_capabilities(completion)
# Add Reasoning tag if not present
if "Reasoning:" not in completion:
completion = "Reasoning: " + completion
data.append({
"prompt": prompt.strip(),
"completion": completion.strip()
})
count += 1
if count % 100 == 0:
print(f"Processed {count}...")
os.makedirs("data", exist_ok=True)
with open("data/sft_real_data.json", "w") as f:
json.dump(data, f, indent=2)
print(f"Generated {len(data)} examples in data/sft_real_data.json")
if __name__ == "__main__":
import nltk
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet')
try:
nltk.data.find('corpora/words')
except LookupError:
nltk.download('words')
main()

Xet Storage Details

Size:
3.79 kB
·
Xet hash:
dbece08aff0091cc4fd22f33379a418c35a23ed3e2d1560314c975415bd60e42

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.