dpe1/jules-tinyreasoner / src /generate_sft_data.py
dpe1's picture
download
raw
2.26 kB
import json
import random
import os
from src.capabilities import lookup_dictionary, evaluate_math
def generate_dictionary_task():
import nltk
try:
word_list = nltk.corpus.words.words()
except LookupError:
nltk.download('words')
word_list = nltk.corpus.words.words()
# Filter for reasonably sized words that actually have definitions
word = None
definition = "No definition found."
for _ in range(10):
w = random.choice(word_list).lower()
d = lookup_dictionary(w)
if d != f"No definition found for {w}.":
word = w
definition = d
break
if word is None:
word = "apple"
definition = lookup_dictionary(word)
prompt = f"What is the definition of {word}?"
reasoning = f"Reasoning: I need to find the definition of {word}. [DEFINE]{word}[CAPABILITY_STOP]{definition}[CAPABILITY_STOP] The word {word} means {definition}."
answer = f"Answer: {definition}"
return {
"prompt": prompt,
"completion": f"{reasoning} {answer}"
}
def generate_math_task():
a = random.randint(1, 500)
b = random.randint(1, 500)
ops = [("+", "sum of"), ("-", "difference between"), ("*", "product of")]
op, phrase = random.choice(ops)
expression = f"{a} {op} {b}"
result = evaluate_math(expression)
prompt = f"What is the {phrase} {a} and {b}?"
reasoning = f"Reasoning: I need to calculate {expression}. [SYMPY]{expression}[CAPABILITY_STOP]{result}[CAPABILITY_STOP] The result of {expression} is {result}."
answer = f"Answer: {result}"
return {
"prompt": prompt,
"completion": f"{reasoning} {answer}"
}
def main():
data = []
for _ in range(2000):
if random.random() < 0.5:
data.append(generate_dictionary_task())
else:
data.append(generate_math_task())
os.makedirs("data", exist_ok=True)
with open("data/sft_data.json", "w") as f:
json.dump(data, f, indent=2)
print(f"Generated {len(data)} SFT examples in data/sft_data.json")
if __name__ == "__main__":
import nltk
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet')
main()

Xet Storage Details

Size:
2.26 kB
·
Xet hash:
f7e4b108f10b0effbd66ca260c6fc81a305bfaf74db20aba543d89eb70f02b60

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.