Buckets:

dpe1
/

jules-tinyreasoner

Files

xet

dpe1/jules-tinyreasoner / src /generate_sft_data.py

dpe1

11 days ago

download

raw

2.26 kB

	import json
	import random
	import os
	from src.capabilities import lookup_dictionary, evaluate_math

	def generate_dictionary_task():
	import nltk
	try:
	word_list = nltk.corpus.words.words()
	except LookupError:
	nltk.download('words')
	word_list = nltk.corpus.words.words()

	# Filter for reasonably sized words that actually have definitions
	word = None
	definition = "No definition found."
	for _ in range(10):
	w = random.choice(word_list).lower()
	d = lookup_dictionary(w)
	if d != f"No definition found for {w}.":
	word = w
	definition = d
	break

	if word is None:
	word = "apple"
	definition = lookup_dictionary(word)

	prompt = f"What is the definition of {word}?"
	reasoning = f"Reasoning: I need to find the definition of {word}. [DEFINE]{word}[CAPABILITY_STOP]{definition}[CAPABILITY_STOP] The word {word} means {definition}."
	answer = f"Answer: {definition}"

	return {
	"prompt": prompt,
	"completion": f"{reasoning} {answer}"
	}

	def generate_math_task():
	a = random.randint(1, 500)
	b = random.randint(1, 500)
	ops = [("+", "sum of"), ("-", "difference between"), ("*", "product of")]
	op, phrase = random.choice(ops)

	expression = f"{a} {op} {b}"
	result = evaluate_math(expression)

	prompt = f"What is the {phrase} {a} and {b}?"
	reasoning = f"Reasoning: I need to calculate {expression}. [SYMPY]{expression}[CAPABILITY_STOP]{result}[CAPABILITY_STOP] The result of {expression} is {result}."
	answer = f"Answer: {result}"

	return {
	"prompt": prompt,
	"completion": f"{reasoning} {answer}"
	}

	def main():
	data = []
	for _ in range(2000):
	if random.random() < 0.5:
	data.append(generate_dictionary_task())
	else:
	data.append(generate_math_task())

	os.makedirs("data", exist_ok=True)
	with open("data/sft_data.json", "w") as f:
	json.dump(data, f, indent=2)
	print(f"Generated {len(data)} SFT examples in data/sft_data.json")

	if __name__ == "__main__":
	import nltk
	try:
	nltk.data.find('corpora/wordnet')
	except LookupError:
	nltk.download('wordnet')
	main()

Xet Storage Details

Size:: 2.26 kB
Xet hash:: f7e4b108f10b0effbd66ca260c6fc81a305bfaf74db20aba543d89eb70f02b60

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.