Buckets:

dpe1
/

jules-tinyreasoner

Files

xet

dpe1/jules-tinyreasoner / src /generate_grounding_data.py

dpe1

11 days ago

download

raw

3.46 kB

	import json
	import random
	import os
	import re
	from src.capabilities import lookup_dictionary, evaluate_math

	def generate_grounding_dictionary_task():
	import nltk
	try:
	word_list = nltk.corpus.words.words()
	except LookupError:
	nltk.download('words')
	word_list = nltk.corpus.words.words()

	# Level 0/1 style words: 3 to 6 letters
	filtered_words = [w for w in word_list if 3 <= len(w) <= 6]

	word = None
	definition = "No definition found."
	for _ in range(20):
	w = random.choice(filtered_words).lower()
	d = lookup_dictionary(w)
	if d != f"No definition found for {w}.":
	word = w
	definition = d
	break

	if word is None:
	word = "apple"
	definition = lookup_dictionary(word)

	prompt = f"What is the definition of {word}?"
	reasoning_templates = [
	f"Reasoning: I need to find the definition of {word}. [DEFINE]{word}[CAPABILITY_STOP]{definition}[CAPABILITY_STOP] The word {word} means {definition}.",
	f"Reasoning: Let me look up the definition of '{word}'. [DEFINE]{word}[CAPABILITY_STOP]{definition}[CAPABILITY_STOP] I found that {word} means {definition}.",
	f"Reasoning: To answer this, I'll check the dictionary for {word}. [DEFINE]{word}[CAPABILITY_STOP]{definition}[CAPABILITY_STOP] It says {word} is {definition}."
	]
	reasoning = random.choice(reasoning_templates)
	answer = f"Answer: {definition}"

	return {
	"prompt": prompt,
	"completion": f"{reasoning} {answer}"
	}

	def generate_grounding_math_task():
	# Level 0 style: single digits
	a = random.randint(1, 9)
	b = random.randint(1, 9)
	ops = [("+", "sum of"), ("-", "difference between"), ("*", "product of"), ("/", "ratio of")]
	op, phrase = random.choice(ops)

	expression = f"{a} {op} {b}"
	result = evaluate_math(expression)

	prompt = f"What is the {phrase} {a} and {b}?"
	reasoning_templates = [
	f"Reasoning: I need to calculate {expression}. [SYMPY]{expression}[CAPABILITY_STOP]{result}[CAPABILITY_STOP] The result of {expression} is {result}.",
	f"Reasoning: Let's compute {a} {op} {b}. [SYMPY]{expression}[CAPABILITY_STOP]{result}[CAPABILITY_STOP] This gives {result}.",
	f"Reasoning: I will use sympy to evaluate {expression}. [SYMPY]{expression}[CAPABILITY_STOP]{result}[CAPABILITY_STOP] The answer is {result}."
	]
	reasoning = random.choice(reasoning_templates)
	answer = f"Answer: {result}"

	return {
	"prompt": prompt,
	"completion": f"{reasoning} {answer}"
	}

	def main():
	data = []
	print("Generating dictionary tasks...")
	for i in range(2500):
	data.append(generate_grounding_dictionary_task())
	if (i+1) % 500 == 0:
	print(f" Generated {i+1} dictionary tasks")

	print("Generating math tasks...")
	for i in range(2500):
	data.append(generate_grounding_math_task())
	if (i+1) % 500 == 0:
	print(f" Generated {i+1} math tasks")

	random.shuffle(data)
	os.makedirs("data", exist_ok=True)
	with open("data/grounding_data.json", "w") as f:
	json.dump(data, f, indent=2)
	print(f"Generated {len(data)} grounding-focused SFT examples in data/grounding_data.json")

	if __name__ == "__main__":
	import nltk
	try:
	nltk.data.find('corpora/wordnet')
	except LookupError:
	nltk.download('wordnet')
	main()

Xet Storage Details

Size:: 3.46 kB
Xet hash:: 2772c43d672470d427ca2e76632c3b2d085d1906ffd80d76b773fdb57e7aa840

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.