dpe1/jules-tinyreasoner / src /generate_grounding_data.py
dpe1's picture
download
raw
3.46 kB
import json
import random
import os
import re
from src.capabilities import lookup_dictionary, evaluate_math
def generate_grounding_dictionary_task():
import nltk
try:
word_list = nltk.corpus.words.words()
except LookupError:
nltk.download('words')
word_list = nltk.corpus.words.words()
# Level 0/1 style words: 3 to 6 letters
filtered_words = [w for w in word_list if 3 <= len(w) <= 6]
word = None
definition = "No definition found."
for _ in range(20):
w = random.choice(filtered_words).lower()
d = lookup_dictionary(w)
if d != f"No definition found for {w}.":
word = w
definition = d
break
if word is None:
word = "apple"
definition = lookup_dictionary(word)
prompt = f"What is the definition of {word}?"
reasoning_templates = [
f"Reasoning: I need to find the definition of {word}. [DEFINE]{word}[CAPABILITY_STOP]{definition}[CAPABILITY_STOP] The word {word} means {definition}.",
f"Reasoning: Let me look up the definition of '{word}'. [DEFINE]{word}[CAPABILITY_STOP]{definition}[CAPABILITY_STOP] I found that {word} means {definition}.",
f"Reasoning: To answer this, I'll check the dictionary for {word}. [DEFINE]{word}[CAPABILITY_STOP]{definition}[CAPABILITY_STOP] It says {word} is {definition}."
]
reasoning = random.choice(reasoning_templates)
answer = f"Answer: {definition}"
return {
"prompt": prompt,
"completion": f"{reasoning} {answer}"
}
def generate_grounding_math_task():
# Level 0 style: single digits
a = random.randint(1, 9)
b = random.randint(1, 9)
ops = [("+", "sum of"), ("-", "difference between"), ("*", "product of"), ("/", "ratio of")]
op, phrase = random.choice(ops)
expression = f"{a} {op} {b}"
result = evaluate_math(expression)
prompt = f"What is the {phrase} {a} and {b}?"
reasoning_templates = [
f"Reasoning: I need to calculate {expression}. [SYMPY]{expression}[CAPABILITY_STOP]{result}[CAPABILITY_STOP] The result of {expression} is {result}.",
f"Reasoning: Let's compute {a} {op} {b}. [SYMPY]{expression}[CAPABILITY_STOP]{result}[CAPABILITY_STOP] This gives {result}.",
f"Reasoning: I will use sympy to evaluate {expression}. [SYMPY]{expression}[CAPABILITY_STOP]{result}[CAPABILITY_STOP] The answer is {result}."
]
reasoning = random.choice(reasoning_templates)
answer = f"Answer: {result}"
return {
"prompt": prompt,
"completion": f"{reasoning} {answer}"
}
def main():
data = []
print("Generating dictionary tasks...")
for i in range(2500):
data.append(generate_grounding_dictionary_task())
if (i+1) % 500 == 0:
print(f" Generated {i+1} dictionary tasks")
print("Generating math tasks...")
for i in range(2500):
data.append(generate_grounding_math_task())
if (i+1) % 500 == 0:
print(f" Generated {i+1} math tasks")
random.shuffle(data)
os.makedirs("data", exist_ok=True)
with open("data/grounding_data.json", "w") as f:
json.dump(data, f, indent=2)
print(f"Generated {len(data)} grounding-focused SFT examples in data/grounding_data.json")
if __name__ == "__main__":
import nltk
try:
nltk.data.find('corpora/wordnet')
except LookupError:
nltk.download('wordnet')
main()

Xet Storage Details

Size:
3.46 kB
·
Xet hash:
2772c43d672470d427ca2e76632c3b2d085d1906ffd80d76b773fdb57e7aa840

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.