| import random | |
| import json | |
| topics = { | |
| "Math Reasoning": [ | |
| ("What is {a} + {b}?", "{a} + {b} is {sum}."), | |
| ("If you have {a} apples and get {b} more, how many?", "{a} + {b} = {sum} apples."), | |
| ("Solve: {a} + {b}", "The answer is {sum}.") | |
| ], | |
| "Causality": [ | |
| ("If it rains, what might happen?", "If it rains, the ground may become wet."), | |
| ("Why do plants grow towards light?", "Because light is a stimulus and plants respond by growing towards it."), | |
| ("What happens if you drop a glass?", "It will likely break due to gravity.") | |
| ], | |
| "Grammar Correction": [ | |
| ("Correct this: 'He go to school everyday.'", "'He goes to school every day.'"), | |
| ("Fix this sentence: 'I has two cat.'", "'I have two cats.'"), | |
| ("Can you fix this sentence: 'he have two taco.'", "'He has two tacos.'"), | |
| ("What’s the correct form of: 'She don't like it.'", "'She doesn't like it.'") | |
| ], | |
| "Common Sense": [ | |
| ("Can a person eat soup with a fork?", "No, it is impractical to eat soup with a fork."), | |
| ("Should you touch fire?", "No, touching fire can cause burns."), | |
| ("If you're tired, what should you do?", "You should rest or sleep.") | |
| ], | |
| "World Knowledge": [ | |
| ("What is the capital of France?", "Paris is the capital of France."), | |
| ("Who was the first president of the USA?", "George Washington."), | |
| ("What currency is used in Japan?", "The Japanese Yen.") | |
| ], | |
| "Instruction Following": [ | |
| ("Open the window and turn off the light.", "Opening the window. Turning off the light."), | |
| ("Sort these numbers in ascending order: 5, 2, 8.", "2, 5, 8."), | |
| ("Sort these numbers in descending order: 5, 2, 8.", "8, 5, 2."), | |
| ("Describe how to make a sandwich.", "Take two slices of bread, add your fillings, and place one slice on top.") | |
| ] | |
| } | |
| def generate_sample(id, topic): | |
| pattern = random.choice(topics[topic]) | |
| if topic == "Math Reasoning": | |
| a = random.randint(1, 20) | |
| b = random.randint(1, 20) | |
| sum_ab = a + b | |
| input_str = pattern[0].format(a=a, b=b, sum=sum_ab) | |
| output_str = pattern[1].format(a=a, b=b, sum=sum_ab) | |
| else: | |
| input_str = pattern[0] | |
| output_str = pattern[1] | |
| return { | |
| "id": id, | |
| "topic": topic, | |
| "input": input_str, | |
| "output": output_str | |
| } | |
| def generate_dataset(n=10000): | |
| dataset = [] | |
| topic_list = list(topics.keys()) | |
| for i in range(n): | |
| topic = random.choice(topic_list) | |
| sample = generate_sample(i, topic) | |
| dataset.append(sample) | |
| return dataset | |
| def save_as_jsonl(data, path="./data/reasoned_data.jsonl"): | |
| with open(path, "w", encoding="utf-8") as f: | |
| for item in data: | |
| json.dump(item, f, ensure_ascii=False) | |
| f.write("\n") | |
| if __name__ == "__main__": | |
| data = generate_dataset(10000) | |
| save_as_jsonl(data) | |
| print("Saved to ./data/reasoned_data.jsonl") |