| """ |
| Prepare data for Lumia-Tiny training. |
| |
| Usage: |
| python3 scripts/prepare_tiny_data.py # use existing data/data.jsonl |
| python3 scripts/prepare_tiny_data.py --generate 50 # generate N synthetic examples |
| python3 scripts/prepare_tiny_data.py --hf-sample 100 # sample from HF dataset |
| python3 scripts/prepare_tiny_data.py --to-jsonl # convert to simple format |
| """ |
|
|
| import os, sys, json, argparse, random |
| from pathlib import Path |
|
|
|
|
| SYSTEM_PROMPTS = [ |
| "You are a helpful AI assistant who solves problems step by step.", |
| "You are a precise programming assistant who writes clean, correct code.", |
| "You are a math tutor who explains concepts clearly.", |
| "You are a reasoning assistant who thinks through problems carefully.", |
| ] |
|
|
| INSTRUCTIONS = [ |
| "What is 2+2?", |
| "Explain how a binary search works.", |
| "Write a Python function to reverse a linked list.", |
| "What is the capital of France?", |
| "Explain the concept of recursion.", |
| "Write a function to check if a string is a palindrome.", |
| "What is the difference between TCP and UDP?", |
| "Explain how gradient descent works.", |
| "Write a quicksort implementation.", |
| "What is the time complexity of binary search?", |
| "Explain the concept of overfitting in machine learning.", |
| "Write a function to find the nth Fibonacci number.", |
| "What is the difference between a list and a tuple in Python?", |
| "Explain the CAP theorem.", |
| "Write a function to merge two sorted arrays.", |
| "What is the Pythagorean theorem?", |
| "Explain how HTTP works.", |
| "Write a function to calculate the factorial of a number.", |
| "What is the difference between SQL and NoSQL?", |
| "Explain the concept of polymorphism in OOP.", |
| "What is 15 * 7?", |
| "Write a function to find the maximum subarray sum.", |
| "Explain how DNS resolution works.", |
| "What is the difference between stack and heap memory?", |
| "Write a function to detect cycles in a linked list.", |
| "What is the derivative of x^2?", |
| "Explain the concept of normalization in databases.", |
| "Write a function to perform binary search in a sorted array.", |
| "What is the difference between symmetric and asymmetric encryption?", |
| "Explain how a hash table works.", |
| "What is 144 / 12?", |
| "Write a function to find all prime numbers up to n.", |
| "Explain the difference between REST and GraphQL.", |
| "What is the chain rule in calculus?", |
| "Write a function to serialize a binary tree.", |
| "What is the difference between HTTP and HTTPS?", |
| "Explain the concept of attention in transformers.", |
| "Write a function to compute the edit distance between two strings.", |
| "What is the integral of 2x?", |
| "Explain how a blockchain works.", |
| ] |
|
|
|
|
| RESPONSES = [ |
| "4", |
| "Binary search works by repeatedly dividing a sorted array in half, comparing the target value to the middle element, and eliminating the half that cannot contain the target.", |
| "```python\ndef reverse_linked_list(head):\n prev = None\n curr = head\n while curr:\n next_node = curr.next\n curr.next = prev\n prev = curr\n curr = next_node\n return prev\n```", |
| "Paris", |
| "Recursion is a programming technique where a function calls itself to solve a problem by breaking it down into smaller subproblems. Each recursive call works on a smaller instance until reaching a base case.", |
| "```python\ndef is_palindrome(s):\n s = ''.join(c.lower() for c in s if c.isalnum())\n return s == s[::-1]\n```", |
| "TCP is connection-oriented and guarantees delivery with error checking. UDP is connectionless and faster but does not guarantee delivery.", |
| "Gradient descent is an optimization algorithm that iteratively adjusts parameters to minimize a loss function by moving in the direction of the negative gradient.", |
| "```python\ndef quicksort(arr):\n if len(arr) <= 1:\n return arr\n pivot = arr[len(arr)//2]\n left = [x for x in arr if x < pivot]\n middle = [x for x in arr if x == pivot]\n right = [x for x in arr if x > pivot]\n return quicksort(left) + middle + quicksort(right)\n```", |
| "O(log n)", |
| "Overfitting occurs when a model learns training data too well, including noise, and performs poorly on new data. It can be addressed by regularization, dropout, or early stopping.", |
| "```python\ndef fibonacci(n):\n if n <= 1:\n return n\n a, b = 0, 1\n for _ in range(2, n+1):\n a, b = b, a + b\n return b\n```", |
| "Lists are mutable, tuples are immutable. Lists use square brackets [], tuples use parentheses ().", |
| "The CAP theorem states that a distributed system cannot simultaneously guarantee Consistency, Availability, and Partition tolerance.", |
| "```python\ndef merge_sorted(a, b):\n result = []\n i = j = 0\n while i < len(a) and j < len(b):\n if a[i] < b[j]:\n result.append(a[i]); i += 1\n else:\n result.append(b[j]); j += 1\n result.extend(a[i:]); result.extend(b[j:])\n return result\n```", |
| "a² + b² = c², where a and b are legs and c is the hypotenuse.", |
| "HTTP is a request-response protocol where a client sends a request to a server, which replies with a status code and body.", |
| "```python\ndef factorial(n):\n if n <= 1:\n return 1\n return n * factorial(n-1)\n```", |
| "SQL databases are relational with structured schemas and ACID properties. NoSQL databases are non-relational, schema-flexible, and scale horizontally.", |
| "Polymorphism allows objects of different types to respond to the same interface. In Python, duck typing lets any object with the required methods be used.", |
| "105", |
| "```python\ndef max_subarray_sum(arr):\n max_ending = max_sofar = arr[0]\n for x in arr[1:]:\n max_ending = max(x, max_ending + x)\n max_sofar = max(max_sofar, max_ending)\n return max_sofar\n```", |
| "DNS converts domain names to IP addresses. The resolver queries root -> TLD -> authoritative servers to find the IP.", |
| "Stack is LIFO (last-in-first-out) for local variables and function calls. Heap is for dynamically allocated memory with longer lifespan.", |
| "```python\ndef has_cycle(head):\n slow = fast = head\n while fast and fast.next:\n slow = slow.next\n fast = fast.next.next\n if slow == fast:\n return True\n return False\n```", |
| "2x", |
| "Normalization organizes data to reduce redundancy. Forms: 1NF (atomic columns), 2NF (no partial dependency), 3NF (no transitive dependency).", |
| "```python\ndef binary_search(arr, target):\n lo, hi = 0, len(arr) - 1\n while lo <= hi:\n mid = (lo + hi) // 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n lo = mid + 1\n else:\n hi = mid - 1\n return -1\n```", |
| "Symmetric encryption uses the same key for both encryption and decryption. Asymmetric uses a public key to encrypt and a private key to decrypt.", |
| "A hash table uses a hash function to map keys to array indices, providing O(1) average lookup. Collisions are handled by chaining or open addressing.", |
| "12", |
| "```python\ndef sieve(n):\n primes = [True] * (n+1)\n primes[0] = primes[1] = False\n for i in range(2, int(n**0.5)+1):\n if primes[i]:\n for j in range(i*i, n+1, i):\n primes[j] = False\n return [i for i, p in enumerate(primes) if p]\n```", |
| "REST uses standard HTTP methods with resource-based URLs. GraphQL uses a single endpoint with a query language for flexible data fetching.", |
| "d/dx f(g(x)) = f'(g(x)) * g'(x)", |
| "```python\ndef serialize_tree(root):\n def encode(node):\n if not node:\n return 'null'\n return f\"{node.val},{encode(node.left)},{encode(node.right)}\"\n return encode(root)\n```", |
| "HTTPS adds TLS encryption on top of HTTP, providing confidentiality and integrity.", |
| "Attention computes weighted combinations of values based on query-key similarity. The Transformer uses multi-head attention to capture different relationship types.", |
| "```python\ndef edit_distance(a, b):\n m, n = len(a), len(b)\n dp = [[0]*(n+1) for _ in range(m+1)]\n for i in range(m+1): dp[i][0] = i\n for j in range(n+1): dp[0][j] = j\n for i in range(1, m+1):\n for j in range(1, n+1):\n if a[i-1] == b[j-1]:\n dp[i][j] = dp[i-1][j-1]\n else:\n dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])\n return dp[m][n]\n```", |
| "x² + C", |
| "A blockchain is a distributed ledger where data is stored in blocks linked by cryptographic hashes. Each block contains a hash of the previous block, forming a chain.", |
| ] |
|
|
|
|
| def generate_synthetic(n=50): |
| random.seed(42) |
| samples = [] |
| for i in range(n): |
| sys_idx = i % len(SYSTEM_PROMPTS) |
| inst_idx = i % len(INSTRUCTIONS) |
| resp_idx = i % len(RESPONSES) |
| samples.append({ |
| "system": SYSTEM_PROMPTS[sys_idx], |
| "instruction": INSTRUCTIONS[inst_idx], |
| "input": "", |
| "output": RESPONSES[resp_idx], |
| }) |
| return samples |
|
|
|
|
| def load_and_split(data_path, test_ratio=0.1): |
| with open(data_path) as f: |
| data = [json.loads(line) for line in f] |
| random.seed(42) |
| random.shuffle(data) |
| split = int(len(data) * (1 - test_ratio)) |
| return data[:split], data[split:] |
|
|
|
|
| def convert_to_messages(data): |
| out = [] |
| for item in data: |
| user_msg = item["instruction"] |
| if item.get("input", ""): |
| user_msg += "\n" + item["input"] |
| out.append({ |
| "messages": [ |
| {"role": "system", "content": item.get("system", "")}, |
| {"role": "user", "content": user_msg}, |
| {"role": "assistant", "content": item["output"]}, |
| ] |
| }) |
| return out |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Prepare data for Lumia-Tiny") |
| parser.add_argument("--generate", type=int, default=None, |
| help="Generate N synthetic examples") |
| parser.add_argument("--to-jsonl", action="store_true", |
| help="Convert data/data.jsonl to messages format") |
| parser.add_argument("--output", default="data/tiny_data.jsonl", |
| help="Output path") |
| args = parser.parse_args() |
|
|
| output_path = args.output |
| os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) |
|
|
| if args.generate: |
| samples = generate_synthetic(args.generate) |
| with open(output_path, "w") as f: |
| for s in samples: |
| f.write(json.dumps(s) + "\n") |
| print(f" Generated {len(samples)} synthetic examples → {output_path}") |
| return |
|
|
| if args.to_jsonl: |
| data_path = "data/data.jsonl" |
| if not os.path.exists(data_path): |
| print(f" Error: {data_path} not found") |
| sys.exit(1) |
| with open(data_path) as f: |
| data = [json.loads(line) for line in f] |
| messages = convert_to_messages(data) |
| base = os.path.splitext(output_path)[0] |
| with open(f"{base}_messages.jsonl", "w") as f: |
| for m in messages: |
| f.write(json.dumps(m) + "\n") |
| print(f" Converted {len(messages)} examples → {base}_messages.jsonl") |
| return |
|
|
| |
| data_path = "data/data.jsonl" |
| if os.path.exists(data_path): |
| with open(data_path) as f: |
| lines = f.readlines() |
| print(f" Data file: {data_path}") |
| print(f" Samples: {len(lines)}") |
| print(f" Size: {os.path.getsize(data_path):,} bytes") |
| sample = json.loads(lines[0]) |
| print(f" Fields: {list(sample.keys())}") |
| print(f" Keys: system, instruction, input, output") |
| print(f"") |
| print(f" To generate synthetic data:") |
| print(f" python3 scripts/prepare_tiny_data.py --generate 200") |
| print(f"") |
| print(f" To convert to messages format:") |
| print(f" python3 scripts/prepare_tiny_data.py --to-jsonl") |
| else: |
| print(f" No data found. Generate with:") |
| print(f" python3 scripts/prepare_tiny_data.py --generate 200") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|