bbkdevops's picture
download
raw
36.2 kB
"""
Code Forge — สร้าง code training data ระดับลึกสุด
ครอบคลุม 8 หมวด:
L1: Python Basics (functions, loops, conditions)
L2: Data Structures (list, dict, set, deque)
L3: String Manipulation
L4: Sorting & Searching Algorithms
L5: Dynamic Programming
L6: Graph Algorithms (BFS/DFS)
L7: Mathematical Algorithms
L8: Competitive Programming (hard)
แต่ละ problem มี:
- question: โจทย์ภาษาอังกฤษ+ไทย
- thinking: แนวคิด step-by-step (CoT)
- answer: code solution พร้อม type hints
- test_cases: unit tests สำหรับ verify
- ground_truth: expected outputs (JSON)
- complexity: O(?) notation
"""
from __future__ import annotations
import json
import textwrap
from dataclasses import dataclass
from pathlib import Path
OUTPUT_DIR = Path(__file__).parent / "filtered"
OUTPUT_DIR.mkdir(exist_ok=True)
CODE_OUTPUT = OUTPUT_DIR / "code_grpo.jsonl"
@dataclass
class CodeProblem:
question: str
thinking: str
answer: str # complete Python solution
test_cases: str # pytest-style assertions
ground_truth: str # JSON expected outputs for simple verification
level: int
category: str
time_complexity: str
space_complexity: str
lang: str = "en"
source: str = "code_forge"
topic: str = "programming"
context: str = ""
def _dedent(code: str) -> str:
return textwrap.dedent(code).strip()
# ─── Level 1: Python Basics ───────────────────────────────────────────────────
L1_PROBLEMS: list[CodeProblem] = [
CodeProblem(
question="Write `def fizzbuzz(n: int) -> list[str]` that returns FizzBuzz from 1 to n: 'Fizz' (÷3), 'Buzz' (÷5), 'FizzBuzz' (÷15), else the number as string.",
thinking=_dedent("""
Iterate 1..n inclusive.
Check divisibility: test 15 first (LCM of 3,5), then 3, then 5, else str(i).
Return as list.
"""),
answer=_dedent("""
def fizzbuzz(n: int) -> list[str]:
result = []
for i in range(1, n + 1):
if i % 15 == 0:
result.append("FizzBuzz")
elif i % 3 == 0:
result.append("Fizz")
elif i % 5 == 0:
result.append("Buzz")
else:
result.append(str(i))
return result
"""),
test_cases=_dedent("""
assert fizzbuzz(5) == ['1', '2', 'Fizz', '4', 'Buzz']
assert fizzbuzz(15)[-1] == 'FizzBuzz'
assert fizzbuzz(1) == ['1']
"""),
ground_truth='["1","2","Fizz","4","Buzz"]',
level=1, category="basics", time_complexity="O(n)", space_complexity="O(n)",
),
CodeProblem(
question="Write `def two_sum(nums: list[int], target: int) -> list[int]` that returns indices of two numbers that add to target. Assume exactly one solution.",
thinking=_dedent("""
Naive O(n²): try all pairs. But O(n) with a hash map.
For each num at index i, check if (target - num) already in seen.
Store {value: index} as we go.
"""),
answer=_dedent("""
def two_sum(nums: list[int], target: int) -> list[int]:
seen: dict[int, int] = {}
for i, num in enumerate(nums):
complement = target - num
if complement in seen:
return [seen[complement], i]
seen[num] = i
return []
"""),
test_cases=_dedent("""
assert two_sum([2, 7, 11, 15], 9) == [0, 1]
assert two_sum([3, 2, 4], 6) == [1, 2]
assert two_sum([3, 3], 6) == [0, 1]
"""),
ground_truth="[0,1]",
level=1, category="basics", time_complexity="O(n)", space_complexity="O(n)",
),
CodeProblem(
question="Write `def flatten_once(lst: list) -> list` that flattens a list one level deep.",
thinking="Use list comprehension: for each sublist, iterate its elements.",
answer=_dedent("""
def flatten_once(lst: list) -> list:
return [item for sublist in lst for item in sublist]
"""),
test_cases=_dedent("""
assert flatten_once([[1,2],[3,4],[5]]) == [1,2,3,4,5]
assert flatten_once([[1],[2,3],[4,5,6]]) == [1,2,3,4,5,6]
assert flatten_once([]) == []
"""),
ground_truth="[1,2,3,4,5]",
level=1, category="basics", time_complexity="O(n)", space_complexity="O(n)",
),
CodeProblem(
question="Write `def count_vowels(s: str) -> int` that counts vowels (a,e,i,o,u) case-insensitively.",
thinking="Convert to lower, count chars in vowel set.",
answer=_dedent("""
def count_vowels(s: str) -> int:
return sum(1 for c in s.lower() if c in 'aeiou')
"""),
test_cases=_dedent("""
assert count_vowels("Hello World") == 3
assert count_vowels("AEIOU") == 5
assert count_vowels("xyz") == 0
"""),
ground_truth="3",
level=1, category="basics", time_complexity="O(n)", space_complexity="O(1)",
),
CodeProblem(
question="Write `def rotate_list(lst: list, k: int) -> list` that rotates a list right by k positions.",
thinking="k can be > len. Use k % len to normalize. Slice: lst[-k:] + lst[:-k].",
answer=_dedent("""
def rotate_list(lst: list, k: int) -> list:
if not lst:
return lst
k = k % len(lst)
return lst[-k:] + lst[:-k] if k else lst[:]
"""),
test_cases=_dedent("""
assert rotate_list([1,2,3,4,5], 2) == [4,5,1,2,3]
assert rotate_list([1,2,3], 0) == [1,2,3]
assert rotate_list([1,2,3], 4) == [3,1,2]
"""),
ground_truth="[4,5,1,2,3]",
level=1, category="basics", time_complexity="O(n)", space_complexity="O(n)",
),
]
# ─── Level 2: Data Structures ─────────────────────────────────────────────────
L2_PROBLEMS: list[CodeProblem] = [
CodeProblem(
question="Write `def lru_cache_impl(capacity: int)` as a class `LRUCache` with `get(key)` and `put(key, value)` both O(1). Use OrderedDict.",
thinking=_dedent("""
OrderedDict maintains insertion order. On get/put, move key to end (most recent).
On put when at capacity, pop first item (least recently used).
get returns -1 if key absent.
"""),
answer=_dedent("""
from collections import OrderedDict
class LRUCache:
def __init__(self, capacity: int):
self.cap = capacity
self.cache: OrderedDict[int, int] = OrderedDict()
def get(self, key: int) -> int:
if key not in self.cache:
return -1
self.cache.move_to_end(key)
return self.cache[key]
def put(self, key: int, value: int) -> None:
if key in self.cache:
self.cache.move_to_end(key)
self.cache[key] = value
if len(self.cache) > self.cap:
self.cache.popitem(last=False)
"""),
test_cases=_dedent("""
cache = LRUCache(2)
cache.put(1, 1); cache.put(2, 2)
assert cache.get(1) == 1
cache.put(3, 3)
assert cache.get(2) == -1
cache.put(4, 4)
assert cache.get(1) == -1
assert cache.get(3) == 3
assert cache.get(4) == 4
"""),
ground_truth="-1",
level=2, category="data_structures", time_complexity="O(1)", space_complexity="O(capacity)",
),
CodeProblem(
question="Write `def valid_parentheses(s: str) -> bool` that returns True if brackets are balanced. Handle '(', ')', '{', '}', '[', ']'.",
thinking=_dedent("""
Use a stack. For each char:
- Opening bracket → push
- Closing bracket → check stack top is matching opening
Empty stack at end = balanced.
"""),
answer=_dedent("""
def valid_parentheses(s: str) -> bool:
stack: list[str] = []
matching = {')': '(', '}': '{', ']': '['}
for ch in s:
if ch in '({[':
stack.append(ch)
elif ch in matching:
if not stack or stack[-1] != matching[ch]:
return False
stack.pop()
return len(stack) == 0
"""),
test_cases=_dedent("""
assert valid_parentheses("()[]{}") == True
assert valid_parentheses("([{}])") == True
assert valid_parentheses("(]") == False
assert valid_parentheses("{[}]") == False
assert valid_parentheses("") == True
"""),
ground_truth="true",
level=2, category="data_structures", time_complexity="O(n)", space_complexity="O(n)",
),
CodeProblem(
question="Write `def top_k_frequent(nums: list[int], k: int) -> list[int]` that returns k most frequent elements.",
thinking=_dedent("""
Count frequencies with Counter.
Use heap or just sorted.
sorted(count.items(), key=lambda x: -x[1])[:k] gives top-k.
For O(n log k) use heapq.nlargest.
"""),
answer=_dedent("""
from collections import Counter
import heapq
def top_k_frequent(nums: list[int], k: int) -> list[int]:
count = Counter(nums)
return [x for x, _ in heapq.nlargest(k, count.items(), key=lambda p: p[1])]
"""),
test_cases=_dedent("""
assert set(top_k_frequent([1,1,1,2,2,3], 2)) == {1, 2}
assert top_k_frequent([1], 1) == [1]
"""),
ground_truth="[1,2]",
level=2, category="data_structures", time_complexity="O(n log k)", space_complexity="O(n)",
),
]
# ─── Level 3: String Manipulation ─────────────────────────────────────────────
L3_PROBLEMS: list[CodeProblem] = [
CodeProblem(
question="Write `def longest_palindromic_substring(s: str) -> str` using Expand-Around-Center O(n²).",
thinking=_dedent("""
For each center (and between chars), expand outward while chars match.
Track max palindrome seen. Handle both odd and even length palindromes.
"""),
answer=_dedent("""
def longest_palindromic_substring(s: str) -> str:
def expand(l: int, r: int) -> str:
while l >= 0 and r < len(s) and s[l] == s[r]:
l -= 1; r += 1
return s[l+1:r]
best = ""
for i in range(len(s)):
odd = expand(i, i)
even = expand(i, i+1)
if len(odd) > len(best): best = odd
if len(even) > len(best): best = even
return best
"""),
test_cases=_dedent("""
assert longest_palindromic_substring("babad") in ("bab", "aba")
assert longest_palindromic_substring("cbbd") == "bb"
assert longest_palindromic_substring("a") == "a"
assert longest_palindromic_substring("racecar") == "racecar"
"""),
ground_truth='"racecar"',
level=3, category="strings", time_complexity="O(n²)", space_complexity="O(1)",
),
CodeProblem(
question="Write `def group_anagrams(words: list[str]) -> list[list[str]]` that groups anagrams together.",
thinking="Sort each word as key, group by key using defaultdict(list).",
answer=_dedent("""
from collections import defaultdict
def group_anagrams(words: list[str]) -> list[list[str]]:
groups: dict[str, list[str]] = defaultdict(list)
for w in words:
key = ''.join(sorted(w))
groups[key].append(w)
return list(groups.values())
"""),
test_cases=_dedent("""
result = group_anagrams(["eat","tea","tan","ate","nat","bat"])
assert sorted(sorted(g) for g in result) == [['ate','eat','tea'],['bat'],['nat','tan']]
"""),
ground_truth='[["ate","eat","tea"],["bat"],["nat","tan"]]',
level=3, category="strings", time_complexity="O(n·k·log k)", space_complexity="O(n·k)",
),
]
# ─── Level 4: Sorting & Searching ─────────────────────────────────────────────
L4_PROBLEMS: list[CodeProblem] = [
CodeProblem(
question="Implement `def merge_sort(arr: list[int]) -> list[int]` from scratch.",
thinking=_dedent("""
Divide: split array in half recursively until size 1.
Conquer: merge two sorted halves.
Merge: two-pointer comparison, append remainder.
"""),
answer=_dedent("""
def merge_sort(arr: list[int]) -> list[int]:
if len(arr) <= 1:
return arr
mid = len(arr) // 2
left = merge_sort(arr[:mid])
right = merge_sort(arr[mid:])
return _merge(left, right)
def _merge(a: list[int], b: list[int]) -> list[int]:
result: list[int] = []
i = j = 0
while i < len(a) and j < len(b):
if a[i] <= b[j]:
result.append(a[i]); i += 1
else:
result.append(b[j]); j += 1
return result + a[i:] + b[j:]
"""),
test_cases=_dedent("""
assert merge_sort([5,2,8,1,9,3]) == [1,2,3,5,8,9]
assert merge_sort([1]) == [1]
assert merge_sort([]) == []
assert merge_sort([3,1,4,1,5,9,2,6]) == sorted([3,1,4,1,5,9,2,6])
"""),
ground_truth="[1,2,3,5,8,9]",
level=4, category="sorting", time_complexity="O(n log n)", space_complexity="O(n)",
),
CodeProblem(
question="Write `def binary_search(arr: list[int], target: int) -> int` that returns the index or -1 if not found. Array is sorted.",
thinking=_dedent("""
Maintain lo, hi pointers. Mid = (lo+hi)//2.
If arr[mid]==target → found. If arr[mid]<target → search right. Else left.
"""),
answer=_dedent("""
def binary_search(arr: list[int], target: int) -> int:
lo, hi = 0, len(arr) - 1
while lo <= hi:
mid = (lo + hi) // 2
if arr[mid] == target:
return mid
elif arr[mid] < target:
lo = mid + 1
else:
hi = mid - 1
return -1
"""),
test_cases=_dedent("""
assert binary_search([1,3,5,7,9,11], 7) == 3
assert binary_search([1,3,5,7,9,11], 6) == -1
assert binary_search([], 1) == -1
assert binary_search([5], 5) == 0
"""),
ground_truth="3",
level=4, category="searching", time_complexity="O(log n)", space_complexity="O(1)",
),
]
# ─── Level 5: Dynamic Programming ─────────────────────────────────────────────
L5_PROBLEMS: list[CodeProblem] = [
CodeProblem(
question="Write `def longest_common_subsequence(s1: str, s2: str) -> int` using DP bottom-up.",
thinking=_dedent("""
dp[i][j] = LCS of s1[:i] and s2[:j].
If s1[i-1]==s2[j-1]: dp[i][j] = dp[i-1][j-1] + 1
Else: dp[i][j] = max(dp[i-1][j], dp[i][j-1])
Time O(mn), Space O(mn) — can reduce to O(n).
"""),
answer=_dedent("""
def longest_common_subsequence(s1: str, s2: str) -> int:
m, n = len(s1), len(s2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if s1[i-1] == s2[j-1]:
dp[i][j] = dp[i-1][j-1] + 1
else:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
return dp[m][n]
"""),
test_cases=_dedent("""
assert longest_common_subsequence("abcde", "ace") == 3
assert longest_common_subsequence("abc", "abc") == 3
assert longest_common_subsequence("abc", "def") == 0
assert longest_common_subsequence("AGGTAB", "GXTXAYB") == 4
"""),
ground_truth="3",
level=5, category="dynamic_programming", time_complexity="O(mn)", space_complexity="O(mn)",
),
CodeProblem(
question="Write `def coin_change(coins: list[int], amount: int) -> int` — minimum number of coins to make amount, or -1 if impossible.",
thinking=_dedent("""
Classic unbounded knapsack DP.
dp[i] = min coins to make amount i.
dp[0] = 0, dp[i] = min(dp[i-c]+1) for c in coins if i>=c.
Initialize dp[1..amount] = infinity.
"""),
answer=_dedent("""
def coin_change(coins: list[int], amount: int) -> int:
dp = [float('inf')] * (amount + 1)
dp[0] = 0
for i in range(1, amount + 1):
for c in coins:
if c <= i:
dp[i] = min(dp[i], dp[i - c] + 1)
return int(dp[amount]) if dp[amount] != float('inf') else -1
"""),
test_cases=_dedent("""
assert coin_change([1,5,6,9], 11) == 2
assert coin_change([1,2,5], 11) == 3
assert coin_change([2], 3) == -1
assert coin_change([1], 0) == 0
"""),
ground_truth="3",
level=5, category="dynamic_programming", time_complexity="O(amount × len(coins))", space_complexity="O(amount)",
),
CodeProblem(
question="Write `def max_subarray(nums: list[int]) -> int` — maximum subarray sum (Kadane's Algorithm).",
thinking=_dedent("""
Kadane's: track current_sum and max_sum.
current_sum = max(num, current_sum + num) — restart if adding is worse.
max_sum = max(max_sum, current_sum).
"""),
answer=_dedent("""
def max_subarray(nums: list[int]) -> int:
if not nums:
return 0
current = best = nums[0]
for num in nums[1:]:
current = max(num, current + num)
best = max(best, current)
return best
"""),
test_cases=_dedent("""
assert max_subarray([-2,1,-3,4,-1,2,1,-5,4]) == 6
assert max_subarray([1]) == 1
assert max_subarray([-1,-2,-3]) == -1
assert max_subarray([5,4,-1,7,8]) == 23
"""),
ground_truth="6",
level=5, category="dynamic_programming", time_complexity="O(n)", space_complexity="O(1)",
),
CodeProblem(
question="Write `def knapsack_01(weights: list[int], values: list[int], capacity: int) -> int` — 0/1 Knapsack max value.",
thinking=_dedent("""
dp[i][w] = max value using first i items with capacity w.
If weight[i-1] > w: dp[i][w] = dp[i-1][w]
Else: dp[i][w] = max(dp[i-1][w], dp[i-1][w-weight[i-1]] + value[i-1])
"""),
answer=_dedent("""
def knapsack_01(weights: list[int], values: list[int], capacity: int) -> int:
n = len(weights)
dp = [[0] * (capacity + 1) for _ in range(n + 1)]
for i in range(1, n + 1):
for w in range(capacity + 1):
dp[i][w] = dp[i-1][w]
if weights[i-1] <= w:
dp[i][w] = max(dp[i][w],
dp[i-1][w - weights[i-1]] + values[i-1])
return dp[n][capacity]
"""),
test_cases=_dedent("""
assert knapsack_01([1,3,4,5],[1,4,5,7], 7) == 9
assert knapsack_01([2,3,4,5],[3,4,5,6], 8) == 10
assert knapsack_01([1],[10], 5) == 10
"""),
ground_truth="9",
level=5, category="dynamic_programming", time_complexity="O(n × capacity)", space_complexity="O(n × capacity)",
),
]
# ─── Level 6: Graph Algorithms ────────────────────────────────────────────────
L6_PROBLEMS: list[CodeProblem] = [
CodeProblem(
question="Write `def bfs_shortest_path(graph: dict[int,list[int]], start: int, end: int) -> list[int]` — returns shortest path or [].",
thinking=_dedent("""
BFS explores nodes level by level → guarantees shortest path in unweighted graph.
Use deque for O(1) popleft. Track parent to reconstruct path.
visited set prevents cycles.
"""),
answer=_dedent("""
from collections import deque
def bfs_shortest_path(graph: dict[int, list[int]], start: int, end: int) -> list[int]:
if start == end:
return [start]
visited = {start}
parent: dict[int, int] = {}
queue = deque([start])
while queue:
node = queue.popleft()
for neighbor in graph.get(node, []):
if neighbor not in visited:
visited.add(neighbor)
parent[neighbor] = node
if neighbor == end:
path = []
cur = end
while cur != start:
path.append(cur)
cur = parent[cur]
path.append(start)
return path[::-1]
queue.append(neighbor)
return []
"""),
test_cases=_dedent("""
g = {1:[2,3], 2:[4], 3:[4,5], 4:[6], 5:[6], 6:[]}
path = bfs_shortest_path(g, 1, 6)
assert path[0] == 1 and path[-1] == 6
assert len(path) == 4 # shortest: 1→2→4→6 or 1→3→4→6
assert bfs_shortest_path(g, 1, 99) == []
"""),
ground_truth="[1,2,4,6]",
level=6, category="graphs", time_complexity="O(V+E)", space_complexity="O(V)",
),
CodeProblem(
question="Write `def num_islands(grid: list[list[str]]) -> int` — count connected components of '1's. Use DFS.",
thinking=_dedent("""
Iterate all cells. If '1', increment count and DFS to mark all connected '1's as '0'.
DFS explores 4 directions. No extra visited needed — modify grid in place.
"""),
answer=_dedent("""
def num_islands(grid: list[list[str]]) -> int:
if not grid:
return 0
rows, cols = len(grid), len(grid[0])
def dfs(r: int, c: int) -> None:
if r < 0 or r >= rows or c < 0 or c >= cols or grid[r][c] != '1':
return
grid[r][c] = '0'
dfs(r+1, c); dfs(r-1, c); dfs(r, c+1); dfs(r, c-1)
count = 0
for r in range(rows):
for c in range(cols):
if grid[r][c] == '1':
dfs(r, c)
count += 1
return count
"""),
test_cases=_dedent("""
g1 = [["1","1","0"],["0","1","0"],["0","0","1"]]
assert num_islands([row[:] for row in g1]) == 2
g2 = [["1","1","1"],["1","1","1"],["1","1","1"]]
assert num_islands([row[:] for row in g2]) == 1
"""),
ground_truth="2",
level=6, category="graphs", time_complexity="O(V+E)", space_complexity="O(V)",
),
CodeProblem(
question="Write `def topological_sort(n: int, edges: list[tuple[int,int]]) -> list[int]` — return topo order or [] if cycle. Kahn's algorithm.",
thinking=_dedent("""
Kahn's (BFS): compute in-degree for each node.
Start with all nodes with in-degree 0 in queue.
Pop node → add to result → decrement neighbors' in-degree → push if 0.
If result length < n → cycle detected.
"""),
answer=_dedent("""
from collections import deque, defaultdict
def topological_sort(n: int, edges: list[tuple[int,int]]) -> list[int]:
adj: dict[int, list[int]] = defaultdict(list)
in_degree = [0] * n
for u, v in edges:
adj[u].append(v)
in_degree[v] += 1
queue = deque(i for i in range(n) if in_degree[i] == 0)
result: list[int] = []
while queue:
node = queue.popleft()
result.append(node)
for nb in adj[node]:
in_degree[nb] -= 1
if in_degree[nb] == 0:
queue.append(nb)
return result if len(result) == n else []
"""),
test_cases=_dedent("""
assert topological_sort(4, [(0,1),(0,2),(1,3),(2,3)]) in [[0,1,2,3],[0,2,1,3]]
assert topological_sort(3, [(0,1),(1,2),(2,0)]) == [] # cycle
assert topological_sort(1, []) == [0]
"""),
ground_truth="[0,1,2,3]",
level=6, category="graphs", time_complexity="O(V+E)", space_complexity="O(V+E)",
),
]
# ─── Level 7: Mathematical Algorithms ────────────────────────────────────────
L7_PROBLEMS: list[CodeProblem] = [
CodeProblem(
question="Write `def sieve_of_eratosthenes(n: int) -> list[int]` — return all primes ≤ n.",
thinking=_dedent("""
Create boolean array size n+1, init True.
Mark 0,1 as not prime. For p from 2 to √n:
if prime[p]: mark p², p²+p, p²+2p, ... as not prime.
Collect indices still True.
"""),
answer=_dedent("""
def sieve_of_eratosthenes(n: int) -> list[int]:
if n < 2:
return []
is_prime = [True] * (n + 1)
is_prime[0] = is_prime[1] = False
p = 2
while p * p <= n:
if is_prime[p]:
for multiple in range(p * p, n + 1, p):
is_prime[multiple] = False
p += 1
return [i for i in range(2, n + 1) if is_prime[i]]
"""),
test_cases=_dedent("""
assert sieve_of_eratosthenes(10) == [2,3,5,7]
assert sieve_of_eratosthenes(1) == []
assert sieve_of_eratosthenes(30) == [2,3,5,7,11,13,17,19,23,29]
"""),
ground_truth="[2,3,5,7]",
level=7, category="math_algorithms", time_complexity="O(n log log n)", space_complexity="O(n)",
),
CodeProblem(
question="Write `def fast_power(base: int, exp: int, mod: int) -> int` — compute (base^exp) % mod using binary exponentiation.",
thinking=_dedent("""
Binary exponentiation: if exp is odd, multiply base once, then square base and halve exp.
Result = (result * base) % mod when bit is set.
O(log exp) multiplications.
"""),
answer=_dedent("""
def fast_power(base: int, exp: int, mod: int) -> int:
result = 1
base %= mod
while exp > 0:
if exp & 1:
result = result * base % mod
base = base * base % mod
exp >>= 1
return result
"""),
test_cases=_dedent("""
assert fast_power(2, 10, 1000) == 24
assert fast_power(3, 0, 7) == 1
assert fast_power(2, 100, 1_000_000_007) == pow(2, 100, 1_000_000_007)
"""),
ground_truth="24",
level=7, category="math_algorithms", time_complexity="O(log exp)", space_complexity="O(1)",
),
]
# ─── Level 8: Competitive Programming ────────────────────────────────────────
L8_PROBLEMS: list[CodeProblem] = [
CodeProblem(
question="Write `def sliding_window_max(nums: list[int], k: int) -> list[int]` — max in each window of size k. O(n) using deque.",
thinking=_dedent("""
Use monotonic deque (decreasing). Store indices.
For each index i:
1. Remove from front if out of window (index <= i-k)
2. Remove from back while nums[back] <= nums[i] (smaller elements useless)
3. Append i
4. If i >= k-1, front of deque is max of current window.
"""),
answer=_dedent("""
from collections import deque
def sliding_window_max(nums: list[int], k: int) -> list[int]:
result: list[int] = []
dq: deque[int] = deque() # indices, decreasing values
for i, num in enumerate(nums):
while dq and dq[0] <= i - k:
dq.popleft()
while dq and nums[dq[-1]] <= num:
dq.pop()
dq.append(i)
if i >= k - 1:
result.append(nums[dq[0]])
return result
"""),
test_cases=_dedent("""
assert sliding_window_max([1,3,-1,-3,5,3,6,7], 3) == [3,3,5,5,6,7]
assert sliding_window_max([1], 1) == [1]
assert sliding_window_max([9,8,7,6,5], 2) == [9,8,7,6]
"""),
ground_truth="[3,3,5,5,6,7]",
level=8, category="competitive", time_complexity="O(n)", space_complexity="O(k)",
),
CodeProblem(
question="Write `def trapping_rain_water(height: list[int]) -> int` — total water trapped. Two-pointer O(n) O(1).",
thinking=_dedent("""
Two pointers l=0, r=n-1. Track max_left, max_right.
Move the pointer with smaller height:
- If height[l] < height[r]:
water += max(0, max_left - height[l])
max_left = max(max_left, height[l])
l++
- Else mirror for right side.
Water = bounded by the shorter side.
"""),
answer=_dedent("""
def trapping_rain_water(height: list[int]) -> int:
l, r = 0, len(height) - 1
max_l = max_r = water = 0
while l < r:
if height[l] < height[r]:
if height[l] >= max_l:
max_l = height[l]
else:
water += max_l - height[l]
l += 1
else:
if height[r] >= max_r:
max_r = height[r]
else:
water += max_r - height[r]
r -= 1
return water
"""),
test_cases=_dedent("""
assert trapping_rain_water([0,1,0,2,1,0,1,3,2,1,2,1]) == 6
assert trapping_rain_water([4,2,0,3,2,5]) == 9
assert trapping_rain_water([1,1]) == 0
assert trapping_rain_water([3,0,2,0,4]) == 7
"""),
ground_truth="6",
level=8, category="competitive", time_complexity="O(n)", space_complexity="O(1)",
),
CodeProblem(
question="Write `def word_break(s: str, word_dict: list[str]) -> bool` — can s be segmented using words in dict? DP O(n²).",
thinking=_dedent("""
dp[i] = True if s[:i] can be segmented.
dp[0] = True (empty string).
For i in 1..n: dp[i] = any(dp[j] and s[j:i] in word_set for j in 0..i)
Use set for O(1) word lookup.
"""),
answer=_dedent("""
def word_break(s: str, word_dict: list[str]) -> bool:
word_set = set(word_dict)
n = len(s)
dp = [False] * (n + 1)
dp[0] = True
for i in range(1, n + 1):
for j in range(i):
if dp[j] and s[j:i] in word_set:
dp[i] = True
break
return dp[n]
"""),
test_cases=_dedent("""
assert word_break("leetcode", ["leet","code"]) == True
assert word_break("applepenapple", ["apple","pen"]) == True
assert word_break("catsandog", ["cats","dog","sand","and","cat"]) == False
"""),
ground_truth="true",
level=8, category="competitive", time_complexity="O(n²)", space_complexity="O(n)",
),
]
# ─── Wrap into training format ────────────────────────────────────────────────
def _to_training_record(prob: CodeProblem) -> dict:
"""แปลง CodeProblem → training record สำหรับ GRPO / SFT"""
system = (
"You are an expert Python programmer. Think step-by-step in <think>...</think>, "
"then provide the complete, correct Python solution in <answer>...</answer>.\n"
"Include type hints and follow PEP 8."
)
full_answer = (
f"<think>\n{prob.thinking}\n"
f"Time: {prob.time_complexity}, Space: {prob.space_complexity}\n</think>\n"
f"<answer>\n{prob.answer}\n</answer>"
)
return {
"question": prob.question,
"thinking": prob.thinking,
"answer": full_answer,
"test_cases": prob.test_cases,
"ground_truth": prob.ground_truth,
"level": prob.level,
"category": prob.category,
"time_complexity": prob.time_complexity,
"lang": prob.lang,
"source": prob.source,
"topic": prob.topic,
"context": f"<system>{system}</system>",
}
def generate_code_dataset(output_path: Path = CODE_OUTPUT) -> int:
all_problems = (
L1_PROBLEMS + L2_PROBLEMS + L3_PROBLEMS + L4_PROBLEMS +
L5_PROBLEMS + L6_PROBLEMS + L7_PROBLEMS + L8_PROBLEMS
)
total = 0
with open(output_path, "w", encoding="utf-8") as f:
for prob in all_problems:
record = _to_training_record(prob)
f.write(json.dumps(record, ensure_ascii=False) + "\n")
total += 1
cats = {}
for p in all_problems:
cats[p.category] = cats.get(p.category, 0) + 1
print(f"Code dataset: {total} problems")
for cat, cnt in sorted(cats.items()):
print(f" {cat}: {cnt}")
print(f"→ {output_path}")
return total
if __name__ == "__main__":
generate_code_dataset()

Xet Storage Details

Size:
36.2 kB
·
Xet hash:
02048edfb18dd39753070139f1616eb592d1d6743d3e63569f00cf1ea600aefe

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.