Spaces:

Cooked4riyal
/

EntropyEnv

Running

App Files Files Community

EntropyEnv / server /datasets /dependency_cases.py

immortalindeed

Spec-compliance overhaul: remove difficulty_multiplier, weighted blend scoring, dep_hard fix, [END] format

f3fd4ef about 1 month ago

raw

history blame contribute delete

19.8 kB

	# server/datasets/dependency_cases.py
	# Ground truth cases for PyTorch Migration Time-Machine tasks.
	#
	# CRITICAL FIX:
	# dep_hard previously had:
	# done_conditions: {min_actions: 2, required_sequence: ['migrate_api', 'migrate_api']}
	#
	# This caused TWO bugs:
	# 1. The agent called migrate_api once. Router checked Counter: needs 2, has 1 → not done.
	# 2. Agent called migrate_api again → repetition_penalty fires (-0.20), tanking the score.
	# 3. Episode only ends at max_steps with a broken accumulated score.
	#
	# FIX: dep_hard now uses min_actions=1, required_sequence=['migrate_api'].
	# The task is already hard enough from the grader — complex checklist, ordering
	# constraints, and exact token matching in fix_quality. The done condition
	# should not add extra difficulty on top of this.
	#
	# ALL dep_easy, dep_medium, dep_hard done conditions verified below.

	DEPENDENCY_CASES = {

	# ── DEP EASY ─────────────────────────────────────────────────────────
	# Task: flag outdated packages and deprecated API usage.
	# Done: after 1 flag_outdated action.
	# Grader: F1 on packages (precision+recall) × 0.55 + deprecated_api_match × 0.45
	# ─────────────────────────────────────────────────────────────────────
	'dep_easy': [
	{
	'case_id': 'dep_easy_001',
	'task_subtype': 'flag',
	'completion_threshold': 0.75,
	'max_steps': 4,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['flag_outdated']},
	'task_description': (
	'This codebase uses torch==1.9.0 and relies on torch.autograd.Variable. '
	'Flag all outdated packages and the deprecated API.'
	),
	'code_snippet': (
	'import torch\n'
	'from torch.autograd import Variable\n'
	'x = Variable(torch.randn(3, 4))\n'
	'model = torch.nn.Linear(4, 2)\n'
	'out = model(x)'
	),
	'requirements': {'torch': '1.9.0', 'torchvision': '0.10.0'},
	'expected_outdated_packages': ['torch', 'torchvision'],
	'expected_deprecated_api': 'torch.autograd.Variable',
	'expected_replacement': 'plain tensor with requires_grad=True',
	},
	{
	'case_id': 'dep_easy_002',
	'task_subtype': 'flag',
	'completion_threshold': 0.75,
	'max_steps': 4,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['flag_outdated']},
	'task_description': (
	'This codebase uses torch==1.4.0 and calls .cuda() directly. '
	'Flag outdated packages and the deprecated device assignment pattern.'
	),
	'code_snippet': (
	'import torch\n'
	'model = MyModel()\n'
	'model.cuda() # deprecated — use .to(device)\n'
	'tensor = torch.randn(2, 3).cuda()'
	),
	'requirements': {'torch': '1.4.0'},
	'expected_outdated_packages': ['torch'],
	'expected_deprecated_api': '.cuda()',
	'expected_replacement': '.to(device)',
	},
	{
	'case_id': 'dep_easy_003',
	'task_subtype': 'flag',
	'completion_threshold': 0.75,
	'max_steps': 4,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['flag_outdated']},
	'task_description': (
	'This codebase uses torch==1.7.0 with DataParallel. '
	'Flag the outdated package and the deprecated multi-GPU API.'
	),
	'code_snippet': (
	'import torch\n'
	'model = torch.nn.DataParallel(MyModel())\n'
	'model.cuda()'
	),
	'requirements': {'torch': '1.7.0', 'numpy': '1.18.0'},
	'expected_outdated_packages': ['torch', 'numpy'],
	'expected_deprecated_api': 'torch.nn.DataParallel',
	'expected_replacement': 'DistributedDataParallel',
	},
	{
	'case_id': 'dep_easy_004',
	'task_subtype': 'flag',
	'completion_threshold': 0.75,
	'max_steps': 4,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['flag_outdated']},
	'task_description': (
	'Flag outdated packages and the deprecated ONNX export API in this code.'
	),
	'code_snippet': (
	'import torch\n'
	'torch.onnx.export(model, dummy_input, "model.onnx",\n'
	' opset_version=9,\n'
	' enable_onnx_checker=True) # deprecated kwarg'
	),
	'requirements': {'torch': '1.8.0'},
	'expected_outdated_packages': ['torch'],
	'expected_deprecated_api': 'enable_onnx_checker',
	'expected_replacement': 'remove the kwarg (deprecated in 1.9, removed in 2.0)',
	},
	{
	'case_id': 'dep_easy_005',
	'task_subtype': 'flag',
	'completion_threshold': 0.75,
	'max_steps': 4,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['flag_outdated']},
	'task_description': (
	'Flag outdated packages and the deprecated autocast API.'
	),
	'code_snippet': (
	'import torch\n'
	'from torch.cuda.amp import autocast\n'
	'with autocast(): # deprecated import path\n'
	' output = model(input)'
	),
	'requirements': {'torch': '1.6.0', 'torchaudio': '0.6.0'},
	'expected_outdated_packages': ['torch', 'torchaudio'],
	'expected_deprecated_api': 'torch.cuda.amp.autocast',
	'expected_replacement': 'torch.amp.autocast',
	},
	],

	# ── DEP MEDIUM ────────────────────────────────────────────────────────
	# Task: resolve version conflicts using the compatibility_matrix.
	# Done: after 1 resolve_conflict action.
	# Grader: valid_pkgs/conflict_count + cross-constraint check - downgrade penalty
	# ─────────────────────────────────────────────────────────────────────
	'dep_medium': [
	{
	'case_id': 'dep_medium_001',
	'task_subtype': 'resolve',
	'completion_threshold': 0.70,
	'max_steps': 4,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['resolve_conflict']},
	'task_description': (
	'Resolve the version conflict between torch, numpy, and protobuf. '
	'Use the compatibility_matrix to find a compatible set of versions.'
	),
	'code_snippet': 'requirements.txt with conflicting torch==2.0.0, numpy==1.20.0, protobuf==3.9.0',
	'requirements': {'torch': '2.0.0', 'numpy': '1.20.0', 'protobuf': '3.9.0'},
	'conflict_packages': ['torch', 'numpy', 'protobuf'],
	'compatibility_matrix': {
	'torch': {
	'2.1.0': {'numpy': '>=1.21,<2.0', 'protobuf': '>=3.20,<5.0'},
	'2.0.0': {'numpy': '>=1.20,<1.25', 'protobuf': '>=3.19,<4.0'},
	},
	'numpy': {
	'1.24.0': {},
	'1.21.0': {},
	'1.20.0': {},
	},
	'protobuf': {
	'4.23.0': {},
	'3.20.0': {},
	'3.9.0': {'torch': '<=1.13'},
	},
	},
	},
	{
	'case_id': 'dep_medium_002',
	'task_subtype': 'resolve',
	'completion_threshold': 0.70,
	'max_steps': 4,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['resolve_conflict']},
	'task_description': (
	'Resolve the version conflict between tensorflow, keras, and h5py.'
	),
	'code_snippet': 'requirements.txt: tensorflow==2.10.0, keras==2.10.0, h5py==2.10.0',
	'requirements': {'tensorflow': '2.10.0', 'keras': '2.10.0', 'h5py': '2.10.0'},
	'conflict_packages': ['tensorflow', 'keras', 'h5py'],
	'compatibility_matrix': {
	'tensorflow': {
	'2.13.0': {'keras': '>=2.13,<2.14', 'h5py': '>=3.7'},
	'2.10.0': {'keras': '==2.10.0', 'h5py': '>=3.1'},
	},
	'keras': {
	'2.13.0': {'tensorflow': '>=2.13,<2.14'},
	'2.10.0': {'tensorflow': '==2.10.0'},
	},
	'h5py': {
	'3.9.0': {},
	'3.7.0': {},
	'2.10.0': {'tensorflow': '<=2.3'},
	},
	},
	},
	{
	'case_id': 'dep_medium_003',
	'task_subtype': 'resolve',
	'completion_threshold': 0.70,
	'max_steps': 4,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['resolve_conflict']},
	'task_description': (
	'Resolve the conflict between transformers, tokenizers, and datasets packages.'
	),
	'code_snippet': 'requirements: transformers==4.20.0, tokenizers==0.11.0, datasets==1.18.0',
	'requirements': {'transformers': '4.20.0', 'tokenizers': '0.11.0', 'datasets': '1.18.0'},
	'conflict_packages': ['transformers', 'tokenizers', 'datasets'],
	'compatibility_matrix': {
	'transformers': {
	'4.35.0': {'tokenizers': '>=0.14,<0.19', 'datasets': '>=2.14'},
	'4.20.0': {'tokenizers': '>=0.11,<0.14', 'datasets': '>=1.18'},
	},
	'tokenizers': {
	'0.15.0': {'transformers': '>=4.28'},
	'0.14.0': {'transformers': '>=4.25'},
	'0.11.0': {},
	},
	'datasets': {
	'2.14.0': {},
	'2.10.0': {},
	'1.18.0': {'tokenizers': '<=0.13'},
	},
	},
	},
	],

	# ── DEP HARD ──────────────────────────────────────────────────────────
	# Task: fix torch.compile graph-break patterns.
	# Done: after 1 migrate_api action (FIXED from 2 → 1).
	#
	# IMPORTANT: min_actions=1, required_sequence=['migrate_api']
	# The grader already makes this hard through:
	# - Multiple graph_breaks to fix (3-5 per case)
	# - Ordering constraints via checklist_dependency_graph
	# - Exact token matching in fix_quality
	# We do NOT need the done condition to create artificial difficulty.
	# ─────────────────────────────────────────────────────────────────────
	'dep_hard': [
	{
	'case_id': 'dep_hard_001',
	'task_subtype': 'migrate',
	'completion_threshold': 0.60,
	'max_steps': 6,
	# FIXED: was min_actions=2, required_sequence=['migrate_api','migrate_api']
	# which caused repetition penalty on the 2nd call and never terminated cleanly
	'done_conditions': {'min_actions': 1, 'required_sequence': ['migrate_api']},
	'task_description': (
	'Fix the torch.compile graph-break patterns in this training loop. '
	'Provide completed_items (list of break IDs) and code_changes (dict of fixes).'
	),
	'code_snippet': (
	'import torch\n\n'
	'def train_step(model, x):\n'
	' out = model(x)\n'
	' if out.shape[0] != x.shape[0]: # data-dependent branch [break_001]\n'
	' out = torch.zeros_like(x)\n'
	' idx = int(out.argmax()) # int() conversion [break_002]\n'
	' mask = out > 0.5 # dynamic masking [break_003]\n'
	' return out[mask].sum()\n'
	),
	'graph_break_report': [
	'break_001: data-dependent control flow (if out.shape[0] != x.shape[0])',
	'break_002: Python int() call on tensor (int(out.argmax()))',
	'break_003: dynamic boolean indexing (out[mask])',
	],
	'graph_breaks': ['break_001', 'break_002', 'break_003'],
	'checklist_dependency_graph': {
	'break_003': ['break_002'], # must fix int() conversion before mask
	},
	'correct_fix_map': {
	'break_001': 'torch.where',
	'break_002': 'torch.argmax',
	'break_003': 'torch.masked_select',
	},
	},
	{
	'case_id': 'dep_hard_002',
	'task_subtype': 'migrate',
	'completion_threshold': 0.60,
	'max_steps': 6,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['migrate_api']},
	'task_description': (
	'Fix these torch.compile graph-breaks in a model forward pass.'
	),
	'code_snippet': (
	'def forward(self, x):\n'
	' x = self.conv(x)\n'
	' size = x.size(0) # .size() with int [break_001]\n'
	' out = x.numpy() # .numpy() call [break_002]\n'
	' out = torch.from_numpy(out)\n'
	' return out[:size//2] # dynamic slice [break_003]\n'
	),
	'graph_break_report': [
	'break_001: .size() call returning Python int',
	'break_002: .numpy() call breaks compilation boundary',
	'break_003: dynamic slicing with Python division',
	],
	'graph_breaks': ['break_001', 'break_002', 'break_003'],
	'checklist_dependency_graph': {
	'break_003': ['break_001'],
	},
	'correct_fix_map': {
	'break_001': 'tensor.shape[0]',
	'break_002': 'detach',
	'break_003': 'torch.narrow',
	},
	},
	{
	'case_id': 'dep_hard_003',
	'task_subtype': 'migrate',
	'completion_threshold': 0.60,
	'max_steps': 6,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['migrate_api']},
	'task_description': (
	'Fix torch.compile graph-breaks in this attention implementation.'
	),
	'code_snippet': (
	'def attention(q, k, v):\n'
	' scores = torch.matmul(q, k.transpose(-2, -1))\n'
	' if scores.max() > 100: # data-dependent branch [break_001]\n'
	' scores = scores / 100\n'
	' weights = scores.numpy() # numpy call [break_002]\n'
	' weights = torch.softmax(torch.tensor(weights), dim=-1)\n'
	' n = int(q.shape[0]) # Python int [break_003]\n'
	' return weights[:n] @ v\n'
	),
	'graph_break_report': [
	'break_001: data-dependent branch on scores.max()',
	'break_002: .numpy() breaks torch.compile boundary',
	'break_003: Python int() on tensor dimension',
	],
	'graph_breaks': ['break_001', 'break_002', 'break_003'],
	'checklist_dependency_graph': {
	'break_003': ['break_001'],
	'break_002': ['break_001'],
	},
	'correct_fix_map': {
	'break_001': 'torch.clamp',
	'break_002': 'torch.softmax',
	'break_003': 'tensor.shape',
	},
	},
	{
	'case_id': 'dep_hard_004',
	'task_subtype': 'migrate',
	'completion_threshold': 0.60,
	'max_steps': 6,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['migrate_api']},
	'task_description': (
	'Fix four torch.compile graph-breaks in this training utility.'
	),
	'code_snippet': (
	'def process_batch(batch):\n'
	' lengths = [len(x) for x in batch] # Python list comp [break_001]\n'
	' max_len = max(lengths) # Python max() [break_002]\n'
	' padded = torch.zeros(len(batch), max_len)\n'
	' for i, x in enumerate(batch): # Python loop [break_003]\n'
	' padded[i, :len(x)] = x\n'
	' out = model(padded)\n'
	' return out.cpu().numpy() # .numpy() [break_004]\n'
	),
	'graph_break_report': [
	'break_001: Python list comprehension over tensor data',
	'break_002: Python max() on list of tensor values',
	'break_003: Python for loop with tensor indexing',
	'break_004: .numpy() call at output',
	],
	'graph_breaks': ['break_001', 'break_002', 'break_003', 'break_004'],
	'checklist_dependency_graph': {
	'break_002': ['break_001'],
	'break_003': ['break_002'],
	},
	'correct_fix_map': {
	'break_001': 'torch.tensor',
	'break_002': 'torch.max',
	'break_003': 'torch.nn.utils.rnn.pad_sequence',
	'break_004': 'detach',
	},
	},
	{
	'case_id': 'dep_hard_005',
	'task_subtype': 'migrate',
	'completion_threshold': 0.60,
	'max_steps': 6,
	'done_conditions': {'min_actions': 1, 'required_sequence': ['migrate_api']},
	'task_description': (
	'Fix torch.compile graph-breaks caused by vmap incompatibilities.'
	),
	'code_snippet': (
	'from torch._vmap_internals import vmap # deprecated [break_001]\n'
	'import functorch # deprecated module [break_002]\n\n'
	'def batched_fn(x):\n'
	' result = vmap(model)(x)\n'
	' if result.isnan().any(): # data-dependent check [break_003]\n'
	' result = torch.zeros_like(result)\n'
	' return result\n'
	),
	'graph_break_report': [
	'break_001: torch._vmap_internals.vmap is deprecated (use torch.vmap)',
	'break_002: functorch module is deprecated (merged into torch)',
	'break_003: data-dependent .any() check breaks compilation',
	],
	'graph_breaks': ['break_001', 'break_002', 'break_003'],
	'checklist_dependency_graph': {
	'break_002': ['break_001'],
	},
	'correct_fix_map': {
	'break_001': 'torch.vmap',
	'break_002': 'torch.func',
	'break_003': 'torch.where',
	},
	},
	],
	}