Spaces:
Running
Running
| """ | |
| Task and Skill Bank for the Skill Invocation Environment. | |
| Contains 10 synthetic task/skill pairs plus 3 tasks adapted from SkillsBench, | |
| covering fictional APIs, data formats, coding standards, deployment procedures, | |
| query languages, flood analysis, economics detrending, and dialogue parsing. | |
| Verifiers use code execution and structural validation — not keyword matching. | |
| """ | |
| import binascii | |
| import hashlib | |
| import hmac | |
| import base64 | |
| import re | |
| import struct | |
| from typing import Callable | |
| try: | |
| import yaml as _yaml | |
| HAS_YAML = True | |
| except ImportError: | |
| HAS_YAML = False | |
| # --------------------------------------------------------------------------- | |
| # Verifier helpers | |
| # --------------------------------------------------------------------------- | |
| def _strip_markdown_fences(code: str) -> str: | |
| """Remove markdown code fences if present.""" | |
| code = code.strip() | |
| # Match ```python ... ``` or ``` ... ``` | |
| match = re.search(r'```(?:python)?\s*\n(.*?)```', code, re.DOTALL) | |
| if match: | |
| return match.group(1) | |
| # Also handle case where entire answer is fenced | |
| if code.startswith("```"): | |
| lines = code.split("\n") | |
| # Remove first and last fence lines | |
| lines = [l for l in lines if not l.strip().startswith("```")] | |
| return "\n".join(lines) | |
| return code | |
| _SAFE_IMPORTS = "import hmac, hashlib, base64, struct, json, re, binascii, math" | |
| def _exec_verifier(func_name: str, test_cases: list[dict]) -> Callable[[str], bool]: | |
| """ | |
| Execute the agent's code, extract func_name, run test_cases. | |
| Each test_case: {"args": [...], "kwargs": {...}, "check": callable} | |
| where check(result) -> bool. | |
| """ | |
| def verify(answer: str) -> bool: | |
| try: | |
| code = _strip_markdown_fences(answer) | |
| namespace: dict = {} | |
| exec(_SAFE_IMPORTS, namespace) | |
| exec(code, namespace) | |
| if func_name not in namespace: | |
| return False | |
| func = namespace[func_name] | |
| for tc in test_cases: | |
| result = func(*tc.get("args", []), **tc.get("kwargs", {})) | |
| if not tc["check"](result): | |
| return False | |
| return True | |
| except Exception: | |
| return False | |
| return verify | |
| def _structural_verifier(checks: list[Callable[[str], bool]]) -> Callable[[str], bool]: | |
| """All structural checks must pass.""" | |
| def verify(answer: str) -> bool: | |
| return all(check(answer) for check in checks) | |
| return verify | |
| def _multi_part_verifier(part_checks: list[Callable[[str], bool]]) -> Callable[[str], bool]: | |
| """Verify answer has multiple parts, each passing its check.""" | |
| def verify(answer: str) -> bool: | |
| return all(check(answer) for check in part_checks) | |
| return verify | |
| # --------------------------------------------------------------------------- | |
| # Pre-computed test data for exec verifiers | |
| # --------------------------------------------------------------------------- | |
| # task_001: expected Zephyr-3 auth outputs | |
| def _expected_zephyr_auth(api_key: str, timestamp: int) -> dict: | |
| signing_string = f"{api_key}:{timestamp}" | |
| digest = hmac.new(api_key.encode(), signing_string.encode(), hashlib.sha256).digest() | |
| b64 = base64.b64encode(digest).decode() | |
| return {"X-Zephyr-Auth": f"ZPH {api_key}:{b64}:{timestamp}"} | |
| _ZEPHYR_TEST_CASES = [ | |
| { | |
| "args": ["test_key_123", 1700000000], | |
| "check": lambda result: ( | |
| isinstance(result, dict) | |
| and "X-Zephyr-Auth" in result | |
| and result == _expected_zephyr_auth("test_key_123", 1700000000) | |
| ), | |
| }, | |
| { | |
| "args": ["another_key", 1700000001], | |
| "check": lambda result: ( | |
| isinstance(result, dict) | |
| and "X-Zephyr-Auth" in result | |
| and result == _expected_zephyr_auth("another_key", 1700000001) | |
| ), | |
| }, | |
| { | |
| "args": ["k", 0], | |
| "check": lambda result: ( | |
| isinstance(result, dict) | |
| and "X-Zephyr-Auth" in result | |
| and result["X-Zephyr-Auth"].startswith("ZPH k:") | |
| and result["X-Zephyr-Auth"].endswith(":0") | |
| ), | |
| }, | |
| ] | |
| # task_002: NovaBin header test data | |
| _NOVABIN_HEADER_RAW = ( | |
| b'NOVB' # magic | |
| + struct.pack('>H', 0x0201) # version 2.1 | |
| + struct.pack('>I', 42) # 42 records | |
| + struct.pack('>H', 0b101) # compressed + checksummed (bits 0 and 2) | |
| ) | |
| _NOVABIN_HEADER_CRC = binascii.crc32(_NOVABIN_HEADER_RAW) & 0xFFFFFFFF | |
| _NOVABIN_HEADER = _NOVABIN_HEADER_RAW + struct.pack('>I', _NOVABIN_HEADER_CRC) | |
| # Second test header: no flags, 1 record | |
| _NOVABIN_HEADER2_RAW = ( | |
| b'NOVB' | |
| + struct.pack('>H', 0x0201) | |
| + struct.pack('>I', 1) | |
| + struct.pack('>H', 0) # no flags | |
| ) | |
| _NOVABIN_HEADER2_CRC = binascii.crc32(_NOVABIN_HEADER2_RAW) & 0xFFFFFFFF | |
| _NOVABIN_HEADER2 = _NOVABIN_HEADER2_RAW + struct.pack('>I', _NOVABIN_HEADER2_CRC) | |
| _NOVABIN_HEADER_TEST_CASES = [ | |
| { | |
| "args": [_NOVABIN_HEADER], | |
| "check": lambda r: ( | |
| isinstance(r, dict) | |
| and r.get("version") in (0x0201, 513) | |
| and r.get("record_count") == 42 | |
| and r.get("compressed") is True | |
| and r.get("encrypted") is False | |
| and r.get("checksummed") is True | |
| ), | |
| }, | |
| { | |
| "args": [_NOVABIN_HEADER2], | |
| "check": lambda r: ( | |
| isinstance(r, dict) | |
| and r.get("record_count") == 1 | |
| and r.get("compressed") is False | |
| and r.get("encrypted") is False | |
| and r.get("checksummed") is False | |
| ), | |
| }, | |
| ] | |
| # task_008: NovaBin record test data | |
| def _build_test_record() -> bytes: | |
| """Build a test record: 2 fields — int32 'age'=25, string 'name'='Alice'.""" | |
| buf = bytearray() | |
| buf += struct.pack('>H', 2) # field count = 2 | |
| # Field 1: int32, name="age", value=25 | |
| buf += bytes([0x01]) # type tag int32 | |
| name1 = b"age" | |
| buf += struct.pack('>H', len(name1)) | |
| buf += name1 | |
| val1 = struct.pack('>i', 25) | |
| buf += struct.pack('>I', len(val1)) | |
| buf += val1 | |
| # Field 2: string, name="name", value="Alice" | |
| buf += bytes([0x03]) # type tag string | |
| name2 = b"name" | |
| buf += struct.pack('>H', len(name2)) | |
| buf += name2 | |
| val2 = b"Alice" | |
| buf += struct.pack('>I', len(val2)) | |
| buf += val2 | |
| return bytes(buf) | |
| def _build_test_record_2() -> bytes: | |
| """Build a test record: 1 field — bool 'active'=True.""" | |
| buf = bytearray() | |
| buf += struct.pack('>H', 1) | |
| buf += bytes([0x04]) # type tag bool | |
| name = b"active" | |
| buf += struct.pack('>H', len(name)) | |
| buf += name | |
| val = bytes([1]) # True | |
| buf += struct.pack('>I', len(val)) | |
| buf += val | |
| return bytes(buf) | |
| _NOVABIN_RECORD = _build_test_record() | |
| _NOVABIN_RECORD_2 = _build_test_record_2() | |
| _NOVABIN_RECORD_TEST_CASES = [ | |
| { | |
| "args": [_NOVABIN_RECORD, 0], | |
| "check": lambda r: ( | |
| isinstance(r, tuple) and len(r) == 2 | |
| and isinstance(r[0], dict) | |
| and r[0].get("age") == 25 | |
| and r[0].get("name") == "Alice" | |
| and isinstance(r[1], int) and r[1] == len(_NOVABIN_RECORD) | |
| ), | |
| }, | |
| { | |
| "args": [_NOVABIN_RECORD_2, 0], | |
| "check": lambda r: ( | |
| isinstance(r, tuple) and len(r) == 2 | |
| and isinstance(r[0], dict) | |
| and r[0].get("active") is True | |
| and isinstance(r[1], int) and r[1] == len(_NOVABIN_RECORD_2) | |
| ), | |
| }, | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # YAML-based verifier for ArcDeploy config | |
| # --------------------------------------------------------------------------- | |
| def _verify_arcdeploy_yaml(answer: str) -> bool: | |
| """Verify ArcDeploy canary config via YAML parsing + structural checks.""" | |
| try: | |
| # Extract YAML block if wrapped in markdown fences | |
| yaml_text = answer | |
| if '```' in answer: | |
| blocks = re.findall(r'```(?:yaml)?\s*\n(.*?)```', answer, re.DOTALL) | |
| if blocks: | |
| yaml_text = blocks[0] | |
| if HAS_YAML: | |
| config = _yaml.safe_load(yaml_text) | |
| if not isinstance(config, dict): | |
| return False | |
| canary = config.get("canary", {}) | |
| phases = canary.get("phases", []) | |
| if len(phases) < 5: | |
| return False | |
| # First phase must be shadow with 0% traffic | |
| if phases[0].get("name") != "shadow" or phases[0].get("traffic_pct") != 0: | |
| return False | |
| # Last phase must be 100% | |
| if phases[-1].get("traffic_pct") != 100: | |
| return False | |
| # Traffic must be monotonically increasing | |
| traffic = [p.get("traffic_pct", 0) for p in phases] | |
| if traffic != sorted(traffic): | |
| return False | |
| # Must have metrics gates on non-100% phases | |
| for p in phases: | |
| if p.get("traffic_pct", 0) < 100 and "metrics_gate" not in p: | |
| return False | |
| # Must have rollback config with auto: true | |
| rollback = canary.get("rollback", {}) | |
| if not rollback.get("auto"): | |
| return False | |
| return True | |
| else: | |
| # Fallback: structural regex checks if yaml not installed | |
| return _arcdeploy_structural_fallback(answer) | |
| except Exception: | |
| return False | |
| def _arcdeploy_structural_fallback(answer: str) -> bool: | |
| """Structural fallback for ArcDeploy when yaml isn't available.""" | |
| checks = [ | |
| # Has shadow phase with traffic_pct: 0 | |
| lambda a: bool(re.search(r'name:\s*shadow', a)) and bool(re.search(r'traffic_pct:\s*0\b', a)), | |
| # Has at least canary_1 phase | |
| lambda a: bool(re.search(r'name:\s*canary_1', a)), | |
| # Has full phase with traffic_pct: 100 | |
| lambda a: bool(re.search(r'traffic_pct:\s*100', a)), | |
| # Has metrics gates with error_rate | |
| lambda a: bool(re.search(r'metrics_gate:.*error_rate', a)), | |
| # Has rollback with auto: true | |
| lambda a: bool(re.search(r'auto:\s*true', a, re.IGNORECASE)), | |
| # Has at least 5 phase entries | |
| lambda a: len(re.findall(r'-\s*name:', a)) >= 5, | |
| ] | |
| return all(c(answer) for c in checks) | |
| # --------------------------------------------------------------------------- | |
| # Skill Bank | |
| # --------------------------------------------------------------------------- | |
| SKILL_BANK: dict[str, dict] = { | |
| # --- Zephyr-3 API domain --- | |
| "skill_001": { | |
| "name": "Zephyr-3 API Authentication", | |
| "short_description": "Authentication protocol and header format for Zephyr-3 API endpoints.", | |
| "full_content": """## Zephyr-3 API Authentication Protocol | |
| All Zephyr-3 API requests require HMAC-SHA256 authentication. The auth header must be constructed as follows: | |
| 1. Create a signing string: `{api_key}:{timestamp_ms}` where timestamp_ms is Unix epoch in milliseconds | |
| 2. Compute HMAC-SHA256 of the signing string using the api_key as the secret key | |
| 3. Base64-encode the HMAC digest | |
| 4. Set the header `X-Zephyr-Auth` to the format: `ZPH {api_key}:{base64_digest}:{timestamp_ms}` | |
| Example: | |
| ```python | |
| import hmac, hashlib, base64 | |
| def encode_zephyr_auth(api_key: str, timestamp: int) -> dict: | |
| signing_string = f"{api_key}:{timestamp}" | |
| digest = hmac.new(api_key.encode(), signing_string.encode(), hashlib.sha256).digest() | |
| b64 = base64.b64encode(digest).decode() | |
| return {"X-Zephyr-Auth": f"ZPH {api_key}:{b64}:{timestamp}"} | |
| ``` | |
| Note: Timestamps must be in milliseconds. Requests with timestamps older than 30 seconds are rejected. | |
| """, | |
| }, | |
| "skill_002": { | |
| "name": "Zephyr-3 Rate Limiting", | |
| "short_description": "Rate limiting rules and backoff strategies for Zephyr-3 API.", | |
| "full_content": "Zephyr-3 uses a token bucket rate limiter with 100 requests per minute per API key. When rate limited (HTTP 429), use exponential backoff starting at 1 second with a max of 32 seconds. Include the X-Zephyr-Retry-After header value if present.", | |
| }, | |
| "skill_003": { | |
| "name": "Zephyr-3 Webhook Configuration", | |
| "short_description": "Setting up and managing webhook endpoints in Zephyr-3.", | |
| "full_content": "Webhooks in Zephyr-3 are configured via the /v1/webhooks endpoint. Each webhook requires a target URL, event types array, and an optional secret for signature verification. Events are delivered with at-least-once semantics.", | |
| }, | |
| # --- NovaBin binary format domain --- | |
| "skill_004": { | |
| "name": "NovaBin File Format Specification", | |
| "short_description": "Binary format spec for NovaBin data files used in NovaDB exports.", | |
| "full_content": """## NovaBin Binary Format v2.1 | |
| NovaBin is a compact binary format for serializing structured records. | |
| ### Header (16 bytes): | |
| - Bytes 0-3: Magic number `0x4E4F5642` ("NOVB") | |
| - Bytes 4-5: Version (uint16, big-endian) — current is 0x0201 | |
| - Bytes 6-9: Record count (uint32, big-endian) | |
| - Bytes 10-11: Flags (uint16) — bit 0: compressed, bit 1: encrypted, bit 2: checksummed | |
| - Bytes 12-15: Header checksum (CRC32 of bytes 0-11) | |
| ### Record format: | |
| - 2 bytes: field count (uint16, big-endian) | |
| - For each field: | |
| - 1 byte: type tag (0x01=int32, 0x02=float64, 0x03=string, 0x04=bool) | |
| - 2 bytes: name length (uint16) | |
| - N bytes: field name (UTF-8) | |
| - 4 bytes: value length (uint32) | |
| - M bytes: value data | |
| ### Parsing example: | |
| ```python | |
| import struct | |
| def parse_novabin_header(data: bytes) -> dict: | |
| magic = data[0:4] | |
| assert magic == b'NOVB', f"Invalid magic: {magic}" | |
| version = struct.unpack('>H', data[4:6])[0] | |
| record_count = struct.unpack('>I', data[6:10])[0] | |
| flags = struct.unpack('>H', data[10:12])[0] | |
| checksum = struct.unpack('>I', data[12:16])[0] | |
| return { | |
| "version": version, "record_count": record_count, | |
| "compressed": bool(flags & 1), "encrypted": bool(flags & 2), | |
| "checksummed": bool(flags & 4), "checksum": checksum | |
| } | |
| ``` | |
| """, | |
| }, | |
| "skill_005": { | |
| "name": "NovaBin Compression Options", | |
| "short_description": "Compression algorithms supported by NovaBin format.", | |
| "full_content": "NovaBin supports LZ4 (default) and Zstandard compression. When flag bit 0 is set, the record payload is compressed. The first byte of compressed data indicates the algorithm: 0x01 for LZ4, 0x02 for Zstandard.", | |
| }, | |
| # --- HelixLang domain --- | |
| "skill_006": { | |
| "name": "HelixLang Error Handling Conventions", | |
| "short_description": "Error handling patterns and error code structure in HelixLang.", | |
| "full_content": """## HelixLang Error Handling Standard v3 | |
| HelixLang uses a Result monad pattern with structured error codes. | |
| ### Error Code Format: | |
| `HLX-{CATEGORY}-{CODE}` where: | |
| - CATEGORY is one of: IO, NET, AUTH, DATA, SYS | |
| - CODE is a 4-digit number | |
| ### Result Type: | |
| ``` | |
| result<T> = Ok(T) | Err(HelixError) | |
| class HelixError: | |
| code: str # e.g., "HLX-NET-4012" | |
| message: str # Human-readable description | |
| context: map # Key-value pairs with debug info | |
| retryable: bool # Whether operation can be retried | |
| chain: HelixError? # Wrapped inner error | |
| ``` | |
| ### Mandatory Error Handling Rules: | |
| 1. All functions returning result<T> must be called with the `try!` operator or explicitly matched | |
| 2. Errors must be propagated with context: `try! operation().with_context("step", "fetching user")` | |
| 3. Error chains must preserve the original error: `Err(HelixError.wrap(original, "HLX-DATA-2001", "transform failed"))` | |
| 4. Retryable errors (retryable=true) should use the `retry_with_backoff` helper: max 3 attempts, exponential backoff 100ms/200ms/400ms | |
| 5. All error handlers must log via `helix.log.error(err)` before returning or re-raising | |
| ### Standard Error Codes: | |
| - HLX-IO-1001: File not found | |
| - HLX-IO-1002: Permission denied | |
| - HLX-NET-4001: Connection timeout | |
| - HLX-NET-4012: SSL certificate invalid | |
| - HLX-AUTH-3001: Token expired | |
| - HLX-AUTH-3002: Insufficient permissions | |
| - HLX-DATA-2001: Deserialization failed | |
| - HLX-DATA-2002: Schema validation error | |
| - HLX-SYS-5001: Out of memory | |
| - HLX-SYS-5002: Thread pool exhausted | |
| """, | |
| }, | |
| "skill_007": { | |
| "name": "HelixLang Module System", | |
| "short_description": "Module imports, visibility rules, and package management in HelixLang.", | |
| "full_content": "HelixLang uses a hierarchical module system. Modules are declared with `mod name { }` and imported with `use path::to::module`. Public items use the `pub` keyword. Circular dependencies are detected at compile time.", | |
| }, | |
| "skill_008": { | |
| "name": "HelixLang Concurrency Primitives", | |
| "short_description": "Async/await patterns and thread safety in HelixLang.", | |
| "full_content": "HelixLang provides green threads via `spawn { }` blocks. Channels are typed: `chan<T>` for unbuffered, `chan<T, N>` for buffered. Mutexes use `lock(resource) { }` syntax. The runtime uses work-stealing scheduling.", | |
| }, | |
| # --- ArcDeploy domain --- | |
| "skill_009": { | |
| "name": "ArcDeploy Canary Rollout Procedure", | |
| "short_description": "Step-by-step canary deployment process using ArcDeploy CLI.", | |
| "full_content": """## ArcDeploy Canary Rollout v4.2 | |
| ArcDeploy uses a 5-phase canary deployment with automatic rollback. | |
| ### Required Configuration (arc-deploy.yaml): | |
| ```yaml | |
| canary: | |
| phases: | |
| - name: shadow | |
| traffic_pct: 0 | |
| duration_min: 5 | |
| metrics_gate: error_rate < 0.01 | |
| - name: canary_1 | |
| traffic_pct: 5 | |
| duration_min: 10 | |
| metrics_gate: p99_latency_ms < 200 AND error_rate < 0.005 | |
| - name: canary_2 | |
| traffic_pct: 25 | |
| duration_min: 15 | |
| metrics_gate: p99_latency_ms < 250 AND error_rate < 0.005 | |
| - name: canary_3 | |
| traffic_pct: 50 | |
| duration_min: 20 | |
| metrics_gate: p99_latency_ms < 300 AND error_rate < 0.01 | |
| - name: full | |
| traffic_pct: 100 | |
| duration_min: 0 | |
| rollback: | |
| auto: true | |
| on_metric_breach: immediate | |
| cooldown_min: 30 | |
| ``` | |
| ### CLI Commands: | |
| ```bash | |
| # Initialize deployment | |
| arc deploy init --service my-svc --version v2.1.0 --config arc-deploy.yaml | |
| # Start canary (enters shadow phase) | |
| arc deploy start --wait-for-gate | |
| # Advance to next phase (or auto-advance if --auto flag used) | |
| arc deploy advance | |
| # Check current status | |
| arc deploy status --json | |
| # Manual rollback | |
| arc deploy rollback --reason "elevated error rate" | |
| # Complete deployment (skip remaining phases) | |
| arc deploy promote --force | |
| ``` | |
| ### Rollback Triggers: | |
| - Any metrics_gate failure triggers immediate rollback | |
| - Manual `arc deploy rollback` at any phase | |
| - Health check failures (3 consecutive) trigger rollback | |
| - Rollback restores previous version and sets cooldown_min lockout | |
| """, | |
| }, | |
| "skill_010": { | |
| "name": "ArcDeploy Service Mesh Integration", | |
| "short_description": "Configuring ArcDeploy with service mesh for traffic routing.", | |
| "full_content": "ArcDeploy integrates with Istio and Linkerd for traffic splitting. Use `arc mesh configure --provider istio` to set up. Traffic rules are managed via VirtualService resources that ArcDeploy generates automatically during canary phases.", | |
| }, | |
| "skill_011": { | |
| "name": "ArcDeploy Monitoring Dashboard", | |
| "short_description": "Setting up monitoring dashboards for ArcDeploy deployments.", | |
| "full_content": "ArcDeploy includes built-in Grafana dashboard templates. Run `arc monitor setup` to deploy. Default panels show: canary vs baseline error rates, latency percentiles, traffic split ratio, and rollback events.", | |
| }, | |
| # --- CrystalQL domain --- | |
| "skill_012": { | |
| "name": "CrystalQL Temporal Query Syntax", | |
| "short_description": "Writing time-travel and temporal range queries in CrystalQL.", | |
| "full_content": """## CrystalQL Temporal Queries | |
| CrystalQL extends SQL with temporal operators for querying historical data. | |
| ### Time-Travel Queries: | |
| ```sql | |
| -- Query data as it existed at a specific point in time | |
| SELECT * FROM users AS OF TIMESTAMP '2024-01-15T10:30:00Z'; | |
| -- Query data as of a relative time | |
| SELECT * FROM orders AS OF INTERVAL '2 hours ago'; | |
| -- Query a range of historical states | |
| SELECT * FROM inventory | |
| BETWEEN TIMESTAMP '2024-01-01' AND TIMESTAMP '2024-01-31' | |
| VERSIONED; | |
| ``` | |
| ### Temporal Joins: | |
| ```sql | |
| -- Join tables at consistent historical points | |
| SELECT u.name, o.total | |
| FROM users u | |
| TEMPORAL JOIN orders o ON u.id = o.user_id | |
| AS OF TIMESTAMP '2024-06-01T00:00:00Z'; | |
| ``` | |
| ### Temporal Aggregations: | |
| ```sql | |
| -- Aggregate over time windows | |
| SELECT | |
| product_id, | |
| TEMPORAL_AVG(price, INTERVAL '1 day') as avg_daily_price, | |
| TEMPORAL_MAX(stock, INTERVAL '1 week') as max_weekly_stock | |
| FROM products | |
| BETWEEN TIMESTAMP '2024-01-01' AND TIMESTAMP '2024-03-01' | |
| GROUP BY product_id | |
| WINDOW TUMBLING(INTERVAL '1 day'); | |
| ``` | |
| ### Important Syntax Rules: | |
| 1. `AS OF` requires ISO 8601 timestamps or INTERVAL expressions | |
| 2. `BETWEEN...AND...VERSIONED` returns all versions of each row | |
| 3. `TEMPORAL JOIN` ensures both sides use the same temporal point | |
| 4. `TEMPORAL_AVG`, `TEMPORAL_MAX`, `TEMPORAL_MIN` are temporal aggregate functions | |
| 5. `WINDOW TUMBLING(interval)` creates non-overlapping time windows | |
| 6. `WINDOW SLIDING(interval, step)` creates overlapping windows with step size | |
| """, | |
| }, | |
| "skill_013": { | |
| "name": "CrystalQL Index Optimization", | |
| "short_description": "Creating and tuning indexes for CrystalQL databases.", | |
| "full_content": "CrystalQL supports B-tree, Hash, and Temporal indexes. Create temporal indexes with `CREATE TEMPORAL INDEX idx ON table(column) RETENTION 90 DAYS`. Use `EXPLAIN TEMPORAL` to analyze temporal query plans.", | |
| }, | |
| # --- VaultSync domain --- | |
| "skill_014": { | |
| "name": "VaultSync Secret Rotation Protocol", | |
| "short_description": "Automated secret rotation workflow in VaultSync.", | |
| "full_content": """## VaultSync Secret Rotation Protocol v2 | |
| VaultSync automates credential rotation with zero-downtime guarantees. | |
| ### Rotation Lifecycle: | |
| 1. **PREPARE**: Generate new credential, store as `pending` version | |
| ``` | |
| vault-sync rotate prepare --secret db/prod/password --generator alphanumeric:32 | |
| ``` | |
| 2. **DUAL-WRITE**: Both old and new credentials are active (grace period) | |
| ``` | |
| vault-sync rotate activate --secret db/prod/password --grace-period 300s | |
| ``` | |
| During grace period, both versions are valid. Applications using VaultSync SDK | |
| automatically pick up the new version. | |
| 3. **VERIFY**: Confirm new credential works | |
| ``` | |
| vault-sync rotate verify --secret db/prod/password --probe-endpoint https://db.internal/health | |
| ``` | |
| 4. **COMMIT**: Revoke old credential | |
| ``` | |
| vault-sync rotate commit --secret db/prod/password | |
| ``` | |
| 5. **ROLLBACK** (if needed): Revert to previous credential | |
| ``` | |
| vault-sync rotate rollback --secret db/prod/password --reason "verification failed" | |
| ``` | |
| ### SDK Integration: | |
| ```python | |
| from vaultsync import SecretClient | |
| client = SecretClient(vault_url="https://vault.internal") | |
| # Auto-refreshing secret reference | |
| db_password = client.secret("db/prod/password") | |
| # Always returns current active version | |
| current = db_password.get() | |
| # Register rotation callback | |
| @db_password.on_rotate | |
| def handle_rotation(new_value, old_value): | |
| reconnect_database(new_value) | |
| ``` | |
| ### Rotation Policy (vault-sync.yaml): | |
| ```yaml | |
| secrets: | |
| - path: db/prod/password | |
| rotation_interval: 7d | |
| generator: alphanumeric:32 | |
| grace_period: 300s | |
| verify: | |
| type: http_probe | |
| endpoint: https://db.internal/health | |
| notify: | |
| - slack:#ops-alerts | |
| ``` | |
| """, | |
| }, | |
| "skill_015": { | |
| "name": "VaultSync Access Policies", | |
| "short_description": "Configuring RBAC access policies for VaultSync secrets.", | |
| "full_content": "VaultSync uses path-based RBAC policies. Policies are written in HCL format: `path \"secret/data/*\" { capabilities = [\"read\"] }`. Policies are attached to service identities via `vault-sync policy attach`.", | |
| }, | |
| # --- FluxStream domain --- | |
| "skill_016": { | |
| "name": "FluxStream Event Processing Pipeline", | |
| "short_description": "Building real-time event processing pipelines with FluxStream DSL.", | |
| "full_content": """## FluxStream Pipeline DSL v3.0 | |
| FluxStream uses a declarative DSL for building event processing pipelines. | |
| ### Pipeline Definition: | |
| ```flux | |
| pipeline user_activity { | |
| source kafka("user-events", group="analytics") { | |
| format: json | |
| watermark: event_time, delay=10s | |
| } | |
| // Filter and transform | |
| |> filter(event.type IN ["click", "purchase", "signup"]) | |
| |> map({ | |
| user_id: event.user_id, | |
| action: event.type, | |
| timestamp: event.event_time, | |
| value: CASE event.type | |
| WHEN "purchase" THEN event.amount | |
| ELSE 1.0 | |
| END | |
| }) | |
| // Windowed aggregation | |
| |> window(tumbling=5m, on=timestamp) { | |
| group_by: [user_id] | |
| aggregate: { | |
| action_count: count(*), | |
| total_value: sum(value), | |
| unique_actions: count_distinct(action) | |
| } | |
| } | |
| // Route to multiple sinks | |
| |> branch { | |
| high_value: total_value > 100.0 | |
| -> sink postgres("analytics_db", table="high_value_users") | |
| default: | |
| -> sink kafka("user-summaries") | |
| } | |
| error_handler { | |
| on deserialize_error: dead_letter("dlq-user-events") | |
| on processing_error: retry(max=3, backoff=exponential(100ms)) | |
| on sink_error: circuit_breaker(threshold=5, reset=60s) | |
| } | |
| } | |
| ``` | |
| ### Key Operators: | |
| - `|> filter(predicate)` — Filter events | |
| - `|> map(expression)` — Transform events | |
| - `|> window(type=duration)` — Windowed aggregation | |
| - `|> branch { condition -> sink }` — Conditional routing | |
| - `|> join(other_stream, on=key, within=duration)` — Stream joins | |
| - `|> deduplicate(key, within=duration)` — Remove duplicates | |
| ### CLI: | |
| ```bash | |
| flux deploy pipeline.flux --env production | |
| flux status user_activity | |
| flux metrics user_activity --window 1h | |
| ``` | |
| """, | |
| }, | |
| "skill_017": { | |
| "name": "FluxStream Connector Configuration", | |
| "short_description": "Configuring source and sink connectors in FluxStream.", | |
| "full_content": "FluxStream supports Kafka, PostgreSQL, Redis, and S3 connectors. Configure via `flux connector add --type kafka --config broker=localhost:9092`. Each connector has health checks and auto-reconnection with configurable backoff.", | |
| }, | |
| "skill_018": { | |
| "name": "FluxStream Schema Registry", | |
| "short_description": "Managing event schemas and compatibility in FluxStream.", | |
| "full_content": "FluxStream integrates with a schema registry for Avro/Protobuf/JSON Schema. Register schemas with `flux schema register --file event.avsc`. Compatibility modes: BACKWARD, FORWARD, FULL. Breaking changes are blocked by default.", | |
| }, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Task Bank | |
| # --------------------------------------------------------------------------- | |
| TASK_BANK: list[dict] = [ | |
| # --- Task 1: Zephyr-3 Auth (Easy) --- exec verifier | |
| { | |
| "id": "task_001", | |
| "difficulty": "easy", | |
| "description": ( | |
| "Write a Python function called `encode_zephyr_auth` that generates an " | |
| "authentication header for the Zephyr-3 API. The function should take " | |
| "`api_key` (str) and `timestamp` (int) as arguments and return a dict " | |
| "with the header." | |
| ), | |
| "relevant_skills": ["skill_001"], | |
| "distractor_skills": ["skill_002", "skill_003", "skill_007", "skill_013", "skill_017"], | |
| "verifier": _exec_verifier("encode_zephyr_auth", _ZEPHYR_TEST_CASES), | |
| }, | |
| # --- Task 2: NovaBin Header Parser (Easy) --- exec verifier | |
| { | |
| "id": "task_002", | |
| "difficulty": "easy", | |
| "description": ( | |
| "Write a Python function called `parse_novabin_header` that takes a " | |
| "`bytes` object (at least 16 bytes) and returns a dict with keys: " | |
| "'version', 'record_count', 'compressed', 'encrypted', 'checksummed', " | |
| "and 'checksum'. Parse according to the NovaBin file format specification." | |
| ), | |
| "relevant_skills": ["skill_004"], | |
| "distractor_skills": ["skill_005", "skill_017", "skill_003", "skill_008", "skill_013"], | |
| "verifier": _exec_verifier("parse_novabin_header", _NOVABIN_HEADER_TEST_CASES), | |
| }, | |
| # --- Task 3: HelixLang Error Handling (Easy) --- structural verifier | |
| { | |
| "id": "task_003", | |
| "difficulty": "easy", | |
| "description": ( | |
| "Write a code snippet in pseudocode or HelixLang style that demonstrates " | |
| "proper error handling for a function that fetches a user from a database, " | |
| "following HelixLang Error Handling Conventions. Include: the correct error " | |
| "code format, context propagation, retry logic for retryable errors, and " | |
| "error logging." | |
| ), | |
| "relevant_skills": ["skill_006"], | |
| "distractor_skills": ["skill_007", "skill_008", "skill_002", "skill_015", "skill_018"], | |
| "verifier": _structural_verifier([ | |
| # Has a correctly-formatted error code (HLX-CATEGORY-NNNN) | |
| lambda a: bool(re.search(r'HLX-(IO|NET|AUTH|DATA|SYS)-\d{4}', a)), | |
| # Uses try! operator in a call context (not just the word) | |
| lambda a: bool(re.search(r'try!\s*\w+', a)), | |
| # Has context propagation with with_context call | |
| lambda a: bool(re.search(r'with_context\s*\(', a)), | |
| # Has retry logic referencing backoff | |
| lambda a: bool(re.search(r'retry.*backoff|retry_with_backoff', a, re.IGNORECASE)), | |
| # Has error logging via helix.log.error | |
| lambda a: bool(re.search(r'helix\.log\.error\s*\(', a)), | |
| # Has Result monad pattern (Ok or Err) | |
| lambda a: bool(re.search(r'result\s*<|Ok\s*\(|Err\s*\(', a)), | |
| ]), | |
| }, | |
| # --- Task 4: ArcDeploy Canary Config (Easy) --- YAML verifier | |
| { | |
| "id": "task_004", | |
| "difficulty": "easy", | |
| "description": ( | |
| "Write an `arc-deploy.yaml` configuration file for a canary deployment " | |
| "of service 'payment-svc' version 3.0.0. The deployment should have the " | |
| "standard 5-phase canary rollout with appropriate traffic percentages, " | |
| "durations, and metrics gates. Include automatic rollback configuration." | |
| ), | |
| "relevant_skills": ["skill_009"], | |
| "distractor_skills": ["skill_010", "skill_011", "skill_005", "skill_017", "skill_003"], | |
| "verifier": _verify_arcdeploy_yaml, | |
| }, | |
| # --- Task 5: CrystalQL Temporal Query (Easy) --- structural verifier | |
| { | |
| "id": "task_005", | |
| "difficulty": "easy", | |
| "description": ( | |
| "Write a CrystalQL query that retrieves the average daily price and " | |
| "maximum weekly stock for each product between January 1 and March 1, " | |
| "2024. Use temporal aggregation functions and a tumbling window of 1 day." | |
| ), | |
| "relevant_skills": ["skill_012"], | |
| "distractor_skills": ["skill_013", "skill_005", "skill_002", "skill_008", "skill_017"], | |
| "verifier": _structural_verifier([ | |
| # Uses TEMPORAL_AVG function with parenthesized args | |
| lambda a: bool(re.search(r'TEMPORAL_AVG\s*\(', a, re.IGNORECASE)), | |
| # Uses TEMPORAL_MAX function with parenthesized args | |
| lambda a: bool(re.search(r'TEMPORAL_MAX\s*\(', a, re.IGNORECASE)), | |
| # Has BETWEEN clause with two timestamps (2024 dates) | |
| lambda a: bool(re.search( | |
| r'BETWEEN\s+TIMESTAMP\s+[\'"]2024-01-01.*AND\s+TIMESTAMP\s+[\'"]2024-03-01', | |
| a, re.IGNORECASE | re.DOTALL, | |
| )), | |
| # Has WINDOW TUMBLING with an interval | |
| lambda a: bool(re.search(r'WINDOW\s+TUMBLING\s*\(', a, re.IGNORECASE)), | |
| # Has GROUP BY product_id | |
| lambda a: bool(re.search(r'GROUP\s+BY\s+product_id', a, re.IGNORECASE)), | |
| # Uses INTERVAL for the window duration | |
| lambda a: bool(re.search(r"INTERVAL\s+['\"]?1\s*day", a, re.IGNORECASE)), | |
| ]), | |
| }, | |
| # --- Task 6: VaultSync Rotation Script (Medium) --- structural verifier | |
| { | |
| "id": "task_006", | |
| "difficulty": "medium", | |
| "description": ( | |
| "Write a shell script that performs a complete secret rotation for " | |
| "the database credential at path 'db/prod/password' using VaultSync CLI. " | |
| "The script should: prepare a new 32-char alphanumeric credential, activate " | |
| "with a 5-minute grace period, verify via health endpoint, and commit. " | |
| "Include error handling that triggers rollback if verification fails." | |
| ), | |
| "relevant_skills": ["skill_014"], | |
| "distractor_skills": ["skill_015", "skill_003", "skill_010", "skill_007", "skill_018"], | |
| "verifier": _structural_verifier([ | |
| # Commands appear in correct lifecycle order: prepare < activate < verify < commit | |
| lambda a: ( | |
| all(cmd in a for cmd in [ | |
| 'vault-sync rotate prepare', | |
| 'vault-sync rotate activate', | |
| 'vault-sync rotate verify', | |
| 'vault-sync rotate commit', | |
| ]) | |
| and a.index('rotate prepare') < a.index('rotate activate') | |
| < a.index('rotate verify') < a.index('rotate commit') | |
| ), | |
| # Has rollback command | |
| lambda a: 'vault-sync rotate rollback' in a, | |
| # Has grace-period flag | |
| lambda a: bool(re.search(r'--grace-period', a)), | |
| # Has probe-endpoint flag | |
| lambda a: bool(re.search(r'--probe-endpoint', a)), | |
| # Has conditional logic for error handling | |
| lambda a: bool(re.search(r'\bif\b|\bthen\b|\$\?|&&\s*vault-sync|\|\|', a)), | |
| # References the correct secret path | |
| lambda a: 'db/prod/password' in a, | |
| ]), | |
| }, | |
| # --- Task 7: FluxStream Pipeline (Medium) --- structural verifier | |
| { | |
| "id": "task_007", | |
| "difficulty": "medium", | |
| "description": ( | |
| "Write a FluxStream pipeline definition called 'order_analytics' that: " | |
| "1) reads from a Kafka topic 'order-events' in JSON format with a 10-second " | |
| "watermark delay, 2) filters for 'completed' and 'refunded' orders, " | |
| "3) performs a 5-minute tumbling window aggregation grouped by product_id " | |
| "computing count and sum of amount, 4) routes high-value results (total > 500) " | |
| "to PostgreSQL and everything else to a Kafka sink topic, and 5) includes " | |
| "proper error handling with dead letter queue and retry logic." | |
| ), | |
| "relevant_skills": ["skill_016"], | |
| "distractor_skills": ["skill_017", "skill_018", "skill_002", "skill_010", "skill_013"], | |
| "verifier": _structural_verifier([ | |
| # Pipeline declaration with correct name | |
| lambda a: bool(re.search(r'pipeline\s+order_analytics\s*\{', a)), | |
| # Source is kafka with correct topic | |
| lambda a: bool(re.search(r'source\s+kafka\s*\(\s*["\']order-events', a)), | |
| # Has filter referencing completed and refunded | |
| lambda a: bool(re.search(r'filter\s*\(', a)) and 'completed' in a and 'refunded' in a, | |
| # Has tumbling window with 5m | |
| lambda a: bool(re.search(r'window\s*\(\s*tumbling\s*=\s*5m', a)), | |
| # Has group_by with product_id | |
| lambda a: bool(re.search(r'group_by.*product_id', a)), | |
| # Has branch with conditional routing to sink | |
| lambda a: bool(re.search(r'branch\s*\{', a)) and 'sink' in a, | |
| # Has error_handler block with dead_letter | |
| lambda a: bool(re.search(r'error_handler\s*\{', a)) and 'dead_letter' in a, | |
| # Has aggregation (count and sum) | |
| lambda a: bool(re.search(r'\bcount\b', a, re.IGNORECASE)) and bool(re.search(r'\bsum\b', a, re.IGNORECASE)), | |
| ]), | |
| }, | |
| # --- Task 8: NovaBin Record Parser (Medium) --- exec verifier | |
| { | |
| "id": "task_008", | |
| "difficulty": "medium", | |
| "description": ( | |
| "Write a Python function called `parse_novabin_record` that parses a single " | |
| "NovaBin record from a bytes buffer starting at a given offset. The function " | |
| "should take `data: bytes` and `offset: int` and return a tuple of " | |
| "(dict_of_fields, new_offset). Handle all four field types: int32, float64, " | |
| "string, and bool according to the NovaBin format specification." | |
| ), | |
| "relevant_skills": ["skill_004"], | |
| "distractor_skills": ["skill_005", "skill_013", "skill_001", "skill_015", "skill_018"], | |
| "verifier": _exec_verifier("parse_novabin_record", _NOVABIN_RECORD_TEST_CASES), | |
| }, | |
| # --- Task 9: CrystalQL + VaultSync Integration (Hard) --- multi-part verifier | |
| { | |
| "id": "task_009", | |
| "difficulty": "hard", | |
| "description": ( | |
| "Design a system that combines CrystalQL temporal queries with VaultSync " | |
| "secret rotation. Write: 1) A CrystalQL temporal join query that retrieves " | |
| "user orders with prices as they were at order time, using the TEMPORAL JOIN " | |
| "and AS OF syntax. 2) A VaultSync rotation policy YAML for the database " | |
| "credential used by this query service, with 7-day rotation, health probe " | |
| "verification, and Slack notifications. 3) A Python integration snippet using " | |
| "VaultSync SDK that auto-refreshes the database connection on secret rotation." | |
| ), | |
| "relevant_skills": ["skill_012", "skill_014"], | |
| "distractor_skills": ["skill_013", "skill_015", "skill_010", "skill_002", "skill_017"], | |
| "verifier": _multi_part_verifier([ | |
| # Part 1: CrystalQL temporal join query | |
| lambda a: bool(re.search(r'TEMPORAL\s+JOIN', a, re.IGNORECASE)), | |
| lambda a: bool(re.search(r'AS\s+OF\s+(TIMESTAMP|INTERVAL)', a, re.IGNORECASE)), | |
| lambda a: bool(re.search(r'\bON\b.*\bid\b', a, re.IGNORECASE)), | |
| # Part 2: VaultSync rotation policy YAML | |
| lambda a: bool(re.search(r'rotation_interval:\s*7d', a)), | |
| lambda a: bool(re.search(r'grace_period:', a)), | |
| lambda a: bool(re.search(r'http_probe|probe.*endpoint', a, re.IGNORECASE)), | |
| lambda a: bool(re.search(r'slack:', a, re.IGNORECASE)), | |
| # Part 3: Python SDK with on_rotate callback | |
| lambda a: bool(re.search(r'SecretClient|secret\s*\(', a)), | |
| lambda a: bool(re.search(r'on_rotate|\.on_rotate', a)), | |
| ]), | |
| }, | |
| # --- Task 10: Full ArcDeploy + FluxStream Monitoring (Hard) --- multi-part verifier | |
| { | |
| "id": "task_010", | |
| "difficulty": "hard", | |
| "description": ( | |
| "Write a complete deployment and monitoring setup that: 1) Defines an " | |
| "ArcDeploy canary rollout configuration for a streaming service with 5 phases " | |
| "including shadow testing, with metrics gates checking both latency and error " | |
| "rates. 2) Writes a FluxStream pipeline called 'deploy_monitor' that consumes " | |
| "deployment metric events, computes real-time error rates and latency " | |
| "percentiles in 1-minute tumbling windows, and routes alerts (error_rate > 0.01) " | |
| "to a dedicated alert sink. Include CLI commands for both tools to start " | |
| "the deployment and deploy the monitoring pipeline." | |
| ), | |
| "relevant_skills": ["skill_009", "skill_016"], | |
| "distractor_skills": ["skill_010", "skill_011", "skill_018", "skill_003", "skill_013"], | |
| "verifier": _multi_part_verifier([ | |
| # Part 1: ArcDeploy config — shadow phase + metrics gates | |
| lambda a: bool(re.search(r'name:\s*shadow', a)), | |
| lambda a: len(re.findall(r'-\s*name:', a)) >= 5, | |
| lambda a: bool(re.search(r'metrics_gate:.*latency', a, re.IGNORECASE)), | |
| lambda a: bool(re.search(r'metrics_gate:.*error_rate', a, re.IGNORECASE)), | |
| lambda a: bool(re.search(r'traffic_pct:\s*100', a)), | |
| # Part 2: FluxStream pipeline with correct name | |
| lambda a: bool(re.search(r'pipeline\s+deploy_monitor\s*\{', a)), | |
| lambda a: bool(re.search(r'source\s+kafka\s*\(', a)), | |
| lambda a: bool(re.search(r'window\s*\(\s*tumbling\s*=\s*1m', a)), | |
| lambda a: bool(re.search(r'branch\s*\{', a)) and bool(re.search(r'error_rate', a)), | |
| # Part 3: CLI commands for both tools | |
| lambda a: bool(re.search(r'arc\s+deploy\s+(init|start)', a)), | |
| lambda a: bool(re.search(r'flux\s+deploy', a)), | |
| ]), | |
| }, | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # Tasks adapted from SkillsBench (https://github.com/benchflow-ai/skillsbench) | |
| # Apache 2.0 License. Skills content used with attribution. | |
| # Adapted for text-in/text-out OpenEnv Gymnasium-style environment. | |
| # --------------------------------------------------------------------------- | |
| # ---- SkillsBench Skills ---- | |
| SKILL_BANK["skill_sb_001"] = { | |
| "name": "Flood Detection Guide", | |
| "short_description": "Detect flood events by comparing water levels to thresholds, aggregating data, and classifying severity.", | |
| "full_content": """## Flood Detection Guide | |
| ### Overview | |
| Flood detection involves comparing observed water levels against established flood stage thresholds. This guide covers how to process water level data and identify flood events. | |
| ### Flood Stage Definition | |
| According to the National Weather Service, flood stage is the water level at which overflow of the natural banks begins to cause damage. A flood event occurs when: | |
| ``` | |
| water_level >= flood_stage_threshold | |
| ``` | |
| ### Aggregating Instantaneous Data to Daily | |
| USGS instantaneous data is recorded at ~15-minute intervals. For flood detection, aggregate to daily maximum: | |
| ```python | |
| # df is DataFrame from nwis.get_iv() with datetime index | |
| # gage_col is the column name containing water levels | |
| daily_max = df[gage_col].resample('D').max() | |
| ``` | |
| ### Why Daily Maximum? | |
| | Aggregation | Use Case | | |
| |-------------|----------| | |
| | `max()` | Flood detection - captures peak water level | | |
| | `mean()` | Long-term trends - may miss short flood peaks | | |
| | `min()` | Low flow analysis | | |
| ### Detecting Flood Days | |
| Compare daily maximum water level against flood threshold: | |
| ```python | |
| flood_threshold = <threshold_from_nws> # feet | |
| # Count days with flooding | |
| flood_days = (daily_max >= flood_threshold).sum() | |
| # Get specific dates with flooding | |
| flood_dates = daily_max[daily_max >= flood_threshold].index.tolist() | |
| ``` | |
| ### Processing Multiple Stations | |
| ```python | |
| flood_results = [] | |
| for site_id, site_data in all_data.items(): | |
| daily_max = site_data['water_levels'].resample('D').max() | |
| threshold = thresholds[site_id]['flood'] | |
| days_above = int((daily_max >= threshold).sum()) | |
| if days_above > 0: | |
| flood_results.append({ | |
| 'station_id': site_id, | |
| 'flood_days': days_above | |
| }) | |
| # Sort by flood days descending | |
| flood_results.sort(key=lambda x: x['flood_days'], reverse=True) | |
| ``` | |
| ### Flood Severity Classification | |
| If multiple threshold levels are available: | |
| ```python | |
| def classify_flood(water_level, thresholds): | |
| if water_level >= thresholds['major']: | |
| return 'major' | |
| elif water_level >= thresholds['moderate']: | |
| return 'moderate' | |
| elif water_level >= thresholds['flood']: | |
| return 'minor' | |
| elif water_level >= thresholds['action']: | |
| return 'action' | |
| else: | |
| return 'normal' | |
| ``` | |
| ### Best Practices | |
| - Use daily maximum for flood detection to capture peaks | |
| - Ensure water level and threshold use same units (typically feet) | |
| - Only report stations with at least 1 flood day | |
| - Sort results by flood severity or duration for prioritization | |
| """, | |
| } | |
| SKILL_BANK["skill_sb_002"] = { | |
| "name": "USGS Data Download Guide", | |
| "short_description": "Download water level data from USGS using the dataretrieval Python package.", | |
| "full_content": """## USGS Data Download Guide | |
| The `dataretrieval` package allows downloading water level data from USGS. | |
| ```python | |
| from dataretrieval import nwis | |
| # Get instantaneous values (15-min intervals) | |
| df, meta = nwis.get_iv( | |
| sites='<station_id>', | |
| start='<start_date>', | |
| end='<end_date>', | |
| parameterCd='00065' | |
| ) | |
| ``` | |
| Parameter codes: 00065 = Gage height (feet), 00060 = Discharge (cfs). | |
| Station IDs are 8-digit strings with leading zeros (e.g., '04119000'). | |
| """, | |
| } | |
| SKILL_BANK["skill_sb_003"] = { | |
| "name": "NWS Flood Threshold Lookup", | |
| "short_description": "Look up flood stage thresholds from National Weather Service for USGS stations.", | |
| "full_content": """## NWS Flood Thresholds | |
| The NWS maintains flood stage thresholds. Download from: | |
| ``` | |
| https://water.noaa.gov/resources/downloads/reports/nwps_all_gauges_report.csv | |
| ``` | |
| Key columns: `usgs id`, `flood stage`, `moderate flood stage`, `major flood stage`. | |
| Use the `flood stage` column as threshold. Filter out -9999 values. | |
| """, | |
| } | |
| SKILL_BANK["skill_sb_004"] = { | |
| "name": "Time Series Detrending for Macroeconomic Analysis", | |
| "short_description": "HP filter detrending techniques for business cycle analysis of economic time series.", | |
| "full_content": """## Time Series Detrending for Macroeconomic Analysis | |
| This skill provides guidance on decomposing economic time series into trend and cyclical components, a fundamental technique in business cycle analysis. | |
| ### The Hodrick-Prescott (HP) Filter | |
| The HP filter decomposes a time series into a trend component and a cyclical component. Given a time series y_t, it finds the trend tau_t that minimizes: | |
| sum((y_t - tau_t)^2) + lambda * sum([(tau_{t+1} - tau_t) - (tau_t - tau_{t-1})]^2) | |
| ### Choosing Lambda | |
| **Critical**: The choice of lambda depends on data frequency: | |
| | Data Frequency | Recommended lambda | Rationale | | |
| |----------------|-------------------|-----------| | |
| | Annual | **100** | Standard for yearly data | | |
| | Quarterly | **1600** | Hodrick-Prescott (1997) recommendation | | |
| | Monthly | **14400** | Ravn-Uhlig (2002) adjustment | | |
| **Common mistake**: Using lambda=1600 (quarterly default) for annual data. | |
| ### Python Implementation | |
| ```python | |
| from statsmodels.tsa.filters.hp_filter import hpfilter | |
| import numpy as np | |
| # Apply HP filter | |
| # Returns: (cyclical_component, trend_component) | |
| cycle, trend = hpfilter(data, lamb=100) # For annual data | |
| ``` | |
| **Important**: The function parameter is `lamb` (not `lambda`, which is a Python keyword). | |
| ### Log Transformation | |
| For most macroeconomic aggregates (GDP, consumption, investment), apply the natural logarithm **before** filtering: | |
| ```python | |
| import numpy as np | |
| log_series = np.log(real_series) | |
| cycle, trend = hpfilter(log_series, lamb=100) | |
| # cycle now represents percentage deviations from trend | |
| ``` | |
| ### Complete Workflow: Business Cycle Correlation | |
| ```python | |
| import numpy as np | |
| from statsmodels.tsa.filters.hp_filter import hpfilter | |
| # Log transformation | |
| ln_consumption = np.log(real_consumption) | |
| ln_investment = np.log(real_investment) | |
| # HP filter with lambda=100 for annual data | |
| cycle_c, trend_c = hpfilter(ln_consumption, lamb=100) | |
| cycle_i, trend_i = hpfilter(ln_investment, lamb=100) | |
| # Compute correlation of cyclical components | |
| correlation = np.corrcoef(cycle_c, cycle_i)[0, 1] | |
| print(f"Business cycle correlation: {correlation:.5f}") | |
| ``` | |
| ### Dependencies | |
| ```bash | |
| pip install statsmodels pandas numpy | |
| ``` | |
| The HP filter is in `statsmodels.tsa.filters.hp_filter`. | |
| """, | |
| } | |
| SKILL_BANK["skill_sb_005"] = { | |
| "name": "Pandas DataFrame Operations", | |
| "short_description": "Common pandas operations for data manipulation, merging, and aggregation.", | |
| "full_content": """## Pandas DataFrame Operations | |
| Common pandas patterns: `pd.read_csv()`, `df.merge()`, `df.groupby().agg()`, `df.pivot_table()`. Use `pd.to_numeric(errors='coerce')` for type conversion. Handle missing data with `df.fillna()` or `df.dropna()`. | |
| """, | |
| } | |
| SKILL_BANK["skill_sb_006"] = { | |
| "name": "Matplotlib Time Series Visualization", | |
| "short_description": "Plotting time series data and trends with matplotlib.", | |
| "full_content": """## Matplotlib Time Series Visualization | |
| Plot time series with `plt.plot(dates, values)`. Add trend lines, annotations, and formatting. Use `fig, axes = plt.subplots()` for multi-panel plots. Save with `plt.savefig('output.png', dpi=150)`. | |
| """, | |
| } | |
| SKILL_BANK["skill_sb_007"] = { | |
| "name": "Dialogue Graph Library", | |
| "short_description": "Build, validate, and serialize dialogue graphs for branching narratives using Graph/Node/Edge classes.", | |
| "full_content": """## Dialogue Graph Skill | |
| This skill provides a `dialogue_graph` module to build valid dialogue trees/graphs. | |
| ### When to use | |
| - **Script Parsers**: Converting text to structured data | |
| - **Dialogue Editors**: Building tools to edit conversation flow | |
| - **Game Logic**: Traversing a dialogue tree | |
| ### How to use | |
| ```python | |
| from dialogue_graph import Graph, Node, Edge | |
| ``` | |
| #### The Graph Class | |
| ```python | |
| graph = Graph() | |
| ``` | |
| #### Adding Nodes | |
| ```python | |
| # Regular dialogue line | |
| graph.add_node(Node(id="Start", speaker="Guard", text="Halt!", type="line")) | |
| # Choice hub | |
| graph.add_node(Node(id="Choices", type="choice")) | |
| ``` | |
| #### Adding Edges | |
| ```python | |
| # Simple transition | |
| graph.add_edge(Edge(source="Start", target="Choices")) | |
| # Choice transition (with text) | |
| graph.add_edge(Edge(source="Choices", target="End", text="1. Run away")) | |
| ``` | |
| #### Export | |
| ```python | |
| data = graph.to_dict() | |
| # returns {"nodes": [...], "edges": [...]} | |
| ``` | |
| #### Validation | |
| ```python | |
| errors = graph.validate() | |
| # Returns list of strings, e.g., ["Edge 'Start'->'Unk' points to missing node 'Unk'"] | |
| ``` | |
| #### Loading from JSON | |
| ```python | |
| graph = Graph.from_dict({'nodes': [...], 'edges': [...]}) | |
| ``` | |
| ### Node format | |
| Each node is a dict with: `id` (str), `text` (str), `speaker` (str), `type` ("line" or "choice") | |
| ### Edge format | |
| Each edge is a dict with: `from` (str), `to` (str), `text` (str) | |
| ### Parsing Script Format | |
| The standard script format: | |
| ``` | |
| [SceneName] | |
| Speaker: Dialogue text. -> NextScene | |
| [ChoiceScene] | |
| 1. Choice text -> TargetScene | |
| 2. Another choice -> OtherTarget | |
| ``` | |
| Rules: | |
| - Lines starting with `[Name]` define scene nodes | |
| - If a scene has `Speaker: Text -> Target`, it is type="line" with that speaker | |
| - If a scene has numbered options (`1. Text -> Target`), it is type="choice" | |
| - The arrow `->` defines an edge from the current scene to the target | |
| - Choice edges include the choice text (e.g., "1. Choice text") | |
| - For line nodes, the edge text is empty or the dialogue text | |
| """, | |
| } | |
| SKILL_BANK["skill_sb_008"] = { | |
| "name": "Graphviz DOT Format", | |
| "short_description": "Generate graph visualizations using Graphviz DOT language.", | |
| "full_content": """## Graphviz DOT Format | |
| Create directed graphs: `digraph G { A -> B; }`. Set node shapes with `[shape=diamond]`. Colors with `[color=blue]`. Use `rankdir=LR` for left-to-right layout. Render with `dot -Tpng graph.dot -o graph.png`. | |
| """, | |
| } | |
| SKILL_BANK["skill_sb_009"] = { | |
| "name": "JSON Schema Validation", | |
| "short_description": "Validate JSON data structures against schemas using jsonschema library.", | |
| "full_content": """## JSON Schema Validation | |
| Use `jsonschema.validate(data, schema)` to check JSON structure. Define schemas with `type`, `properties`, `required`, `items`. Install: `pip install jsonschema`. Catches `ValidationError` on failure. | |
| """, | |
| } | |
| # ---- SkillsBench-adapted Test Data ---- | |
| _SB_FLOOD_TEST_DATA = { | |
| "daily_max_levels": { | |
| "04124000": [15.2, 16.1, 15.8, 14.9, 15.5, 16.0, 15.3], | |
| "04101800": [22.1, 23.5, 22.8, 24.1, 23.0, 21.9, 22.5], | |
| "04097500": [10.5, 11.2, 10.8, 11.1, 10.3, 9.8, 9.5], | |
| "04166500": [8.2, 8.5, 7.9, 7.8, 8.0, 7.5, 7.3], | |
| "04999999": [5.0, 4.8, 5.1, 4.9, 5.0, 4.7, 4.6], | |
| }, | |
| "flood_thresholds": { | |
| "04124000": 15.0, | |
| "04101800": 22.0, | |
| "04097500": 10.5, | |
| "04166500": 8.1, | |
| "04999999": 6.0, | |
| }, | |
| } | |
| _SB_FLOOD_EXPECTED = {"04124000": 6, "04101800": 6, "04097500": 4, "04166500": 2} | |
| # Pre-computed HP filter test data (deterministic, seed=42) | |
| # Generated with: np.random.seed(42); n=50; trend=linspace(10,12,n); | |
| # cycle=0.05*sin(linspace(0,4*pi,n)); noise_a/b=0.01*randn(n); | |
| # series = exp(trend + [1|0.8]*cycle + noise) | |
| _SB_HP_SERIES_A = [ | |
| 22136.146542207392, 23204.845943423705, 24652.79143953327, | |
| 26172.47083057089, 27002.39677592748, 28273.313580880793, | |
| 30051.396100901988, 31012.171288310117, 31766.445340494465, | |
| 33182.78309985593, 33887.64705958506, 34894.26280732681, | |
| 36149.52370863675, 36385.350397792274, 37514.59011964166, | |
| 39113.01687213975, 40212.14327686968, 42196.670687212085, | |
| 43297.26210449467, 44892.592298244555, 48303.64908257102, | |
| 49801.15821216017, 52508.42643350753, 54489.575008037566, | |
| 57974.245708500996, 61568.844301395045, 64117.839034819415, | |
| 68576.37200943612, 71392.08801561705, 75092.589894759, | |
| 78263.12445218801, 83583.20317625241, 85213.13757625608, | |
| 87323.09714571184, 91894.08758161688, 92786.56912352098, | |
| 96861.20170394408, 97480.13273428561, 100909.88166437816, | |
| 105506.58025009745, 109428.19143887387, 112516.26919621794, | |
| 116352.87005507768, 120827.19859243499, 124641.85636295384, | |
| 131502.23328916836, 138435.59870506602, 147916.42415706386, | |
| 154807.41309002193, 159910.50571503676, | |
| ] | |
| _SB_HP_SERIES_B = [ | |
| 22097.96583697866, 23089.002181075368, 24209.299982962977, | |
| 25755.231345447573, 27113.336616067252, 28331.828906180173, | |
| 29041.61942185999, 30382.41537568266, 31739.18190358319, | |
| 33080.971269261834, 33697.971409879494, 34882.03889214109, | |
| 35642.5482028925, 36717.31912567404, 38645.99707499506, | |
| 40129.947797790956, 40926.54825685394, 42889.14931129554, | |
| 44289.153022361395, 45687.121649986715, 48212.03616086788, | |
| 51085.192717671445, 52768.858896074795, 56354.155802272886, | |
| 56856.12876304818, 61928.90266241154, 64673.42412352394, | |
| 67709.05244322174, 71328.26917352251, 73157.81859575034, | |
| 77790.94757943293, 81527.23479593106, 85685.60547068984, | |
| 87078.29744266604, 89823.2654818476, 93051.57680422119, | |
| 97361.69347175729, 99800.53110013365, 102039.82705754046, | |
| 106419.91537764092, 109536.45736907546, 114426.94522822299, | |
| 116805.80714641871, 122008.49722830504, 127217.1275907411, | |
| 131648.90322204158, 140460.9679752306, 147465.45111253537, | |
| 154676.11432606063, 162373.437098354, | |
| ] | |
| _SB_HP_EXPECTED_CORR = 0.62487 | |
| _SB_DIALOGUE_TEST_SCRIPT = """[GateScene] | |
| Guard: Halt! State your name. -> NameChoice | |
| [NameChoice] | |
| 1. I am Sir Aldric. -> KnightPath | |
| 2. Just a traveler. -> TravelerPath | |
| [KnightPath] | |
| Captain: Welcome, knight! -> End | |
| [TravelerPath] | |
| Guard: Move along then. -> End | |
| [End] | |
| Narrator: The story continues... | |
| """ | |
| # ---- SkillsBench-adapted Tasks ---- | |
| TASK_BANK.extend([ | |
| # --- task_sb_001: Flood Risk Analysis (Easy) --- exec verifier | |
| { | |
| "id": "task_sb_001", | |
| "difficulty": "easy", | |
| "source": "skillsbench", | |
| "description": ( | |
| "Write a Python function called `detect_flood_days` that takes two arguments: " | |
| "`daily_max_levels` (dict mapping station_id to list of daily maximum water " | |
| "levels in feet) and `flood_thresholds` (dict mapping station_id to flood stage " | |
| "threshold in feet). Return a dict mapping station_id to the number of days " | |
| "where water level >= threshold. Only include stations with at least 1 flood day." | |
| ), | |
| "relevant_skills": ["skill_sb_001"], | |
| "distractor_skills": ["skill_sb_002", "skill_sb_003", "skill_sb_005", "skill_sb_008", "skill_sb_009"], | |
| "verifier": _exec_verifier("detect_flood_days", [ | |
| { | |
| "args": [ | |
| _SB_FLOOD_TEST_DATA["daily_max_levels"], | |
| _SB_FLOOD_TEST_DATA["flood_thresholds"], | |
| ], | |
| "check": lambda r: ( | |
| isinstance(r, dict) | |
| and r == _SB_FLOOD_EXPECTED | |
| ), | |
| }, | |
| # Edge case: empty input | |
| { | |
| "args": [{}, {}], | |
| "check": lambda r: isinstance(r, dict) and len(r) == 0, | |
| }, | |
| # Edge case: station with exactly threshold value (should count) | |
| { | |
| "args": [ | |
| {"ST1": [5.0, 4.9, 5.0]}, | |
| {"ST1": 5.0}, | |
| ], | |
| "check": lambda r: isinstance(r, dict) and r.get("ST1") == 2, | |
| }, | |
| ]), | |
| }, | |
| # --- task_sb_002: Economics HP Filter Correlation (Medium) --- exec verifier | |
| { | |
| "id": "task_sb_002", | |
| "difficulty": "medium", | |
| "source": "skillsbench", | |
| "description": ( | |
| "Write a Python function called `hp_filter_correlation` that takes two arguments: " | |
| "`series_a` (list of floats) and `series_b` (list of floats) representing two " | |
| "economic time series. The function should: 1) Take the natural log of each " | |
| "series, 2) Apply the Hodrick-Prescott filter with lambda=100 to extract " | |
| "cyclical components, 3) Compute and return the Pearson correlation coefficient " | |
| "between the two cyclical components as a float rounded to 5 decimal places. " | |
| "You may use numpy and scipy (statsmodels.api.tsa.filters.hpfilter is available)." | |
| ), | |
| "relevant_skills": ["skill_sb_004"], | |
| "distractor_skills": ["skill_sb_005", "skill_sb_006", "skill_sb_002", "skill_sb_009", "skill_sb_003"], | |
| "verifier": _exec_verifier("hp_filter_correlation", [ | |
| { | |
| "args": [_SB_HP_SERIES_A, _SB_HP_SERIES_B], | |
| "check": lambda r: ( | |
| isinstance(r, (int, float)) | |
| and abs(float(r) - _SB_HP_EXPECTED_CORR) < 0.01 | |
| ), | |
| }, | |
| ]), | |
| }, | |
| # --- task_sb_003: Dialogue Parser (Medium) --- exec verifier | |
| { | |
| "id": "task_sb_003", | |
| "difficulty": "medium", | |
| "source": "skillsbench", | |
| "description": ( | |
| "Write a Python function called `parse_dialogue` that takes a dialogue script " | |
| "string and returns a dict with 'nodes' (list of dicts with 'id', 'text', " | |
| "'speaker', 'type') and 'edges' (list of dicts with 'from', 'to', 'text'). " | |
| "Parse this format:\n\n" | |
| "```\n" | |
| "[SceneName]\n" | |
| "Speaker: Dialogue text. -> NextScene\n" | |
| "\n" | |
| "[ChoiceScene]\n" | |
| "1. Choice text -> TargetScene\n" | |
| "2. Another choice -> OtherTarget\n" | |
| "```\n\n" | |
| "Lines with `[Name]` define scene nodes (type='line' if has speaker dialogue, " | |
| "type='choice' if has numbered options). Arrow `->` defines edges. For line " | |
| "nodes, text is the dialogue text and speaker is the speaker name. For choice " | |
| "nodes, text and speaker are empty strings. Edge text is the choice text for " | |
| "choices or empty string for line transitions." | |
| ), | |
| "relevant_skills": ["skill_sb_007"], | |
| "distractor_skills": ["skill_sb_008", "skill_sb_009", "skill_sb_002", "skill_sb_005", "skill_sb_006"], | |
| "verifier": _exec_verifier("parse_dialogue", [ | |
| { | |
| "args": [_SB_DIALOGUE_TEST_SCRIPT], | |
| "check": lambda r: ( | |
| isinstance(r, dict) | |
| and "nodes" in r and "edges" in r | |
| and isinstance(r["nodes"], list) and isinstance(r["edges"], list) | |
| # Must have 5 nodes: GateScene, NameChoice, KnightPath, TravelerPath, End | |
| and len(r["nodes"]) == 5 | |
| # Check specific nodes exist with correct data | |
| and any( | |
| n.get("id") == "GateScene" | |
| and n.get("speaker") == "Guard" | |
| and "Halt" in n.get("text", "") | |
| and n.get("type") == "line" | |
| for n in r["nodes"] | |
| ) | |
| and any( | |
| n.get("id") == "NameChoice" | |
| and n.get("type") == "choice" | |
| for n in r["nodes"] | |
| ) | |
| and any( | |
| n.get("id") == "End" | |
| for n in r["nodes"] | |
| ) | |
| # Must have 5 edges | |
| and len(r["edges"]) == 5 | |
| # Check specific edges | |
| and any( | |
| e.get("from") == "GateScene" and e.get("to") == "NameChoice" | |
| for e in r["edges"] | |
| ) | |
| and any( | |
| e.get("from") == "NameChoice" and e.get("to") == "KnightPath" | |
| for e in r["edges"] | |
| ) | |
| and any( | |
| e.get("from") == "NameChoice" and e.get("to") == "TravelerPath" | |
| for e in r["edges"] | |
| ) | |
| and any( | |
| e.get("from") == "KnightPath" and e.get("to") == "End" | |
| for e in r["edges"] | |
| ) | |
| and any( | |
| e.get("from") == "TravelerPath" and e.get("to") == "End" | |
| for e in r["edges"] | |
| ) | |
| ), | |
| }, | |
| ]), | |
| }, | |
| ]) | |