Siteshcodes's picture
v2.0: multi-step episodes, procedural bugs, semantic grading, sessions, 71 tests
703aa57
# server/task.py
import sys
import random
import hashlib
sys.path.insert(0, "/app")
from typing import Tuple, List, Dict, Any
from model import BugReport, TriageAction
# ---------------------------------------------------------------------------
# LABEL SYNONYM MAP — allows semantic matching
# ---------------------------------------------------------------------------
LABEL_SYNONYMS: Dict[str, set] = {
"bug": {"defect", "issue", "error", "fault", "broken"},
"security": {"vulnerability", "cve", "exploit", "auth", "injection"},
"performance": {"perf", "slow", "latency", "optimization", "speed", "memory"},
"ux": {"ui", "frontend", "user-experience", "design", "usability"},
"data-integrity": {"data-loss", "corruption", "data", "consistency"},
"payments": {"billing", "payment", "stripe", "checkout", "revenue"},
"documentation": {"docs", "typo", "readme", "wiki"},
"infrastructure": {"infra", "devops", "deploy", "ci", "cd", "docker"},
"api": {"endpoint", "rest", "graphql", "http", "request"},
"database": {"db", "sql", "query", "migration", "schema"},
}
# ---------------------------------------------------------------------------
# BUG TEMPLATE SYSTEM — generates hundreds of unique bugs
# ---------------------------------------------------------------------------
_BUG_TEMPLATES = {
"crash": {
"titles": [
"{service} crashes on {trigger}",
"{service} throws {error_type} when {trigger}",
"Fatal error in {service} during {trigger}",
"Unhandled exception in {service}: {error_type}",
"{service} segfaults under {condition}",
],
"bodies": [
"When a user {trigger}, the {service} crashes immediately. "
"Error: {error_type}. Stack trace points to {component}. "
"Affects {impact}. {workaround}",
"The {service} is failing with {error_type} every time a user {trigger}. "
"No error message is shown to the user — the process just dies. "
"Impact: {impact}. {workaround}",
],
"vars": {
"service": ["auth service", "payment gateway", "search API", "notification worker",
"session manager", "user profile service", "file upload handler",
"webhook processor", "background job runner", "cache layer"],
"trigger": ["submits a form with special characters", "uploads a file larger than 10MB",
"logs in with SSO", "resets their password", "exports data to CSV",
"switches between tabs rapidly", "uses the bulk import feature",
"accesses the admin panel", "triggers a webhook", "runs a scheduled job"],
"error_type": ["NullPointerException", "SegmentationFault", "OutOfMemoryError",
"ConnectionTimeoutException", "StackOverflowError",
"IndexOutOfBoundsException", "TypeError", "KeyError"],
"component": ["UserController.java:142", "PaymentService.py:89",
"AuthMiddleware.ts:56", "SearchIndex.go:203",
"NotificationQueue.rb:77", "FileHandler.py:234"],
"impact": ["100% of users on this flow", "all mobile users", "EU region users only",
"users with accounts older than 1 year", "approximately 30% of sessions",
"every request during peak hours"],
"workaround": ["No workaround exists — the feature is completely broken.",
"Workaround: users can retry after clearing browser cache.",
"Temporary fix: restart the service every 2 hours.",
"No known workaround. Users are blocked."],
"condition": ["high concurrent load", "memory pressure above 80%",
"when connection pool is exhausted", "after running for 6+ hours"],
},
"answer_template": {
"severe": {"priority": "P0", "labels": ["bug"], "assigned_team": "backend", "milestone": "hotfix"},
"moderate": {"priority": "P1", "labels": ["bug"], "assigned_team": "backend", "milestone": "v2.1"},
},
"severity_keywords": {
"severe": ["100%", "all mobile", "No workaround", "completely broken", "blocked",
"SegmentationFault", "OutOfMemoryError"],
"moderate": ["retry", "30%", "Temporary fix", "restart"],
},
},
"security": {
"titles": [
"SQL injection vulnerability in {endpoint}",
"XSS attack possible via {input_field}",
"Authentication bypass in {service}",
"Sensitive data exposed in {location}",
"{credential_type} not invalidated after {event}",
"SSRF vulnerability in {endpoint}",
],
"bodies": [
"The {endpoint} endpoint does not sanitize {input_field} inputs. "
"Crafted queries can {exploit_result}. PoC attached and verified on {env}. "
"Treat as confidential — do not discuss publicly until patched. {additional_context}",
"When a user {event}, existing {credential_type} remain valid for {duration}. "
"An attacker who {attack_vector} can continue to access the account. "
"This is a {vuln_category} vulnerability. {additional_context}",
],
"vars": {
"endpoint": ["/api/search", "/api/users", "/api/export", "/admin/query",
"/api/upload", "/graphql", "/api/webhook"],
"input_field": ["search query", "username field", "file upload name",
"comment body", "profile bio", "webhook URL"],
"service": ["login flow", "OAuth callback", "API gateway", "admin panel",
"password reset", "2FA verification"],
"location": ["API error responses", "debug logs shipped to client",
"public S3 bucket", "unencrypted cookies", "localStorage"],
"credential_type": ["JWT tokens", "session cookies", "API keys", "OAuth tokens"],
"event": ["changes their password", "revokes API access",
"is suspended by admin", "enables 2FA"],
"exploit_result": ["dump the entire user table including password hashes",
"execute arbitrary JavaScript in other users' browsers",
"access any user's account without credentials",
"read internal service endpoints via SSRF"],
"env": ["production", "staging", "production replica"],
"duration": ["up to 24 hours", "indefinitely", "until manual cache clear",
"for the full token TTL (7 days)"],
"attack_vector": ["previously stole a token", "intercepted a session cookie",
"obtained a leaked API key"],
"vuln_category": ["session management", "access control",
"injection", "broken authentication"],
"additional_context": [
"OWASP A03 — Injection.",
"OWASP A07 — Identification and Authentication Failures.",
"CVSS score estimated at 9.1 (Critical).",
"Compliance impact: potential GDPR violation if user PII is exfiltrated.",
"Bounty hunter reported this 48 hours ago — disclosure deadline approaching.",
],
},
"answer_template": {
"default": {"priority": "P0", "labels": ["bug", "security"],
"assigned_team": "security", "milestone": "hotfix"},
},
"severity_keywords": {"default": []},
},
"performance": {
"titles": [
"{page} loads slowly for {dataset_size}",
"Memory leak in {service} causes OOM after {duration}",
"API response time degrades under {load_condition}",
"{operation} takes {duration} for {dataset_size}",
"CPU spikes to 100% when {trigger}",
],
"bodies": [
"When {condition}, the {page} takes {response_time} to load. "
"{diagnostic_info}. {impact}. {workaround}",
"The {service} allocates memory during {operation} and never frees it. "
"Server runs out of memory every {duration}. {diagnostic_info}. "
"{workaround}",
],
"vars": {
"page": ["dashboard", "analytics page", "user list", "search results",
"audit log", "reports page", "admin overview"],
"service": ["background job processor", "cache warming service",
"log aggregator", "image resizer", "ETL pipeline"],
"dataset_size": ["large datasets (10k+ rows)", "enterprise accounts",
"tables with 100k+ entries", "files over 50MB"],
"duration": ["6 hours", "4 hours", "12 hours", "30+ seconds",
"2+ minutes", "an entire day"],
"load_condition": ["concurrent load", "peak traffic", "batch processing",
"more than 50 simultaneous users"],
"operation": ["bulk export", "report generation", "data migration",
"full-text search", "image processing"],
"trigger": ["running bulk exports", "processing large uploads",
"generating PDF reports", "reindexing search"],
"condition": ["a dataset has more than 10k rows",
"multiple users trigger exports simultaneously",
"the nightly ETL job runs alongside user traffic"],
"response_time": ["30+ seconds", "over a minute", "2-3 minutes",
"timeout after 60 seconds"],
"diagnostic_info": ["CPU spikes to 100%", "Heap profiler confirms the leak",
"Database EXPLAIN shows full table scan",
"N+1 query pattern detected in APM",
"Garbage collector running every 500ms"],
"impact": ["Affects power users with large accounts",
"All users experience slowness during peak hours",
"Requires manual restart to recover",
"Operational overhead: scheduled restarts every 4 hours"],
"workaround": ["Workaround: export data and use offline tools.",
"Workaround: scheduled restarts every 4 hours.",
"No workaround — users just wait.",
"Workaround: paginate results (but UX is degraded)."],
},
"answer_template": {
"severe": {"priority": "P1", "labels": ["bug", "performance"],
"assigned_team": "backend", "milestone": "v2.1"},
"moderate": {"priority": "P2", "labels": ["bug", "performance"],
"assigned_team": "backend", "milestone": "v2.1"},
},
"severity_keywords": {
"severe": ["OOM", "100%", "manual restart", "timeout", "No workaround",
"all users", "never frees"],
"moderate": ["Workaround", "power users", "paginate"],
},
},
"ui_bug": {
"titles": [
"{ui_element} breaks layout on {browser}",
"{ui_element} not rendering correctly in {mode}",
"Responsive layout broken on {device}",
"{feature} toggle not persisting across {context}",
"Accessibility: {ui_element} missing {a11y_attr}",
],
"bodies": [
"Switching to {mode} on {browser} causes {ui_element} to {visual_issue}. "
"{other_browsers}. {workaround}",
"On {device}, the {ui_element} is {visual_issue}. "
"Tested on {browser}. {impact}. {workaround}",
],
"vars": {
"ui_element": ["navigation bar", "sidebar menu", "modal dialog",
"dropdown selector", "data table", "footer",
"toast notifications", "breadcrumb trail"],
"browser": ["Safari 16", "Firefox ESR", "Chrome on Android",
"Edge on Windows", "iOS Safari", "Samsung Internet"],
"mode": ["dark mode", "high contrast mode", "RTL layout",
"compact view", "print view"],
"device": ["iPhone SE", "tablets in portrait", "screens below 768px",
"ultra-wide monitors", "4K displays"],
"feature": ["dark mode", "compact view", "language preference",
"notification settings"],
"context": ["page reloads", "different tabs", "sessions",
"browser restarts"],
"visual_issue": ["overlap the main content", "disappear entirely",
"render with incorrect colors", "become unclickable",
"overflow beyond the viewport"],
"other_browsers": ["Chrome and Firefox are unaffected.",
"Only reproducible on this specific browser.",
"Affects all WebKit-based browsers."],
"a11y_attr": ["ARIA labels", "keyboard focus indicators",
"screen reader text", "proper heading hierarchy"],
"impact": ["Cosmetic issue, no functional impact.",
"Users cannot access the affected feature.",
"Usability is degraded but the feature works."],
"workaround": ["Workaround: use a different browser.",
"Workaround: manually resize the window.",
"No workaround for this browser.",
"Workaround: disable the feature in settings."],
},
"answer_template": {
"severe": {"priority": "P2", "labels": ["bug", "ux"],
"assigned_team": "frontend", "milestone": "v2.1"},
"moderate": {"priority": "P3", "labels": ["bug", "ux"],
"assigned_team": "frontend", "milestone": "backlog"},
},
"severity_keywords": {
"severe": ["cannot access", "unclickable", "disappear", "No workaround"],
"moderate": ["Cosmetic", "different browser", "resize"],
},
},
"data_corruption": {
"titles": [
"Race condition in {feature}: {consequence}",
"Data inconsistency in {feature} under concurrent writes",
"{export_format} export produces corrupted output for {edge_case}",
"Stale data served from cache after {trigger}",
"Duplicate records created when {trigger}",
],
"bodies": [
"Under concurrent load, {feature} can {consequence} due to a race condition "
"in {root_cause}. Frequency: {frequency}. {impact}. {workaround}",
"When {feature} data contains {edge_case}, the exported {export_format} file "
"is corrupted and cannot be {consumer}. {impact}. {workaround}",
],
"vars": {
"feature": ["file upload", "order processing", "user registration",
"inventory update", "comment system", "permission assignment"],
"consequence": ["files occasionally overwrite each other",
"orders are duplicated or lost",
"users get assigned wrong permissions",
"inventory counts become negative"],
"root_cause": ["temp file naming logic", "lack of database locking",
"non-atomic read-modify-write cycle",
"missing unique constraint"],
"frequency": ["approximately 1 in 10,000 operations",
"consistently under 50+ concurrent users",
"intermittently — hard to reproduce",
"every time the batch job runs"],
"edge_case": ["non-ASCII characters (e.g., café, naïve)",
"values containing commas or quotes",
"null or empty fields",
"timestamps crossing DST boundaries"],
"export_format": ["CSV", "Excel", "JSON", "PDF"],
"consumer": ["opened in Excel", "parsed by downstream services",
"imported back into the system"],
"trigger": ["double-clicking the submit button",
"cache TTL expires during a write operation",
"two users edit the same record simultaneously",
"the nightly sync job overlaps with user activity"],
"impact": ["Potential data loss confirmed.",
"No data loss confirmed yet, but risk exists.",
"Affects users with international data.",
"Breaks downstream pipeline processing."],
"workaround": ["Workaround: enable sequential mode in settings.",
"Workaround: manually re-export after cleanup.",
"No reliable workaround — data must be manually verified.",
"Workaround: add a mutex lock externally (operational overhead)."],
},
"answer_template": {
"severe": {"priority": "P1", "labels": ["bug", "data-integrity"],
"assigned_team": "backend", "milestone": "v2.1"},
"moderate": {"priority": "P2", "labels": ["bug", "data-integrity"],
"assigned_team": "backend", "milestone": "v2.1"},
},
"severity_keywords": {
"severe": ["data loss", "No reliable workaround", "consistently",
"permissions", "overwrite", "negative"],
"moderate": ["No data loss", "intermittently", "sequential mode",
"re-export", "non-ASCII"],
},
},
"documentation": {
"titles": [
"Typo in {location}",
"Outdated {doc_type} on {page}",
"Missing documentation for {feature}",
"Incorrect {doc_element} in {location}",
],
"bodies": [
"There is a {issue_type} on the {page}: {detail}. No functional impact, "
"purely cosmetic. {extra}",
"The {doc_type} for {feature} is {issue_type}. {detail}. {extra}",
],
"vars": {
"location": ["homepage docs", "API reference", "README", "changelog",
"contributing guide", "onboarding wiki"],
"doc_type": ["installation guide", "API documentation", "changelog",
"migration guide", "code comments"],
"page": ["landing page", "docs homepage", "getting started page",
"FAQ section", "footer"],
"feature": ["new webhook API", "batch processing endpoint",
"SSO integration", "rate limiting"],
"doc_element": ["code example", "endpoint URL", "parameter description",
"copyright year", "version number"],
"issue_type": ["a typo", "outdated", "missing", "incorrect", "misleading"],
"detail": ["'Welccome' should be 'Welcome'",
"references removed v1.x API that no longer exists",
"completely undocumented despite being a core feature",
"shows '© 2022' but should be '© 2024'",
"the curl example uses the wrong HTTP method"],
"extra": ["", "Low priority — does not block any workflow.",
"New users have reported confusion.",
"Only noticed by contributors reading source code."],
},
"answer_template": {
"default": {"priority": "P3", "labels": ["documentation"],
"assigned_team": "devx", "milestone": "backlog"},
},
"severity_keywords": {"default": []},
},
"api_bug": {
"titles": [
"API rate limiter {issue} after {trigger}",
"{endpoint} returns {status_code} instead of {expected_code}",
"Pagination broken on {endpoint}: {symptom}",
"Webhook delivery {issue} for {event_type} events",
"API versioning: {endpoint} behaves differently on v1 vs v2",
],
"bodies": [
"After receiving a {status_code} response, {consequence}. "
"The {root_cause}. {impact}. {workaround}",
"The {endpoint} endpoint {symptom} when {trigger}. "
"Expected behavior: {expected}. Actual: {actual}. {impact}.",
],
"vars": {
"endpoint": ["/api/users", "/api/search", "/api/export",
"/api/webhooks", "/api/billing", "/api/analytics"],
"issue": ["blocks legitimate users", "fails silently",
"returns incorrect retry headers", "drops events"],
"trigger": ["a 429 error", "rate limit window resets",
"a burst of requests from CI/CD", "server restart"],
"status_code": ["429", "500", "502", "504", "403"],
"expected_code": ["200", "201", "204", "404"],
"symptom": ["returns duplicate entries",
"skips items between pages",
"returns empty page despite more data existing"],
"event_type": ["payment.completed", "user.created",
"subscription.cancelled", "deployment.finished"],
"consequence": ["legitimate users remain blocked for 1 hour",
"data is silently lost with no error",
"downstream services receive stale data"],
"root_cause": ["unblock logic has a bug — it never clears the blocked flag",
"cursor-based pagination uses wrong sort order",
"retry-after header reports seconds instead of milliseconds"],
"expected": ["200 OK with paginated results",
"successful delivery with retry on failure",
"proper rate limit reset after window expires"],
"actual": ["empty response with 200 status",
"permanent block until manual intervention",
"events dropped without any error log"],
"impact": ["Affects CI/CD pipelines hitting the API.",
"External integrations break silently.",
"Customer-facing dashboards show wrong data.",
"Retry-After header causes clients to wait too long."],
"workaround": ["Workaround: manually clear Redis key.",
"Workaround: add client-side deduplication.",
"No workaround — requires server-side fix.",
"Workaround: pin API version to v1 in headers."],
},
"answer_template": {
"severe": {"priority": "P1", "labels": ["bug", "api"],
"assigned_team": "backend", "milestone": "v2.1"},
"moderate": {"priority": "P2", "labels": ["bug", "api"],
"assigned_team": "backend", "milestone": "v2.1"},
},
"severity_keywords": {
"severe": ["silently lost", "permanent block", "No workaround",
"dropped", "external integrations"],
"moderate": ["Workaround", "pin API", "deduplication"],
},
},
}
# The original handcrafted bugs — kept as a gold-standard subset
_HANDCRAFTED_BUGS = {
"easy": {
"bugs": [
BugReport(
id="easy-001",
title="App crashes on login with correct credentials",
body="When I enter my correct username and password, the app crashes immediately. "
"This started after the v2.0 release. Affects 100% of users. "
"No workaround exists — users cannot log in at all.",
author="user123",
labels_hint=[],
comments=["Confirmed on iOS and Android.", "Happens every time."],
severity_signals=["100% of users", "crashes", "no workaround"],
stack_trace="NullPointerException at AuthController.java:87",
affected_component="auth-service",
),
BugReport(
id="easy-002",
title="Typo in documentation homepage",
body="There is a typo on the homepage docs: 'Welccome' should be 'Welcome'. "
"No functional impact, purely cosmetic.",
author="docs_fan",
labels_hint=["documentation"],
comments=[],
severity_signals=["cosmetic", "no functional impact"],
stack_trace="",
affected_component="docs",
),
BugReport(
id="easy-003",
title="Dashboard loads slowly for large datasets",
body="When a dataset has more than 10k rows, the dashboard takes 30+ seconds to load. "
"Workaround: export data and use offline tools. Affects power users only.",
author="power_user",
labels_hint=["performance"],
comments=["Noticed after the last deploy.", "CPU spikes to 100%."],
severity_signals=["workaround exists", "power users only"],
stack_trace="",
affected_component="dashboard",
),
BugReport(
id="easy-004",
title="Email notifications not sent after password reset",
body="Users who reset their password do not receive the confirmation email. "
"SMTP logs show the job is queued but never dispatched. "
"Affects all users attempting password reset.",
author="support_team",
labels_hint=["bug"],
comments=["Reported by 12 users this week.",
"Started after email service migration."],
severity_signals=["all users", "never dispatched"],
stack_trace="",
affected_component="email-service",
),
BugReport(
id="easy-005",
title="Incorrect copyright year in footer",
body="The footer shows '© 2022' but it should be '© 2024'. "
"No functional impact.",
author="intern_dev",
labels_hint=["documentation"],
comments=[],
severity_signals=["no functional impact"],
stack_trace="",
affected_component="frontend",
),
],
"answers": {
"easy-001": {"priority": "P0"},
"easy-002": {"priority": "P3"},
"easy-003": {"priority": "P2"},
"easy-004": {"priority": "P1"},
"easy-005": {"priority": "P3"},
},
},
"medium": {
"bugs": [
BugReport(
id="med-001",
title="Payment fails silently on checkout",
body="Checkout completes without error but payment is never charged. "
"No error shown to user. Stripe logs show declined transaction. "
"Direct revenue loss — every failed checkout is a lost sale.",
author="store_owner",
labels_hint=["bug"],
comments=["Revenue impact confirmed.", "Happening since Tuesday."],
severity_signals=["revenue loss", "silently", "every failed checkout"],
stack_trace="Stripe API: card_declined at PaymentService.py:145",
affected_component="payment-service",
),
BugReport(
id="med-002",
title="Search results include deleted posts",
body="Deleted blog posts still appear in search results for up to 24 hours. "
"Users can read content that was explicitly removed by moderators. "
"Potential GDPR violation if deleted content belongs to EU users.",
author="moderator_jane",
labels_hint=[],
comments=["GDPR concern — deleted content still visible."],
severity_signals=["GDPR violation", "deleted content visible"],
stack_trace="",
affected_component="search-index",
),
BugReport(
id="med-003",
title="Dark mode toggle breaks layout on Safari",
body="Switching to dark mode on Safari 16 causes nav bar to overlap content. "
"Chrome and Firefox unaffected. Workaround: use a different browser.",
author="safari_user",
labels_hint=["bug", "ux"],
comments=["Only on Safari, not Chrome/Firefox."],
severity_signals=["workaround exists", "single browser"],
stack_trace="",
affected_component="frontend-css",
),
BugReport(
id="med-004",
title="CSV export produces corrupted file for non-ASCII characters",
body="When table data contains accented characters (e.g. café, naïve), "
"the exported CSV file is corrupted and cannot be opened in Excel. "
"Affects users with international data.",
author="data_analyst",
labels_hint=["bug"],
comments=["Encoding issue — UTF-8 not respected.",
"Workaround: manual copy-paste."],
severity_signals=["corrupted", "workaround exists"],
stack_trace="",
affected_component="export-service",
),
BugReport(
id="med-005",
title="API rate limiter blocks legitimate users after 429 error",
body="After receiving a 429 Too Many Requests response, legitimate users "
"remain blocked for 1 hour even after the rate limit window resets. "
"The unblock logic has a bug — it never clears the blocked flag.",
author="api_user",
labels_hint=["bug"],
comments=["Affects CI/CD pipelines hitting the API.",
"Retry-After header is wrong."],
severity_signals=["permanent block", "never clears", "bug in logic"],
stack_trace="",
affected_component="api-gateway",
),
],
"answers": {
"med-001": {"priority": "P0", "labels": ["bug", "payments"], "assigned_team": "backend"},
"med-002": {"priority": "P1", "labels": ["bug", "security"], "assigned_team": "security"},
"med-003": {"priority": "P2", "labels": ["bug", "ux"], "assigned_team": "frontend"},
"med-004": {"priority": "P2", "labels": ["bug", "data-integrity"], "assigned_team": "backend"},
"med-005": {"priority": "P1", "labels": ["bug", "performance"], "assigned_team": "backend"},
},
},
"hard": {
"bugs": [
BugReport(
id="hard-001",
title="SQL injection vulnerability in search endpoint",
body="The /api/search endpoint does not sanitize inputs. "
"Crafted queries can dump the entire user table including password hashes. "
"PoC attached. Verified on production. Treat as confidential — "
"do not discuss publicly until patched.",
author="security_researcher",
labels_hint=[],
comments=["Critical. Affects production.", "Do not discuss publicly."],
severity_signals=["SQL injection", "password hashes", "production",
"confidential"],
stack_trace="",
affected_component="search-api",
),
BugReport(
id="hard-002",
title="Memory leak in background job processor causes OOM after 6 hours",
body="The job processor allocates ~50MB per job and never frees it. "
"Server runs out of memory every 6 hours, requiring a manual restart. "
"Heap profiler confirms leak introduced in v1.9. "
"Workaround: scheduled restarts every 4 hours (operational overhead).",
author="devops_alice",
labels_hint=["performance"],
comments=["Verified with heap profiler.", "Started in v1.9."],
severity_signals=["memory leak", "OOM", "manual restart", "never frees"],
stack_trace="HeapDump: JobProcessor.process() -> 50MB/call, never GC'd",
affected_component="job-processor",
),
BugReport(
id="hard-003",
title="Race condition in file upload: files occasionally overwrite each other",
body="Under concurrent load, two users uploading simultaneously can get "
"each other's files due to a race condition in the temp file naming logic. "
"Frequency: approximately 1 in 10,000 uploads under normal load. "
"No data loss confirmed yet and a workaround exists: "
"enable sequential upload mode in settings (disabled by default). "
"Risk is low-probability but affects data integrity.",
author="qa_bot",
labels_hint=["bug"],
comments=["Reproduced with locust at 50 concurrent users.",
"Sequential mode avoids it."],
severity_signals=["race condition", "data integrity",
"workaround exists", "low-probability"],
stack_trace="",
affected_component="file-upload",
),
BugReport(
id="hard-004",
title="Auth token not invalidated after password change",
body="When a user changes their password, existing JWT tokens remain valid "
"for up to 24 hours. An attacker who previously stole a token can "
"continue to access the account even after the password is reset. "
"This is a session management security vulnerability.",
author="pentest_team",
labels_hint=["security"],
comments=["Verified on staging.",
"OWASP A07 — Identification and Authentication Failures."],
severity_signals=["JWT not invalidated", "attacker", "security vulnerability",
"stolen token"],
stack_trace="",
affected_component="auth-service",
),
BugReport(
id="hard-005",
title="Infinite loop in webhook retry logic causes CPU spike",
body="When a webhook endpoint returns a 500 error, the retry logic enters "
"an infinite loop with no backoff or retry cap. "
"This causes CPU to spike to 100% within minutes and starves other services. "
"Triggered in production twice this week. Requires process kill to recover.",
author="oncall_eng",
labels_hint=["bug", "performance"],
comments=["PagerDuty alert fired twice.",
"Needs exponential backoff + max retry cap."],
severity_signals=["infinite loop", "100%", "production",
"process kill", "starves other services"],
stack_trace="Thread dump: WebhookRetrier.retry() → recursive call, no exit",
affected_component="webhook-service",
),
],
"answers": {
"hard-001": {
"priority": "P0", "labels": ["bug", "security"],
"assigned_team": "security", "milestone": "hotfix",
},
"hard-002": {
"priority": "P1", "labels": ["bug", "performance"],
"assigned_team": "backend", "milestone": "v2.1",
},
"hard-003": {
"priority": "P1", "labels": ["bug", "data-integrity"],
"assigned_team": "backend", "milestone": "v2.1",
},
"hard-004": {
"priority": "P0", "labels": ["bug", "security"],
"assigned_team": "security", "milestone": "hotfix",
},
"hard-005": {
"priority": "P0", "labels": ["bug", "performance"],
"assigned_team": "backend", "milestone": "hotfix",
},
},
},
}
# Combine into single TASKS dict (backward compatible)
TASKS = _HANDCRAFTED_BUGS
# ---------------------------------------------------------------------------
# PROCEDURAL BUG GENERATOR
# ---------------------------------------------------------------------------
def _determine_severity(text: str, keywords: Dict[str, list]) -> str:
"""Check which severity level the generated text matches."""
text_lower = text.lower()
for level, kws in keywords.items():
if level == "default":
return "default"
hits = sum(1 for kw in kws if kw.lower() in text_lower)
if hits >= 1:
return level
# fallback to first non-default key
return list(keywords.keys())[0] if keywords else "moderate"
def generate_bug(task_key: str, seed: int = None) -> Tuple[BugReport, dict]:
"""Generate a procedural bug report with its correct answer."""
rng = random.Random(seed)
# Weight categories by difficulty
weights = {
"easy": {"documentation": 3, "ui_bug": 3, "performance": 2,
"crash": 1, "api_bug": 1},
"medium": {"crash": 3, "performance": 3, "api_bug": 2,
"data_corruption": 2, "ui_bug": 1},
"hard": {"security": 4, "crash": 3, "data_corruption": 3,
"performance": 2, "api_bug": 2},
}
task_weights = weights.get(task_key, weights["medium"])
categories = []
for cat, w in task_weights.items():
categories.extend([cat] * w)
category = rng.choice(categories)
template = _BUG_TEMPLATES[category]
# Pick random variable values
chosen_vars = {}
for var_name, options in template["vars"].items():
chosen_vars[var_name] = rng.choice(options)
# Build title and body
title_tmpl = rng.choice(template["titles"])
body_tmpl = rng.choice(template["bodies"])
# Safe format — ignore missing keys
def safe_format(tmpl, vars_dict):
result = tmpl
for k, v in vars_dict.items():
result = result.replace("{" + k + "}", v)
return result
title = safe_format(title_tmpl, chosen_vars)
body = safe_format(body_tmpl, chosen_vars)
# Generate unique ID from seed
bug_id = f"gen-{seed or rng.randint(0, 999999):06d}"
# Pick author
authors = ["user_report", "qa_engineer", "support_team", "dev_oncall",
"security_bot", "customer_jane", "automated_monitor",
"intern_dev", "senior_eng", "pm_feedback"]
author = rng.choice(authors)
# Build comments
comment_templates = [
"Confirmed on our side.", "Reproduced in staging.",
"Multiple reports from users.", "Started after last deployment.",
"Urgent — customer escalation.", "Low priority — no user complaints.",
"Needs investigation.", "Related to ticket from last sprint.",
]
num_comments = rng.randint(0, 3)
comments = rng.sample(comment_templates, min(num_comments, len(comment_templates)))
# Determine severity and answer
full_text = f"{title} {body} {' '.join(comments)}"
severity_kws = template.get("severity_keywords", {})
severity = _determine_severity(full_text, severity_kws)
answer_templates = template["answer_template"]
answer = dict(answer_templates.get(severity, list(answer_templates.values())[0]))
# For easy tasks, only priority matters
if task_key == "easy":
answer = {"priority": answer["priority"]}
elif task_key == "medium":
answer.pop("milestone", None)
bug = BugReport(
id=bug_id,
title=title,
body=body,
author=author,
labels_hint=rng.sample(["bug", "needs-triage", "reported"], rng.randint(0, 2)),
comments=comments,
severity_signals=[],
stack_trace="",
affected_component=chosen_vars.get("service", chosen_vars.get("endpoint", "")),
)
return bug, answer
# ---------------------------------------------------------------------------
# BUG SAMPLER — uses handcrafted bugs first, then procedural for variety
# ---------------------------------------------------------------------------
def sample_bug(task_key: str, seed: int = None) -> Tuple[BugReport, dict]:
"""Return a bug and its answer. Mixes handcrafted + procedural."""
rng = random.Random(seed)
# 40% chance of handcrafted, 60% procedural
if rng.random() < 0.4 and task_key in _HANDCRAFTED_BUGS:
bugs = _HANDCRAFTED_BUGS[task_key]["bugs"]
bug = rng.choice(bugs)
answer = _HANDCRAFTED_BUGS[task_key]["answers"][bug.id]
return bug, answer
else:
gen_seed = seed if seed is not None else rng.randint(0, 999999)
return generate_bug(task_key, seed=gen_seed)
# ---------------------------------------------------------------------------
# GRADING — with semantic label matching
# ---------------------------------------------------------------------------
PRIORITY_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3}
def _priority_score(predicted: str, correct: str) -> float:
"""Score priority assignment with partial credit for near-misses."""
if predicted == correct:
return 0.95
pred_rank = PRIORITY_ORDER.get(predicted, 99)
corr_rank = PRIORITY_ORDER.get(correct, 99)
diff = abs(pred_rank - corr_rank)
if diff == 1:
return 0.5
elif diff == 2:
return 0.2
return 0.05
def _normalize_label(label: str) -> str:
"""Normalize a label to its canonical form."""
label_lower = label.lower().strip()
for canonical, synonyms in LABEL_SYNONYMS.items():
if label_lower == canonical or label_lower in synonyms:
return canonical
return label_lower
def _label_score(predicted: List[str], correct: List[str]) -> float:
"""Score labels using semantic matching via synonym groups."""
pred_normalized = set(_normalize_label(l) for l in predicted)
corr_normalized = set(_normalize_label(l) for l in correct)
if not corr_normalized:
return 0.95
intersection = pred_normalized & corr_normalized
union = pred_normalized | corr_normalized
raw = len(intersection) / len(union) if union else 0.0
return max(0.05, min(0.95, raw))
def _reasoning_score(reasoning: str, answer: dict) -> float:
"""Bonus for reasoning that mentions relevant signals."""
if not reasoning or len(reasoning.strip()) < 10:
return 0.0
key_signals = {
"P0": ["production", "all users", "data loss", "security", "crash",
"revenue", "injection", "vulnerability", "100%"],
"P1": ["major", "significant", "no workaround", "broken",
"gdpr", "blocked", "leak", "never"],
"P2": ["degraded", "workaround", "partial", "slow",
"affected", "power users"],
"P3": ["minor", "cosmetic", "docs", "typo", "low",
"no functional impact"],
}
expected_priority = answer.get("priority", "P2")
signals = key_signals.get(expected_priority, [])
reasoning_lower = reasoning.lower()
hits = sum(1 for s in signals if s in reasoning_lower)
return min(0.15, hits * 0.05)
def grade_action(task_key: str, bug: BugReport, action: TriageAction,
answer: dict = None) -> Tuple[float, str]:
"""Grade the agent's triage action against the correct answer."""
# Backward compatibility: look up answer from handcrafted if not provided
if answer is None:
if task_key in _HANDCRAFTED_BUGS and bug.id in _HANDCRAFTED_BUGS[task_key]["answers"]:
answer = _HANDCRAFTED_BUGS[task_key]["answers"][bug.id]
else:
return 0.5, "No answer key found for this bug."
feedback_parts = []
reasoning_bonus = _reasoning_score(action.reasoning, answer)
if task_key == "easy":
score = _priority_score(action.priority, answer["priority"])
symbol = "✓" if score >= 0.9 else "~" if score >= 0.4 else "✗"
feedback_parts.append(
f"Priority: {symbol} (got {action.priority}, expected {answer['priority']})")
score = score + reasoning_bonus
score = max(0.01, min(0.99, score))
return round(score, 3), " | ".join(feedback_parts)
elif task_key == "medium":
p_score = _priority_score(action.priority, answer["priority"])
l_score = _label_score(action.labels, answer.get("labels", []))
expected_team = answer.get("assigned_team", "")
t_score = 0.95 if expected_team and action.assigned_team.lower() == expected_team.lower() else 0.05
score = 0.45 * p_score + 0.40 * l_score + 0.15 * t_score + reasoning_bonus
feedback_parts.append(
f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
feedback_parts.append(f"Labels: {l_score:.2f} (semantic match)")
feedback_parts.append(
f"Team: {t_score:.2f} (got {action.assigned_team}, expected {expected_team})")
if reasoning_bonus > 0:
feedback_parts.append(f"Reasoning bonus: +{reasoning_bonus:.2f}")
score = max(0.01, min(0.99, score))
return round(score, 3), " | ".join(feedback_parts)
else: # hard
p_score = _priority_score(action.priority, answer["priority"])
l_score = _label_score(action.labels, answer.get("labels", []))
t_score = 0.95 if action.assigned_team.lower() == answer["assigned_team"].lower() else 0.05
m_score = 0.95 if action.milestone.lower() == answer["milestone"].lower() else 0.05
score = 0.35 * p_score + 0.30 * l_score + 0.20 * t_score + 0.15 * m_score + reasoning_bonus
feedback_parts.append(
f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
feedback_parts.append(f"Labels: {l_score:.2f} (semantic match)")
feedback_parts.append(
f"Team: {t_score:.2f} (got {action.assigned_team}, expected {answer['assigned_team']})")
feedback_parts.append(
f"Milestone: {m_score:.2f} (got {action.milestone}, expected {answer['milestone']})")
if reasoning_bonus > 0:
feedback_parts.append(f"Reasoning bonus: +{reasoning_bonus:.2f}")
# Security escalation penalty
if answer.get("assigned_team") == "security" and action.assigned_team.lower() != "security":
score = max(0.01, score - 0.15)
feedback_parts.append("⚠ Security escalation missed (-0.15)")
score = max(0.01, min(0.99, score))
return round(score, 3), " | ".join(feedback_parts)
# ---------------------------------------------------------------------------
# NAMED GRADER FUNCTIONS — referenced by openenv.yaml
# ---------------------------------------------------------------------------
def priority_match(*args, **kwargs):
if len(args) < 2:
return 0.5
bug, action = args[0], args[1]
score, _ = grade_action("easy", bug, action)
return float(score)
def priority_label_team(*args, **kwargs):
if len(args) < 2:
return 0.5
bug, action = args[0], args[1]
score, _ = grade_action("medium", bug, action)
return float(score)
def full_triage(*args, **kwargs):
if len(args) < 2:
return 0.5
bug, action = args[0], args[1]
score, _ = grade_action("hard", bug, action)
return float(score)
__all__ = [
"priority_match",
"priority_label_team",
"full_triage",
"sample_bug",
"generate_bug",
"grade_action",
"TASKS",
"LABEL_SYNONYMS",
]