Spaces:
Sleeping
Sleeping
| # server/task.py | |
| import sys | |
| import random | |
| import hashlib | |
| sys.path.insert(0, "/app") | |
| from typing import Tuple, List, Dict, Any | |
| from model import BugReport, TriageAction | |
| # --------------------------------------------------------------------------- | |
| # LABEL SYNONYM MAP — allows semantic matching | |
| # --------------------------------------------------------------------------- | |
| LABEL_SYNONYMS: Dict[str, set] = { | |
| "bug": {"defect", "issue", "error", "fault", "broken"}, | |
| "security": {"vulnerability", "cve", "exploit", "auth", "injection"}, | |
| "performance": {"perf", "slow", "latency", "optimization", "speed", "memory"}, | |
| "ux": {"ui", "frontend", "user-experience", "design", "usability"}, | |
| "data-integrity": {"data-loss", "corruption", "data", "consistency"}, | |
| "payments": {"billing", "payment", "stripe", "checkout", "revenue"}, | |
| "documentation": {"docs", "typo", "readme", "wiki"}, | |
| "infrastructure": {"infra", "devops", "deploy", "ci", "cd", "docker"}, | |
| "api": {"endpoint", "rest", "graphql", "http", "request"}, | |
| "database": {"db", "sql", "query", "migration", "schema"}, | |
| } | |
| # --------------------------------------------------------------------------- | |
| # BUG TEMPLATE SYSTEM — generates hundreds of unique bugs | |
| # --------------------------------------------------------------------------- | |
| _BUG_TEMPLATES = { | |
| "crash": { | |
| "titles": [ | |
| "{service} crashes on {trigger}", | |
| "{service} throws {error_type} when {trigger}", | |
| "Fatal error in {service} during {trigger}", | |
| "Unhandled exception in {service}: {error_type}", | |
| "{service} segfaults under {condition}", | |
| ], | |
| "bodies": [ | |
| "When a user {trigger}, the {service} crashes immediately. " | |
| "Error: {error_type}. Stack trace points to {component}. " | |
| "Affects {impact}. {workaround}", | |
| "The {service} is failing with {error_type} every time a user {trigger}. " | |
| "No error message is shown to the user — the process just dies. " | |
| "Impact: {impact}. {workaround}", | |
| ], | |
| "vars": { | |
| "service": ["auth service", "payment gateway", "search API", "notification worker", | |
| "session manager", "user profile service", "file upload handler", | |
| "webhook processor", "background job runner", "cache layer"], | |
| "trigger": ["submits a form with special characters", "uploads a file larger than 10MB", | |
| "logs in with SSO", "resets their password", "exports data to CSV", | |
| "switches between tabs rapidly", "uses the bulk import feature", | |
| "accesses the admin panel", "triggers a webhook", "runs a scheduled job"], | |
| "error_type": ["NullPointerException", "SegmentationFault", "OutOfMemoryError", | |
| "ConnectionTimeoutException", "StackOverflowError", | |
| "IndexOutOfBoundsException", "TypeError", "KeyError"], | |
| "component": ["UserController.java:142", "PaymentService.py:89", | |
| "AuthMiddleware.ts:56", "SearchIndex.go:203", | |
| "NotificationQueue.rb:77", "FileHandler.py:234"], | |
| "impact": ["100% of users on this flow", "all mobile users", "EU region users only", | |
| "users with accounts older than 1 year", "approximately 30% of sessions", | |
| "every request during peak hours"], | |
| "workaround": ["No workaround exists — the feature is completely broken.", | |
| "Workaround: users can retry after clearing browser cache.", | |
| "Temporary fix: restart the service every 2 hours.", | |
| "No known workaround. Users are blocked."], | |
| "condition": ["high concurrent load", "memory pressure above 80%", | |
| "when connection pool is exhausted", "after running for 6+ hours"], | |
| }, | |
| "answer_template": { | |
| "severe": {"priority": "P0", "labels": ["bug"], "assigned_team": "backend", "milestone": "hotfix"}, | |
| "moderate": {"priority": "P1", "labels": ["bug"], "assigned_team": "backend", "milestone": "v2.1"}, | |
| }, | |
| "severity_keywords": { | |
| "severe": ["100%", "all mobile", "No workaround", "completely broken", "blocked", | |
| "SegmentationFault", "OutOfMemoryError"], | |
| "moderate": ["retry", "30%", "Temporary fix", "restart"], | |
| }, | |
| }, | |
| "security": { | |
| "titles": [ | |
| "SQL injection vulnerability in {endpoint}", | |
| "XSS attack possible via {input_field}", | |
| "Authentication bypass in {service}", | |
| "Sensitive data exposed in {location}", | |
| "{credential_type} not invalidated after {event}", | |
| "SSRF vulnerability in {endpoint}", | |
| ], | |
| "bodies": [ | |
| "The {endpoint} endpoint does not sanitize {input_field} inputs. " | |
| "Crafted queries can {exploit_result}. PoC attached and verified on {env}. " | |
| "Treat as confidential — do not discuss publicly until patched. {additional_context}", | |
| "When a user {event}, existing {credential_type} remain valid for {duration}. " | |
| "An attacker who {attack_vector} can continue to access the account. " | |
| "This is a {vuln_category} vulnerability. {additional_context}", | |
| ], | |
| "vars": { | |
| "endpoint": ["/api/search", "/api/users", "/api/export", "/admin/query", | |
| "/api/upload", "/graphql", "/api/webhook"], | |
| "input_field": ["search query", "username field", "file upload name", | |
| "comment body", "profile bio", "webhook URL"], | |
| "service": ["login flow", "OAuth callback", "API gateway", "admin panel", | |
| "password reset", "2FA verification"], | |
| "location": ["API error responses", "debug logs shipped to client", | |
| "public S3 bucket", "unencrypted cookies", "localStorage"], | |
| "credential_type": ["JWT tokens", "session cookies", "API keys", "OAuth tokens"], | |
| "event": ["changes their password", "revokes API access", | |
| "is suspended by admin", "enables 2FA"], | |
| "exploit_result": ["dump the entire user table including password hashes", | |
| "execute arbitrary JavaScript in other users' browsers", | |
| "access any user's account without credentials", | |
| "read internal service endpoints via SSRF"], | |
| "env": ["production", "staging", "production replica"], | |
| "duration": ["up to 24 hours", "indefinitely", "until manual cache clear", | |
| "for the full token TTL (7 days)"], | |
| "attack_vector": ["previously stole a token", "intercepted a session cookie", | |
| "obtained a leaked API key"], | |
| "vuln_category": ["session management", "access control", | |
| "injection", "broken authentication"], | |
| "additional_context": [ | |
| "OWASP A03 — Injection.", | |
| "OWASP A07 — Identification and Authentication Failures.", | |
| "CVSS score estimated at 9.1 (Critical).", | |
| "Compliance impact: potential GDPR violation if user PII is exfiltrated.", | |
| "Bounty hunter reported this 48 hours ago — disclosure deadline approaching.", | |
| ], | |
| }, | |
| "answer_template": { | |
| "default": {"priority": "P0", "labels": ["bug", "security"], | |
| "assigned_team": "security", "milestone": "hotfix"}, | |
| }, | |
| "severity_keywords": {"default": []}, | |
| }, | |
| "performance": { | |
| "titles": [ | |
| "{page} loads slowly for {dataset_size}", | |
| "Memory leak in {service} causes OOM after {duration}", | |
| "API response time degrades under {load_condition}", | |
| "{operation} takes {duration} for {dataset_size}", | |
| "CPU spikes to 100% when {trigger}", | |
| ], | |
| "bodies": [ | |
| "When {condition}, the {page} takes {response_time} to load. " | |
| "{diagnostic_info}. {impact}. {workaround}", | |
| "The {service} allocates memory during {operation} and never frees it. " | |
| "Server runs out of memory every {duration}. {diagnostic_info}. " | |
| "{workaround}", | |
| ], | |
| "vars": { | |
| "page": ["dashboard", "analytics page", "user list", "search results", | |
| "audit log", "reports page", "admin overview"], | |
| "service": ["background job processor", "cache warming service", | |
| "log aggregator", "image resizer", "ETL pipeline"], | |
| "dataset_size": ["large datasets (10k+ rows)", "enterprise accounts", | |
| "tables with 100k+ entries", "files over 50MB"], | |
| "duration": ["6 hours", "4 hours", "12 hours", "30+ seconds", | |
| "2+ minutes", "an entire day"], | |
| "load_condition": ["concurrent load", "peak traffic", "batch processing", | |
| "more than 50 simultaneous users"], | |
| "operation": ["bulk export", "report generation", "data migration", | |
| "full-text search", "image processing"], | |
| "trigger": ["running bulk exports", "processing large uploads", | |
| "generating PDF reports", "reindexing search"], | |
| "condition": ["a dataset has more than 10k rows", | |
| "multiple users trigger exports simultaneously", | |
| "the nightly ETL job runs alongside user traffic"], | |
| "response_time": ["30+ seconds", "over a minute", "2-3 minutes", | |
| "timeout after 60 seconds"], | |
| "diagnostic_info": ["CPU spikes to 100%", "Heap profiler confirms the leak", | |
| "Database EXPLAIN shows full table scan", | |
| "N+1 query pattern detected in APM", | |
| "Garbage collector running every 500ms"], | |
| "impact": ["Affects power users with large accounts", | |
| "All users experience slowness during peak hours", | |
| "Requires manual restart to recover", | |
| "Operational overhead: scheduled restarts every 4 hours"], | |
| "workaround": ["Workaround: export data and use offline tools.", | |
| "Workaround: scheduled restarts every 4 hours.", | |
| "No workaround — users just wait.", | |
| "Workaround: paginate results (but UX is degraded)."], | |
| }, | |
| "answer_template": { | |
| "severe": {"priority": "P1", "labels": ["bug", "performance"], | |
| "assigned_team": "backend", "milestone": "v2.1"}, | |
| "moderate": {"priority": "P2", "labels": ["bug", "performance"], | |
| "assigned_team": "backend", "milestone": "v2.1"}, | |
| }, | |
| "severity_keywords": { | |
| "severe": ["OOM", "100%", "manual restart", "timeout", "No workaround", | |
| "all users", "never frees"], | |
| "moderate": ["Workaround", "power users", "paginate"], | |
| }, | |
| }, | |
| "ui_bug": { | |
| "titles": [ | |
| "{ui_element} breaks layout on {browser}", | |
| "{ui_element} not rendering correctly in {mode}", | |
| "Responsive layout broken on {device}", | |
| "{feature} toggle not persisting across {context}", | |
| "Accessibility: {ui_element} missing {a11y_attr}", | |
| ], | |
| "bodies": [ | |
| "Switching to {mode} on {browser} causes {ui_element} to {visual_issue}. " | |
| "{other_browsers}. {workaround}", | |
| "On {device}, the {ui_element} is {visual_issue}. " | |
| "Tested on {browser}. {impact}. {workaround}", | |
| ], | |
| "vars": { | |
| "ui_element": ["navigation bar", "sidebar menu", "modal dialog", | |
| "dropdown selector", "data table", "footer", | |
| "toast notifications", "breadcrumb trail"], | |
| "browser": ["Safari 16", "Firefox ESR", "Chrome on Android", | |
| "Edge on Windows", "iOS Safari", "Samsung Internet"], | |
| "mode": ["dark mode", "high contrast mode", "RTL layout", | |
| "compact view", "print view"], | |
| "device": ["iPhone SE", "tablets in portrait", "screens below 768px", | |
| "ultra-wide monitors", "4K displays"], | |
| "feature": ["dark mode", "compact view", "language preference", | |
| "notification settings"], | |
| "context": ["page reloads", "different tabs", "sessions", | |
| "browser restarts"], | |
| "visual_issue": ["overlap the main content", "disappear entirely", | |
| "render with incorrect colors", "become unclickable", | |
| "overflow beyond the viewport"], | |
| "other_browsers": ["Chrome and Firefox are unaffected.", | |
| "Only reproducible on this specific browser.", | |
| "Affects all WebKit-based browsers."], | |
| "a11y_attr": ["ARIA labels", "keyboard focus indicators", | |
| "screen reader text", "proper heading hierarchy"], | |
| "impact": ["Cosmetic issue, no functional impact.", | |
| "Users cannot access the affected feature.", | |
| "Usability is degraded but the feature works."], | |
| "workaround": ["Workaround: use a different browser.", | |
| "Workaround: manually resize the window.", | |
| "No workaround for this browser.", | |
| "Workaround: disable the feature in settings."], | |
| }, | |
| "answer_template": { | |
| "severe": {"priority": "P2", "labels": ["bug", "ux"], | |
| "assigned_team": "frontend", "milestone": "v2.1"}, | |
| "moderate": {"priority": "P3", "labels": ["bug", "ux"], | |
| "assigned_team": "frontend", "milestone": "backlog"}, | |
| }, | |
| "severity_keywords": { | |
| "severe": ["cannot access", "unclickable", "disappear", "No workaround"], | |
| "moderate": ["Cosmetic", "different browser", "resize"], | |
| }, | |
| }, | |
| "data_corruption": { | |
| "titles": [ | |
| "Race condition in {feature}: {consequence}", | |
| "Data inconsistency in {feature} under concurrent writes", | |
| "{export_format} export produces corrupted output for {edge_case}", | |
| "Stale data served from cache after {trigger}", | |
| "Duplicate records created when {trigger}", | |
| ], | |
| "bodies": [ | |
| "Under concurrent load, {feature} can {consequence} due to a race condition " | |
| "in {root_cause}. Frequency: {frequency}. {impact}. {workaround}", | |
| "When {feature} data contains {edge_case}, the exported {export_format} file " | |
| "is corrupted and cannot be {consumer}. {impact}. {workaround}", | |
| ], | |
| "vars": { | |
| "feature": ["file upload", "order processing", "user registration", | |
| "inventory update", "comment system", "permission assignment"], | |
| "consequence": ["files occasionally overwrite each other", | |
| "orders are duplicated or lost", | |
| "users get assigned wrong permissions", | |
| "inventory counts become negative"], | |
| "root_cause": ["temp file naming logic", "lack of database locking", | |
| "non-atomic read-modify-write cycle", | |
| "missing unique constraint"], | |
| "frequency": ["approximately 1 in 10,000 operations", | |
| "consistently under 50+ concurrent users", | |
| "intermittently — hard to reproduce", | |
| "every time the batch job runs"], | |
| "edge_case": ["non-ASCII characters (e.g., café, naïve)", | |
| "values containing commas or quotes", | |
| "null or empty fields", | |
| "timestamps crossing DST boundaries"], | |
| "export_format": ["CSV", "Excel", "JSON", "PDF"], | |
| "consumer": ["opened in Excel", "parsed by downstream services", | |
| "imported back into the system"], | |
| "trigger": ["double-clicking the submit button", | |
| "cache TTL expires during a write operation", | |
| "two users edit the same record simultaneously", | |
| "the nightly sync job overlaps with user activity"], | |
| "impact": ["Potential data loss confirmed.", | |
| "No data loss confirmed yet, but risk exists.", | |
| "Affects users with international data.", | |
| "Breaks downstream pipeline processing."], | |
| "workaround": ["Workaround: enable sequential mode in settings.", | |
| "Workaround: manually re-export after cleanup.", | |
| "No reliable workaround — data must be manually verified.", | |
| "Workaround: add a mutex lock externally (operational overhead)."], | |
| }, | |
| "answer_template": { | |
| "severe": {"priority": "P1", "labels": ["bug", "data-integrity"], | |
| "assigned_team": "backend", "milestone": "v2.1"}, | |
| "moderate": {"priority": "P2", "labels": ["bug", "data-integrity"], | |
| "assigned_team": "backend", "milestone": "v2.1"}, | |
| }, | |
| "severity_keywords": { | |
| "severe": ["data loss", "No reliable workaround", "consistently", | |
| "permissions", "overwrite", "negative"], | |
| "moderate": ["No data loss", "intermittently", "sequential mode", | |
| "re-export", "non-ASCII"], | |
| }, | |
| }, | |
| "documentation": { | |
| "titles": [ | |
| "Typo in {location}", | |
| "Outdated {doc_type} on {page}", | |
| "Missing documentation for {feature}", | |
| "Incorrect {doc_element} in {location}", | |
| ], | |
| "bodies": [ | |
| "There is a {issue_type} on the {page}: {detail}. No functional impact, " | |
| "purely cosmetic. {extra}", | |
| "The {doc_type} for {feature} is {issue_type}. {detail}. {extra}", | |
| ], | |
| "vars": { | |
| "location": ["homepage docs", "API reference", "README", "changelog", | |
| "contributing guide", "onboarding wiki"], | |
| "doc_type": ["installation guide", "API documentation", "changelog", | |
| "migration guide", "code comments"], | |
| "page": ["landing page", "docs homepage", "getting started page", | |
| "FAQ section", "footer"], | |
| "feature": ["new webhook API", "batch processing endpoint", | |
| "SSO integration", "rate limiting"], | |
| "doc_element": ["code example", "endpoint URL", "parameter description", | |
| "copyright year", "version number"], | |
| "issue_type": ["a typo", "outdated", "missing", "incorrect", "misleading"], | |
| "detail": ["'Welccome' should be 'Welcome'", | |
| "references removed v1.x API that no longer exists", | |
| "completely undocumented despite being a core feature", | |
| "shows '© 2022' but should be '© 2024'", | |
| "the curl example uses the wrong HTTP method"], | |
| "extra": ["", "Low priority — does not block any workflow.", | |
| "New users have reported confusion.", | |
| "Only noticed by contributors reading source code."], | |
| }, | |
| "answer_template": { | |
| "default": {"priority": "P3", "labels": ["documentation"], | |
| "assigned_team": "devx", "milestone": "backlog"}, | |
| }, | |
| "severity_keywords": {"default": []}, | |
| }, | |
| "api_bug": { | |
| "titles": [ | |
| "API rate limiter {issue} after {trigger}", | |
| "{endpoint} returns {status_code} instead of {expected_code}", | |
| "Pagination broken on {endpoint}: {symptom}", | |
| "Webhook delivery {issue} for {event_type} events", | |
| "API versioning: {endpoint} behaves differently on v1 vs v2", | |
| ], | |
| "bodies": [ | |
| "After receiving a {status_code} response, {consequence}. " | |
| "The {root_cause}. {impact}. {workaround}", | |
| "The {endpoint} endpoint {symptom} when {trigger}. " | |
| "Expected behavior: {expected}. Actual: {actual}. {impact}.", | |
| ], | |
| "vars": { | |
| "endpoint": ["/api/users", "/api/search", "/api/export", | |
| "/api/webhooks", "/api/billing", "/api/analytics"], | |
| "issue": ["blocks legitimate users", "fails silently", | |
| "returns incorrect retry headers", "drops events"], | |
| "trigger": ["a 429 error", "rate limit window resets", | |
| "a burst of requests from CI/CD", "server restart"], | |
| "status_code": ["429", "500", "502", "504", "403"], | |
| "expected_code": ["200", "201", "204", "404"], | |
| "symptom": ["returns duplicate entries", | |
| "skips items between pages", | |
| "returns empty page despite more data existing"], | |
| "event_type": ["payment.completed", "user.created", | |
| "subscription.cancelled", "deployment.finished"], | |
| "consequence": ["legitimate users remain blocked for 1 hour", | |
| "data is silently lost with no error", | |
| "downstream services receive stale data"], | |
| "root_cause": ["unblock logic has a bug — it never clears the blocked flag", | |
| "cursor-based pagination uses wrong sort order", | |
| "retry-after header reports seconds instead of milliseconds"], | |
| "expected": ["200 OK with paginated results", | |
| "successful delivery with retry on failure", | |
| "proper rate limit reset after window expires"], | |
| "actual": ["empty response with 200 status", | |
| "permanent block until manual intervention", | |
| "events dropped without any error log"], | |
| "impact": ["Affects CI/CD pipelines hitting the API.", | |
| "External integrations break silently.", | |
| "Customer-facing dashboards show wrong data.", | |
| "Retry-After header causes clients to wait too long."], | |
| "workaround": ["Workaround: manually clear Redis key.", | |
| "Workaround: add client-side deduplication.", | |
| "No workaround — requires server-side fix.", | |
| "Workaround: pin API version to v1 in headers."], | |
| }, | |
| "answer_template": { | |
| "severe": {"priority": "P1", "labels": ["bug", "api"], | |
| "assigned_team": "backend", "milestone": "v2.1"}, | |
| "moderate": {"priority": "P2", "labels": ["bug", "api"], | |
| "assigned_team": "backend", "milestone": "v2.1"}, | |
| }, | |
| "severity_keywords": { | |
| "severe": ["silently lost", "permanent block", "No workaround", | |
| "dropped", "external integrations"], | |
| "moderate": ["Workaround", "pin API", "deduplication"], | |
| }, | |
| }, | |
| } | |
| # The original handcrafted bugs — kept as a gold-standard subset | |
| _HANDCRAFTED_BUGS = { | |
| "easy": { | |
| "bugs": [ | |
| BugReport( | |
| id="easy-001", | |
| title="App crashes on login with correct credentials", | |
| body="When I enter my correct username and password, the app crashes immediately. " | |
| "This started after the v2.0 release. Affects 100% of users. " | |
| "No workaround exists — users cannot log in at all.", | |
| author="user123", | |
| labels_hint=[], | |
| comments=["Confirmed on iOS and Android.", "Happens every time."], | |
| severity_signals=["100% of users", "crashes", "no workaround"], | |
| stack_trace="NullPointerException at AuthController.java:87", | |
| affected_component="auth-service", | |
| ), | |
| BugReport( | |
| id="easy-002", | |
| title="Typo in documentation homepage", | |
| body="There is a typo on the homepage docs: 'Welccome' should be 'Welcome'. " | |
| "No functional impact, purely cosmetic.", | |
| author="docs_fan", | |
| labels_hint=["documentation"], | |
| comments=[], | |
| severity_signals=["cosmetic", "no functional impact"], | |
| stack_trace="", | |
| affected_component="docs", | |
| ), | |
| BugReport( | |
| id="easy-003", | |
| title="Dashboard loads slowly for large datasets", | |
| body="When a dataset has more than 10k rows, the dashboard takes 30+ seconds to load. " | |
| "Workaround: export data and use offline tools. Affects power users only.", | |
| author="power_user", | |
| labels_hint=["performance"], | |
| comments=["Noticed after the last deploy.", "CPU spikes to 100%."], | |
| severity_signals=["workaround exists", "power users only"], | |
| stack_trace="", | |
| affected_component="dashboard", | |
| ), | |
| BugReport( | |
| id="easy-004", | |
| title="Email notifications not sent after password reset", | |
| body="Users who reset their password do not receive the confirmation email. " | |
| "SMTP logs show the job is queued but never dispatched. " | |
| "Affects all users attempting password reset.", | |
| author="support_team", | |
| labels_hint=["bug"], | |
| comments=["Reported by 12 users this week.", | |
| "Started after email service migration."], | |
| severity_signals=["all users", "never dispatched"], | |
| stack_trace="", | |
| affected_component="email-service", | |
| ), | |
| BugReport( | |
| id="easy-005", | |
| title="Incorrect copyright year in footer", | |
| body="The footer shows '© 2022' but it should be '© 2024'. " | |
| "No functional impact.", | |
| author="intern_dev", | |
| labels_hint=["documentation"], | |
| comments=[], | |
| severity_signals=["no functional impact"], | |
| stack_trace="", | |
| affected_component="frontend", | |
| ), | |
| ], | |
| "answers": { | |
| "easy-001": {"priority": "P0"}, | |
| "easy-002": {"priority": "P3"}, | |
| "easy-003": {"priority": "P2"}, | |
| "easy-004": {"priority": "P1"}, | |
| "easy-005": {"priority": "P3"}, | |
| }, | |
| }, | |
| "medium": { | |
| "bugs": [ | |
| BugReport( | |
| id="med-001", | |
| title="Payment fails silently on checkout", | |
| body="Checkout completes without error but payment is never charged. " | |
| "No error shown to user. Stripe logs show declined transaction. " | |
| "Direct revenue loss — every failed checkout is a lost sale.", | |
| author="store_owner", | |
| labels_hint=["bug"], | |
| comments=["Revenue impact confirmed.", "Happening since Tuesday."], | |
| severity_signals=["revenue loss", "silently", "every failed checkout"], | |
| stack_trace="Stripe API: card_declined at PaymentService.py:145", | |
| affected_component="payment-service", | |
| ), | |
| BugReport( | |
| id="med-002", | |
| title="Search results include deleted posts", | |
| body="Deleted blog posts still appear in search results for up to 24 hours. " | |
| "Users can read content that was explicitly removed by moderators. " | |
| "Potential GDPR violation if deleted content belongs to EU users.", | |
| author="moderator_jane", | |
| labels_hint=[], | |
| comments=["GDPR concern — deleted content still visible."], | |
| severity_signals=["GDPR violation", "deleted content visible"], | |
| stack_trace="", | |
| affected_component="search-index", | |
| ), | |
| BugReport( | |
| id="med-003", | |
| title="Dark mode toggle breaks layout on Safari", | |
| body="Switching to dark mode on Safari 16 causes nav bar to overlap content. " | |
| "Chrome and Firefox unaffected. Workaround: use a different browser.", | |
| author="safari_user", | |
| labels_hint=["bug", "ux"], | |
| comments=["Only on Safari, not Chrome/Firefox."], | |
| severity_signals=["workaround exists", "single browser"], | |
| stack_trace="", | |
| affected_component="frontend-css", | |
| ), | |
| BugReport( | |
| id="med-004", | |
| title="CSV export produces corrupted file for non-ASCII characters", | |
| body="When table data contains accented characters (e.g. café, naïve), " | |
| "the exported CSV file is corrupted and cannot be opened in Excel. " | |
| "Affects users with international data.", | |
| author="data_analyst", | |
| labels_hint=["bug"], | |
| comments=["Encoding issue — UTF-8 not respected.", | |
| "Workaround: manual copy-paste."], | |
| severity_signals=["corrupted", "workaround exists"], | |
| stack_trace="", | |
| affected_component="export-service", | |
| ), | |
| BugReport( | |
| id="med-005", | |
| title="API rate limiter blocks legitimate users after 429 error", | |
| body="After receiving a 429 Too Many Requests response, legitimate users " | |
| "remain blocked for 1 hour even after the rate limit window resets. " | |
| "The unblock logic has a bug — it never clears the blocked flag.", | |
| author="api_user", | |
| labels_hint=["bug"], | |
| comments=["Affects CI/CD pipelines hitting the API.", | |
| "Retry-After header is wrong."], | |
| severity_signals=["permanent block", "never clears", "bug in logic"], | |
| stack_trace="", | |
| affected_component="api-gateway", | |
| ), | |
| ], | |
| "answers": { | |
| "med-001": {"priority": "P0", "labels": ["bug", "payments"], "assigned_team": "backend"}, | |
| "med-002": {"priority": "P1", "labels": ["bug", "security"], "assigned_team": "security"}, | |
| "med-003": {"priority": "P2", "labels": ["bug", "ux"], "assigned_team": "frontend"}, | |
| "med-004": {"priority": "P2", "labels": ["bug", "data-integrity"], "assigned_team": "backend"}, | |
| "med-005": {"priority": "P1", "labels": ["bug", "performance"], "assigned_team": "backend"}, | |
| }, | |
| }, | |
| "hard": { | |
| "bugs": [ | |
| BugReport( | |
| id="hard-001", | |
| title="SQL injection vulnerability in search endpoint", | |
| body="The /api/search endpoint does not sanitize inputs. " | |
| "Crafted queries can dump the entire user table including password hashes. " | |
| "PoC attached. Verified on production. Treat as confidential — " | |
| "do not discuss publicly until patched.", | |
| author="security_researcher", | |
| labels_hint=[], | |
| comments=["Critical. Affects production.", "Do not discuss publicly."], | |
| severity_signals=["SQL injection", "password hashes", "production", | |
| "confidential"], | |
| stack_trace="", | |
| affected_component="search-api", | |
| ), | |
| BugReport( | |
| id="hard-002", | |
| title="Memory leak in background job processor causes OOM after 6 hours", | |
| body="The job processor allocates ~50MB per job and never frees it. " | |
| "Server runs out of memory every 6 hours, requiring a manual restart. " | |
| "Heap profiler confirms leak introduced in v1.9. " | |
| "Workaround: scheduled restarts every 4 hours (operational overhead).", | |
| author="devops_alice", | |
| labels_hint=["performance"], | |
| comments=["Verified with heap profiler.", "Started in v1.9."], | |
| severity_signals=["memory leak", "OOM", "manual restart", "never frees"], | |
| stack_trace="HeapDump: JobProcessor.process() -> 50MB/call, never GC'd", | |
| affected_component="job-processor", | |
| ), | |
| BugReport( | |
| id="hard-003", | |
| title="Race condition in file upload: files occasionally overwrite each other", | |
| body="Under concurrent load, two users uploading simultaneously can get " | |
| "each other's files due to a race condition in the temp file naming logic. " | |
| "Frequency: approximately 1 in 10,000 uploads under normal load. " | |
| "No data loss confirmed yet and a workaround exists: " | |
| "enable sequential upload mode in settings (disabled by default). " | |
| "Risk is low-probability but affects data integrity.", | |
| author="qa_bot", | |
| labels_hint=["bug"], | |
| comments=["Reproduced with locust at 50 concurrent users.", | |
| "Sequential mode avoids it."], | |
| severity_signals=["race condition", "data integrity", | |
| "workaround exists", "low-probability"], | |
| stack_trace="", | |
| affected_component="file-upload", | |
| ), | |
| BugReport( | |
| id="hard-004", | |
| title="Auth token not invalidated after password change", | |
| body="When a user changes their password, existing JWT tokens remain valid " | |
| "for up to 24 hours. An attacker who previously stole a token can " | |
| "continue to access the account even after the password is reset. " | |
| "This is a session management security vulnerability.", | |
| author="pentest_team", | |
| labels_hint=["security"], | |
| comments=["Verified on staging.", | |
| "OWASP A07 — Identification and Authentication Failures."], | |
| severity_signals=["JWT not invalidated", "attacker", "security vulnerability", | |
| "stolen token"], | |
| stack_trace="", | |
| affected_component="auth-service", | |
| ), | |
| BugReport( | |
| id="hard-005", | |
| title="Infinite loop in webhook retry logic causes CPU spike", | |
| body="When a webhook endpoint returns a 500 error, the retry logic enters " | |
| "an infinite loop with no backoff or retry cap. " | |
| "This causes CPU to spike to 100% within minutes and starves other services. " | |
| "Triggered in production twice this week. Requires process kill to recover.", | |
| author="oncall_eng", | |
| labels_hint=["bug", "performance"], | |
| comments=["PagerDuty alert fired twice.", | |
| "Needs exponential backoff + max retry cap."], | |
| severity_signals=["infinite loop", "100%", "production", | |
| "process kill", "starves other services"], | |
| stack_trace="Thread dump: WebhookRetrier.retry() → recursive call, no exit", | |
| affected_component="webhook-service", | |
| ), | |
| ], | |
| "answers": { | |
| "hard-001": { | |
| "priority": "P0", "labels": ["bug", "security"], | |
| "assigned_team": "security", "milestone": "hotfix", | |
| }, | |
| "hard-002": { | |
| "priority": "P1", "labels": ["bug", "performance"], | |
| "assigned_team": "backend", "milestone": "v2.1", | |
| }, | |
| "hard-003": { | |
| "priority": "P1", "labels": ["bug", "data-integrity"], | |
| "assigned_team": "backend", "milestone": "v2.1", | |
| }, | |
| "hard-004": { | |
| "priority": "P0", "labels": ["bug", "security"], | |
| "assigned_team": "security", "milestone": "hotfix", | |
| }, | |
| "hard-005": { | |
| "priority": "P0", "labels": ["bug", "performance"], | |
| "assigned_team": "backend", "milestone": "hotfix", | |
| }, | |
| }, | |
| }, | |
| } | |
| # Combine into single TASKS dict (backward compatible) | |
| TASKS = _HANDCRAFTED_BUGS | |
| # --------------------------------------------------------------------------- | |
| # PROCEDURAL BUG GENERATOR | |
| # --------------------------------------------------------------------------- | |
| def _determine_severity(text: str, keywords: Dict[str, list]) -> str: | |
| """Check which severity level the generated text matches.""" | |
| text_lower = text.lower() | |
| for level, kws in keywords.items(): | |
| if level == "default": | |
| return "default" | |
| hits = sum(1 for kw in kws if kw.lower() in text_lower) | |
| if hits >= 1: | |
| return level | |
| # fallback to first non-default key | |
| return list(keywords.keys())[0] if keywords else "moderate" | |
| def generate_bug(task_key: str, seed: int = None) -> Tuple[BugReport, dict]: | |
| """Generate a procedural bug report with its correct answer.""" | |
| rng = random.Random(seed) | |
| # Weight categories by difficulty | |
| weights = { | |
| "easy": {"documentation": 3, "ui_bug": 3, "performance": 2, | |
| "crash": 1, "api_bug": 1}, | |
| "medium": {"crash": 3, "performance": 3, "api_bug": 2, | |
| "data_corruption": 2, "ui_bug": 1}, | |
| "hard": {"security": 4, "crash": 3, "data_corruption": 3, | |
| "performance": 2, "api_bug": 2}, | |
| } | |
| task_weights = weights.get(task_key, weights["medium"]) | |
| categories = [] | |
| for cat, w in task_weights.items(): | |
| categories.extend([cat] * w) | |
| category = rng.choice(categories) | |
| template = _BUG_TEMPLATES[category] | |
| # Pick random variable values | |
| chosen_vars = {} | |
| for var_name, options in template["vars"].items(): | |
| chosen_vars[var_name] = rng.choice(options) | |
| # Build title and body | |
| title_tmpl = rng.choice(template["titles"]) | |
| body_tmpl = rng.choice(template["bodies"]) | |
| # Safe format — ignore missing keys | |
| def safe_format(tmpl, vars_dict): | |
| result = tmpl | |
| for k, v in vars_dict.items(): | |
| result = result.replace("{" + k + "}", v) | |
| return result | |
| title = safe_format(title_tmpl, chosen_vars) | |
| body = safe_format(body_tmpl, chosen_vars) | |
| # Generate unique ID from seed | |
| bug_id = f"gen-{seed or rng.randint(0, 999999):06d}" | |
| # Pick author | |
| authors = ["user_report", "qa_engineer", "support_team", "dev_oncall", | |
| "security_bot", "customer_jane", "automated_monitor", | |
| "intern_dev", "senior_eng", "pm_feedback"] | |
| author = rng.choice(authors) | |
| # Build comments | |
| comment_templates = [ | |
| "Confirmed on our side.", "Reproduced in staging.", | |
| "Multiple reports from users.", "Started after last deployment.", | |
| "Urgent — customer escalation.", "Low priority — no user complaints.", | |
| "Needs investigation.", "Related to ticket from last sprint.", | |
| ] | |
| num_comments = rng.randint(0, 3) | |
| comments = rng.sample(comment_templates, min(num_comments, len(comment_templates))) | |
| # Determine severity and answer | |
| full_text = f"{title} {body} {' '.join(comments)}" | |
| severity_kws = template.get("severity_keywords", {}) | |
| severity = _determine_severity(full_text, severity_kws) | |
| answer_templates = template["answer_template"] | |
| answer = dict(answer_templates.get(severity, list(answer_templates.values())[0])) | |
| # For easy tasks, only priority matters | |
| if task_key == "easy": | |
| answer = {"priority": answer["priority"]} | |
| elif task_key == "medium": | |
| answer.pop("milestone", None) | |
| bug = BugReport( | |
| id=bug_id, | |
| title=title, | |
| body=body, | |
| author=author, | |
| labels_hint=rng.sample(["bug", "needs-triage", "reported"], rng.randint(0, 2)), | |
| comments=comments, | |
| severity_signals=[], | |
| stack_trace="", | |
| affected_component=chosen_vars.get("service", chosen_vars.get("endpoint", "")), | |
| ) | |
| return bug, answer | |
| # --------------------------------------------------------------------------- | |
| # BUG SAMPLER — uses handcrafted bugs first, then procedural for variety | |
| # --------------------------------------------------------------------------- | |
| def sample_bug(task_key: str, seed: int = None) -> Tuple[BugReport, dict]: | |
| """Return a bug and its answer. Mixes handcrafted + procedural.""" | |
| rng = random.Random(seed) | |
| # 40% chance of handcrafted, 60% procedural | |
| if rng.random() < 0.4 and task_key in _HANDCRAFTED_BUGS: | |
| bugs = _HANDCRAFTED_BUGS[task_key]["bugs"] | |
| bug = rng.choice(bugs) | |
| answer = _HANDCRAFTED_BUGS[task_key]["answers"][bug.id] | |
| return bug, answer | |
| else: | |
| gen_seed = seed if seed is not None else rng.randint(0, 999999) | |
| return generate_bug(task_key, seed=gen_seed) | |
| # --------------------------------------------------------------------------- | |
| # GRADING — with semantic label matching | |
| # --------------------------------------------------------------------------- | |
| PRIORITY_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3} | |
| def _priority_score(predicted: str, correct: str) -> float: | |
| """Score priority assignment with partial credit for near-misses.""" | |
| if predicted == correct: | |
| return 0.95 | |
| pred_rank = PRIORITY_ORDER.get(predicted, 99) | |
| corr_rank = PRIORITY_ORDER.get(correct, 99) | |
| diff = abs(pred_rank - corr_rank) | |
| if diff == 1: | |
| return 0.5 | |
| elif diff == 2: | |
| return 0.2 | |
| return 0.05 | |
| def _normalize_label(label: str) -> str: | |
| """Normalize a label to its canonical form.""" | |
| label_lower = label.lower().strip() | |
| for canonical, synonyms in LABEL_SYNONYMS.items(): | |
| if label_lower == canonical or label_lower in synonyms: | |
| return canonical | |
| return label_lower | |
| def _label_score(predicted: List[str], correct: List[str]) -> float: | |
| """Score labels using semantic matching via synonym groups.""" | |
| pred_normalized = set(_normalize_label(l) for l in predicted) | |
| corr_normalized = set(_normalize_label(l) for l in correct) | |
| if not corr_normalized: | |
| return 0.95 | |
| intersection = pred_normalized & corr_normalized | |
| union = pred_normalized | corr_normalized | |
| raw = len(intersection) / len(union) if union else 0.0 | |
| return max(0.05, min(0.95, raw)) | |
| def _reasoning_score(reasoning: str, answer: dict) -> float: | |
| """Bonus for reasoning that mentions relevant signals.""" | |
| if not reasoning or len(reasoning.strip()) < 10: | |
| return 0.0 | |
| key_signals = { | |
| "P0": ["production", "all users", "data loss", "security", "crash", | |
| "revenue", "injection", "vulnerability", "100%"], | |
| "P1": ["major", "significant", "no workaround", "broken", | |
| "gdpr", "blocked", "leak", "never"], | |
| "P2": ["degraded", "workaround", "partial", "slow", | |
| "affected", "power users"], | |
| "P3": ["minor", "cosmetic", "docs", "typo", "low", | |
| "no functional impact"], | |
| } | |
| expected_priority = answer.get("priority", "P2") | |
| signals = key_signals.get(expected_priority, []) | |
| reasoning_lower = reasoning.lower() | |
| hits = sum(1 for s in signals if s in reasoning_lower) | |
| return min(0.15, hits * 0.05) | |
| def grade_action(task_key: str, bug: BugReport, action: TriageAction, | |
| answer: dict = None) -> Tuple[float, str]: | |
| """Grade the agent's triage action against the correct answer.""" | |
| # Backward compatibility: look up answer from handcrafted if not provided | |
| if answer is None: | |
| if task_key in _HANDCRAFTED_BUGS and bug.id in _HANDCRAFTED_BUGS[task_key]["answers"]: | |
| answer = _HANDCRAFTED_BUGS[task_key]["answers"][bug.id] | |
| else: | |
| return 0.5, "No answer key found for this bug." | |
| feedback_parts = [] | |
| reasoning_bonus = _reasoning_score(action.reasoning, answer) | |
| if task_key == "easy": | |
| score = _priority_score(action.priority, answer["priority"]) | |
| symbol = "✓" if score >= 0.9 else "~" if score >= 0.4 else "✗" | |
| feedback_parts.append( | |
| f"Priority: {symbol} (got {action.priority}, expected {answer['priority']})") | |
| score = score + reasoning_bonus | |
| score = max(0.01, min(0.99, score)) | |
| return round(score, 3), " | ".join(feedback_parts) | |
| elif task_key == "medium": | |
| p_score = _priority_score(action.priority, answer["priority"]) | |
| l_score = _label_score(action.labels, answer.get("labels", [])) | |
| expected_team = answer.get("assigned_team", "") | |
| t_score = 0.95 if expected_team and action.assigned_team.lower() == expected_team.lower() else 0.05 | |
| score = 0.45 * p_score + 0.40 * l_score + 0.15 * t_score + reasoning_bonus | |
| feedback_parts.append( | |
| f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})") | |
| feedback_parts.append(f"Labels: {l_score:.2f} (semantic match)") | |
| feedback_parts.append( | |
| f"Team: {t_score:.2f} (got {action.assigned_team}, expected {expected_team})") | |
| if reasoning_bonus > 0: | |
| feedback_parts.append(f"Reasoning bonus: +{reasoning_bonus:.2f}") | |
| score = max(0.01, min(0.99, score)) | |
| return round(score, 3), " | ".join(feedback_parts) | |
| else: # hard | |
| p_score = _priority_score(action.priority, answer["priority"]) | |
| l_score = _label_score(action.labels, answer.get("labels", [])) | |
| t_score = 0.95 if action.assigned_team.lower() == answer["assigned_team"].lower() else 0.05 | |
| m_score = 0.95 if action.milestone.lower() == answer["milestone"].lower() else 0.05 | |
| score = 0.35 * p_score + 0.30 * l_score + 0.20 * t_score + 0.15 * m_score + reasoning_bonus | |
| feedback_parts.append( | |
| f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})") | |
| feedback_parts.append(f"Labels: {l_score:.2f} (semantic match)") | |
| feedback_parts.append( | |
| f"Team: {t_score:.2f} (got {action.assigned_team}, expected {answer['assigned_team']})") | |
| feedback_parts.append( | |
| f"Milestone: {m_score:.2f} (got {action.milestone}, expected {answer['milestone']})") | |
| if reasoning_bonus > 0: | |
| feedback_parts.append(f"Reasoning bonus: +{reasoning_bonus:.2f}") | |
| # Security escalation penalty | |
| if answer.get("assigned_team") == "security" and action.assigned_team.lower() != "security": | |
| score = max(0.01, score - 0.15) | |
| feedback_parts.append("⚠ Security escalation missed (-0.15)") | |
| score = max(0.01, min(0.99, score)) | |
| return round(score, 3), " | ".join(feedback_parts) | |
| # --------------------------------------------------------------------------- | |
| # NAMED GRADER FUNCTIONS — referenced by openenv.yaml | |
| # --------------------------------------------------------------------------- | |
| def priority_match(*args, **kwargs): | |
| if len(args) < 2: | |
| return 0.5 | |
| bug, action = args[0], args[1] | |
| score, _ = grade_action("easy", bug, action) | |
| return float(score) | |
| def priority_label_team(*args, **kwargs): | |
| if len(args) < 2: | |
| return 0.5 | |
| bug, action = args[0], args[1] | |
| score, _ = grade_action("medium", bug, action) | |
| return float(score) | |
| def full_triage(*args, **kwargs): | |
| if len(args) < 2: | |
| return 0.5 | |
| bug, action = args[0], args[1] | |
| score, _ = grade_action("hard", bug, action) | |
| return float(score) | |
| __all__ = [ | |
| "priority_match", | |
| "priority_label_team", | |
| "full_triage", | |
| "sample_bug", | |
| "generate_bug", | |
| "grade_action", | |
| "TASKS", | |
| "LABEL_SYNONYMS", | |
| ] |