Spaces:

Siteshcodes
/

bug-triage-env

Sleeping

File size: 49,250 Bytes

# server/task.py
import sys
import random
import hashlib
sys.path.insert(0, "/app")

from typing import Tuple, List, Dict, Any
from model import BugReport, TriageAction


# ---------------------------------------------------------------------------
#  LABEL SYNONYM MAP — allows semantic matching
# ---------------------------------------------------------------------------

LABEL_SYNONYMS: Dict[str, set] = {
    "bug":              {"defect", "issue", "error", "fault", "broken"},
    "security":         {"vulnerability", "cve", "exploit", "auth", "injection"},
    "performance":      {"perf", "slow", "latency", "optimization", "speed", "memory"},
    "ux":               {"ui", "frontend", "user-experience", "design", "usability"},
    "data-integrity":   {"data-loss", "corruption", "data", "consistency"},
    "payments":         {"billing", "payment", "stripe", "checkout", "revenue"},
    "documentation":    {"docs", "typo", "readme", "wiki"},
    "infrastructure":   {"infra", "devops", "deploy", "ci", "cd", "docker"},
    "api":              {"endpoint", "rest", "graphql", "http", "request"},
    "database":         {"db", "sql", "query", "migration", "schema"},
}

# ---------------------------------------------------------------------------
#  BUG TEMPLATE SYSTEM — generates hundreds of unique bugs
# ---------------------------------------------------------------------------

_BUG_TEMPLATES = {
    "crash": {
        "titles": [
            "{service} crashes on {trigger}",
            "{service} throws {error_type} when {trigger}",
            "Fatal error in {service} during {trigger}",
            "Unhandled exception in {service}: {error_type}",
            "{service} segfaults under {condition}",
        ],
        "bodies": [
            "When a user {trigger}, the {service} crashes immediately. "
            "Error: {error_type}. Stack trace points to {component}. "
            "Affects {impact}. {workaround}",
            "The {service} is failing with {error_type} every time a user {trigger}. "
            "No error message is shown to the user — the process just dies. "
            "Impact: {impact}. {workaround}",
        ],
        "vars": {
            "service": ["auth service", "payment gateway", "search API", "notification worker",
                        "session manager", "user profile service", "file upload handler",
                        "webhook processor", "background job runner", "cache layer"],
            "trigger": ["submits a form with special characters", "uploads a file larger than 10MB",
                        "logs in with SSO", "resets their password", "exports data to CSV",
                        "switches between tabs rapidly", "uses the bulk import feature",
                        "accesses the admin panel", "triggers a webhook", "runs a scheduled job"],
            "error_type": ["NullPointerException", "SegmentationFault", "OutOfMemoryError",
                           "ConnectionTimeoutException", "StackOverflowError",
                           "IndexOutOfBoundsException", "TypeError", "KeyError"],
            "component": ["UserController.java:142", "PaymentService.py:89",
                          "AuthMiddleware.ts:56", "SearchIndex.go:203",
                          "NotificationQueue.rb:77", "FileHandler.py:234"],
            "impact": ["100% of users on this flow", "all mobile users", "EU region users only",
                       "users with accounts older than 1 year", "approximately 30% of sessions",
                       "every request during peak hours"],
            "workaround": ["No workaround exists — the feature is completely broken.",
                           "Workaround: users can retry after clearing browser cache.",
                           "Temporary fix: restart the service every 2 hours.",
                           "No known workaround. Users are blocked."],
            "condition": ["high concurrent load", "memory pressure above 80%",
                          "when connection pool is exhausted", "after running for 6+ hours"],
        },
        "answer_template": {
            "severe": {"priority": "P0", "labels": ["bug"], "assigned_team": "backend", "milestone": "hotfix"},
            "moderate": {"priority": "P1", "labels": ["bug"], "assigned_team": "backend", "milestone": "v2.1"},
        },
        "severity_keywords": {
            "severe": ["100%", "all mobile", "No workaround", "completely broken", "blocked",
                       "SegmentationFault", "OutOfMemoryError"],
            "moderate": ["retry", "30%", "Temporary fix", "restart"],
        },
    },

    "security": {
        "titles": [
            "SQL injection vulnerability in {endpoint}",
            "XSS attack possible via {input_field}",
            "Authentication bypass in {service}",
            "Sensitive data exposed in {location}",
            "{credential_type} not invalidated after {event}",
            "SSRF vulnerability in {endpoint}",
        ],
        "bodies": [
            "The {endpoint} endpoint does not sanitize {input_field} inputs. "
            "Crafted queries can {exploit_result}. PoC attached and verified on {env}. "
            "Treat as confidential — do not discuss publicly until patched. {additional_context}",
            "When a user {event}, existing {credential_type} remain valid for {duration}. "
            "An attacker who {attack_vector} can continue to access the account. "
            "This is a {vuln_category} vulnerability. {additional_context}",
        ],
        "vars": {
            "endpoint": ["/api/search", "/api/users", "/api/export", "/admin/query",
                         "/api/upload", "/graphql", "/api/webhook"],
            "input_field": ["search query", "username field", "file upload name",
                            "comment body", "profile bio", "webhook URL"],
            "service": ["login flow", "OAuth callback", "API gateway", "admin panel",
                        "password reset", "2FA verification"],
            "location": ["API error responses", "debug logs shipped to client",
                         "public S3 bucket", "unencrypted cookies", "localStorage"],
            "credential_type": ["JWT tokens", "session cookies", "API keys", "OAuth tokens"],
            "event": ["changes their password", "revokes API access",
                      "is suspended by admin", "enables 2FA"],
            "exploit_result": ["dump the entire user table including password hashes",
                               "execute arbitrary JavaScript in other users' browsers",
                               "access any user's account without credentials",
                               "read internal service endpoints via SSRF"],
            "env": ["production", "staging", "production replica"],
            "duration": ["up to 24 hours", "indefinitely", "until manual cache clear",
                         "for the full token TTL (7 days)"],
            "attack_vector": ["previously stole a token", "intercepted a session cookie",
                              "obtained a leaked API key"],
            "vuln_category": ["session management", "access control",
                              "injection", "broken authentication"],
            "additional_context": [
                "OWASP A03 — Injection.",
                "OWASP A07 — Identification and Authentication Failures.",
                "CVSS score estimated at 9.1 (Critical).",
                "Compliance impact: potential GDPR violation if user PII is exfiltrated.",
                "Bounty hunter reported this 48 hours ago — disclosure deadline approaching.",
            ],
        },
        "answer_template": {
            "default": {"priority": "P0", "labels": ["bug", "security"],
                        "assigned_team": "security", "milestone": "hotfix"},
        },
        "severity_keywords": {"default": []},
    },

    "performance": {
        "titles": [
            "{page} loads slowly for {dataset_size}",
            "Memory leak in {service} causes OOM after {duration}",
            "API response time degrades under {load_condition}",
            "{operation} takes {duration} for {dataset_size}",
            "CPU spikes to 100% when {trigger}",
        ],
        "bodies": [
            "When {condition}, the {page} takes {response_time} to load. "
            "{diagnostic_info}. {impact}. {workaround}",
            "The {service} allocates memory during {operation} and never frees it. "
            "Server runs out of memory every {duration}. {diagnostic_info}. "
            "{workaround}",
        ],
        "vars": {
            "page": ["dashboard", "analytics page", "user list", "search results",
                     "audit log", "reports page", "admin overview"],
            "service": ["background job processor", "cache warming service",
                        "log aggregator", "image resizer", "ETL pipeline"],
            "dataset_size": ["large datasets (10k+ rows)", "enterprise accounts",
                             "tables with 100k+ entries", "files over 50MB"],
            "duration": ["6 hours", "4 hours", "12 hours", "30+ seconds",
                         "2+ minutes", "an entire day"],
            "load_condition": ["concurrent load", "peak traffic", "batch processing",
                               "more than 50 simultaneous users"],
            "operation": ["bulk export", "report generation", "data migration",
                          "full-text search", "image processing"],
            "trigger": ["running bulk exports", "processing large uploads",
                        "generating PDF reports", "reindexing search"],
            "condition": ["a dataset has more than 10k rows",
                          "multiple users trigger exports simultaneously",
                          "the nightly ETL job runs alongside user traffic"],
            "response_time": ["30+ seconds", "over a minute", "2-3 minutes",
                              "timeout after 60 seconds"],
            "diagnostic_info": ["CPU spikes to 100%", "Heap profiler confirms the leak",
                                "Database EXPLAIN shows full table scan",
                                "N+1 query pattern detected in APM",
                                "Garbage collector running every 500ms"],
            "impact": ["Affects power users with large accounts",
                       "All users experience slowness during peak hours",
                       "Requires manual restart to recover",
                       "Operational overhead: scheduled restarts every 4 hours"],
            "workaround": ["Workaround: export data and use offline tools.",
                           "Workaround: scheduled restarts every 4 hours.",
                           "No workaround — users just wait.",
                           "Workaround: paginate results (but UX is degraded)."],
        },
        "answer_template": {
            "severe": {"priority": "P1", "labels": ["bug", "performance"],
                       "assigned_team": "backend", "milestone": "v2.1"},
            "moderate": {"priority": "P2", "labels": ["bug", "performance"],
                         "assigned_team": "backend", "milestone": "v2.1"},
        },
        "severity_keywords": {
            "severe": ["OOM", "100%", "manual restart", "timeout", "No workaround",
                       "all users", "never frees"],
            "moderate": ["Workaround", "power users", "paginate"],
        },
    },

    "ui_bug": {
        "titles": [
            "{ui_element} breaks layout on {browser}",
            "{ui_element} not rendering correctly in {mode}",
            "Responsive layout broken on {device}",
            "{feature} toggle not persisting across {context}",
            "Accessibility: {ui_element} missing {a11y_attr}",
        ],
        "bodies": [
            "Switching to {mode} on {browser} causes {ui_element} to {visual_issue}. "
            "{other_browsers}. {workaround}",
            "On {device}, the {ui_element} is {visual_issue}. "
            "Tested on {browser}. {impact}. {workaround}",
        ],
        "vars": {
            "ui_element": ["navigation bar", "sidebar menu", "modal dialog",
                           "dropdown selector", "data table", "footer",
                           "toast notifications", "breadcrumb trail"],
            "browser": ["Safari 16", "Firefox ESR", "Chrome on Android",
                        "Edge on Windows", "iOS Safari", "Samsung Internet"],
            "mode": ["dark mode", "high contrast mode", "RTL layout",
                     "compact view", "print view"],
            "device": ["iPhone SE", "tablets in portrait", "screens below 768px",
                       "ultra-wide monitors", "4K displays"],
            "feature": ["dark mode", "compact view", "language preference",
                        "notification settings"],
            "context": ["page reloads", "different tabs", "sessions",
                        "browser restarts"],
            "visual_issue": ["overlap the main content", "disappear entirely",
                             "render with incorrect colors", "become unclickable",
                             "overflow beyond the viewport"],
            "other_browsers": ["Chrome and Firefox are unaffected.",
                               "Only reproducible on this specific browser.",
                               "Affects all WebKit-based browsers."],
            "a11y_attr": ["ARIA labels", "keyboard focus indicators",
                          "screen reader text", "proper heading hierarchy"],
            "impact": ["Cosmetic issue, no functional impact.",
                       "Users cannot access the affected feature.",
                       "Usability is degraded but the feature works."],
            "workaround": ["Workaround: use a different browser.",
                           "Workaround: manually resize the window.",
                           "No workaround for this browser.",
                           "Workaround: disable the feature in settings."],
        },
        "answer_template": {
            "severe": {"priority": "P2", "labels": ["bug", "ux"],
                       "assigned_team": "frontend", "milestone": "v2.1"},
            "moderate": {"priority": "P3", "labels": ["bug", "ux"],
                         "assigned_team": "frontend", "milestone": "backlog"},
        },
        "severity_keywords": {
            "severe": ["cannot access", "unclickable", "disappear", "No workaround"],
            "moderate": ["Cosmetic", "different browser", "resize"],
        },
    },

    "data_corruption": {
        "titles": [
            "Race condition in {feature}: {consequence}",
            "Data inconsistency in {feature} under concurrent writes",
            "{export_format} export produces corrupted output for {edge_case}",
            "Stale data served from cache after {trigger}",
            "Duplicate records created when {trigger}",
        ],
        "bodies": [
            "Under concurrent load, {feature} can {consequence} due to a race condition "
            "in {root_cause}. Frequency: {frequency}. {impact}. {workaround}",
            "When {feature} data contains {edge_case}, the exported {export_format} file "
            "is corrupted and cannot be {consumer}. {impact}. {workaround}",
        ],
        "vars": {
            "feature": ["file upload", "order processing", "user registration",
                        "inventory update", "comment system", "permission assignment"],
            "consequence": ["files occasionally overwrite each other",
                            "orders are duplicated or lost",
                            "users get assigned wrong permissions",
                            "inventory counts become negative"],
            "root_cause": ["temp file naming logic", "lack of database locking",
                           "non-atomic read-modify-write cycle",
                           "missing unique constraint"],
            "frequency": ["approximately 1 in 10,000 operations",
                          "consistently under 50+ concurrent users",
                          "intermittently — hard to reproduce",
                          "every time the batch job runs"],
            "edge_case": ["non-ASCII characters (e.g., café, naïve)",
                          "values containing commas or quotes",
                          "null or empty fields",
                          "timestamps crossing DST boundaries"],
            "export_format": ["CSV", "Excel", "JSON", "PDF"],
            "consumer": ["opened in Excel", "parsed by downstream services",
                         "imported back into the system"],
            "trigger": ["double-clicking the submit button",
                        "cache TTL expires during a write operation",
                        "two users edit the same record simultaneously",
                        "the nightly sync job overlaps with user activity"],
            "impact": ["Potential data loss confirmed.",
                       "No data loss confirmed yet, but risk exists.",
                       "Affects users with international data.",
                       "Breaks downstream pipeline processing."],
            "workaround": ["Workaround: enable sequential mode in settings.",
                           "Workaround: manually re-export after cleanup.",
                           "No reliable workaround — data must be manually verified.",
                           "Workaround: add a mutex lock externally (operational overhead)."],
        },
        "answer_template": {
            "severe": {"priority": "P1", "labels": ["bug", "data-integrity"],
                       "assigned_team": "backend", "milestone": "v2.1"},
            "moderate": {"priority": "P2", "labels": ["bug", "data-integrity"],
                         "assigned_team": "backend", "milestone": "v2.1"},
        },
        "severity_keywords": {
            "severe": ["data loss", "No reliable workaround", "consistently",
                       "permissions", "overwrite", "negative"],
            "moderate": ["No data loss", "intermittently", "sequential mode",
                         "re-export", "non-ASCII"],
        },
    },

    "documentation": {
        "titles": [
            "Typo in {location}",
            "Outdated {doc_type} on {page}",
            "Missing documentation for {feature}",
            "Incorrect {doc_element} in {location}",
        ],
        "bodies": [
            "There is a {issue_type} on the {page}: {detail}. No functional impact, "
            "purely cosmetic. {extra}",
            "The {doc_type} for {feature} is {issue_type}. {detail}. {extra}",
        ],
        "vars": {
            "location": ["homepage docs", "API reference", "README", "changelog",
                         "contributing guide", "onboarding wiki"],
            "doc_type": ["installation guide", "API documentation", "changelog",
                         "migration guide", "code comments"],
            "page": ["landing page", "docs homepage", "getting started page",
                     "FAQ section", "footer"],
            "feature": ["new webhook API", "batch processing endpoint",
                        "SSO integration", "rate limiting"],
            "doc_element": ["code example", "endpoint URL", "parameter description",
                            "copyright year", "version number"],
            "issue_type": ["a typo", "outdated", "missing", "incorrect", "misleading"],
            "detail": ["'Welccome' should be 'Welcome'",
                       "references removed v1.x API that no longer exists",
                       "completely undocumented despite being a core feature",
                       "shows '© 2022' but should be '© 2024'",
                       "the curl example uses the wrong HTTP method"],
            "extra": ["", "Low priority — does not block any workflow.",
                      "New users have reported confusion.",
                      "Only noticed by contributors reading source code."],
        },
        "answer_template": {
            "default": {"priority": "P3", "labels": ["documentation"],
                        "assigned_team": "devx", "milestone": "backlog"},
        },
        "severity_keywords": {"default": []},
    },

    "api_bug": {
        "titles": [
            "API rate limiter {issue} after {trigger}",
            "{endpoint} returns {status_code} instead of {expected_code}",
            "Pagination broken on {endpoint}: {symptom}",
            "Webhook delivery {issue} for {event_type} events",
            "API versioning: {endpoint} behaves differently on v1 vs v2",
        ],
        "bodies": [
            "After receiving a {status_code} response, {consequence}. "
            "The {root_cause}. {impact}. {workaround}",
            "The {endpoint} endpoint {symptom} when {trigger}. "
            "Expected behavior: {expected}. Actual: {actual}. {impact}.",
        ],
        "vars": {
            "endpoint": ["/api/users", "/api/search", "/api/export",
                         "/api/webhooks", "/api/billing", "/api/analytics"],
            "issue": ["blocks legitimate users", "fails silently",
                      "returns incorrect retry headers", "drops events"],
            "trigger": ["a 429 error", "rate limit window resets",
                        "a burst of requests from CI/CD", "server restart"],
            "status_code": ["429", "500", "502", "504", "403"],
            "expected_code": ["200", "201", "204", "404"],
            "symptom": ["returns duplicate entries",
                        "skips items between pages",
                        "returns empty page despite more data existing"],
            "event_type": ["payment.completed", "user.created",
                           "subscription.cancelled", "deployment.finished"],
            "consequence": ["legitimate users remain blocked for 1 hour",
                            "data is silently lost with no error",
                            "downstream services receive stale data"],
            "root_cause": ["unblock logic has a bug — it never clears the blocked flag",
                           "cursor-based pagination uses wrong sort order",
                           "retry-after header reports seconds instead of milliseconds"],
            "expected": ["200 OK with paginated results",
                         "successful delivery with retry on failure",
                         "proper rate limit reset after window expires"],
            "actual": ["empty response with 200 status",
                       "permanent block until manual intervention",
                       "events dropped without any error log"],
            "impact": ["Affects CI/CD pipelines hitting the API.",
                       "External integrations break silently.",
                       "Customer-facing dashboards show wrong data.",
                       "Retry-After header causes clients to wait too long."],
            "workaround": ["Workaround: manually clear Redis key.",
                           "Workaround: add client-side deduplication.",
                           "No workaround — requires server-side fix.",
                           "Workaround: pin API version to v1 in headers."],
        },
        "answer_template": {
            "severe": {"priority": "P1", "labels": ["bug", "api"],
                       "assigned_team": "backend", "milestone": "v2.1"},
            "moderate": {"priority": "P2", "labels": ["bug", "api"],
                         "assigned_team": "backend", "milestone": "v2.1"},
        },
        "severity_keywords": {
            "severe": ["silently lost", "permanent block", "No workaround",
                       "dropped", "external integrations"],
            "moderate": ["Workaround", "pin API", "deduplication"],
        },
    },
}


# The original handcrafted bugs — kept as a gold-standard subset
_HANDCRAFTED_BUGS = {
    "easy": {
        "bugs": [
            BugReport(
                id="easy-001",
                title="App crashes on login with correct credentials",
                body="When I enter my correct username and password, the app crashes immediately. "
                     "This started after the v2.0 release. Affects 100% of users. "
                     "No workaround exists — users cannot log in at all.",
                author="user123",
                labels_hint=[],
                comments=["Confirmed on iOS and Android.", "Happens every time."],
                severity_signals=["100% of users", "crashes", "no workaround"],
                stack_trace="NullPointerException at AuthController.java:87",
                affected_component="auth-service",
            ),
            BugReport(
                id="easy-002",
                title="Typo in documentation homepage",
                body="There is a typo on the homepage docs: 'Welccome' should be 'Welcome'. "
                     "No functional impact, purely cosmetic.",
                author="docs_fan",
                labels_hint=["documentation"],
                comments=[],
                severity_signals=["cosmetic", "no functional impact"],
                stack_trace="",
                affected_component="docs",
            ),
            BugReport(
                id="easy-003",
                title="Dashboard loads slowly for large datasets",
                body="When a dataset has more than 10k rows, the dashboard takes 30+ seconds to load. "
                     "Workaround: export data and use offline tools. Affects power users only.",
                author="power_user",
                labels_hint=["performance"],
                comments=["Noticed after the last deploy.", "CPU spikes to 100%."],
                severity_signals=["workaround exists", "power users only"],
                stack_trace="",
                affected_component="dashboard",
            ),
            BugReport(
                id="easy-004",
                title="Email notifications not sent after password reset",
                body="Users who reset their password do not receive the confirmation email. "
                     "SMTP logs show the job is queued but never dispatched. "
                     "Affects all users attempting password reset.",
                author="support_team",
                labels_hint=["bug"],
                comments=["Reported by 12 users this week.",
                           "Started after email service migration."],
                severity_signals=["all users", "never dispatched"],
                stack_trace="",
                affected_component="email-service",
            ),
            BugReport(
                id="easy-005",
                title="Incorrect copyright year in footer",
                body="The footer shows '© 2022' but it should be '© 2024'. "
                     "No functional impact.",
                author="intern_dev",
                labels_hint=["documentation"],
                comments=[],
                severity_signals=["no functional impact"],
                stack_trace="",
                affected_component="frontend",
            ),
        ],
        "answers": {
            "easy-001": {"priority": "P0"},
            "easy-002": {"priority": "P3"},
            "easy-003": {"priority": "P2"},
            "easy-004": {"priority": "P1"},
            "easy-005": {"priority": "P3"},
        },
    },

    "medium": {
        "bugs": [
            BugReport(
                id="med-001",
                title="Payment fails silently on checkout",
                body="Checkout completes without error but payment is never charged. "
                     "No error shown to user. Stripe logs show declined transaction. "
                     "Direct revenue loss — every failed checkout is a lost sale.",
                author="store_owner",
                labels_hint=["bug"],
                comments=["Revenue impact confirmed.", "Happening since Tuesday."],
                severity_signals=["revenue loss", "silently", "every failed checkout"],
                stack_trace="Stripe API: card_declined at PaymentService.py:145",
                affected_component="payment-service",
            ),
            BugReport(
                id="med-002",
                title="Search results include deleted posts",
                body="Deleted blog posts still appear in search results for up to 24 hours. "
                     "Users can read content that was explicitly removed by moderators. "
                     "Potential GDPR violation if deleted content belongs to EU users.",
                author="moderator_jane",
                labels_hint=[],
                comments=["GDPR concern — deleted content still visible."],
                severity_signals=["GDPR violation", "deleted content visible"],
                stack_trace="",
                affected_component="search-index",
            ),
            BugReport(
                id="med-003",
                title="Dark mode toggle breaks layout on Safari",
                body="Switching to dark mode on Safari 16 causes nav bar to overlap content. "
                     "Chrome and Firefox unaffected. Workaround: use a different browser.",
                author="safari_user",
                labels_hint=["bug", "ux"],
                comments=["Only on Safari, not Chrome/Firefox."],
                severity_signals=["workaround exists", "single browser"],
                stack_trace="",
                affected_component="frontend-css",
            ),
            BugReport(
                id="med-004",
                title="CSV export produces corrupted file for non-ASCII characters",
                body="When table data contains accented characters (e.g. café, naïve), "
                     "the exported CSV file is corrupted and cannot be opened in Excel. "
                     "Affects users with international data.",
                author="data_analyst",
                labels_hint=["bug"],
                comments=["Encoding issue — UTF-8 not respected.",
                           "Workaround: manual copy-paste."],
                severity_signals=["corrupted", "workaround exists"],
                stack_trace="",
                affected_component="export-service",
            ),
            BugReport(
                id="med-005",
                title="API rate limiter blocks legitimate users after 429 error",
                body="After receiving a 429 Too Many Requests response, legitimate users "
                     "remain blocked for 1 hour even after the rate limit window resets. "
                     "The unblock logic has a bug — it never clears the blocked flag.",
                author="api_user",
                labels_hint=["bug"],
                comments=["Affects CI/CD pipelines hitting the API.",
                           "Retry-After header is wrong."],
                severity_signals=["permanent block", "never clears", "bug in logic"],
                stack_trace="",
                affected_component="api-gateway",
            ),
        ],
        "answers": {
            "med-001": {"priority": "P0", "labels": ["bug", "payments"],        "assigned_team": "backend"},
            "med-002": {"priority": "P1", "labels": ["bug", "security"],        "assigned_team": "security"},
            "med-003": {"priority": "P2", "labels": ["bug", "ux"],              "assigned_team": "frontend"},
            "med-004": {"priority": "P2", "labels": ["bug", "data-integrity"],  "assigned_team": "backend"},
            "med-005": {"priority": "P1", "labels": ["bug", "performance"],     "assigned_team": "backend"},
        },
    },

    "hard": {
        "bugs": [
            BugReport(
                id="hard-001",
                title="SQL injection vulnerability in search endpoint",
                body="The /api/search endpoint does not sanitize inputs. "
                     "Crafted queries can dump the entire user table including password hashes. "
                     "PoC attached. Verified on production. Treat as confidential — "
                     "do not discuss publicly until patched.",
                author="security_researcher",
                labels_hint=[],
                comments=["Critical. Affects production.", "Do not discuss publicly."],
                severity_signals=["SQL injection", "password hashes", "production",
                                  "confidential"],
                stack_trace="",
                affected_component="search-api",
            ),
            BugReport(
                id="hard-002",
                title="Memory leak in background job processor causes OOM after 6 hours",
                body="The job processor allocates ~50MB per job and never frees it. "
                     "Server runs out of memory every 6 hours, requiring a manual restart. "
                     "Heap profiler confirms leak introduced in v1.9. "
                     "Workaround: scheduled restarts every 4 hours (operational overhead).",
                author="devops_alice",
                labels_hint=["performance"],
                comments=["Verified with heap profiler.", "Started in v1.9."],
                severity_signals=["memory leak", "OOM", "manual restart", "never frees"],
                stack_trace="HeapDump: JobProcessor.process() -> 50MB/call, never GC'd",
                affected_component="job-processor",
            ),
            BugReport(
                id="hard-003",
                title="Race condition in file upload: files occasionally overwrite each other",
                body="Under concurrent load, two users uploading simultaneously can get "
                     "each other's files due to a race condition in the temp file naming logic. "
                     "Frequency: approximately 1 in 10,000 uploads under normal load. "
                     "No data loss confirmed yet and a workaround exists: "
                     "enable sequential upload mode in settings (disabled by default). "
                     "Risk is low-probability but affects data integrity.",
                author="qa_bot",
                labels_hint=["bug"],
                comments=["Reproduced with locust at 50 concurrent users.",
                           "Sequential mode avoids it."],
                severity_signals=["race condition", "data integrity",
                                  "workaround exists", "low-probability"],
                stack_trace="",
                affected_component="file-upload",
            ),
            BugReport(
                id="hard-004",
                title="Auth token not invalidated after password change",
                body="When a user changes their password, existing JWT tokens remain valid "
                     "for up to 24 hours. An attacker who previously stole a token can "
                     "continue to access the account even after the password is reset. "
                     "This is a session management security vulnerability.",
                author="pentest_team",
                labels_hint=["security"],
                comments=["Verified on staging.",
                           "OWASP A07 — Identification and Authentication Failures."],
                severity_signals=["JWT not invalidated", "attacker", "security vulnerability",
                                  "stolen token"],
                stack_trace="",
                affected_component="auth-service",
            ),
            BugReport(
                id="hard-005",
                title="Infinite loop in webhook retry logic causes CPU spike",
                body="When a webhook endpoint returns a 500 error, the retry logic enters "
                     "an infinite loop with no backoff or retry cap. "
                     "This causes CPU to spike to 100% within minutes and starves other services. "
                     "Triggered in production twice this week. Requires process kill to recover.",
                author="oncall_eng",
                labels_hint=["bug", "performance"],
                comments=["PagerDuty alert fired twice.",
                           "Needs exponential backoff + max retry cap."],
                severity_signals=["infinite loop", "100%", "production",
                                  "process kill", "starves other services"],
                stack_trace="Thread dump: WebhookRetrier.retry() → recursive call, no exit",
                affected_component="webhook-service",
            ),
        ],
        "answers": {
            "hard-001": {
                "priority": "P0", "labels": ["bug", "security"],
                "assigned_team": "security", "milestone": "hotfix",
            },
            "hard-002": {
                "priority": "P1", "labels": ["bug", "performance"],
                "assigned_team": "backend", "milestone": "v2.1",
            },
            "hard-003": {
                "priority": "P1", "labels": ["bug", "data-integrity"],
                "assigned_team": "backend", "milestone": "v2.1",
            },
            "hard-004": {
                "priority": "P0", "labels": ["bug", "security"],
                "assigned_team": "security", "milestone": "hotfix",
            },
            "hard-005": {
                "priority": "P0", "labels": ["bug", "performance"],
                "assigned_team": "backend", "milestone": "hotfix",
            },
        },
    },
}


# Combine into single TASKS dict (backward compatible)
TASKS = _HANDCRAFTED_BUGS


# ---------------------------------------------------------------------------
#  PROCEDURAL BUG GENERATOR
# ---------------------------------------------------------------------------

def _determine_severity(text: str, keywords: Dict[str, list]) -> str:
    """Check which severity level the generated text matches."""
    text_lower = text.lower()
    for level, kws in keywords.items():
        if level == "default":
            return "default"
        hits = sum(1 for kw in kws if kw.lower() in text_lower)
        if hits >= 1:
            return level
    # fallback to first non-default key
    return list(keywords.keys())[0] if keywords else "moderate"


def generate_bug(task_key: str, seed: int = None) -> Tuple[BugReport, dict]:
    """Generate a procedural bug report with its correct answer."""
    rng = random.Random(seed)

    # Weight categories by difficulty
    weights = {
        "easy": {"documentation": 3, "ui_bug": 3, "performance": 2,
                 "crash": 1, "api_bug": 1},
        "medium": {"crash": 3, "performance": 3, "api_bug": 2,
                   "data_corruption": 2, "ui_bug": 1},
        "hard": {"security": 4, "crash": 3, "data_corruption": 3,
                 "performance": 2, "api_bug": 2},
    }

    task_weights = weights.get(task_key, weights["medium"])
    categories = []
    for cat, w in task_weights.items():
        categories.extend([cat] * w)
    category = rng.choice(categories)

    template = _BUG_TEMPLATES[category]

    # Pick random variable values
    chosen_vars = {}
    for var_name, options in template["vars"].items():
        chosen_vars[var_name] = rng.choice(options)

    # Build title and body
    title_tmpl = rng.choice(template["titles"])
    body_tmpl = rng.choice(template["bodies"])

    # Safe format — ignore missing keys
    def safe_format(tmpl, vars_dict):
        result = tmpl
        for k, v in vars_dict.items():
            result = result.replace("{" + k + "}", v)
        return result

    title = safe_format(title_tmpl, chosen_vars)
    body = safe_format(body_tmpl, chosen_vars)

    # Generate unique ID from seed
    bug_id = f"gen-{seed or rng.randint(0, 999999):06d}"

    # Pick author
    authors = ["user_report", "qa_engineer", "support_team", "dev_oncall",
               "security_bot", "customer_jane", "automated_monitor",
               "intern_dev", "senior_eng", "pm_feedback"]
    author = rng.choice(authors)

    # Build comments
    comment_templates = [
        "Confirmed on our side.", "Reproduced in staging.",
        "Multiple reports from users.", "Started after last deployment.",
        "Urgent — customer escalation.", "Low priority — no user complaints.",
        "Needs investigation.", "Related to ticket from last sprint.",
    ]
    num_comments = rng.randint(0, 3)
    comments = rng.sample(comment_templates, min(num_comments, len(comment_templates)))

    # Determine severity and answer
    full_text = f"{title} {body} {' '.join(comments)}"
    severity_kws = template.get("severity_keywords", {})
    severity = _determine_severity(full_text, severity_kws)

    answer_templates = template["answer_template"]
    answer = dict(answer_templates.get(severity, list(answer_templates.values())[0]))

    # For easy tasks, only priority matters
    if task_key == "easy":
        answer = {"priority": answer["priority"]}
    elif task_key == "medium":
        answer.pop("milestone", None)

    bug = BugReport(
        id=bug_id,
        title=title,
        body=body,
        author=author,
        labels_hint=rng.sample(["bug", "needs-triage", "reported"], rng.randint(0, 2)),
        comments=comments,
        severity_signals=[],
        stack_trace="",
        affected_component=chosen_vars.get("service", chosen_vars.get("endpoint", "")),
    )

    return bug, answer


# ---------------------------------------------------------------------------
#  BUG SAMPLER — uses handcrafted bugs first, then procedural for variety
# ---------------------------------------------------------------------------

def sample_bug(task_key: str, seed: int = None) -> Tuple[BugReport, dict]:
    """Return a bug and its answer. Mixes handcrafted + procedural."""
    rng = random.Random(seed)

    # 40% chance of handcrafted, 60% procedural
    if rng.random() < 0.4 and task_key in _HANDCRAFTED_BUGS:
        bugs = _HANDCRAFTED_BUGS[task_key]["bugs"]
        bug = rng.choice(bugs)
        answer = _HANDCRAFTED_BUGS[task_key]["answers"][bug.id]
        return bug, answer
    else:
        gen_seed = seed if seed is not None else rng.randint(0, 999999)
        return generate_bug(task_key, seed=gen_seed)


# ---------------------------------------------------------------------------
#  GRADING — with semantic label matching
# ---------------------------------------------------------------------------

PRIORITY_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3}


def _priority_score(predicted: str, correct: str) -> float:
    """Score priority assignment with partial credit for near-misses."""
    if predicted == correct:
        return 0.95
    pred_rank = PRIORITY_ORDER.get(predicted, 99)
    corr_rank = PRIORITY_ORDER.get(correct, 99)
    diff = abs(pred_rank - corr_rank)
    if diff == 1:
        return 0.5
    elif diff == 2:
        return 0.2
    return 0.05


def _normalize_label(label: str) -> str:
    """Normalize a label to its canonical form."""
    label_lower = label.lower().strip()
    for canonical, synonyms in LABEL_SYNONYMS.items():
        if label_lower == canonical or label_lower in synonyms:
            return canonical
    return label_lower


def _label_score(predicted: List[str], correct: List[str]) -> float:
    """Score labels using semantic matching via synonym groups."""
    pred_normalized = set(_normalize_label(l) for l in predicted)
    corr_normalized = set(_normalize_label(l) for l in correct)

    if not corr_normalized:
        return 0.95

    intersection = pred_normalized & corr_normalized
    union = pred_normalized | corr_normalized

    raw = len(intersection) / len(union) if union else 0.0
    return max(0.05, min(0.95, raw))


def _reasoning_score(reasoning: str, answer: dict) -> float:
    """Bonus for reasoning that mentions relevant signals."""
    if not reasoning or len(reasoning.strip()) < 10:
        return 0.0

    key_signals = {
        "P0": ["production", "all users", "data loss", "security", "crash",
               "revenue", "injection", "vulnerability", "100%"],
        "P1": ["major", "significant", "no workaround", "broken",
               "gdpr", "blocked", "leak", "never"],
        "P2": ["degraded", "workaround", "partial", "slow",
               "affected", "power users"],
        "P3": ["minor", "cosmetic", "docs", "typo", "low",
               "no functional impact"],
    }

    expected_priority = answer.get("priority", "P2")
    signals = key_signals.get(expected_priority, [])
    reasoning_lower = reasoning.lower()

    hits = sum(1 for s in signals if s in reasoning_lower)
    return min(0.15, hits * 0.05)


def grade_action(task_key: str, bug: BugReport, action: TriageAction,
                 answer: dict = None) -> Tuple[float, str]:
    """Grade the agent's triage action against the correct answer."""

    # Backward compatibility: look up answer from handcrafted if not provided
    if answer is None:
        if task_key in _HANDCRAFTED_BUGS and bug.id in _HANDCRAFTED_BUGS[task_key]["answers"]:
            answer = _HANDCRAFTED_BUGS[task_key]["answers"][bug.id]
        else:
            return 0.5, "No answer key found for this bug."

    feedback_parts = []
    reasoning_bonus = _reasoning_score(action.reasoning, answer)

    if task_key == "easy":
        score = _priority_score(action.priority, answer["priority"])
        symbol = "✓" if score >= 0.9 else "~" if score >= 0.4 else "✗"
        feedback_parts.append(
            f"Priority: {symbol} (got {action.priority}, expected {answer['priority']})")
        score = score + reasoning_bonus
        score = max(0.01, min(0.99, score))
        return round(score, 3), " | ".join(feedback_parts)

    elif task_key == "medium":
        p_score = _priority_score(action.priority, answer["priority"])
        l_score = _label_score(action.labels, answer.get("labels", []))
        expected_team = answer.get("assigned_team", "")
        t_score = 0.95 if expected_team and action.assigned_team.lower() == expected_team.lower() else 0.05

        score = 0.45 * p_score + 0.40 * l_score + 0.15 * t_score + reasoning_bonus

        feedback_parts.append(
            f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
        feedback_parts.append(f"Labels: {l_score:.2f} (semantic match)")
        feedback_parts.append(
            f"Team: {t_score:.2f} (got {action.assigned_team}, expected {expected_team})")
        if reasoning_bonus > 0:
            feedback_parts.append(f"Reasoning bonus: +{reasoning_bonus:.2f}")

        score = max(0.01, min(0.99, score))
        return round(score, 3), " | ".join(feedback_parts)

    else:  # hard
        p_score = _priority_score(action.priority, answer["priority"])
        l_score = _label_score(action.labels, answer.get("labels", []))
        t_score = 0.95 if action.assigned_team.lower() == answer["assigned_team"].lower() else 0.05
        m_score = 0.95 if action.milestone.lower() == answer["milestone"].lower() else 0.05

        score = 0.35 * p_score + 0.30 * l_score + 0.20 * t_score + 0.15 * m_score + reasoning_bonus

        feedback_parts.append(
            f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
        feedback_parts.append(f"Labels: {l_score:.2f} (semantic match)")
        feedback_parts.append(
            f"Team: {t_score:.2f} (got {action.assigned_team}, expected {answer['assigned_team']})")
        feedback_parts.append(
            f"Milestone: {m_score:.2f} (got {action.milestone}, expected {answer['milestone']})")

        if reasoning_bonus > 0:
            feedback_parts.append(f"Reasoning bonus: +{reasoning_bonus:.2f}")

        # Security escalation penalty
        if answer.get("assigned_team") == "security" and action.assigned_team.lower() != "security":
            score = max(0.01, score - 0.15)
            feedback_parts.append("⚠ Security escalation missed (-0.15)")

        score = max(0.01, min(0.99, score))
        return round(score, 3), " | ".join(feedback_parts)


# ---------------------------------------------------------------------------
#  NAMED GRADER FUNCTIONS — referenced by openenv.yaml
# ---------------------------------------------------------------------------

def priority_match(*args, **kwargs):
    if len(args) < 2:
        return 0.5
    bug, action = args[0], args[1]
    score, _ = grade_action("easy", bug, action)
    return float(score)


def priority_label_team(*args, **kwargs):
    if len(args) < 2:
        return 0.5
    bug, action = args[0], args[1]
    score, _ = grade_action("medium", bug, action)
    return float(score)


def full_triage(*args, **kwargs):
    if len(args) < 2:
        return 0.5
    bug, action = args[0], args[1]
    score, _ = grade_action("hard", bug, action)
    return float(score)


__all__ = [
    "priority_match",
    "priority_label_team",
    "full_triage",
    "sample_bug",
    "generate_bug",
    "grade_action",
    "TASKS",
    "LABEL_SYNONYMS",
]