Spaces:
Sleeping
Sleeping
File size: 49,250 Bytes
666b49a 442df7c 703aa57 666b49a 703aa57 38ab410 442df7c 4ba1053 703aa57 4ba1053 703aa57 38ab410 703aa57 38ab410 442df7c 38ab410 703aa57 38ab410 442df7c 38ab410 703aa57 38ab410 442df7c 38ab410 703aa57 38ab410 442df7c 703aa57 442df7c 703aa57 442df7c 38ab410 442df7c 38ab410 442df7c 38ab410 703aa57 38ab410 442df7c 38ab410 703aa57 38ab410 442df7c 38ab410 703aa57 38ab410 442df7c 703aa57 442df7c 703aa57 442df7c 38ab410 442df7c 38ab410 442df7c 38ab410 703aa57 38ab410 442df7c 38ab410 703aa57 38ab410 442df7c 38ab410 703aa57 442df7c 703aa57 442df7c 703aa57 38ab410 703aa57 38ab410 703aa57 38ab410 703aa57 38ab410 442df7c 703aa57 442df7c 703aa57 442df7c 38ab410 703aa57 38ab410 442df7c 38ab410 703aa57 38ab410 2fbe4d0 703aa57 38ab410 703aa57 6174aa3 38ab410 703aa57 2fbe4d0 703aa57 2fbe4d0 38ab410 703aa57 38ab410 703aa57 38ab410 2fbe4d0 703aa57 6174aa3 442df7c 38ab410 703aa57 442df7c 2fbe4d0 703aa57 6174aa3 38ab410 703aa57 2fbe4d0 703aa57 38ab410 6174aa3 38ab410 703aa57 6174aa3 25253dd 703aa57 25253dd bc79ac5 703aa57 25253dd bc79ac5 703aa57 25253dd bc79ac5 703aa57 25253dd 703aa57 25253dd 703aa57 25253dd 703aa57 25253dd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 | # server/task.py
import sys
import random
import hashlib
sys.path.insert(0, "/app")
from typing import Tuple, List, Dict, Any
from model import BugReport, TriageAction
# ---------------------------------------------------------------------------
# LABEL SYNONYM MAP β allows semantic matching
# ---------------------------------------------------------------------------
LABEL_SYNONYMS: Dict[str, set] = {
"bug": {"defect", "issue", "error", "fault", "broken"},
"security": {"vulnerability", "cve", "exploit", "auth", "injection"},
"performance": {"perf", "slow", "latency", "optimization", "speed", "memory"},
"ux": {"ui", "frontend", "user-experience", "design", "usability"},
"data-integrity": {"data-loss", "corruption", "data", "consistency"},
"payments": {"billing", "payment", "stripe", "checkout", "revenue"},
"documentation": {"docs", "typo", "readme", "wiki"},
"infrastructure": {"infra", "devops", "deploy", "ci", "cd", "docker"},
"api": {"endpoint", "rest", "graphql", "http", "request"},
"database": {"db", "sql", "query", "migration", "schema"},
}
# ---------------------------------------------------------------------------
# BUG TEMPLATE SYSTEM β generates hundreds of unique bugs
# ---------------------------------------------------------------------------
_BUG_TEMPLATES = {
"crash": {
"titles": [
"{service} crashes on {trigger}",
"{service} throws {error_type} when {trigger}",
"Fatal error in {service} during {trigger}",
"Unhandled exception in {service}: {error_type}",
"{service} segfaults under {condition}",
],
"bodies": [
"When a user {trigger}, the {service} crashes immediately. "
"Error: {error_type}. Stack trace points to {component}. "
"Affects {impact}. {workaround}",
"The {service} is failing with {error_type} every time a user {trigger}. "
"No error message is shown to the user β the process just dies. "
"Impact: {impact}. {workaround}",
],
"vars": {
"service": ["auth service", "payment gateway", "search API", "notification worker",
"session manager", "user profile service", "file upload handler",
"webhook processor", "background job runner", "cache layer"],
"trigger": ["submits a form with special characters", "uploads a file larger than 10MB",
"logs in with SSO", "resets their password", "exports data to CSV",
"switches between tabs rapidly", "uses the bulk import feature",
"accesses the admin panel", "triggers a webhook", "runs a scheduled job"],
"error_type": ["NullPointerException", "SegmentationFault", "OutOfMemoryError",
"ConnectionTimeoutException", "StackOverflowError",
"IndexOutOfBoundsException", "TypeError", "KeyError"],
"component": ["UserController.java:142", "PaymentService.py:89",
"AuthMiddleware.ts:56", "SearchIndex.go:203",
"NotificationQueue.rb:77", "FileHandler.py:234"],
"impact": ["100% of users on this flow", "all mobile users", "EU region users only",
"users with accounts older than 1 year", "approximately 30% of sessions",
"every request during peak hours"],
"workaround": ["No workaround exists β the feature is completely broken.",
"Workaround: users can retry after clearing browser cache.",
"Temporary fix: restart the service every 2 hours.",
"No known workaround. Users are blocked."],
"condition": ["high concurrent load", "memory pressure above 80%",
"when connection pool is exhausted", "after running for 6+ hours"],
},
"answer_template": {
"severe": {"priority": "P0", "labels": ["bug"], "assigned_team": "backend", "milestone": "hotfix"},
"moderate": {"priority": "P1", "labels": ["bug"], "assigned_team": "backend", "milestone": "v2.1"},
},
"severity_keywords": {
"severe": ["100%", "all mobile", "No workaround", "completely broken", "blocked",
"SegmentationFault", "OutOfMemoryError"],
"moderate": ["retry", "30%", "Temporary fix", "restart"],
},
},
"security": {
"titles": [
"SQL injection vulnerability in {endpoint}",
"XSS attack possible via {input_field}",
"Authentication bypass in {service}",
"Sensitive data exposed in {location}",
"{credential_type} not invalidated after {event}",
"SSRF vulnerability in {endpoint}",
],
"bodies": [
"The {endpoint} endpoint does not sanitize {input_field} inputs. "
"Crafted queries can {exploit_result}. PoC attached and verified on {env}. "
"Treat as confidential β do not discuss publicly until patched. {additional_context}",
"When a user {event}, existing {credential_type} remain valid for {duration}. "
"An attacker who {attack_vector} can continue to access the account. "
"This is a {vuln_category} vulnerability. {additional_context}",
],
"vars": {
"endpoint": ["/api/search", "/api/users", "/api/export", "/admin/query",
"/api/upload", "/graphql", "/api/webhook"],
"input_field": ["search query", "username field", "file upload name",
"comment body", "profile bio", "webhook URL"],
"service": ["login flow", "OAuth callback", "API gateway", "admin panel",
"password reset", "2FA verification"],
"location": ["API error responses", "debug logs shipped to client",
"public S3 bucket", "unencrypted cookies", "localStorage"],
"credential_type": ["JWT tokens", "session cookies", "API keys", "OAuth tokens"],
"event": ["changes their password", "revokes API access",
"is suspended by admin", "enables 2FA"],
"exploit_result": ["dump the entire user table including password hashes",
"execute arbitrary JavaScript in other users' browsers",
"access any user's account without credentials",
"read internal service endpoints via SSRF"],
"env": ["production", "staging", "production replica"],
"duration": ["up to 24 hours", "indefinitely", "until manual cache clear",
"for the full token TTL (7 days)"],
"attack_vector": ["previously stole a token", "intercepted a session cookie",
"obtained a leaked API key"],
"vuln_category": ["session management", "access control",
"injection", "broken authentication"],
"additional_context": [
"OWASP A03 β Injection.",
"OWASP A07 β Identification and Authentication Failures.",
"CVSS score estimated at 9.1 (Critical).",
"Compliance impact: potential GDPR violation if user PII is exfiltrated.",
"Bounty hunter reported this 48 hours ago β disclosure deadline approaching.",
],
},
"answer_template": {
"default": {"priority": "P0", "labels": ["bug", "security"],
"assigned_team": "security", "milestone": "hotfix"},
},
"severity_keywords": {"default": []},
},
"performance": {
"titles": [
"{page} loads slowly for {dataset_size}",
"Memory leak in {service} causes OOM after {duration}",
"API response time degrades under {load_condition}",
"{operation} takes {duration} for {dataset_size}",
"CPU spikes to 100% when {trigger}",
],
"bodies": [
"When {condition}, the {page} takes {response_time} to load. "
"{diagnostic_info}. {impact}. {workaround}",
"The {service} allocates memory during {operation} and never frees it. "
"Server runs out of memory every {duration}. {diagnostic_info}. "
"{workaround}",
],
"vars": {
"page": ["dashboard", "analytics page", "user list", "search results",
"audit log", "reports page", "admin overview"],
"service": ["background job processor", "cache warming service",
"log aggregator", "image resizer", "ETL pipeline"],
"dataset_size": ["large datasets (10k+ rows)", "enterprise accounts",
"tables with 100k+ entries", "files over 50MB"],
"duration": ["6 hours", "4 hours", "12 hours", "30+ seconds",
"2+ minutes", "an entire day"],
"load_condition": ["concurrent load", "peak traffic", "batch processing",
"more than 50 simultaneous users"],
"operation": ["bulk export", "report generation", "data migration",
"full-text search", "image processing"],
"trigger": ["running bulk exports", "processing large uploads",
"generating PDF reports", "reindexing search"],
"condition": ["a dataset has more than 10k rows",
"multiple users trigger exports simultaneously",
"the nightly ETL job runs alongside user traffic"],
"response_time": ["30+ seconds", "over a minute", "2-3 minutes",
"timeout after 60 seconds"],
"diagnostic_info": ["CPU spikes to 100%", "Heap profiler confirms the leak",
"Database EXPLAIN shows full table scan",
"N+1 query pattern detected in APM",
"Garbage collector running every 500ms"],
"impact": ["Affects power users with large accounts",
"All users experience slowness during peak hours",
"Requires manual restart to recover",
"Operational overhead: scheduled restarts every 4 hours"],
"workaround": ["Workaround: export data and use offline tools.",
"Workaround: scheduled restarts every 4 hours.",
"No workaround β users just wait.",
"Workaround: paginate results (but UX is degraded)."],
},
"answer_template": {
"severe": {"priority": "P1", "labels": ["bug", "performance"],
"assigned_team": "backend", "milestone": "v2.1"},
"moderate": {"priority": "P2", "labels": ["bug", "performance"],
"assigned_team": "backend", "milestone": "v2.1"},
},
"severity_keywords": {
"severe": ["OOM", "100%", "manual restart", "timeout", "No workaround",
"all users", "never frees"],
"moderate": ["Workaround", "power users", "paginate"],
},
},
"ui_bug": {
"titles": [
"{ui_element} breaks layout on {browser}",
"{ui_element} not rendering correctly in {mode}",
"Responsive layout broken on {device}",
"{feature} toggle not persisting across {context}",
"Accessibility: {ui_element} missing {a11y_attr}",
],
"bodies": [
"Switching to {mode} on {browser} causes {ui_element} to {visual_issue}. "
"{other_browsers}. {workaround}",
"On {device}, the {ui_element} is {visual_issue}. "
"Tested on {browser}. {impact}. {workaround}",
],
"vars": {
"ui_element": ["navigation bar", "sidebar menu", "modal dialog",
"dropdown selector", "data table", "footer",
"toast notifications", "breadcrumb trail"],
"browser": ["Safari 16", "Firefox ESR", "Chrome on Android",
"Edge on Windows", "iOS Safari", "Samsung Internet"],
"mode": ["dark mode", "high contrast mode", "RTL layout",
"compact view", "print view"],
"device": ["iPhone SE", "tablets in portrait", "screens below 768px",
"ultra-wide monitors", "4K displays"],
"feature": ["dark mode", "compact view", "language preference",
"notification settings"],
"context": ["page reloads", "different tabs", "sessions",
"browser restarts"],
"visual_issue": ["overlap the main content", "disappear entirely",
"render with incorrect colors", "become unclickable",
"overflow beyond the viewport"],
"other_browsers": ["Chrome and Firefox are unaffected.",
"Only reproducible on this specific browser.",
"Affects all WebKit-based browsers."],
"a11y_attr": ["ARIA labels", "keyboard focus indicators",
"screen reader text", "proper heading hierarchy"],
"impact": ["Cosmetic issue, no functional impact.",
"Users cannot access the affected feature.",
"Usability is degraded but the feature works."],
"workaround": ["Workaround: use a different browser.",
"Workaround: manually resize the window.",
"No workaround for this browser.",
"Workaround: disable the feature in settings."],
},
"answer_template": {
"severe": {"priority": "P2", "labels": ["bug", "ux"],
"assigned_team": "frontend", "milestone": "v2.1"},
"moderate": {"priority": "P3", "labels": ["bug", "ux"],
"assigned_team": "frontend", "milestone": "backlog"},
},
"severity_keywords": {
"severe": ["cannot access", "unclickable", "disappear", "No workaround"],
"moderate": ["Cosmetic", "different browser", "resize"],
},
},
"data_corruption": {
"titles": [
"Race condition in {feature}: {consequence}",
"Data inconsistency in {feature} under concurrent writes",
"{export_format} export produces corrupted output for {edge_case}",
"Stale data served from cache after {trigger}",
"Duplicate records created when {trigger}",
],
"bodies": [
"Under concurrent load, {feature} can {consequence} due to a race condition "
"in {root_cause}. Frequency: {frequency}. {impact}. {workaround}",
"When {feature} data contains {edge_case}, the exported {export_format} file "
"is corrupted and cannot be {consumer}. {impact}. {workaround}",
],
"vars": {
"feature": ["file upload", "order processing", "user registration",
"inventory update", "comment system", "permission assignment"],
"consequence": ["files occasionally overwrite each other",
"orders are duplicated or lost",
"users get assigned wrong permissions",
"inventory counts become negative"],
"root_cause": ["temp file naming logic", "lack of database locking",
"non-atomic read-modify-write cycle",
"missing unique constraint"],
"frequency": ["approximately 1 in 10,000 operations",
"consistently under 50+ concurrent users",
"intermittently β hard to reproduce",
"every time the batch job runs"],
"edge_case": ["non-ASCII characters (e.g., cafΓ©, naΓ―ve)",
"values containing commas or quotes",
"null or empty fields",
"timestamps crossing DST boundaries"],
"export_format": ["CSV", "Excel", "JSON", "PDF"],
"consumer": ["opened in Excel", "parsed by downstream services",
"imported back into the system"],
"trigger": ["double-clicking the submit button",
"cache TTL expires during a write operation",
"two users edit the same record simultaneously",
"the nightly sync job overlaps with user activity"],
"impact": ["Potential data loss confirmed.",
"No data loss confirmed yet, but risk exists.",
"Affects users with international data.",
"Breaks downstream pipeline processing."],
"workaround": ["Workaround: enable sequential mode in settings.",
"Workaround: manually re-export after cleanup.",
"No reliable workaround β data must be manually verified.",
"Workaround: add a mutex lock externally (operational overhead)."],
},
"answer_template": {
"severe": {"priority": "P1", "labels": ["bug", "data-integrity"],
"assigned_team": "backend", "milestone": "v2.1"},
"moderate": {"priority": "P2", "labels": ["bug", "data-integrity"],
"assigned_team": "backend", "milestone": "v2.1"},
},
"severity_keywords": {
"severe": ["data loss", "No reliable workaround", "consistently",
"permissions", "overwrite", "negative"],
"moderate": ["No data loss", "intermittently", "sequential mode",
"re-export", "non-ASCII"],
},
},
"documentation": {
"titles": [
"Typo in {location}",
"Outdated {doc_type} on {page}",
"Missing documentation for {feature}",
"Incorrect {doc_element} in {location}",
],
"bodies": [
"There is a {issue_type} on the {page}: {detail}. No functional impact, "
"purely cosmetic. {extra}",
"The {doc_type} for {feature} is {issue_type}. {detail}. {extra}",
],
"vars": {
"location": ["homepage docs", "API reference", "README", "changelog",
"contributing guide", "onboarding wiki"],
"doc_type": ["installation guide", "API documentation", "changelog",
"migration guide", "code comments"],
"page": ["landing page", "docs homepage", "getting started page",
"FAQ section", "footer"],
"feature": ["new webhook API", "batch processing endpoint",
"SSO integration", "rate limiting"],
"doc_element": ["code example", "endpoint URL", "parameter description",
"copyright year", "version number"],
"issue_type": ["a typo", "outdated", "missing", "incorrect", "misleading"],
"detail": ["'Welccome' should be 'Welcome'",
"references removed v1.x API that no longer exists",
"completely undocumented despite being a core feature",
"shows 'Β© 2022' but should be 'Β© 2024'",
"the curl example uses the wrong HTTP method"],
"extra": ["", "Low priority β does not block any workflow.",
"New users have reported confusion.",
"Only noticed by contributors reading source code."],
},
"answer_template": {
"default": {"priority": "P3", "labels": ["documentation"],
"assigned_team": "devx", "milestone": "backlog"},
},
"severity_keywords": {"default": []},
},
"api_bug": {
"titles": [
"API rate limiter {issue} after {trigger}",
"{endpoint} returns {status_code} instead of {expected_code}",
"Pagination broken on {endpoint}: {symptom}",
"Webhook delivery {issue} for {event_type} events",
"API versioning: {endpoint} behaves differently on v1 vs v2",
],
"bodies": [
"After receiving a {status_code} response, {consequence}. "
"The {root_cause}. {impact}. {workaround}",
"The {endpoint} endpoint {symptom} when {trigger}. "
"Expected behavior: {expected}. Actual: {actual}. {impact}.",
],
"vars": {
"endpoint": ["/api/users", "/api/search", "/api/export",
"/api/webhooks", "/api/billing", "/api/analytics"],
"issue": ["blocks legitimate users", "fails silently",
"returns incorrect retry headers", "drops events"],
"trigger": ["a 429 error", "rate limit window resets",
"a burst of requests from CI/CD", "server restart"],
"status_code": ["429", "500", "502", "504", "403"],
"expected_code": ["200", "201", "204", "404"],
"symptom": ["returns duplicate entries",
"skips items between pages",
"returns empty page despite more data existing"],
"event_type": ["payment.completed", "user.created",
"subscription.cancelled", "deployment.finished"],
"consequence": ["legitimate users remain blocked for 1 hour",
"data is silently lost with no error",
"downstream services receive stale data"],
"root_cause": ["unblock logic has a bug β it never clears the blocked flag",
"cursor-based pagination uses wrong sort order",
"retry-after header reports seconds instead of milliseconds"],
"expected": ["200 OK with paginated results",
"successful delivery with retry on failure",
"proper rate limit reset after window expires"],
"actual": ["empty response with 200 status",
"permanent block until manual intervention",
"events dropped without any error log"],
"impact": ["Affects CI/CD pipelines hitting the API.",
"External integrations break silently.",
"Customer-facing dashboards show wrong data.",
"Retry-After header causes clients to wait too long."],
"workaround": ["Workaround: manually clear Redis key.",
"Workaround: add client-side deduplication.",
"No workaround β requires server-side fix.",
"Workaround: pin API version to v1 in headers."],
},
"answer_template": {
"severe": {"priority": "P1", "labels": ["bug", "api"],
"assigned_team": "backend", "milestone": "v2.1"},
"moderate": {"priority": "P2", "labels": ["bug", "api"],
"assigned_team": "backend", "milestone": "v2.1"},
},
"severity_keywords": {
"severe": ["silently lost", "permanent block", "No workaround",
"dropped", "external integrations"],
"moderate": ["Workaround", "pin API", "deduplication"],
},
},
}
# The original handcrafted bugs β kept as a gold-standard subset
_HANDCRAFTED_BUGS = {
"easy": {
"bugs": [
BugReport(
id="easy-001",
title="App crashes on login with correct credentials",
body="When I enter my correct username and password, the app crashes immediately. "
"This started after the v2.0 release. Affects 100% of users. "
"No workaround exists β users cannot log in at all.",
author="user123",
labels_hint=[],
comments=["Confirmed on iOS and Android.", "Happens every time."],
severity_signals=["100% of users", "crashes", "no workaround"],
stack_trace="NullPointerException at AuthController.java:87",
affected_component="auth-service",
),
BugReport(
id="easy-002",
title="Typo in documentation homepage",
body="There is a typo on the homepage docs: 'Welccome' should be 'Welcome'. "
"No functional impact, purely cosmetic.",
author="docs_fan",
labels_hint=["documentation"],
comments=[],
severity_signals=["cosmetic", "no functional impact"],
stack_trace="",
affected_component="docs",
),
BugReport(
id="easy-003",
title="Dashboard loads slowly for large datasets",
body="When a dataset has more than 10k rows, the dashboard takes 30+ seconds to load. "
"Workaround: export data and use offline tools. Affects power users only.",
author="power_user",
labels_hint=["performance"],
comments=["Noticed after the last deploy.", "CPU spikes to 100%."],
severity_signals=["workaround exists", "power users only"],
stack_trace="",
affected_component="dashboard",
),
BugReport(
id="easy-004",
title="Email notifications not sent after password reset",
body="Users who reset their password do not receive the confirmation email. "
"SMTP logs show the job is queued but never dispatched. "
"Affects all users attempting password reset.",
author="support_team",
labels_hint=["bug"],
comments=["Reported by 12 users this week.",
"Started after email service migration."],
severity_signals=["all users", "never dispatched"],
stack_trace="",
affected_component="email-service",
),
BugReport(
id="easy-005",
title="Incorrect copyright year in footer",
body="The footer shows 'Β© 2022' but it should be 'Β© 2024'. "
"No functional impact.",
author="intern_dev",
labels_hint=["documentation"],
comments=[],
severity_signals=["no functional impact"],
stack_trace="",
affected_component="frontend",
),
],
"answers": {
"easy-001": {"priority": "P0"},
"easy-002": {"priority": "P3"},
"easy-003": {"priority": "P2"},
"easy-004": {"priority": "P1"},
"easy-005": {"priority": "P3"},
},
},
"medium": {
"bugs": [
BugReport(
id="med-001",
title="Payment fails silently on checkout",
body="Checkout completes without error but payment is never charged. "
"No error shown to user. Stripe logs show declined transaction. "
"Direct revenue loss β every failed checkout is a lost sale.",
author="store_owner",
labels_hint=["bug"],
comments=["Revenue impact confirmed.", "Happening since Tuesday."],
severity_signals=["revenue loss", "silently", "every failed checkout"],
stack_trace="Stripe API: card_declined at PaymentService.py:145",
affected_component="payment-service",
),
BugReport(
id="med-002",
title="Search results include deleted posts",
body="Deleted blog posts still appear in search results for up to 24 hours. "
"Users can read content that was explicitly removed by moderators. "
"Potential GDPR violation if deleted content belongs to EU users.",
author="moderator_jane",
labels_hint=[],
comments=["GDPR concern β deleted content still visible."],
severity_signals=["GDPR violation", "deleted content visible"],
stack_trace="",
affected_component="search-index",
),
BugReport(
id="med-003",
title="Dark mode toggle breaks layout on Safari",
body="Switching to dark mode on Safari 16 causes nav bar to overlap content. "
"Chrome and Firefox unaffected. Workaround: use a different browser.",
author="safari_user",
labels_hint=["bug", "ux"],
comments=["Only on Safari, not Chrome/Firefox."],
severity_signals=["workaround exists", "single browser"],
stack_trace="",
affected_component="frontend-css",
),
BugReport(
id="med-004",
title="CSV export produces corrupted file for non-ASCII characters",
body="When table data contains accented characters (e.g. cafΓ©, naΓ―ve), "
"the exported CSV file is corrupted and cannot be opened in Excel. "
"Affects users with international data.",
author="data_analyst",
labels_hint=["bug"],
comments=["Encoding issue β UTF-8 not respected.",
"Workaround: manual copy-paste."],
severity_signals=["corrupted", "workaround exists"],
stack_trace="",
affected_component="export-service",
),
BugReport(
id="med-005",
title="API rate limiter blocks legitimate users after 429 error",
body="After receiving a 429 Too Many Requests response, legitimate users "
"remain blocked for 1 hour even after the rate limit window resets. "
"The unblock logic has a bug β it never clears the blocked flag.",
author="api_user",
labels_hint=["bug"],
comments=["Affects CI/CD pipelines hitting the API.",
"Retry-After header is wrong."],
severity_signals=["permanent block", "never clears", "bug in logic"],
stack_trace="",
affected_component="api-gateway",
),
],
"answers": {
"med-001": {"priority": "P0", "labels": ["bug", "payments"], "assigned_team": "backend"},
"med-002": {"priority": "P1", "labels": ["bug", "security"], "assigned_team": "security"},
"med-003": {"priority": "P2", "labels": ["bug", "ux"], "assigned_team": "frontend"},
"med-004": {"priority": "P2", "labels": ["bug", "data-integrity"], "assigned_team": "backend"},
"med-005": {"priority": "P1", "labels": ["bug", "performance"], "assigned_team": "backend"},
},
},
"hard": {
"bugs": [
BugReport(
id="hard-001",
title="SQL injection vulnerability in search endpoint",
body="The /api/search endpoint does not sanitize inputs. "
"Crafted queries can dump the entire user table including password hashes. "
"PoC attached. Verified on production. Treat as confidential β "
"do not discuss publicly until patched.",
author="security_researcher",
labels_hint=[],
comments=["Critical. Affects production.", "Do not discuss publicly."],
severity_signals=["SQL injection", "password hashes", "production",
"confidential"],
stack_trace="",
affected_component="search-api",
),
BugReport(
id="hard-002",
title="Memory leak in background job processor causes OOM after 6 hours",
body="The job processor allocates ~50MB per job and never frees it. "
"Server runs out of memory every 6 hours, requiring a manual restart. "
"Heap profiler confirms leak introduced in v1.9. "
"Workaround: scheduled restarts every 4 hours (operational overhead).",
author="devops_alice",
labels_hint=["performance"],
comments=["Verified with heap profiler.", "Started in v1.9."],
severity_signals=["memory leak", "OOM", "manual restart", "never frees"],
stack_trace="HeapDump: JobProcessor.process() -> 50MB/call, never GC'd",
affected_component="job-processor",
),
BugReport(
id="hard-003",
title="Race condition in file upload: files occasionally overwrite each other",
body="Under concurrent load, two users uploading simultaneously can get "
"each other's files due to a race condition in the temp file naming logic. "
"Frequency: approximately 1 in 10,000 uploads under normal load. "
"No data loss confirmed yet and a workaround exists: "
"enable sequential upload mode in settings (disabled by default). "
"Risk is low-probability but affects data integrity.",
author="qa_bot",
labels_hint=["bug"],
comments=["Reproduced with locust at 50 concurrent users.",
"Sequential mode avoids it."],
severity_signals=["race condition", "data integrity",
"workaround exists", "low-probability"],
stack_trace="",
affected_component="file-upload",
),
BugReport(
id="hard-004",
title="Auth token not invalidated after password change",
body="When a user changes their password, existing JWT tokens remain valid "
"for up to 24 hours. An attacker who previously stole a token can "
"continue to access the account even after the password is reset. "
"This is a session management security vulnerability.",
author="pentest_team",
labels_hint=["security"],
comments=["Verified on staging.",
"OWASP A07 β Identification and Authentication Failures."],
severity_signals=["JWT not invalidated", "attacker", "security vulnerability",
"stolen token"],
stack_trace="",
affected_component="auth-service",
),
BugReport(
id="hard-005",
title="Infinite loop in webhook retry logic causes CPU spike",
body="When a webhook endpoint returns a 500 error, the retry logic enters "
"an infinite loop with no backoff or retry cap. "
"This causes CPU to spike to 100% within minutes and starves other services. "
"Triggered in production twice this week. Requires process kill to recover.",
author="oncall_eng",
labels_hint=["bug", "performance"],
comments=["PagerDuty alert fired twice.",
"Needs exponential backoff + max retry cap."],
severity_signals=["infinite loop", "100%", "production",
"process kill", "starves other services"],
stack_trace="Thread dump: WebhookRetrier.retry() β recursive call, no exit",
affected_component="webhook-service",
),
],
"answers": {
"hard-001": {
"priority": "P0", "labels": ["bug", "security"],
"assigned_team": "security", "milestone": "hotfix",
},
"hard-002": {
"priority": "P1", "labels": ["bug", "performance"],
"assigned_team": "backend", "milestone": "v2.1",
},
"hard-003": {
"priority": "P1", "labels": ["bug", "data-integrity"],
"assigned_team": "backend", "milestone": "v2.1",
},
"hard-004": {
"priority": "P0", "labels": ["bug", "security"],
"assigned_team": "security", "milestone": "hotfix",
},
"hard-005": {
"priority": "P0", "labels": ["bug", "performance"],
"assigned_team": "backend", "milestone": "hotfix",
},
},
},
}
# Combine into single TASKS dict (backward compatible)
TASKS = _HANDCRAFTED_BUGS
# ---------------------------------------------------------------------------
# PROCEDURAL BUG GENERATOR
# ---------------------------------------------------------------------------
def _determine_severity(text: str, keywords: Dict[str, list]) -> str:
"""Check which severity level the generated text matches."""
text_lower = text.lower()
for level, kws in keywords.items():
if level == "default":
return "default"
hits = sum(1 for kw in kws if kw.lower() in text_lower)
if hits >= 1:
return level
# fallback to first non-default key
return list(keywords.keys())[0] if keywords else "moderate"
def generate_bug(task_key: str, seed: int = None) -> Tuple[BugReport, dict]:
"""Generate a procedural bug report with its correct answer."""
rng = random.Random(seed)
# Weight categories by difficulty
weights = {
"easy": {"documentation": 3, "ui_bug": 3, "performance": 2,
"crash": 1, "api_bug": 1},
"medium": {"crash": 3, "performance": 3, "api_bug": 2,
"data_corruption": 2, "ui_bug": 1},
"hard": {"security": 4, "crash": 3, "data_corruption": 3,
"performance": 2, "api_bug": 2},
}
task_weights = weights.get(task_key, weights["medium"])
categories = []
for cat, w in task_weights.items():
categories.extend([cat] * w)
category = rng.choice(categories)
template = _BUG_TEMPLATES[category]
# Pick random variable values
chosen_vars = {}
for var_name, options in template["vars"].items():
chosen_vars[var_name] = rng.choice(options)
# Build title and body
title_tmpl = rng.choice(template["titles"])
body_tmpl = rng.choice(template["bodies"])
# Safe format β ignore missing keys
def safe_format(tmpl, vars_dict):
result = tmpl
for k, v in vars_dict.items():
result = result.replace("{" + k + "}", v)
return result
title = safe_format(title_tmpl, chosen_vars)
body = safe_format(body_tmpl, chosen_vars)
# Generate unique ID from seed
bug_id = f"gen-{seed or rng.randint(0, 999999):06d}"
# Pick author
authors = ["user_report", "qa_engineer", "support_team", "dev_oncall",
"security_bot", "customer_jane", "automated_monitor",
"intern_dev", "senior_eng", "pm_feedback"]
author = rng.choice(authors)
# Build comments
comment_templates = [
"Confirmed on our side.", "Reproduced in staging.",
"Multiple reports from users.", "Started after last deployment.",
"Urgent β customer escalation.", "Low priority β no user complaints.",
"Needs investigation.", "Related to ticket from last sprint.",
]
num_comments = rng.randint(0, 3)
comments = rng.sample(comment_templates, min(num_comments, len(comment_templates)))
# Determine severity and answer
full_text = f"{title} {body} {' '.join(comments)}"
severity_kws = template.get("severity_keywords", {})
severity = _determine_severity(full_text, severity_kws)
answer_templates = template["answer_template"]
answer = dict(answer_templates.get(severity, list(answer_templates.values())[0]))
# For easy tasks, only priority matters
if task_key == "easy":
answer = {"priority": answer["priority"]}
elif task_key == "medium":
answer.pop("milestone", None)
bug = BugReport(
id=bug_id,
title=title,
body=body,
author=author,
labels_hint=rng.sample(["bug", "needs-triage", "reported"], rng.randint(0, 2)),
comments=comments,
severity_signals=[],
stack_trace="",
affected_component=chosen_vars.get("service", chosen_vars.get("endpoint", "")),
)
return bug, answer
# ---------------------------------------------------------------------------
# BUG SAMPLER β uses handcrafted bugs first, then procedural for variety
# ---------------------------------------------------------------------------
def sample_bug(task_key: str, seed: int = None) -> Tuple[BugReport, dict]:
"""Return a bug and its answer. Mixes handcrafted + procedural."""
rng = random.Random(seed)
# 40% chance of handcrafted, 60% procedural
if rng.random() < 0.4 and task_key in _HANDCRAFTED_BUGS:
bugs = _HANDCRAFTED_BUGS[task_key]["bugs"]
bug = rng.choice(bugs)
answer = _HANDCRAFTED_BUGS[task_key]["answers"][bug.id]
return bug, answer
else:
gen_seed = seed if seed is not None else rng.randint(0, 999999)
return generate_bug(task_key, seed=gen_seed)
# ---------------------------------------------------------------------------
# GRADING β with semantic label matching
# ---------------------------------------------------------------------------
PRIORITY_ORDER = {"P0": 0, "P1": 1, "P2": 2, "P3": 3}
def _priority_score(predicted: str, correct: str) -> float:
"""Score priority assignment with partial credit for near-misses."""
if predicted == correct:
return 0.95
pred_rank = PRIORITY_ORDER.get(predicted, 99)
corr_rank = PRIORITY_ORDER.get(correct, 99)
diff = abs(pred_rank - corr_rank)
if diff == 1:
return 0.5
elif diff == 2:
return 0.2
return 0.05
def _normalize_label(label: str) -> str:
"""Normalize a label to its canonical form."""
label_lower = label.lower().strip()
for canonical, synonyms in LABEL_SYNONYMS.items():
if label_lower == canonical or label_lower in synonyms:
return canonical
return label_lower
def _label_score(predicted: List[str], correct: List[str]) -> float:
"""Score labels using semantic matching via synonym groups."""
pred_normalized = set(_normalize_label(l) for l in predicted)
corr_normalized = set(_normalize_label(l) for l in correct)
if not corr_normalized:
return 0.95
intersection = pred_normalized & corr_normalized
union = pred_normalized | corr_normalized
raw = len(intersection) / len(union) if union else 0.0
return max(0.05, min(0.95, raw))
def _reasoning_score(reasoning: str, answer: dict) -> float:
"""Bonus for reasoning that mentions relevant signals."""
if not reasoning or len(reasoning.strip()) < 10:
return 0.0
key_signals = {
"P0": ["production", "all users", "data loss", "security", "crash",
"revenue", "injection", "vulnerability", "100%"],
"P1": ["major", "significant", "no workaround", "broken",
"gdpr", "blocked", "leak", "never"],
"P2": ["degraded", "workaround", "partial", "slow",
"affected", "power users"],
"P3": ["minor", "cosmetic", "docs", "typo", "low",
"no functional impact"],
}
expected_priority = answer.get("priority", "P2")
signals = key_signals.get(expected_priority, [])
reasoning_lower = reasoning.lower()
hits = sum(1 for s in signals if s in reasoning_lower)
return min(0.15, hits * 0.05)
def grade_action(task_key: str, bug: BugReport, action: TriageAction,
answer: dict = None) -> Tuple[float, str]:
"""Grade the agent's triage action against the correct answer."""
# Backward compatibility: look up answer from handcrafted if not provided
if answer is None:
if task_key in _HANDCRAFTED_BUGS and bug.id in _HANDCRAFTED_BUGS[task_key]["answers"]:
answer = _HANDCRAFTED_BUGS[task_key]["answers"][bug.id]
else:
return 0.5, "No answer key found for this bug."
feedback_parts = []
reasoning_bonus = _reasoning_score(action.reasoning, answer)
if task_key == "easy":
score = _priority_score(action.priority, answer["priority"])
symbol = "β" if score >= 0.9 else "~" if score >= 0.4 else "β"
feedback_parts.append(
f"Priority: {symbol} (got {action.priority}, expected {answer['priority']})")
score = score + reasoning_bonus
score = max(0.01, min(0.99, score))
return round(score, 3), " | ".join(feedback_parts)
elif task_key == "medium":
p_score = _priority_score(action.priority, answer["priority"])
l_score = _label_score(action.labels, answer.get("labels", []))
expected_team = answer.get("assigned_team", "")
t_score = 0.95 if expected_team and action.assigned_team.lower() == expected_team.lower() else 0.05
score = 0.45 * p_score + 0.40 * l_score + 0.15 * t_score + reasoning_bonus
feedback_parts.append(
f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
feedback_parts.append(f"Labels: {l_score:.2f} (semantic match)")
feedback_parts.append(
f"Team: {t_score:.2f} (got {action.assigned_team}, expected {expected_team})")
if reasoning_bonus > 0:
feedback_parts.append(f"Reasoning bonus: +{reasoning_bonus:.2f}")
score = max(0.01, min(0.99, score))
return round(score, 3), " | ".join(feedback_parts)
else: # hard
p_score = _priority_score(action.priority, answer["priority"])
l_score = _label_score(action.labels, answer.get("labels", []))
t_score = 0.95 if action.assigned_team.lower() == answer["assigned_team"].lower() else 0.05
m_score = 0.95 if action.milestone.lower() == answer["milestone"].lower() else 0.05
score = 0.35 * p_score + 0.30 * l_score + 0.20 * t_score + 0.15 * m_score + reasoning_bonus
feedback_parts.append(
f"Priority: {p_score:.2f} (got {action.priority}, expected {answer['priority']})")
feedback_parts.append(f"Labels: {l_score:.2f} (semantic match)")
feedback_parts.append(
f"Team: {t_score:.2f} (got {action.assigned_team}, expected {answer['assigned_team']})")
feedback_parts.append(
f"Milestone: {m_score:.2f} (got {action.milestone}, expected {answer['milestone']})")
if reasoning_bonus > 0:
feedback_parts.append(f"Reasoning bonus: +{reasoning_bonus:.2f}")
# Security escalation penalty
if answer.get("assigned_team") == "security" and action.assigned_team.lower() != "security":
score = max(0.01, score - 0.15)
feedback_parts.append("β Security escalation missed (-0.15)")
score = max(0.01, min(0.99, score))
return round(score, 3), " | ".join(feedback_parts)
# ---------------------------------------------------------------------------
# NAMED GRADER FUNCTIONS β referenced by openenv.yaml
# ---------------------------------------------------------------------------
def priority_match(*args, **kwargs):
if len(args) < 2:
return 0.5
bug, action = args[0], args[1]
score, _ = grade_action("easy", bug, action)
return float(score)
def priority_label_team(*args, **kwargs):
if len(args) < 2:
return 0.5
bug, action = args[0], args[1]
score, _ = grade_action("medium", bug, action)
return float(score)
def full_triage(*args, **kwargs):
if len(args) < 2:
return 0.5
bug, action = args[0], args[1]
score, _ = grade_action("hard", bug, action)
return float(score)
__all__ = [
"priority_match",
"priority_label_team",
"full_triage",
"sample_bug",
"generate_bug",
"grade_action",
"TASKS",
"LABEL_SYNONYMS",
] |