Spaces:
Sleeping
Sleeping
| """ | |
| Synthetic bug report dataset for the Nexus Platform β a fictional data analytics SaaS. | |
| Issues span 4 severity levels and include duplicate groups for medium/hard tasks. | |
| All ground-truth answers (severity, duplicate sets, expected labels, response keywords) | |
| are embedded here so graders are fully deterministic. | |
| """ | |
| from typing import Dict, List, Any | |
| # ββ SEVERITY ordering (for partial-credit distance scoring) βββββββββββββββββββ | |
| SEVERITY_ORDER = ["critical", "high", "medium", "low"] | |
| # ββ AVAILABLE LABELS ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| AVAILABLE_LABELS = [ | |
| "bug", "security", "data-loss", "performance", "ui", "ux", | |
| "api", "authentication", "database", "memory-leak", "crash", | |
| "export", "notifications", "mobile", "documentation", "enhancement", | |
| "duplicate", "wontfix", "needs-investigation", "regression", | |
| "file-upload", "dashboard", "reporting", "billing", "integrations", | |
| ] | |
| # ββ CORE ISSUE BANK βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Each issue has ground-truth severity and labels for grading. | |
| ISSUE_BANK: Dict[str, Dict[str, Any]] = { | |
| # ββ CRITICAL ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "NX-001": { | |
| "id": "NX-001", | |
| "title": "SQL injection vulnerability in report search API", | |
| "description": ( | |
| "The `/api/reports/search?q=` endpoint does not sanitize user input. " | |
| "Passing `' OR '1'='1` returns all reports regardless of user permissions. " | |
| "A malicious actor can exfiltrate the entire reports database without authentication." | |
| ), | |
| "reporter": "security_researcher_ext", | |
| "created_at": "2024-03-01", | |
| "component": "api", | |
| "reproduction_steps": "curl \"https://app.nexus.io/api/reports/search?q=' OR '1'='1\"", | |
| "environment_info": "All environments (prod confirmed)", | |
| "severity": "critical", | |
| "labels": ["bug", "security", "api"], | |
| }, | |
| "NX-002": { | |
| "id": "NX-002", | |
| "title": "Production data corrupted after v3.2 schema migration", | |
| "description": ( | |
| "After running the v3.2 database migration on 2024-02-28, approximately 12,000 rows " | |
| "in the `pipeline_runs` table have NULL values in the `completed_at` column where " | |
| "valid timestamps previously existed. Affected customers report incorrect billing " | |
| "calculations and missing audit logs." | |
| ), | |
| "reporter": "ops_team", | |
| "created_at": "2024-02-28", | |
| "component": "database", | |
| "reproduction_steps": "SELECT COUNT(*) FROM pipeline_runs WHERE completed_at IS NULL AND created_at < '2024-02-28'", | |
| "environment_info": "Production only", | |
| "severity": "critical", | |
| "labels": ["bug", "data-loss", "database", "regression"], | |
| }, | |
| "NX-003": { | |
| "id": "NX-003", | |
| "title": "JWT tokens from deleted accounts still grant API access", | |
| "description": ( | |
| "When an admin deletes a user account, their existing JWT tokens remain valid until " | |
| "natural expiry (24h). A terminated employee can continue accessing all company data " | |
| "via the API for up to 24 hours after account deletion. Token revocation is not implemented." | |
| ), | |
| "reporter": "cto_internal", | |
| "created_at": "2024-03-05", | |
| "component": "authentication", | |
| "reproduction_steps": "1. Create user. 2. Obtain JWT. 3. Delete user via admin panel. 4. Use old JWT β still works.", | |
| "environment_info": "All environments", | |
| "severity": "critical", | |
| "labels": ["bug", "security", "authentication"], | |
| }, | |
| "NX-004": { | |
| "id": "NX-004", | |
| "title": "Scheduled exports silently drop rows when queue exceeds 10k jobs", | |
| "description": ( | |
| "During peak hours, when the export queue depth exceeds 10,000 jobs, the worker " | |
| "process silently discards rows from CSV exports to stay within a hardcoded memory " | |
| "cap. Customers receive 'successful' export emails with incomplete data. No error is logged." | |
| ), | |
| "reporter": "enterprise_customer_A", | |
| "created_at": "2024-03-10", | |
| "component": "export", | |
| "reproduction_steps": "Trigger 10,001 concurrent export jobs; inspect row counts in output CSVs.", | |
| "environment_info": "Production (observed on first Monday of every month)", | |
| "severity": "critical", | |
| "labels": ["bug", "data-loss", "export"], | |
| }, | |
| # ββ HIGH ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "NX-010": { | |
| "id": "NX-010", | |
| "title": "Application crashes (OOM) when uploading CSV files larger than 500 MB", | |
| "description": ( | |
| "Uploading any CSV file larger than approximately 500 MB via the web UI causes the " | |
| "application server to throw an OutOfMemoryError and crash. The upload appears to " | |
| "buffer the entire file in memory before processing. Server restarts automatically " | |
| "but the upload is lost and the user receives a generic 502 error." | |
| ), | |
| "reporter": "data_team_lead", | |
| "created_at": "2024-02-10", | |
| "component": "file-upload", | |
| "reproduction_steps": "Upload a 600 MB CSV. Observe server log: java.lang.OutOfMemoryError: Java heap space", | |
| "environment_info": "All environments, JVM heap set to 2GB", | |
| "severity": "high", | |
| "labels": ["bug", "crash", "file-upload", "memory-leak"], | |
| }, | |
| "NX-011": { | |
| "id": "NX-011", | |
| "title": "Memory usage grows unbounded in streaming pipeline connector", | |
| "description": ( | |
| "The Kafka streaming connector leaks memory at approximately 50 MB/hour when " | |
| "processing high-throughput topics (>100k msgs/sec). After ~8 hours of operation " | |
| "the pod is OOMKilled. Heapdump analysis points to message offset cache not being " | |
| "evicted. Requires daily pod restarts as a workaround." | |
| ), | |
| "reporter": "platform_engineer", | |
| "created_at": "2024-02-15", | |
| "component": "integrations", | |
| "reproduction_steps": "Run Kafka connector for 8h at >100k msg/sec throughput and monitor RSS.", | |
| "environment_info": "Kubernetes, all cloud providers", | |
| "severity": "high", | |
| "labels": ["bug", "memory-leak", "integrations", "performance"], | |
| }, | |
| "NX-012": { | |
| "id": "NX-012", | |
| "title": "Dashboard chart widgets fail to render for datasets > 1M rows", | |
| "description": ( | |
| "When a dashboard is connected to a dataset with more than 1 million rows, all " | |
| "chart widgets display a spinner indefinitely. The browser console shows: " | |
| "'RangeError: Maximum call stack size exceeded' in chart-renderer.js. " | |
| "Smaller datasets work fine. This affects ~30% of enterprise customers." | |
| ), | |
| "reporter": "product_team", | |
| "created_at": "2024-02-20", | |
| "component": "dashboard", | |
| "reproduction_steps": "Connect any chart widget to a table with >1M rows.", | |
| "environment_info": "Chrome 121, Firefox 122, Safari 17 β all affected", | |
| "severity": "high", | |
| "labels": ["bug", "dashboard", "ui", "performance"], | |
| }, | |
| "NX-013": { | |
| "id": "NX-013", | |
| "title": "Critical alert email notifications not being delivered", | |
| "description": ( | |
| "Since the SendGrid API key rotation on 2024-02-25, email notifications for " | |
| "CRITICAL severity pipeline alerts are not being sent. The notification service " | |
| "logs show 401 Unauthorized errors from SendGrid but silently swallows them " | |
| "rather than retrying or alerting ops. Customers are missing SLA breach warnings." | |
| ), | |
| "reporter": "sre_team", | |
| "created_at": "2024-02-26", | |
| "component": "notifications", | |
| "reproduction_steps": "Trigger a critical pipeline alert; check email β nothing arrives. Check logs for SendGrid 401.", | |
| "environment_info": "Production only", | |
| "severity": "high", | |
| "labels": ["bug", "notifications", "api", "regression"], | |
| }, | |
| "NX-014": { | |
| "id": "NX-014", | |
| "title": "Billing calculation incorrect for pro-rated mid-cycle plan upgrades", | |
| "description": ( | |
| "When a customer upgrades from Starter to Pro mid-billing-cycle, the system " | |
| "charges the full Pro price instead of the pro-rated amount. Approximately " | |
| "230 customers were overcharged in February. Finance has already processed " | |
| "manual refunds but the bug is still present." | |
| ), | |
| "reporter": "billing_team", | |
| "created_at": "2024-03-02", | |
| "component": "billing", | |
| "reproduction_steps": "Upgrade plan on day 15 of a 30-day cycle; inspect next invoice.", | |
| "environment_info": "Production only", | |
| "severity": "high", | |
| "labels": ["bug", "billing", "regression"], | |
| }, | |
| # ββ MEDIUM ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "NX-020": { | |
| "id": "NX-020", | |
| "title": "Filtered report queries are 10-40x slower after index rebuild", | |
| "description": ( | |
| "After the maintenance window on 2024-02-18, queries that filter reports by " | |
| "date range are 10-40x slower (2s β 45s for typical queries). " | |
| "EXPLAIN ANALYZE shows a seq scan despite the index existing. " | |
| "A workaround is to run ANALYZE on the affected table manually." | |
| ), | |
| "reporter": "backend_dev", | |
| "created_at": "2024-02-19", | |
| "component": "database", | |
| "reproduction_steps": "SELECT * FROM reports WHERE created_at BETWEEN '2024-01-01' AND '2024-02-01' β observe query time.", | |
| "environment_info": "PostgreSQL 15.2, production", | |
| "severity": "medium", | |
| "labels": ["bug", "database", "performance", "regression"], | |
| }, | |
| "NX-021": { | |
| "id": "NX-021", | |
| "title": "Date picker shows incorrect UTC offset for IST (Indian Standard Time) users", | |
| "description": ( | |
| "Users with their timezone set to IST (UTC+5:30) see the date picker " | |
| "displaying UTC+5:00 instead of the correct UTC+5:30. This causes scheduled " | |
| "pipelines to run 30 minutes earlier than configured. Workaround: set timezone " | |
| "to 'Kolkata' explicitly." | |
| ), | |
| "reporter": "customer_support", | |
| "created_at": "2024-02-22", | |
| "component": "ui", | |
| "reproduction_steps": "Set account timezone to IST; create a scheduled pipeline; observe execution time offset.", | |
| "environment_info": "All browsers, all OSes", | |
| "severity": "medium", | |
| "labels": ["bug", "ui", "ux"], | |
| }, | |
| "NX-022": { | |
| "id": "NX-022", | |
| "title": "Export to PDF button missing on mobile devices", | |
| "description": ( | |
| "The 'Export to PDF' button in the report toolbar is not visible on mobile " | |
| "viewports (< 768px). The button is present in the DOM but has display:none " | |
| "applied via a media query with no alternative access path. Mobile users " | |
| "cannot export reports." | |
| ), | |
| "reporter": "mobile_user_feedback", | |
| "created_at": "2024-02-25", | |
| "component": "ui", | |
| "reproduction_steps": "Open any report on a mobile device or resize browser to < 768px; toolbar missing PDF export.", | |
| "environment_info": "iOS Safari, Android Chrome", | |
| "severity": "medium", | |
| "labels": ["bug", "mobile", "ui", "export"], | |
| }, | |
| "NX-023": { | |
| "id": "NX-023", | |
| "title": "Live search results don't update until full page reload", | |
| "description": ( | |
| "The global search bar (Cmd+K) shows stale results after new content is added. " | |
| "Search index is only refreshed on full page reload. If a user creates a new " | |
| "dashboard and immediately searches for it, it won't appear. Refreshing fixes it." | |
| ), | |
| "reporter": "power_user", | |
| "created_at": "2024-03-01", | |
| "component": "ui", | |
| "reproduction_steps": "Create a new dashboard; immediately Cmd+K search for it β not found. Reload page; search again β found.", | |
| "environment_info": "All browsers", | |
| "severity": "medium", | |
| "labels": ["bug", "ui", "ux"], | |
| }, | |
| "NX-024": { | |
| "id": "NX-024", | |
| "title": "Bulk delete confirmation modal does not close after action completes", | |
| "description": ( | |
| "After confirming a bulk delete in the data source manager, the confirmation " | |
| "modal remains open even though the deletion succeeded. The user must manually " | |
| "close it. This is a minor UX annoyance but has caused some users to attempt " | |
| "the deletion twice, leading to a 'not found' error." | |
| ), | |
| "reporter": "qa_team", | |
| "created_at": "2024-03-03", | |
| "component": "ui", | |
| "reproduction_steps": "Select 3+ data sources; click Bulk Delete; confirm; observe modal stays open.", | |
| "environment_info": "Chrome 121", | |
| "severity": "medium", | |
| "labels": ["bug", "ui", "ux"], | |
| }, | |
| "NX-025": { | |
| "id": "NX-025", | |
| "title": "API rate limit headers missing on 429 responses", | |
| "description": ( | |
| "When the API returns a 429 Too Many Requests response, the standard " | |
| "Retry-After and X-RateLimit-* headers are absent. This forces client " | |
| "developers to implement fixed backoff instead of respecting server-side " | |
| "rate limit windows, resulting in thundering herd on resumption." | |
| ), | |
| "reporter": "api_integrations_partner", | |
| "created_at": "2024-03-07", | |
| "component": "api", | |
| "reproduction_steps": "Exceed API rate limit; inspect response headers β Retry-After missing.", | |
| "environment_info": "All environments", | |
| "severity": "medium", | |
| "labels": ["bug", "api"], | |
| }, | |
| # ββ LOW βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| "NX-030": { | |
| "id": "NX-030", | |
| "title": "Typo: 'occured' should be 'occurred' in pipeline error message", | |
| "description": "Error message reads: 'An unexpected error has occured. Please try again.' β 'occured' is misspelled.", | |
| "reporter": "attentive_user", | |
| "created_at": "2024-03-08", | |
| "component": "ui", | |
| "reproduction_steps": "Trigger a pipeline failure; observe error toast.", | |
| "environment_info": "All environments", | |
| "severity": "low", | |
| "labels": ["bug", "ui", "documentation"], | |
| }, | |
| "NX-031": { | |
| "id": "NX-031", | |
| "title": "Footer links open in same tab instead of new tab", | |
| "description": "All links in the footer (Privacy Policy, Terms of Service, Status Page) open in the same browser tab rather than a new tab, interrupting the user's workflow.", | |
| "reporter": "ux_reviewer", | |
| "created_at": "2024-03-09", | |
| "component": "ui", | |
| "reproduction_steps": "Click any footer link; observe it opens in same tab.", | |
| "environment_info": "All browsers", | |
| "severity": "low", | |
| "labels": ["bug", "ui", "ux"], | |
| }, | |
| "NX-032": { | |
| "id": "NX-032", | |
| "title": "Dark mode: settings panel header uses wrong background color", | |
| "description": ( | |
| "In dark mode, the settings panel header retains a light grey (#F5F5F5) background " | |
| "instead of using the dark theme token (#1E1E1E). All other panels are correctly themed." | |
| ), | |
| "reporter": "design_team", | |
| "created_at": "2024-03-10", | |
| "component": "ui", | |
| "reproduction_steps": "Enable dark mode; open Settings; observe header background inconsistency.", | |
| "environment_info": "All browsers", | |
| "severity": "low", | |
| "labels": ["bug", "ui"], | |
| }, | |
| "NX-033": { | |
| "id": "NX-033", | |
| "title": "Tooltip delay is 1500ms β should be 300ms per design spec", | |
| "description": "All UI tooltips have a 1500ms hover delay. The design system spec defines 300ms. This makes the UI feel sluggish.", | |
| "reporter": "design_system_team", | |
| "created_at": "2024-03-11", | |
| "component": "ui", | |
| "reproduction_steps": "Hover over any button with a tooltip; time the delay.", | |
| "environment_info": "All environments", | |
| "severity": "low", | |
| "labels": ["bug", "ui", "ux"], | |
| }, | |
| # ββ DUPLICATE GROUP A: Large file OOM (same root as NX-010) βββββββββββββββ | |
| "NX-040": { | |
| "id": "NX-040", | |
| "title": "Server 502 error when importing large datasets", | |
| "description": ( | |
| "Every time I try to import my dataset (it's about 700 MB), I get a 502 Bad " | |
| "Gateway error after about 30 seconds. The import dialog says it's processing " | |
| "but then just fails with no useful message. Smaller files (under 200 MB) work fine." | |
| ), | |
| "reporter": "analyst_user_1", | |
| "created_at": "2024-02-12", | |
| "component": "file-upload", | |
| "reproduction_steps": "Upload a file > 500 MB via the Import Data dialog.", | |
| "environment_info": "Chrome 121, MacOS 14", | |
| "severity": "high", | |
| "labels": ["bug", "crash", "file-upload"], | |
| }, | |
| "NX-041": { | |
| "id": "NX-041", | |
| "title": "Nexus crashes when I upload my Q4 sales data", | |
| "description": ( | |
| "I have a CSV file with Q4 sales data, about 650 MB. Every single time I try to " | |
| "upload it the whole app goes down for a few minutes. I see 'Service Unavailable' " | |
| "in the browser. This is urgent β I need this data loaded for a board presentation tomorrow." | |
| ), | |
| "reporter": "sales_director", | |
| "created_at": "2024-02-14", | |
| "component": "file-upload", | |
| "reproduction_steps": "Upload any CSV >= 600MB.", | |
| "environment_info": "Safari 17, MacOS 14.2", | |
| "severity": "high", | |
| "labels": ["bug", "crash", "file-upload"], | |
| }, | |
| # ββ DUPLICATE GROUP B: Kafka memory leak (same root as NX-011) ββββββββββββ | |
| "NX-042": { | |
| "id": "NX-042", | |
| "title": "Kafka connector pod gets OOMKilled nightly", | |
| "description": ( | |
| "Our Kafka connector pod is being killed every night with OOMKilled status in " | |
| "Kubernetes. We have to restart it every morning. Memory monitoring shows it " | |
| "steadily grows from ~200 MB at startup to over 2 GB over 8-10 hours. " | |
| "Topic is running at about 80k messages/sec." | |
| ), | |
| "reporter": "devops_lead", | |
| "created_at": "2024-02-17", | |
| "component": "integrations", | |
| "reproduction_steps": "Deploy Kafka connector; monitor memory with kubectl top pod for 8+ hours.", | |
| "environment_info": "GKE 1.28, k8s OOMKill", | |
| "severity": "high", | |
| "labels": ["bug", "memory-leak", "integrations"], | |
| }, | |
| "NX-043": { | |
| "id": "NX-043", | |
| "title": "Stream connector memory keeps growing β possible leak in offset cache", | |
| "description": ( | |
| "I've been profiling our streaming connector and noticed that the offset tracking " | |
| "cache never gets cleared. The HashMap holding message offsets just keeps growing " | |
| "indefinitely. After a thread dump at hour 9, the cache had over 4 million entries. " | |
| "This seems to be the root cause of the pod OOM issue others have reported." | |
| ), | |
| "reporter": "senior_engineer", | |
| "created_at": "2024-02-19", | |
| "component": "integrations", | |
| "reproduction_steps": "Attach a profiler to the connector process; inspect KafkaOffsetCache size over time.", | |
| "environment_info": "All environments", | |
| "severity": "high", | |
| "labels": ["bug", "memory-leak", "integrations", "needs-investigation"], | |
| }, | |
| # ββ DUPLICATE GROUP C: Dashboard chart rendering (same root as NX-012) ββββ | |
| "NX-044": { | |
| "id": "NX-044", | |
| "title": "Charts just spin forever on our main analytics dashboard", | |
| "description": ( | |
| "Since upgrading to the Business plan and connecting our full dataset, " | |
| "every chart on our main analytics dashboard just shows a loading spinner. " | |
| "We waited 20 minutes and they never loaded. The dataset has about 1.2 million rows. " | |
| "Dashboard worked fine when we were on the Starter plan with a smaller dataset sample." | |
| ), | |
| "reporter": "biz_analyst", | |
| "created_at": "2024-02-22", | |
| "component": "dashboard", | |
| "reproduction_steps": "Connect dashboard to a dataset with > 1M rows; open dashboard.", | |
| "environment_info": "Chrome 121, Windows 11", | |
| "severity": "high", | |
| "labels": ["bug", "dashboard", "ui"], | |
| }, | |
| } | |
| # ββ TASK SCENARIOS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # | |
| # TASK 1 β bug_classify: Agent sees ONE issue, must call classify(severity). | |
| # TASK 2 β duplicate_detection: Agent sees new_issue + backlog; must find duplicates. | |
| # TASK 3 β full_triage: Agent must classify + find dupes + draft response + assign labels. | |
| BUG_CLASSIFY_SCENARIOS = [ | |
| # (issue_id, correct_severity) | |
| {"issue_id": "NX-001", "correct_severity": "critical"}, | |
| {"issue_id": "NX-003", "correct_severity": "critical"}, | |
| {"issue_id": "NX-010", "correct_severity": "high"}, | |
| {"issue_id": "NX-012", "correct_severity": "high"}, | |
| {"issue_id": "NX-013", "correct_severity": "high"}, | |
| {"issue_id": "NX-014", "correct_severity": "high"}, | |
| {"issue_id": "NX-020", "correct_severity": "medium"}, | |
| {"issue_id": "NX-021", "correct_severity": "medium"}, | |
| {"issue_id": "NX-022", "correct_severity": "medium"}, | |
| {"issue_id": "NX-030", "correct_severity": "low"}, | |
| {"issue_id": "NX-031", "correct_severity": "low"}, | |
| {"issue_id": "NX-032", "correct_severity": "low"}, | |
| ] | |
| DUPLICATE_DETECTION_SCENARIOS = [ | |
| { | |
| "new_issue_id": "NX-040", # Large file upload crash (duplicate of NX-010) | |
| "backlog_ids": [ | |
| "NX-001", "NX-011", "NX-010", "NX-020", | |
| "NX-041", "NX-022", "NX-030", "NX-044", | |
| ], | |
| "actual_duplicates": ["NX-010", "NX-041"], # NX-040 dupes these two | |
| }, | |
| { | |
| "new_issue_id": "NX-042", # Kafka OOM (duplicate of NX-011) | |
| "backlog_ids": [ | |
| "NX-010", "NX-043", "NX-011", "NX-020", | |
| "NX-012", "NX-031", "NX-001", "NX-023", | |
| ], | |
| "actual_duplicates": ["NX-011", "NX-043"], # NX-042 dupes these two | |
| }, | |
| { | |
| "new_issue_id": "NX-044", # Dashboard infinite spinner (duplicate of NX-012) | |
| "backlog_ids": [ | |
| "NX-011", "NX-020", "NX-012", "NX-021", | |
| "NX-040", "NX-001", "NX-033", "NX-013", | |
| ], | |
| "actual_duplicates": ["NX-012"], # NX-044 dupes only NX-012 | |
| }, | |
| { | |
| "new_issue_id": "NX-041", # Sales director large-file crash (duplicate of NX-010, NX-040) | |
| "backlog_ids": [ | |
| "NX-010", "NX-040", "NX-011", "NX-020", | |
| "NX-012", "NX-022", "NX-001", "NX-025", | |
| ], | |
| "actual_duplicates": ["NX-010", "NX-040"], | |
| }, | |
| { | |
| "new_issue_id": "NX-043", # Offset-cache memory leak (duplicate of NX-011, NX-042) | |
| "backlog_ids": [ | |
| "NX-042", "NX-011", "NX-010", "NX-020", | |
| "NX-012", "NX-014", "NX-001", "NX-033", | |
| ], | |
| "actual_duplicates": ["NX-011", "NX-042"], | |
| }, | |
| ] | |
| FULL_TRIAGE_SCENARIOS = [ | |
| { | |
| "issue_id": "NX-043", | |
| "correct_severity": "high", | |
| "backlog_ids": [ | |
| "NX-010", "NX-042", "NX-011", "NX-020", | |
| "NX-012", "NX-031", "NX-001", "NX-023", | |
| ], | |
| "actual_duplicates": ["NX-011", "NX-042"], | |
| "expected_labels": ["bug", "memory-leak", "integrations", "needs-investigation"], | |
| "response_quality_keywords": ["memory", "offset", "cache", "profil", "leak"], | |
| "description": ( | |
| "Hard: nuanced high-severity issue; two non-obvious duplicates with different " | |
| "terminology; domain-specific labels required." | |
| ), | |
| }, | |
| { | |
| "issue_id": "NX-041", | |
| "correct_severity": "high", | |
| "backlog_ids": [ | |
| "NX-010", "NX-011", "NX-040", "NX-020", | |
| "NX-012", "NX-022", "NX-030", "NX-001", | |
| ], | |
| "actual_duplicates": ["NX-010", "NX-040"], | |
| "expected_labels": ["bug", "crash", "file-upload"], | |
| "response_quality_keywords": ["upload", "file", "size", "memory"], | |
| "description": ( | |
| "Hard: large-file crash with two stylistically different duplicate reports." | |
| ), | |
| }, | |
| { | |
| "issue_id": "NX-013", | |
| "correct_severity": "high", | |
| "backlog_ids": [ | |
| "NX-001", "NX-010", "NX-020", "NX-021", | |
| "NX-022", "NX-030", "NX-031", "NX-025", | |
| ], | |
| "actual_duplicates": [], # No duplicates in backlog β agent must submit with zero marks | |
| "expected_labels": ["bug", "notifications", "api", "regression"], | |
| "response_quality_keywords": ["notification", "email", "sendgrid", "alert", "api"], | |
| "description": ( | |
| "Hard: no duplicates exist β agent must resist false positives; " | |
| "regression label and API component knowledge required." | |
| ), | |
| }, | |
| { | |
| "issue_id": "NX-002", | |
| "correct_severity": "critical", | |
| "backlog_ids": [ | |
| "NX-001", "NX-003", "NX-010", "NX-020", | |
| "NX-021", "NX-030", "NX-011", "NX-014", | |
| ], | |
| "actual_duplicates": [], # Unique data-loss event, no duplicates | |
| "expected_labels": ["bug", "data-loss", "database", "regression"], | |
| "response_quality_keywords": ["migration", "data", "database", "pipeline", "billing", "audit"], | |
| "description": ( | |
| "Hard: critical data-loss issue; agent must recognise severity correctly " | |
| "and assign data-loss + regression labels without duplicates to distract it." | |
| ), | |
| }, | |
| ] | |