scalerthon / data.py
sarayusapa's picture
Upload folder using huggingface_hub
d1b8170 verified
"""
Synthetic bug report dataset for the Nexus Platform β€” a fictional data analytics SaaS.
Issues span 4 severity levels and include duplicate groups for medium/hard tasks.
All ground-truth answers (severity, duplicate sets, expected labels, response keywords)
are embedded here so graders are fully deterministic.
"""
from typing import Dict, List, Any
# ── SEVERITY ordering (for partial-credit distance scoring) ───────────────────
SEVERITY_ORDER = ["critical", "high", "medium", "low"]
# ── AVAILABLE LABELS ──────────────────────────────────────────────────────────
AVAILABLE_LABELS = [
"bug", "security", "data-loss", "performance", "ui", "ux",
"api", "authentication", "database", "memory-leak", "crash",
"export", "notifications", "mobile", "documentation", "enhancement",
"duplicate", "wontfix", "needs-investigation", "regression",
"file-upload", "dashboard", "reporting", "billing", "integrations",
]
# ── CORE ISSUE BANK ───────────────────────────────────────────────────────────
# Each issue has ground-truth severity and labels for grading.
ISSUE_BANK: Dict[str, Dict[str, Any]] = {
# ── CRITICAL ──────────────────────────────────────────────────────────────
"NX-001": {
"id": "NX-001",
"title": "SQL injection vulnerability in report search API",
"description": (
"The `/api/reports/search?q=` endpoint does not sanitize user input. "
"Passing `' OR '1'='1` returns all reports regardless of user permissions. "
"A malicious actor can exfiltrate the entire reports database without authentication."
),
"reporter": "security_researcher_ext",
"created_at": "2024-03-01",
"component": "api",
"reproduction_steps": "curl \"https://app.nexus.io/api/reports/search?q=' OR '1'='1\"",
"environment_info": "All environments (prod confirmed)",
"severity": "critical",
"labels": ["bug", "security", "api"],
},
"NX-002": {
"id": "NX-002",
"title": "Production data corrupted after v3.2 schema migration",
"description": (
"After running the v3.2 database migration on 2024-02-28, approximately 12,000 rows "
"in the `pipeline_runs` table have NULL values in the `completed_at` column where "
"valid timestamps previously existed. Affected customers report incorrect billing "
"calculations and missing audit logs."
),
"reporter": "ops_team",
"created_at": "2024-02-28",
"component": "database",
"reproduction_steps": "SELECT COUNT(*) FROM pipeline_runs WHERE completed_at IS NULL AND created_at < '2024-02-28'",
"environment_info": "Production only",
"severity": "critical",
"labels": ["bug", "data-loss", "database", "regression"],
},
"NX-003": {
"id": "NX-003",
"title": "JWT tokens from deleted accounts still grant API access",
"description": (
"When an admin deletes a user account, their existing JWT tokens remain valid until "
"natural expiry (24h). A terminated employee can continue accessing all company data "
"via the API for up to 24 hours after account deletion. Token revocation is not implemented."
),
"reporter": "cto_internal",
"created_at": "2024-03-05",
"component": "authentication",
"reproduction_steps": "1. Create user. 2. Obtain JWT. 3. Delete user via admin panel. 4. Use old JWT β€” still works.",
"environment_info": "All environments",
"severity": "critical",
"labels": ["bug", "security", "authentication"],
},
"NX-004": {
"id": "NX-004",
"title": "Scheduled exports silently drop rows when queue exceeds 10k jobs",
"description": (
"During peak hours, when the export queue depth exceeds 10,000 jobs, the worker "
"process silently discards rows from CSV exports to stay within a hardcoded memory "
"cap. Customers receive 'successful' export emails with incomplete data. No error is logged."
),
"reporter": "enterprise_customer_A",
"created_at": "2024-03-10",
"component": "export",
"reproduction_steps": "Trigger 10,001 concurrent export jobs; inspect row counts in output CSVs.",
"environment_info": "Production (observed on first Monday of every month)",
"severity": "critical",
"labels": ["bug", "data-loss", "export"],
},
# ── HIGH ──────────────────────────────────────────────────────────────────
"NX-010": {
"id": "NX-010",
"title": "Application crashes (OOM) when uploading CSV files larger than 500 MB",
"description": (
"Uploading any CSV file larger than approximately 500 MB via the web UI causes the "
"application server to throw an OutOfMemoryError and crash. The upload appears to "
"buffer the entire file in memory before processing. Server restarts automatically "
"but the upload is lost and the user receives a generic 502 error."
),
"reporter": "data_team_lead",
"created_at": "2024-02-10",
"component": "file-upload",
"reproduction_steps": "Upload a 600 MB CSV. Observe server log: java.lang.OutOfMemoryError: Java heap space",
"environment_info": "All environments, JVM heap set to 2GB",
"severity": "high",
"labels": ["bug", "crash", "file-upload", "memory-leak"],
},
"NX-011": {
"id": "NX-011",
"title": "Memory usage grows unbounded in streaming pipeline connector",
"description": (
"The Kafka streaming connector leaks memory at approximately 50 MB/hour when "
"processing high-throughput topics (>100k msgs/sec). After ~8 hours of operation "
"the pod is OOMKilled. Heapdump analysis points to message offset cache not being "
"evicted. Requires daily pod restarts as a workaround."
),
"reporter": "platform_engineer",
"created_at": "2024-02-15",
"component": "integrations",
"reproduction_steps": "Run Kafka connector for 8h at >100k msg/sec throughput and monitor RSS.",
"environment_info": "Kubernetes, all cloud providers",
"severity": "high",
"labels": ["bug", "memory-leak", "integrations", "performance"],
},
"NX-012": {
"id": "NX-012",
"title": "Dashboard chart widgets fail to render for datasets > 1M rows",
"description": (
"When a dashboard is connected to a dataset with more than 1 million rows, all "
"chart widgets display a spinner indefinitely. The browser console shows: "
"'RangeError: Maximum call stack size exceeded' in chart-renderer.js. "
"Smaller datasets work fine. This affects ~30% of enterprise customers."
),
"reporter": "product_team",
"created_at": "2024-02-20",
"component": "dashboard",
"reproduction_steps": "Connect any chart widget to a table with >1M rows.",
"environment_info": "Chrome 121, Firefox 122, Safari 17 β€” all affected",
"severity": "high",
"labels": ["bug", "dashboard", "ui", "performance"],
},
"NX-013": {
"id": "NX-013",
"title": "Critical alert email notifications not being delivered",
"description": (
"Since the SendGrid API key rotation on 2024-02-25, email notifications for "
"CRITICAL severity pipeline alerts are not being sent. The notification service "
"logs show 401 Unauthorized errors from SendGrid but silently swallows them "
"rather than retrying or alerting ops. Customers are missing SLA breach warnings."
),
"reporter": "sre_team",
"created_at": "2024-02-26",
"component": "notifications",
"reproduction_steps": "Trigger a critical pipeline alert; check email β€” nothing arrives. Check logs for SendGrid 401.",
"environment_info": "Production only",
"severity": "high",
"labels": ["bug", "notifications", "api", "regression"],
},
"NX-014": {
"id": "NX-014",
"title": "Billing calculation incorrect for pro-rated mid-cycle plan upgrades",
"description": (
"When a customer upgrades from Starter to Pro mid-billing-cycle, the system "
"charges the full Pro price instead of the pro-rated amount. Approximately "
"230 customers were overcharged in February. Finance has already processed "
"manual refunds but the bug is still present."
),
"reporter": "billing_team",
"created_at": "2024-03-02",
"component": "billing",
"reproduction_steps": "Upgrade plan on day 15 of a 30-day cycle; inspect next invoice.",
"environment_info": "Production only",
"severity": "high",
"labels": ["bug", "billing", "regression"],
},
# ── MEDIUM ────────────────────────────────────────────────────────────────
"NX-020": {
"id": "NX-020",
"title": "Filtered report queries are 10-40x slower after index rebuild",
"description": (
"After the maintenance window on 2024-02-18, queries that filter reports by "
"date range are 10-40x slower (2s β†’ 45s for typical queries). "
"EXPLAIN ANALYZE shows a seq scan despite the index existing. "
"A workaround is to run ANALYZE on the affected table manually."
),
"reporter": "backend_dev",
"created_at": "2024-02-19",
"component": "database",
"reproduction_steps": "SELECT * FROM reports WHERE created_at BETWEEN '2024-01-01' AND '2024-02-01' β€” observe query time.",
"environment_info": "PostgreSQL 15.2, production",
"severity": "medium",
"labels": ["bug", "database", "performance", "regression"],
},
"NX-021": {
"id": "NX-021",
"title": "Date picker shows incorrect UTC offset for IST (Indian Standard Time) users",
"description": (
"Users with their timezone set to IST (UTC+5:30) see the date picker "
"displaying UTC+5:00 instead of the correct UTC+5:30. This causes scheduled "
"pipelines to run 30 minutes earlier than configured. Workaround: set timezone "
"to 'Kolkata' explicitly."
),
"reporter": "customer_support",
"created_at": "2024-02-22",
"component": "ui",
"reproduction_steps": "Set account timezone to IST; create a scheduled pipeline; observe execution time offset.",
"environment_info": "All browsers, all OSes",
"severity": "medium",
"labels": ["bug", "ui", "ux"],
},
"NX-022": {
"id": "NX-022",
"title": "Export to PDF button missing on mobile devices",
"description": (
"The 'Export to PDF' button in the report toolbar is not visible on mobile "
"viewports (< 768px). The button is present in the DOM but has display:none "
"applied via a media query with no alternative access path. Mobile users "
"cannot export reports."
),
"reporter": "mobile_user_feedback",
"created_at": "2024-02-25",
"component": "ui",
"reproduction_steps": "Open any report on a mobile device or resize browser to < 768px; toolbar missing PDF export.",
"environment_info": "iOS Safari, Android Chrome",
"severity": "medium",
"labels": ["bug", "mobile", "ui", "export"],
},
"NX-023": {
"id": "NX-023",
"title": "Live search results don't update until full page reload",
"description": (
"The global search bar (Cmd+K) shows stale results after new content is added. "
"Search index is only refreshed on full page reload. If a user creates a new "
"dashboard and immediately searches for it, it won't appear. Refreshing fixes it."
),
"reporter": "power_user",
"created_at": "2024-03-01",
"component": "ui",
"reproduction_steps": "Create a new dashboard; immediately Cmd+K search for it β€” not found. Reload page; search again β€” found.",
"environment_info": "All browsers",
"severity": "medium",
"labels": ["bug", "ui", "ux"],
},
"NX-024": {
"id": "NX-024",
"title": "Bulk delete confirmation modal does not close after action completes",
"description": (
"After confirming a bulk delete in the data source manager, the confirmation "
"modal remains open even though the deletion succeeded. The user must manually "
"close it. This is a minor UX annoyance but has caused some users to attempt "
"the deletion twice, leading to a 'not found' error."
),
"reporter": "qa_team",
"created_at": "2024-03-03",
"component": "ui",
"reproduction_steps": "Select 3+ data sources; click Bulk Delete; confirm; observe modal stays open.",
"environment_info": "Chrome 121",
"severity": "medium",
"labels": ["bug", "ui", "ux"],
},
"NX-025": {
"id": "NX-025",
"title": "API rate limit headers missing on 429 responses",
"description": (
"When the API returns a 429 Too Many Requests response, the standard "
"Retry-After and X-RateLimit-* headers are absent. This forces client "
"developers to implement fixed backoff instead of respecting server-side "
"rate limit windows, resulting in thundering herd on resumption."
),
"reporter": "api_integrations_partner",
"created_at": "2024-03-07",
"component": "api",
"reproduction_steps": "Exceed API rate limit; inspect response headers β€” Retry-After missing.",
"environment_info": "All environments",
"severity": "medium",
"labels": ["bug", "api"],
},
# ── LOW ───────────────────────────────────────────────────────────────────
"NX-030": {
"id": "NX-030",
"title": "Typo: 'occured' should be 'occurred' in pipeline error message",
"description": "Error message reads: 'An unexpected error has occured. Please try again.' β€” 'occured' is misspelled.",
"reporter": "attentive_user",
"created_at": "2024-03-08",
"component": "ui",
"reproduction_steps": "Trigger a pipeline failure; observe error toast.",
"environment_info": "All environments",
"severity": "low",
"labels": ["bug", "ui", "documentation"],
},
"NX-031": {
"id": "NX-031",
"title": "Footer links open in same tab instead of new tab",
"description": "All links in the footer (Privacy Policy, Terms of Service, Status Page) open in the same browser tab rather than a new tab, interrupting the user's workflow.",
"reporter": "ux_reviewer",
"created_at": "2024-03-09",
"component": "ui",
"reproduction_steps": "Click any footer link; observe it opens in same tab.",
"environment_info": "All browsers",
"severity": "low",
"labels": ["bug", "ui", "ux"],
},
"NX-032": {
"id": "NX-032",
"title": "Dark mode: settings panel header uses wrong background color",
"description": (
"In dark mode, the settings panel header retains a light grey (#F5F5F5) background "
"instead of using the dark theme token (#1E1E1E). All other panels are correctly themed."
),
"reporter": "design_team",
"created_at": "2024-03-10",
"component": "ui",
"reproduction_steps": "Enable dark mode; open Settings; observe header background inconsistency.",
"environment_info": "All browsers",
"severity": "low",
"labels": ["bug", "ui"],
},
"NX-033": {
"id": "NX-033",
"title": "Tooltip delay is 1500ms β€” should be 300ms per design spec",
"description": "All UI tooltips have a 1500ms hover delay. The design system spec defines 300ms. This makes the UI feel sluggish.",
"reporter": "design_system_team",
"created_at": "2024-03-11",
"component": "ui",
"reproduction_steps": "Hover over any button with a tooltip; time the delay.",
"environment_info": "All environments",
"severity": "low",
"labels": ["bug", "ui", "ux"],
},
# ── DUPLICATE GROUP A: Large file OOM (same root as NX-010) ───────────────
"NX-040": {
"id": "NX-040",
"title": "Server 502 error when importing large datasets",
"description": (
"Every time I try to import my dataset (it's about 700 MB), I get a 502 Bad "
"Gateway error after about 30 seconds. The import dialog says it's processing "
"but then just fails with no useful message. Smaller files (under 200 MB) work fine."
),
"reporter": "analyst_user_1",
"created_at": "2024-02-12",
"component": "file-upload",
"reproduction_steps": "Upload a file > 500 MB via the Import Data dialog.",
"environment_info": "Chrome 121, MacOS 14",
"severity": "high",
"labels": ["bug", "crash", "file-upload"],
},
"NX-041": {
"id": "NX-041",
"title": "Nexus crashes when I upload my Q4 sales data",
"description": (
"I have a CSV file with Q4 sales data, about 650 MB. Every single time I try to "
"upload it the whole app goes down for a few minutes. I see 'Service Unavailable' "
"in the browser. This is urgent β€” I need this data loaded for a board presentation tomorrow."
),
"reporter": "sales_director",
"created_at": "2024-02-14",
"component": "file-upload",
"reproduction_steps": "Upload any CSV >= 600MB.",
"environment_info": "Safari 17, MacOS 14.2",
"severity": "high",
"labels": ["bug", "crash", "file-upload"],
},
# ── DUPLICATE GROUP B: Kafka memory leak (same root as NX-011) ────────────
"NX-042": {
"id": "NX-042",
"title": "Kafka connector pod gets OOMKilled nightly",
"description": (
"Our Kafka connector pod is being killed every night with OOMKilled status in "
"Kubernetes. We have to restart it every morning. Memory monitoring shows it "
"steadily grows from ~200 MB at startup to over 2 GB over 8-10 hours. "
"Topic is running at about 80k messages/sec."
),
"reporter": "devops_lead",
"created_at": "2024-02-17",
"component": "integrations",
"reproduction_steps": "Deploy Kafka connector; monitor memory with kubectl top pod for 8+ hours.",
"environment_info": "GKE 1.28, k8s OOMKill",
"severity": "high",
"labels": ["bug", "memory-leak", "integrations"],
},
"NX-043": {
"id": "NX-043",
"title": "Stream connector memory keeps growing β€” possible leak in offset cache",
"description": (
"I've been profiling our streaming connector and noticed that the offset tracking "
"cache never gets cleared. The HashMap holding message offsets just keeps growing "
"indefinitely. After a thread dump at hour 9, the cache had over 4 million entries. "
"This seems to be the root cause of the pod OOM issue others have reported."
),
"reporter": "senior_engineer",
"created_at": "2024-02-19",
"component": "integrations",
"reproduction_steps": "Attach a profiler to the connector process; inspect KafkaOffsetCache size over time.",
"environment_info": "All environments",
"severity": "high",
"labels": ["bug", "memory-leak", "integrations", "needs-investigation"],
},
# ── DUPLICATE GROUP C: Dashboard chart rendering (same root as NX-012) ────
"NX-044": {
"id": "NX-044",
"title": "Charts just spin forever on our main analytics dashboard",
"description": (
"Since upgrading to the Business plan and connecting our full dataset, "
"every chart on our main analytics dashboard just shows a loading spinner. "
"We waited 20 minutes and they never loaded. The dataset has about 1.2 million rows. "
"Dashboard worked fine when we were on the Starter plan with a smaller dataset sample."
),
"reporter": "biz_analyst",
"created_at": "2024-02-22",
"component": "dashboard",
"reproduction_steps": "Connect dashboard to a dataset with > 1M rows; open dashboard.",
"environment_info": "Chrome 121, Windows 11",
"severity": "high",
"labels": ["bug", "dashboard", "ui"],
},
}
# ── TASK SCENARIOS ─────────────────────────────────────────────────────────────
#
# TASK 1 β€” bug_classify: Agent sees ONE issue, must call classify(severity).
# TASK 2 β€” duplicate_detection: Agent sees new_issue + backlog; must find duplicates.
# TASK 3 β€” full_triage: Agent must classify + find dupes + draft response + assign labels.
BUG_CLASSIFY_SCENARIOS = [
# (issue_id, correct_severity)
{"issue_id": "NX-001", "correct_severity": "critical"},
{"issue_id": "NX-003", "correct_severity": "critical"},
{"issue_id": "NX-010", "correct_severity": "high"},
{"issue_id": "NX-012", "correct_severity": "high"},
{"issue_id": "NX-013", "correct_severity": "high"},
{"issue_id": "NX-014", "correct_severity": "high"},
{"issue_id": "NX-020", "correct_severity": "medium"},
{"issue_id": "NX-021", "correct_severity": "medium"},
{"issue_id": "NX-022", "correct_severity": "medium"},
{"issue_id": "NX-030", "correct_severity": "low"},
{"issue_id": "NX-031", "correct_severity": "low"},
{"issue_id": "NX-032", "correct_severity": "low"},
]
DUPLICATE_DETECTION_SCENARIOS = [
{
"new_issue_id": "NX-040", # Large file upload crash (duplicate of NX-010)
"backlog_ids": [
"NX-001", "NX-011", "NX-010", "NX-020",
"NX-041", "NX-022", "NX-030", "NX-044",
],
"actual_duplicates": ["NX-010", "NX-041"], # NX-040 dupes these two
},
{
"new_issue_id": "NX-042", # Kafka OOM (duplicate of NX-011)
"backlog_ids": [
"NX-010", "NX-043", "NX-011", "NX-020",
"NX-012", "NX-031", "NX-001", "NX-023",
],
"actual_duplicates": ["NX-011", "NX-043"], # NX-042 dupes these two
},
{
"new_issue_id": "NX-044", # Dashboard infinite spinner (duplicate of NX-012)
"backlog_ids": [
"NX-011", "NX-020", "NX-012", "NX-021",
"NX-040", "NX-001", "NX-033", "NX-013",
],
"actual_duplicates": ["NX-012"], # NX-044 dupes only NX-012
},
{
"new_issue_id": "NX-041", # Sales director large-file crash (duplicate of NX-010, NX-040)
"backlog_ids": [
"NX-010", "NX-040", "NX-011", "NX-020",
"NX-012", "NX-022", "NX-001", "NX-025",
],
"actual_duplicates": ["NX-010", "NX-040"],
},
{
"new_issue_id": "NX-043", # Offset-cache memory leak (duplicate of NX-011, NX-042)
"backlog_ids": [
"NX-042", "NX-011", "NX-010", "NX-020",
"NX-012", "NX-014", "NX-001", "NX-033",
],
"actual_duplicates": ["NX-011", "NX-042"],
},
]
FULL_TRIAGE_SCENARIOS = [
{
"issue_id": "NX-043",
"correct_severity": "high",
"backlog_ids": [
"NX-010", "NX-042", "NX-011", "NX-020",
"NX-012", "NX-031", "NX-001", "NX-023",
],
"actual_duplicates": ["NX-011", "NX-042"],
"expected_labels": ["bug", "memory-leak", "integrations", "needs-investigation"],
"response_quality_keywords": ["memory", "offset", "cache", "profil", "leak"],
"description": (
"Hard: nuanced high-severity issue; two non-obvious duplicates with different "
"terminology; domain-specific labels required."
),
},
{
"issue_id": "NX-041",
"correct_severity": "high",
"backlog_ids": [
"NX-010", "NX-011", "NX-040", "NX-020",
"NX-012", "NX-022", "NX-030", "NX-001",
],
"actual_duplicates": ["NX-010", "NX-040"],
"expected_labels": ["bug", "crash", "file-upload"],
"response_quality_keywords": ["upload", "file", "size", "memory"],
"description": (
"Hard: large-file crash with two stylistically different duplicate reports."
),
},
{
"issue_id": "NX-013",
"correct_severity": "high",
"backlog_ids": [
"NX-001", "NX-010", "NX-020", "NX-021",
"NX-022", "NX-030", "NX-031", "NX-025",
],
"actual_duplicates": [], # No duplicates in backlog β€” agent must submit with zero marks
"expected_labels": ["bug", "notifications", "api", "regression"],
"response_quality_keywords": ["notification", "email", "sendgrid", "alert", "api"],
"description": (
"Hard: no duplicates exist β€” agent must resist false positives; "
"regression label and API component knowledge required."
),
},
{
"issue_id": "NX-002",
"correct_severity": "critical",
"backlog_ids": [
"NX-001", "NX-003", "NX-010", "NX-020",
"NX-021", "NX-030", "NX-011", "NX-014",
],
"actual_duplicates": [], # Unique data-loss event, no duplicates
"expected_labels": ["bug", "data-loss", "database", "regression"],
"response_quality_keywords": ["migration", "data", "database", "pipeline", "billing", "audit"],
"description": (
"Hard: critical data-loss issue; agent must recognise severity correctly "
"and assign data-loss + regression labels without duplicates to distract it."
),
},
]