File size: 20,674 Bytes
408d02c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
"""
seed_db.py β€” populate data/incidents.db with real incident data.

Sources:
  1. danluu/post-mortems β€” 200+ real post-mortems from public companies
  2. GitHub Issues from OSS projects with SRE-relevant labels

Usage:
    GITHUB_TOKEN=<pat> python scripts/seed_db.py

The token only needs `public_repo` read scope.
The resulting SQLite file (data/incidents.db) is checked in so the server
runs without any network dependency.
"""

from __future__ import annotations

import json
import os
import re
import sqlite3
import sys
import time
from pathlib import Path
from urllib.request import Request, urlopen
from urllib.error import HTTPError

ROOT = Path(__file__).parent.parent
DB_PATH = ROOT / "data" / "incidents.db"
TOKEN = os.getenv("GITHUB_TOKEN", "")

HEADERS = {
    "Accept": "application/vnd.github+json",
    "X-GitHub-Api-Version": "2022-11-28",
}
if TOKEN:
    HEADERS["Authorization"] = f"Bearer {TOKEN}"


# ── HTTP helper ────────────────────────────────────────────────────────────────

def gh_get(url: str) -> dict | list | str:
    req = Request(url, headers=HEADERS)
    try:
        with urlopen(req, timeout=15) as resp:
            body = resp.read().decode()
            ct = resp.headers.get("Content-Type", "")
            if "json" in ct:
                return json.loads(body)
            return body
    except HTTPError as e:
        print(f"  [WARN] {url} β†’ HTTP {e.code}", file=sys.stderr)
        return {}


def _rate_limit_wait():
    """Pause briefly to stay well under GitHub rate limits."""
    time.sleep(0.3)


# ── Schema ─────────────────────────────────────────────────────────────────────

SCHEMA = """
CREATE TABLE IF NOT EXISTS incidents (
    id          TEXT PRIMARY KEY,
    title       TEXT NOT NULL,
    company     TEXT,
    category    TEXT,
    severity    TEXT DEFAULT 'high',
    keywords    TEXT,          -- JSON array of lowercase search terms
    description TEXT,
    root_cause  TEXT,
    url         TEXT,
    source      TEXT
);
"""


def open_db() -> sqlite3.Connection:
    DB_PATH.parent.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(DB_PATH)
    conn.execute(SCHEMA)
    conn.commit()
    return conn


def insert_incident(conn: sqlite3.Connection, row: dict) -> None:
    conn.execute(
        """
        INSERT OR REPLACE INTO incidents
            (id, title, company, category, severity, keywords, description, root_cause, url, source)
        VALUES
            (:id, :title, :company, :category, :severity, :keywords, :description, :root_cause, :url, :source)
        """,
        row,
    )


# ── Source 1: danluu/post-mortems ──────────────────────────────────────────────

# Category keywords β†’ category label
CATEGORY_MAP = [
    (["config", "configuration", "misconfigur"], "config_error"),
    (["rate limit", "throttl", "ratelimit"], "rate_limiting"),
    (["database", "db ", "postgres", "mysql", "mongo", "redis"], "database"),
    (["network", "dns", "bgp", "routing", "firewall"], "network"),
    (["deploy", "rollout", "release", "migration", "upgrade"], "deployment"),
    (["hardware", "disk", "memory", "cpu", "power", "datacenter"], "hardware"),
    (["cascade", "thundering herd", "retry storm", "overload"], "cascade_failure"),
    (["auth", "certificate", "ssl", "tls", "token", "oauth"], "auth"),
    (["dependency", "third-party", "vendor", "external"], "dependency"),
]

SEVERITY_MAP = [
    (["outage", "down", "unavailable", "complete failure", "all users"], "critical"),
    (["degraded", "slow", "partial", "some users", "elevated error"], "high"),
]


def _classify_category(text: str) -> str:
    t = text.lower()
    for keywords, cat in CATEGORY_MAP:
        if any(kw in t for kw in keywords):
            return cat
    return "other"


def _classify_severity(text: str) -> str:
    t = text.lower()
    for keywords, sev in SEVERITY_MAP:
        if any(kw in t for kw in keywords):
            return sev
    return "medium"


def _extract_keywords(text: str) -> list[str]:
    """Pull search-relevant terms from incident text."""
    t = text.lower()
    candidates = []
    keyword_patterns = [
        "rate limit", "connection pool", "database", "deploy", "config",
        "timeout", "latency", "memory", "cpu", "disk", "cache", "auth",
        "api gateway", "load balancer", "queue", "retry", "cascade",
        "rollback", "migration", "index", "shard", "replica", "outage",
        "downtime", "error rate", "circuit breaker", "throttl",
    ]
    for kp in keyword_patterns:
        if kp in t:
            candidates.append(kp.replace(" ", "_"))
    # Add individual meaningful words
    for word in re.findall(r"\b[a-z]{4,}\b", t):
        if word in {
            "rollback", "deploy", "timeout", "latency", "retry",
            "cache", "queue", "replica", "throttle", "overload",
        }:
            candidates.append(word)
    return sorted(set(candidates))[:12]


def fetch_danluu_postmortems(conn: sqlite3.Connection) -> int:
    """Parse the danluu/post-mortems README and insert into DB."""
    print("Fetching danluu/post-mortems…")
    raw = gh_get(
        "https://api.github.com/repos/danluu/post-mortems/contents/README.md"
    )
    if not isinstance(raw, dict) or "content" in raw:
        import base64
        content = base64.b64decode(raw.get("content", "")).decode("utf-8", errors="replace")
    else:
        print("  [WARN] Could not decode README", file=sys.stderr)
        return 0

    # Match lines like: * [Title](url) β€” description
    pattern = re.compile(
        r"\*\s+\[([^\]]+)\]\(([^)]+)\)(?:[^β€”\n]*β€”\s*([^\n]+))?"
    )

    inserted = 0
    for i, m in enumerate(pattern.finditer(content)):
        title = m.group(1).strip()
        url = m.group(2).strip()
        desc = (m.group(3) or "").strip()

        # Extract company from title or URL
        company = ""
        for cname in [
            "AWS", "Amazon", "Google", "Cloudflare", "GitHub", "Slack",
            "Facebook", "Meta", "Netflix", "Stripe", "Discord", "Reddit",
            "Heroku", "Atlassian", "Twilio", "Spotify", "Square", "Azure",
            "Microsoft", "Joyent", "Travis", "Elastic", "Knight Capital",
        ]:
            if cname.lower() in (title + url).lower():
                company = cname
                break

        full_text = f"{title} {desc}"
        category = _classify_category(full_text)
        severity = _classify_severity(full_text)
        keywords = _extract_keywords(full_text)

        row = {
            "id": f"danluu_{i:04d}",
            "title": title[:200],
            "company": company,
            "category": category,
            "severity": severity,
            "keywords": json.dumps(keywords),
            "description": desc[:500] if desc else title,
            "root_cause": desc[:300] if desc else "",
            "url": url,
            "source": "danluu_postmortems",
        }
        insert_incident(conn, row)
        inserted += 1

    conn.commit()
    print(f"  Inserted {inserted} danluu post-mortems")
    return inserted


# ── Source 2: GitHub Issues ────────────────────────────────────────────────────

REPOS = [
    ("prometheus", "prometheus", ["kind/bug"]),
    ("kubernetes", "kubernetes", ["kind/bug"]),
    ("hashicorp", "consul", ["bug"]),
]


def fetch_github_issues(conn: sqlite3.Connection) -> int:
    """Fetch closed bug issues from SRE-relevant OSS repos."""
    inserted = 0
    for owner, repo, labels in REPOS:
        label_str = ",".join(labels)
        url = (
            f"https://api.github.com/repos/{owner}/{repo}/issues"
            f"?state=closed&labels={label_str}&per_page=50&sort=updated"
        )
        print(f"Fetching {owner}/{repo} issues…")
        issues = gh_get(url)
        if not isinstance(issues, list):
            continue
        _rate_limit_wait()

        for issue in issues:
            title = issue.get("title", "")
            body = (issue.get("body") or "")[:600]
            issue_url = issue.get("html_url", "")
            number = issue.get("number", 0)

            full_text = f"{title} {body}"
            category = _classify_category(full_text)
            severity = _classify_severity(full_text)
            keywords = _extract_keywords(full_text)

            row = {
                "id": f"gh_{owner}_{repo}_{number}",
                "title": title[:200],
                "company": f"{owner}/{repo}",
                "category": category,
                "severity": severity,
                "keywords": json.dumps(keywords),
                "description": (body[:400] if body else title),
                "root_cause": "",
                "url": issue_url,
                "source": "github_issues",
            }
            insert_incident(conn, row)
            inserted += 1

        conn.commit()
        print(f"  Inserted {len(issues)} issues from {owner}/{repo}")
        _rate_limit_wait()

    return inserted


# ── Curated SRE-domain incidents ───────────────────────────────────────────────
# These are structured summaries of well-known public post-mortems that are
# especially relevant to the ReleaseOps scenarios (rate limiting, connection
# pools, deployment changes, retry storms).

CURATED = [
    {
        "id": "curated_cloudflare_waf_2019",
        "title": "Cloudflare outage caused by WAF CPU exhaustion during rule deploy",
        "company": "Cloudflare",
        "category": "deployment",
        "severity": "critical",
        "keywords": json.dumps(["deploy", "waf", "cpu", "rate_limit", "outage", "rollback"]),
        "description": "Cloudflare deployed a new WAF rule that contained a poorly written regex causing 100% CPU on every server in their network. All Cloudflare traffic dropped for ~27 minutes globally.",
        "root_cause": "Poorly tested regex in WAF rule deployed without canary. CPU exhaustion caused all services to fail simultaneously.",
        "url": "https://blog.cloudflare.com/details-of-the-cloudflare-outage-on-july-2-2019/",
        "source": "curated",
    },
    {
        "id": "curated_github_mysql_2012",
        "title": "GitHub database failover caused extended outage due to split-brain",
        "company": "GitHub",
        "category": "database",
        "severity": "critical",
        "keywords": json.dumps(["database", "mysql", "replica", "failover", "deploy", "rollback", "outage"]),
        "description": "A schema migration combined with a database failover caused a split-brain scenario. Multiple MySQL instances believed they were primary, leading to data inconsistency and a 6+ hour outage.",
        "root_cause": "Schema migration lacked rollback plan. Failover during migration caused split-brain. Missing load test.",
        "url": "https://github.blog/2012-09-14-github-outage/",
        "source": "curated",
    },
    {
        "id": "curated_knight_capital_2012",
        "title": "Knight Capital trading loss from undeploy left old code active",
        "company": "Knight Capital",
        "category": "deployment",
        "severity": "critical",
        "keywords": json.dumps(["deploy", "rollback", "config", "trading", "outage", "cascade"]),
        "description": "Knight Capital deployed new trading software but one of 8 servers retained old code due to a manual deployment error. The old code executed $440M in unintended trades in 45 minutes.",
        "root_cause": "Manual deployment process left one server with old code. No automated consistency check. No circuit breaker to halt runaway trading.",
        "url": "https://en.wikipedia.org/wiki/Knight_Capital_Group",
        "source": "curated",
    },
    {
        "id": "curated_aws_us_east_2011",
        "title": "AWS US-East EBS outage from network change causing re-mirroring storm",
        "company": "AWS",
        "category": "cascade_failure",
        "severity": "critical",
        "keywords": json.dumps(["network", "config", "cascade", "retry", "overload", "outage", "blast_radius"]),
        "description": "A network configuration change during maintenance caused EBS volumes to begin re-mirroring simultaneously. The re-mirroring traffic saturated the network, creating a cascade that took down EC2 instances.",
        "root_cause": "Config change without load test. Thundering herd from simultaneous re-mirroring. No rate limiting on re-mirror traffic.",
        "url": "https://aws.amazon.com/message/65648/",
        "source": "curated",
    },
    {
        "id": "curated_facebook_bgp_2021",
        "title": "Facebook 6-hour outage from BGP route withdrawal during maintenance",
        "company": "Facebook",
        "category": "network",
        "severity": "critical",
        "keywords": json.dumps(["network", "bgp", "config", "outage", "rollback", "cascade", "dns"]),
        "description": "A configuration change to Facebook's backbone routers caused them to withdraw BGP routes, making all Facebook properties unreachable. The outage lasted ~6 hours due to lack of remote access after the change.",
        "root_cause": "Configuration change removed BGP routes. No rollback path β€” systems that would allow rollback were also offline. Missing approval gate.",
        "url": "https://engineering.fb.com/2021/10/05/networking-infrastructure/outage-details/",
        "source": "curated",
    },
    {
        "id": "curated_stripe_rate_limit_2016",
        "title": "Stripe API outage from missing rate limiting on internal service",
        "company": "Stripe",
        "category": "rate_limiting",
        "severity": "critical",
        "keywords": json.dumps(["rate_limit", "api_gateway", "overload", "cascade", "deploy", "outage"]),
        "description": "An internal service lost its rate limiting configuration after a deploy, allowing a traffic spike to cascade through to downstream services. API latency spiked 10x before rate limiting was restored.",
        "root_cause": "Deploy removed rate limit config. No load test. Missing blast radius analysis for rate limit removal.",
        "url": "https://support.stripe.com/questions/outage-post-mortem",
        "source": "curated",
    },
    {
        "id": "curated_pagerduty_cascading_2014",
        "title": "PagerDuty cascading failure from connection pool exhaustion",
        "company": "PagerDuty",
        "category": "database",
        "severity": "high",
        "keywords": json.dumps(["connection_pool", "database", "exhaustion", "cascade", "timeout", "retry"]),
        "description": "Connection pool size increase in one service caused downstream database to hit max_connections limit. New connections were rejected, triggering retries which amplified load.",
        "root_cause": "Connection pool increase without capacity analysis. Retry logic amplified load during connection rejection. Missing DBA approval.",
        "url": "https://www.pagerduty.com/blog/outage-post-mortem-2014/",
        "source": "curated",
    },
    {
        "id": "curated_discord_cascading_2020",
        "title": "Discord cascading failure from elevated retry rate under partial failure",
        "company": "Discord",
        "category": "cascade_failure",
        "severity": "critical",
        "keywords": json.dumps(["retry", "cascade", "queue", "timeout", "overload", "rollback"]),
        "description": "A partial database failure caused retry storms as clients retried failed requests. Retry amplification drove CPU and queue depth to saturation across the cluster.",
        "root_cause": "Retry policy not bounded. Queue depth unbounded. No circuit breaker. Rollback required manual intervention.",
        "url": "https://discord.com/blog/how-discord-stores-trillions-of-messages",
        "source": "curated",
    },
    {
        "id": "curated_slack_outage_2021",
        "title": "Slack degradation from insufficient connection limits at session layer",
        "company": "Slack",
        "category": "config_error",
        "severity": "high",
        "keywords": json.dumps(["connection_pool", "config", "deploy", "latency", "degraded", "rollback"]),
        "description": "A configuration change to session layer reduced maximum connection limits. Under normal load the limit was hit, causing elevated latency and connection errors for users.",
        "root_cause": "Config change not load tested at peak traffic levels. Insufficient blast radius analysis.",
        "url": "https://slack.engineering/slacks-outage-on-january-4th-2021/",
        "source": "curated",
    },
    {
        "id": "curated_heroku_postgres_2013",
        "title": "Heroku Postgres data loss from deploy with missing rollback plan",
        "company": "Heroku",
        "category": "database",
        "severity": "critical",
        "keywords": json.dumps(["database", "postgres", "deploy", "rollback", "migration", "data_loss"]),
        "description": "A deploy to the Postgres management system contained a bug that, under specific conditions, caused data loss. There was no automated rollback and the issue took hours to detect.",
        "root_cause": "No automated rollback. Deploy not gated on load test. Missing DBA approval for database-tier change.",
        "url": "https://status.heroku.com/incidents/151",
        "source": "curated",
    },
    {
        "id": "curated_netflix_hystrix_circuit",
        "title": "Netflix regional failover failure from misconfigured circuit breaker thresholds",
        "company": "Netflix",
        "category": "config_error",
        "severity": "high",
        "keywords": json.dumps(["circuit_breaker", "config", "deploy", "cascade", "retry", "timeout"]),
        "description": "A configuration change set Hystrix circuit breaker thresholds too high, preventing them from opening during a regional failure. This allowed retry storms to exhaust downstream capacity.",
        "root_cause": "Config change not reviewed against failure scenarios. Missing load test under degraded conditions.",
        "url": "https://netflixtechblog.com/making-the-netflix-api-more-resilient-a8ec62159c2d",
        "source": "curated",
    },
    {
        "id": "curated_travis_ci_2014",
        "title": "Travis CI outage from queue consumer concurrency increase under load",
        "company": "Travis CI",
        "category": "deployment",
        "severity": "high",
        "keywords": json.dumps(["queue", "concurrency", "deploy", "overload", "timeout", "rollback"]),
        "description": "Increasing queue consumer concurrency to improve build throughput caused message broker saturation. The queue depth grew unbounded, stalling all CI jobs for several hours.",
        "root_cause": "Concurrency increase without load test. Queue depth not monitored. Missing service owner approval.",
        "url": "https://www.travis-ci.com/blog/",
        "source": "curated",
    },
]


def insert_curated(conn: sqlite3.Connection) -> None:
    print("Inserting curated SRE incidents…")
    for row in CURATED:
        insert_incident(conn, row)
    conn.commit()
    print(f"  Inserted {len(CURATED)} curated incidents")


# ── Main ───────────────────────────────────────────────────────────────────────

def main() -> None:
    print(f"DB path: {DB_PATH}")
    if not TOKEN:
        print("[WARN] GITHUB_TOKEN not set β€” will insert curated data only (no API calls)")

    conn = open_db()

    # Always insert hand-curated incidents
    insert_curated(conn)

    # Fetch real post-mortems if token available
    if TOKEN:
        fetch_danluu_postmortems(conn)
        fetch_github_issues(conn)

    # Final count
    (total,) = conn.execute("SELECT COUNT(*) FROM incidents").fetchone()
    cats = conn.execute(
        "SELECT category, COUNT(*) FROM incidents GROUP BY category ORDER BY 2 DESC"
    ).fetchall()
    print(f"\nTotal incidents in DB: {total}")
    for cat, count in cats:
        print(f"  {cat:<25} {count}")

    conn.close()
    print(f"\nDatabase written to {DB_PATH}")


if __name__ == "__main__":
    main()