bee / scripts /backfill_cve_completions.py
Bee Deploy
HF Space backend deploy [de0cba5]
5e21013
#!/usr/bin/env python3
"""scripts/backfill_cve_completions.py β€” generate teacher answers for prompted CVE rows.
Sister script to backfill_cve_prompts.py. That one generated the
`payload.prompt` (the question side of a training pair); this one
generates the `payload.completion` (the answer side).
Why a separate script
---------------------
Splitting prompt-generation from completion-generation lets each pass
run independently, restart cleanly on failure, and use a different
teacher per side if quality demands. For Stage-1 cybersec adapter
training the same Mistral mistral-medium-latest tier handles both β€” it
has the depth to write a senior-engineer answer and the Experiment
tier is free.
Selection criteria
------------------
WHERE kind='cve' AND domain='cybersecurity'
AND payload ? 'prompt'
AND NOT (payload ? 'completion')
ORDER BY KEV-flagged first, then CVSS severity, then recency.
KEV-first ordering matters because those are the CVEs adversaries are
ACTIVELY exploiting in the wild β€” the highest-signal training data we
have. If the run is interrupted partway, we still get the best rows.
Idempotent: re-run after Ctrl-C; the `NOT (payload ? 'completion')`
predicate is the resume cursor.
Throughput
----------
Mistral Experiment tier: 23 RPM account-wide. We pace at 20.
494 rows / 20 RPM = ~25 minutes for a full backfill.
"""
from __future__ import annotations
import argparse
import json
import os
import sys
import time
import urllib.error
import urllib.request
from pathlib import Path
try:
from dotenv import load_dotenv
load_dotenv(Path(__file__).resolve().parent.parent / ".env")
except ImportError:
pass
import psycopg
from psycopg import rows as psycopg_rows
MISTRAL_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"
DEFAULT_MODEL = "mistral-medium-latest"
RATE_LIMIT_RPM = 20
RATE_INTERVAL_S = 60.0 / RATE_LIMIT_RPM
# This system prompt mirrors the role we want the trained adapter to
# embody: a senior security engineer who explains root cause,
# exploitation, mitigation, and detection in concrete technical terms.
# Every completion becomes one (prompt, completion) training pair, so
# the answer style here directly shapes the adapter's voice.
SYSTEM_PROMPT = (
"You are a senior offensive-and-defensive cybersecurity engineer "
"answering for a peer-engineer audience. Given a vulnerability "
"training prompt, write a concrete, technically rigorous answer "
"that covers (1) root cause, (2) realistic exploitation pattern, "
"(3) concrete mitigation/patch guidance, and (4) detection signals "
"(log fields, EDR signatures, network markers). No marketing fluff, "
"no generic 'apply security best practices' platitudes. Cite "
"specific configuration keys, function names, or CWE IDs when "
"relevant. 4-8 sentences total, dense and implementation-ready. "
"Output the answer body only β€” no preface, no markdown headings, "
"no JSON, no fences."
)
def strip_markdown_fences(s: str) -> str:
"""Some models wrap output in ```…``` even when asked not to."""
s = s.strip()
if not s.startswith("```"):
return s
parts = s.split("```")
if len(parts) >= 3:
inner = parts[1]
if "\n" in inner:
first, rest = inner.split("\n", 1)
if not first.strip() or first.strip().isalpha():
return rest.strip()
return inner.strip()
return s
def call_mistral(
api_key: str,
model: str,
user_prompt: str,
timeout_s: int = 90,
) -> tuple[str | None, str | None]:
body = json.dumps(
{
"model": model,
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
"max_tokens": 900,
"temperature": 0.4,
}
).encode("utf-8")
req = urllib.request.Request(
MISTRAL_ENDPOINT,
data=body,
method="POST",
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
},
)
try:
with urllib.request.urlopen(req, timeout=timeout_s) as resp:
data = json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as e:
if e.code == 429:
return None, "429"
msg = ""
try:
msg = e.read().decode("utf-8")[:200]
except Exception:
msg = ""
print(f" ! HTTP {e.code}: {msg}", file=sys.stderr)
return None, "http_other"
except Exception as e:
print(f" ! fetch error: {e}", file=sys.stderr)
return None, "fetch"
content = (data.get("choices") or [{}])[0].get("message", {}).get("content", "")
content = strip_markdown_fences(content)
if not content or len(content) < 80:
return None, "empty"
return content, None
def count_pending(conn) -> int:
with conn.cursor() as cur:
cur.execute(
"""
SELECT count(*)
FROM public.training_queue
WHERE kind = 'cve'
AND domain = 'cybersecurity'
AND payload ? 'prompt'
AND NOT (payload ? 'completion')
"""
)
return cur.fetchone()[0]
def fetch_rows(conn, limit: int) -> list[dict]:
"""KEV-flagged first (actively exploited), then CRITICAL/HIGH/MEDIUM,
then most recent. Even a partial run captures the highest-signal data."""
sql = """
SELECT id, external_id, payload
FROM public.training_queue
WHERE kind = 'cve'
AND domain = 'cybersecurity'
AND payload ? 'prompt'
AND NOT (payload ? 'completion')
ORDER BY
CASE WHEN (payload->>'kev')::boolean THEN 0 ELSE 1 END,
CASE payload->>'cvss_severity'
WHEN 'CRITICAL' THEN 1
WHEN 'HIGH' THEN 2
WHEN 'MEDIUM' THEN 3
ELSE 9
END,
(payload->>'published') DESC NULLS LAST
LIMIT %s
"""
with conn.cursor(row_factory=psycopg_rows.dict_row) as cur:
cur.execute(sql, (limit,))
return list(cur.fetchall())
def update_row(conn, row_id: int, completion: str, model: str) -> None:
sql = """
UPDATE public.training_queue
SET payload = payload
|| jsonb_build_object('completion', %s::text)
|| jsonb_build_object('completion_model', %s::text)
WHERE id = %s
"""
with conn.cursor() as cur:
cur.execute(sql, (completion, model, row_id))
conn.commit()
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0])
parser.add_argument("--limit", type=int, default=None, help="cap total rows enriched")
parser.add_argument("--batch", type=int, default=50)
parser.add_argument(
"--model",
default=os.environ.get("BEE_BACKFILL_MODEL", DEFAULT_MODEL),
)
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
api_key = (os.environ.get("BEE_MISTRAL_API_KEY") or "").strip()
if not api_key:
print("ERROR: BEE_MISTRAL_API_KEY not set", file=sys.stderr)
return 1
pg_url = (os.environ.get("POSTGRES_URL_NON_POOLING") or "").strip()
if not pg_url:
print("ERROR: POSTGRES_URL_NON_POOLING not set", file=sys.stderr)
return 1
print(
f"Completion backfill β€” model={args.model} "
f"batch={args.batch} pace={RATE_LIMIT_RPM} req/min"
)
started = time.monotonic()
enriched = 0
skipped = 0
rate_limited = 0
last_call = 0.0
with psycopg.connect(pg_url, autocommit=False) as conn:
pending = count_pending(conn)
print(f" pending rows worth completing: {pending}")
if args.dry_run:
print("dry-run; exiting")
return 0
target = min(args.limit, pending) if args.limit else pending
if target == 0:
print("nothing to do")
return 0
print(f" target this run: {target}")
print()
while enriched + skipped < target:
remaining = target - enriched - skipped
rows = fetch_rows(conn, min(args.batch, remaining))
if not rows:
break
for row in rows:
elapsed = time.monotonic() - last_call
if elapsed < RATE_INTERVAL_S:
time.sleep(RATE_INTERVAL_S - elapsed)
last_call = time.monotonic()
content, err = call_mistral(api_key, args.model, row["payload"]["prompt"])
if err == "429":
rate_limited += 1
print(" ! 429 β€” backing off 12s")
time.sleep(12.0)
continue
if not content:
skipped += 1
continue
update_row(conn, row["id"], content, args.model)
enriched += 1
if enriched % 10 == 0 or enriched == target:
elapsed_min = (time.monotonic() - started) / 60.0
rate = enriched / elapsed_min if elapsed_min > 0 else 0
eta_min = (target - enriched) / rate if rate > 0 else 0
print(
f" enriched {enriched}/{target} "
f"(skipped {skipped}, 429s {rate_limited}, "
f"~{rate:.1f}/min, ETA {eta_min:.1f}min)"
)
elapsed_total = time.monotonic() - started
print()
print(
f"Done. enriched={enriched} skipped={skipped} "
f"rate_limited={rate_limited} in {elapsed_total/60:.1f} min"
)
return 0
if __name__ == "__main__":
sys.exit(main())