Spaces:

cuilabs
/

bee

Paused

bee / scripts /backfill_cve_completions.py

Bee Deploy

HF Space backend deploy [de0cba5]

5e21013 21 days ago

9.91 kB

	#!/usr/bin/env python3
	"""scripts/backfill_cve_completions.py — generate teacher answers for prompted CVE rows.

	Sister script to backfill_cve_prompts.py. That one generated the
	`payload.prompt` (the question side of a training pair); this one
	generates the `payload.completion` (the answer side).

	Why a separate script
	---------------------
	Splitting prompt-generation from completion-generation lets each pass
	run independently, restart cleanly on failure, and use a different
	teacher per side if quality demands. For Stage-1 cybersec adapter
	training the same Mistral mistral-medium-latest tier handles both — it
	has the depth to write a senior-engineer answer and the Experiment
	tier is free.

	Selection criteria
	------------------
	WHERE kind='cve' AND domain='cybersecurity'
	AND payload ? 'prompt'
	AND NOT (payload ? 'completion')
	ORDER BY KEV-flagged first, then CVSS severity, then recency.

	KEV-first ordering matters because those are the CVEs adversaries are
	ACTIVELY exploiting in the wild — the highest-signal training data we
	have. If the run is interrupted partway, we still get the best rows.

	Idempotent: re-run after Ctrl-C; the `NOT (payload ? 'completion')`
	predicate is the resume cursor.

	Throughput
	----------
	Mistral Experiment tier: 23 RPM account-wide. We pace at 20.
	494 rows / 20 RPM = ~25 minutes for a full backfill.
	"""

	from __future__ import annotations

	import argparse
	import json
	import os
	import sys
	import time
	import urllib.error
	import urllib.request
	from pathlib import Path

	try:
	from dotenv import load_dotenv

	load_dotenv(Path(__file__).resolve().parent.parent / ".env")
	except ImportError:
	pass

	import psycopg
	from psycopg import rows as psycopg_rows

	MISTRAL_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"
	DEFAULT_MODEL = "mistral-medium-latest"

	RATE_LIMIT_RPM = 20
	RATE_INTERVAL_S = 60.0 / RATE_LIMIT_RPM

	# This system prompt mirrors the role we want the trained adapter to
	# embody: a senior security engineer who explains root cause,
	# exploitation, mitigation, and detection in concrete technical terms.
	# Every completion becomes one (prompt, completion) training pair, so
	# the answer style here directly shapes the adapter's voice.
	SYSTEM_PROMPT = (
	"You are a senior offensive-and-defensive cybersecurity engineer "
	"answering for a peer-engineer audience. Given a vulnerability "
	"training prompt, write a concrete, technically rigorous answer "
	"that covers (1) root cause, (2) realistic exploitation pattern, "
	"(3) concrete mitigation/patch guidance, and (4) detection signals "
	"(log fields, EDR signatures, network markers). No marketing fluff, "
	"no generic 'apply security best practices' platitudes. Cite "
	"specific configuration keys, function names, or CWE IDs when "
	"relevant. 4-8 sentences total, dense and implementation-ready. "
	"Output the answer body only — no preface, no markdown headings, "
	"no JSON, no fences."
	)


	def strip_markdown_fences(s: str) -> str:
	"""Some models wrap output in ```…``` even when asked not to."""
	s = s.strip()
	if not s.startswith("```"):
	return s
	parts = s.split("```")
	if len(parts) >= 3:
	inner = parts[1]
	if "\n" in inner:
	first, rest = inner.split("\n", 1)
	if not first.strip() or first.strip().isalpha():
	return rest.strip()
	return inner.strip()
	return s


	def call_mistral(
	api_key: str,
	model: str,
	user_prompt: str,
	timeout_s: int = 90,
	) -> tuple[str \| None, str \| None]:
	body = json.dumps(
	{
	"model": model,
	"messages": [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_prompt},
	],
	"max_tokens": 900,
	"temperature": 0.4,
	}
	).encode("utf-8")
	req = urllib.request.Request(
	MISTRAL_ENDPOINT,
	data=body,
	method="POST",
	headers={
	"Authorization": f"Bearer {api_key}",
	"Content-Type": "application/json",
	},
	)
	try:
	with urllib.request.urlopen(req, timeout=timeout_s) as resp:
	data = json.loads(resp.read().decode("utf-8"))
	except urllib.error.HTTPError as e:
	if e.code == 429:
	return None, "429"
	msg = ""
	try:
	msg = e.read().decode("utf-8")[:200]
	except Exception:
	msg = ""
	print(f" ! HTTP {e.code}: {msg}", file=sys.stderr)
	return None, "http_other"
	except Exception as e:
	print(f" ! fetch error: {e}", file=sys.stderr)
	return None, "fetch"

	content = (data.get("choices") or [{}])[0].get("message", {}).get("content", "")
	content = strip_markdown_fences(content)
	if not content or len(content) < 80:
	return None, "empty"
	return content, None


	def count_pending(conn) -> int:
	with conn.cursor() as cur:
	cur.execute(
	"""
	SELECT count(*)
	FROM public.training_queue
	WHERE kind = 'cve'
	AND domain = 'cybersecurity'
	AND payload ? 'prompt'
	AND NOT (payload ? 'completion')
	"""
	)
	return cur.fetchone()[0]


	def fetch_rows(conn, limit: int) -> list[dict]:
	"""KEV-flagged first (actively exploited), then CRITICAL/HIGH/MEDIUM,
	then most recent. Even a partial run captures the highest-signal data."""
	sql = """
	SELECT id, external_id, payload
	FROM public.training_queue
	WHERE kind = 'cve'
	AND domain = 'cybersecurity'
	AND payload ? 'prompt'
	AND NOT (payload ? 'completion')
	ORDER BY
	CASE WHEN (payload->>'kev')::boolean THEN 0 ELSE 1 END,
	CASE payload->>'cvss_severity'
	WHEN 'CRITICAL' THEN 1
	WHEN 'HIGH' THEN 2
	WHEN 'MEDIUM' THEN 3
	ELSE 9
	END,
	(payload->>'published') DESC NULLS LAST
	LIMIT %s
	"""
	with conn.cursor(row_factory=psycopg_rows.dict_row) as cur:
	cur.execute(sql, (limit,))
	return list(cur.fetchall())


	def update_row(conn, row_id: int, completion: str, model: str) -> None:
	sql = """
	UPDATE public.training_queue
	SET payload = payload
	\|\| jsonb_build_object('completion', %s::text)
	\|\| jsonb_build_object('completion_model', %s::text)
	WHERE id = %s
	"""
	with conn.cursor() as cur:
	cur.execute(sql, (completion, model, row_id))
	conn.commit()


	def main() -> int:
	parser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0])
	parser.add_argument("--limit", type=int, default=None, help="cap total rows enriched")
	parser.add_argument("--batch", type=int, default=50)
	parser.add_argument(
	"--model",
	default=os.environ.get("BEE_BACKFILL_MODEL", DEFAULT_MODEL),
	)
	parser.add_argument("--dry-run", action="store_true")
	args = parser.parse_args()

	api_key = (os.environ.get("BEE_MISTRAL_API_KEY") or "").strip()
	if not api_key:
	print("ERROR: BEE_MISTRAL_API_KEY not set", file=sys.stderr)
	return 1
	pg_url = (os.environ.get("POSTGRES_URL_NON_POOLING") or "").strip()
	if not pg_url:
	print("ERROR: POSTGRES_URL_NON_POOLING not set", file=sys.stderr)
	return 1

	print(
	f"Completion backfill — model={args.model} "
	f"batch={args.batch} pace={RATE_LIMIT_RPM} req/min"
	)

	started = time.monotonic()
	enriched = 0
	skipped = 0
	rate_limited = 0
	last_call = 0.0

	with psycopg.connect(pg_url, autocommit=False) as conn:
	pending = count_pending(conn)
	print(f" pending rows worth completing: {pending}")
	if args.dry_run:
	print("dry-run; exiting")
	return 0

	target = min(args.limit, pending) if args.limit else pending
	if target == 0:
	print("nothing to do")
	return 0
	print(f" target this run: {target}")
	print()

	while enriched + skipped < target:
	remaining = target - enriched - skipped
	rows = fetch_rows(conn, min(args.batch, remaining))
	if not rows:
	break
	for row in rows:
	elapsed = time.monotonic() - last_call
	if elapsed < RATE_INTERVAL_S:
	time.sleep(RATE_INTERVAL_S - elapsed)
	last_call = time.monotonic()

	content, err = call_mistral(api_key, args.model, row["payload"]["prompt"])
	if err == "429":
	rate_limited += 1
	print(" ! 429 — backing off 12s")
	time.sleep(12.0)
	continue
	if not content:
	skipped += 1
	continue

	update_row(conn, row["id"], content, args.model)
	enriched += 1

	if enriched % 10 == 0 or enriched == target:
	elapsed_min = (time.monotonic() - started) / 60.0
	rate = enriched / elapsed_min if elapsed_min > 0 else 0
	eta_min = (target - enriched) / rate if rate > 0 else 0
	print(
	f" enriched {enriched}/{target} "
	f"(skipped {skipped}, 429s {rate_limited}, "
	f"~{rate:.1f}/min, ETA {eta_min:.1f}min)"
	)

	elapsed_total = time.monotonic() - started
	print()
	print(
	f"Done. enriched={enriched} skipped={skipped} "
	f"rate_limited={rate_limited} in {elapsed_total/60:.1f} min"
	)
	return 0


	if __name__ == "__main__":
	sys.exit(main())