Spaces:

spectralman
/

router-api

Sleeping

App Files Files Community

router-api / greenrouting /data /cascade.py

spectralman

Initial deploy: classifier + FastAPI router

6f0ff99 verified 26 days ago

raw

history blame contribute delete

9.94 kB

	"""Difficulty cascade: runs each query against an ascending ladder of models,
	grades the response, and derives a continuous `min_capable_log_params` label.

	Memory strategy: load one rung at a time, run all queries, dump checkpoint, free
	the weights, then advance to the next rung. Resumes from per-rung JSONL files.
	"""

	from __future__ import annotations

	import json
	import math
	import time
	from dataclasses import asdict, dataclass
	from pathlib import Path
	from typing import Iterable, Optional

	from greenrouting.data.graders import grade
	from greenrouting.data.schema import LabeledQuery, RawQuery


	@dataclass
	class RungResult:
	rung_id: str
	params_b: float
	query_id: str
	sample_index: int
	response: str
	score: float
	response_tokens: int


	def _read_raw_manifest(path: str \| Path) -> list[RawQuery]:
	queries: list[RawQuery] = []
	with open(path, "r", encoding="utf-8") as f:
	for line in f:
	if not line.strip():
	continue
	data = json.loads(line)
	queries.append(RawQuery(**data))
	return queries


	def _read_rung_checkpoint(path: Path) -> dict[str, list[RungResult]]:
	if not path.exists():
	return {}
	out: dict[str, list[RungResult]] = {}
	with open(path, "r", encoding="utf-8") as f:
	for line in f:
	if not line.strip():
	continue
	row = json.loads(line)
	r = RungResult(**row)
	out.setdefault(r.query_id, []).append(r)
	return out


	def _append_rung_checkpoint(path: Path, result: RungResult) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	with open(path, "a", encoding="utf-8") as f:
	f.write(json.dumps(asdict(result)) + "\n")


	def _load_model_and_tokenizer(hf_model: str):
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
	tok = AutoTokenizer.from_pretrained(hf_model)
	if tok.pad_token_id is None:
	tok.pad_token_id = tok.eos_token_id
	model = AutoModelForCausalLM.from_pretrained(
	hf_model, torch_dtype=dtype, device_map="auto" if torch.cuda.is_available() else None
	)
	model.eval()
	return tok, model


	def _free_model(model) -> None:
	import gc
	del model
	gc.collect()
	try:
	import torch
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	except Exception:
	pass


	def _format_prompt(tok, query: str) -> str:
	if hasattr(tok, "apply_chat_template") and tok.chat_template:
	messages = [{"role": "user", "content": query}]
	return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	return f"### Instruction:\n{query}\n\n### Response:\n"


	def _generate(tok, model, prompt: str, max_new_tokens: int, temperature: float) -> tuple[str, int]:
	import torch
	inputs = tok(prompt, return_tensors="pt").to(model.device)
	do_sample = temperature > 0
	with torch.no_grad():
	out = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	do_sample=do_sample,
	temperature=temperature if do_sample else 1.0,
	pad_token_id=tok.pad_token_id,
	)
	new_tokens = out[0][inputs["input_ids"].shape[1]:]
	response = tok.decode(new_tokens, skip_special_tokens=True)
	return response.strip(), int(new_tokens.shape[0])


	def run_rung(
	rung,
	queries: list[RawQuery],
	k_samples: int,
	max_new_tokens: int,
	temperature_first: float,
	temperature_resample: float,
	checkpoint_path: Path,
	progress: bool = True,
	) -> list[RungResult]:
	existing = _read_rung_checkpoint(checkpoint_path)
	pending = [q for q in queries if len(existing.get(q.id, [])) < k_samples]
	results: list[RungResult] = [r for rs in existing.values() for r in rs]

	if not pending:
	return results

	tok, model = _load_model_and_tokenizer(rung.hf_model)
	try:
	for i, q in enumerate(pending):
	done = len(existing.get(q.id, []))
	for s in range(done, k_samples):
	temp = temperature_first if s == 0 else temperature_resample
	prompt = _format_prompt(tok, q.text)
	start = time.time()
	response, n_tokens = _generate(tok, model, prompt, max_new_tokens, temp)
	score = grade(response, q.grader_metadata, max_new_tokens=max_new_tokens)
	rr = RungResult(
	rung_id=rung.id,
	params_b=rung.params_b,
	query_id=q.id,
	sample_index=s,
	response=response,
	score=score,
	response_tokens=n_tokens,
	)
	_append_rung_checkpoint(checkpoint_path, rr)
	results.append(rr)
	if progress:
	print(
	f" [{rung.id}] {i+1}/{len(pending)} sample={s} "
	f"score={score:.2f} tok={n_tokens} t={time.time()-start:.1f}s"
	)
	finally:
	_free_model(model)

	return results


	def derive_difficulty(
	per_rung: dict[str, list[float]],
	rung_params_b: dict[str, float],
	pass_threshold: float,
	) -> float:
	"""Continuous min_capable_log_params from per-rung mean scores.

	Logic:
	- sort rungs by parameter count
	- for each rung, mean score across samples is the "rung pass rate"
	- the smallest rung whose pass rate >= threshold defines the floor
	- linear interpolation in log(params) space between the failing and passing rung
	- if no rung passes, return log(largest_rung_params * 2) as out-of-pool
	- if smallest rung already passes, return log(smallest_rung_params)
	"""
	sorted_rungs = sorted(rung_params_b.items(), key=lambda kv: kv[1])
	if not sorted_rungs:
	return math.log(8e9)

	means: list[tuple[str, float, float]] = []
	for rung_id, params_b in sorted_rungs:
	scores = per_rung.get(rung_id, [])
	if not scores:
	continue
	means.append((rung_id, params_b, sum(scores) / len(scores)))

	if not means:
	return math.log(sorted_rungs[-1][1] * 1e9 * 2)

	if means[0][2] >= pass_threshold:
	return math.log(means[0][1] * 1e9)

	for i in range(1, len(means)):
	prev_id, prev_params, prev_score = means[i - 1]
	cur_id, cur_params, cur_score = means[i]
	if cur_score >= pass_threshold:
	denom = max(cur_score - prev_score, 1e-6)
	t = max(0.0, min(1.0, (pass_threshold - prev_score) / denom))
	log_lo = math.log(prev_params * 1e9)
	log_hi = math.log(cur_params * 1e9)
	return log_lo + t * (log_hi - log_lo)

	return math.log(means[-1][1] * 1e9 * 2)


	def derive_length_bucket(response_token_counts: list[int]) -> str:
	if not response_token_counts:
	return "medium"
	avg = sum(response_token_counts) / len(response_token_counts)
	if avg < 100:
	return "short"
	if avg < 400:
	return "medium"
	return "long"


	def run_cascade(
	config,
	raw_manifest_path: str \| Path,
	capability_labels_path: str \| Path,
	train_path: str \| Path,
	test_path: str \| Path,
	pass_threshold: float = 0.7,
	) -> None:
	from greenrouting.data.builder import (
	read_capability_labels,
	write_labeled_dataset,
	)

	queries = _read_raw_manifest(raw_manifest_path)
	cap_labels = read_capability_labels(capability_labels_path)

	per_rung_results: dict[str, dict[str, list[RungResult]]] = {}
	out_dir = Path(config.output_dir)

	for rung in config.cascade.rungs:
	if not rung.runs_locally:
	print(f"[skip] {rung.id} marked as not runs_locally; configure remote backend.")
	continue
	ckpt = out_dir / f"cascade_{config.profile_name}_{rung.id}.jsonl"
	results = run_rung(
	rung,
	queries,
	k_samples=config.cascade.k_samples,
	max_new_tokens=config.cascade.max_new_tokens,
	temperature_first=config.cascade.temperature_first,
	temperature_resample=config.cascade.temperature_resample,
	checkpoint_path=ckpt,
	)
	by_query: dict[str, list[RungResult]] = {}
	for r in results:
	by_query.setdefault(r.query_id, []).append(r)
	per_rung_results[rung.id] = by_query
	print(f"[done] rung {rung.id}: {sum(len(v) for v in by_query.values())} samples")

	rung_params: dict[str, float] = {r.id: r.params_b for r in config.cascade.rungs}

	labeled: list[LabeledQuery] = []
	for q in queries:
	per_rung_scores: dict[str, list[float]] = {}
	token_counts: list[int] = []
	for rung_id, by_query in per_rung_results.items():
	for rr in by_query.get(q.id, []):
	per_rung_scores.setdefault(rung_id, []).append(rr.score)
	token_counts.append(rr.response_tokens)
	if not per_rung_scores:
	continue
	difficulty = derive_difficulty(per_rung_scores, rung_params, pass_threshold)
	length_bucket = derive_length_bucket(token_counts)
	caps = cap_labels.get(q.id, {})
	labeled.append(LabeledQuery(
	raw=q,
	capabilities=caps,
	difficulty_log_params=difficulty,
	length_bucket=length_bucket,
	cascade_results={
	"per_rung_mean_scores": {
	k: sum(v) / len(v) for k, v in per_rung_scores.items()
	},
	},
	))

	write_labeled_dataset(
	train_path=train_path,
	test_path=test_path,
	rows=labeled,
	test_split=config.test_split,
	seed=config.seed,
	)
	print(f"[done] wrote {len(labeled)} labeled rows -> {train_path}, {test_path}")