Spaces:

cuilabs
/

bee

Paused

bee / scripts /distill_domains.py

Bee Deploy

HF Space backend deploy [de0cba5]

5e21013 7 days ago

5.22 kB

	"""Generate domain training data from teacher API.

	This is the single highest-impact thing you can do for Bee.
	500 expert-level training samples per domain, generated by the configured
	teacher provider chain (DeepSeek V4 Pro primary; Anthropic Haiku 4.5 →
	Gemini 2.5 Flash → GPT-5 fallback per `bee/teacher_providers.py`).

	Cost depends on which provider serves the run. With DeepSeek V4 Pro
	on its 75%-off promo (through 2026-05-31) ~200 samples × 6 domains is
	roughly $5-8; on Haiku 4.5 fallback it's $15-20.

	Then train LoRA adapters on the data (see train_lora.py).

	Usage (any of these will work — script defers to `resolve_chain` and
	picks the highest-priority provider that has a configured key):

	# DeepSeek primary (recommended — cheapest + strongest):
	BEE_DEEPSEEK_API_KEY=sk-... python scripts/distill_domains.py

	# Anthropic fallback (only if DeepSeek key isn't set):
	BEE_TEACHER_API_KEY=sk-ant-... python scripts/distill_domains.py

	# Force a specific provider regardless of which keys are set:
	BEE_TEACHER_PROVIDER=deepseek python scripts/distill_domains.py

	# Single domain, smaller batch for a fast/cheap test:
	BEE_DEEPSEEK_API_KEY=... python scripts/distill_domains.py --domain cui_labs --samples 50
	"""

	import argparse
	import json
	import logging
	import os
	import sys
	from pathlib import Path

	# Add project root to path
	PROJECT_ROOT = Path(__file__).parent.parent
	sys.path.insert(0, str(PROJECT_ROOT))

	from dotenv import load_dotenv
	load_dotenv(PROJECT_ROOT / ".env")

	from bee.distillation import DistillationConfig, DistillationPipeline
	from bee.teacher_providers import describe_chain, is_any_teacher_configured, resolve_primary

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
	)
	logger = logging.getLogger("distill")


	def main():
	parser = argparse.ArgumentParser(description="Generate domain training data from teacher API")
	parser.add_argument("--domain", type=str, default=None, help="Single domain to generate (default: all)")
	parser.add_argument("--samples", type=int, default=200, help="Samples per domain")
	parser.add_argument("--output", type=str, default="./data/datasets/distilled", help="Output directory")
	parser.add_argument("--teacher-model", type=str, default=None, help="Override teacher model")
	args = parser.parse_args()

	if not is_any_teacher_configured():
	print("ERROR: No teacher provider configured. Set ONE of:")
	print(" BEE_DEEPSEEK_API_KEY (recommended — primary; DeepSeek V4 Pro)")
	print(" BEE_TEACHER_API_KEY (Anthropic fallback — Haiku 4.5)")
	print(" BEE_GOOGLE_API_KEY (Gemini 2.5 Flash fallback)")
	print(" BEE_OPENAI_API_KEY (GPT-5 fallback)")
	sys.exit(1)

	# CUI Labs domains — aligned to cuilabs.io focus areas. `cui_labs`
	# runs first so identity + portfolio facts are baked into the LoRA
	# adapter early; without that, a fresh model on Bee Cell still
	# hallucinates being from HuggingFace / Berkeley on identity prompts.
	domains = ["cui_labs", "programming", "cybersecurity", "quantum", "fintech", "general"]
	if args.domain:
	if args.domain not in domains:
	print(f"Unknown domain: {args.domain}. Available: {domains}")
	sys.exit(1)
	domains = [args.domain]

	# Defer provider selection to the canonical resolver so DeepSeek V4
	# Pro (primary) gets picked automatically when its key is set, with
	# the Anthropic / Google / OpenAI chain queued for retry on 429/5xx.
	# Leaving teacher_api_key blank tells DistillationPipeline to call
	# ResilientTeacherClient.from_env() rather than build a single-
	# provider client.
	primary = resolve_primary()
	teacher_model_override = args.teacher_model or os.getenv("BEE_TEACHER_MODEL", "")

	config = DistillationConfig(
	teacher_api_url="",
	teacher_api_key="",
	teacher_model=teacher_model_override or (primary.model if primary else "deepseek-v4-pro"),
	output_dir=args.output,
	samples_per_domain=args.samples,
	domains=domains,
	include_reasoning=True,
	include_corrections=True,
	)

	print("=" * 60)
	print("BEE DOMAIN DISTILLATION")
	print("=" * 60)
	print(f" Provider chain: {describe_chain()}")
	print(f" Domains: {', '.join(domains)}")
	print(f" Samples/domain: {config.samples_per_domain}")
	print(f" Total samples: ~{config.samples_per_domain * len(domains)}")
	print(f" Output: {config.output_dir}")
	print("=" * 60)

	pipeline = DistillationPipeline(config)

	try:
	results = pipeline.run(domains=domains)
	print("\n" + "=" * 60)
	print("COMPLETE")
	print("=" * 60)
	print(f" Generated: {results.get('total_generated', 0)} samples")
	print(f" Errors: {results.get('total_errors', 0)}")
	print(f" Output: {config.output_dir}")
	print(f"\n Next step: Train LoRA adapters on this data:")
	print(f" python scripts/train_lora.py --data {config.output_dir}")
	finally:
	pipeline.close()


	if __name__ == "__main__":
	main()