"""Generate domain training data from teacher API. This is the single highest-impact thing you can do for Bee. 500 expert-level training samples per domain, generated by the configured teacher provider chain (DeepSeek V4 Pro primary; Anthropic Haiku 4.5 → Gemini 2.5 Flash → GPT-5 fallback per `bee/teacher_providers.py`). Cost depends on which provider serves the run. With DeepSeek V4 Pro on its 75%-off promo (through 2026-05-31) ~200 samples × 6 domains is roughly $5-8; on Haiku 4.5 fallback it's $15-20. Then train LoRA adapters on the data (see train_lora.py). Usage (any of these will work — script defers to `resolve_chain` and picks the highest-priority provider that has a configured key): # DeepSeek primary (recommended — cheapest + strongest): BEE_DEEPSEEK_API_KEY=sk-... python scripts/distill_domains.py # Anthropic fallback (only if DeepSeek key isn't set): BEE_TEACHER_API_KEY=sk-ant-... python scripts/distill_domains.py # Force a specific provider regardless of which keys are set: BEE_TEACHER_PROVIDER=deepseek python scripts/distill_domains.py # Single domain, smaller batch for a fast/cheap test: BEE_DEEPSEEK_API_KEY=... python scripts/distill_domains.py --domain cui_labs --samples 50 """ import argparse import json import logging import os import sys from pathlib import Path # Add project root to path PROJECT_ROOT = Path(__file__).parent.parent sys.path.insert(0, str(PROJECT_ROOT)) from dotenv import load_dotenv load_dotenv(PROJECT_ROOT / ".env") from bee.distillation import DistillationConfig, DistillationPipeline from bee.teacher_providers import describe_chain, is_any_teacher_configured, resolve_primary logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", ) logger = logging.getLogger("distill") def main(): parser = argparse.ArgumentParser(description="Generate domain training data from teacher API") parser.add_argument("--domain", type=str, default=None, help="Single domain to generate (default: all)") parser.add_argument("--samples", type=int, default=200, help="Samples per domain") parser.add_argument("--output", type=str, default="./data/datasets/distilled", help="Output directory") parser.add_argument("--teacher-model", type=str, default=None, help="Override teacher model") args = parser.parse_args() if not is_any_teacher_configured(): print("ERROR: No teacher provider configured. Set ONE of:") print(" BEE_DEEPSEEK_API_KEY (recommended — primary; DeepSeek V4 Pro)") print(" BEE_TEACHER_API_KEY (Anthropic fallback — Haiku 4.5)") print(" BEE_GOOGLE_API_KEY (Gemini 2.5 Flash fallback)") print(" BEE_OPENAI_API_KEY (GPT-5 fallback)") sys.exit(1) # CUI Labs domains — aligned to cuilabs.io focus areas. `cui_labs` # runs first so identity + portfolio facts are baked into the LoRA # adapter early; without that, a fresh model on Bee Cell still # hallucinates being from HuggingFace / Berkeley on identity prompts. domains = ["cui_labs", "programming", "cybersecurity", "quantum", "fintech", "general"] if args.domain: if args.domain not in domains: print(f"Unknown domain: {args.domain}. Available: {domains}") sys.exit(1) domains = [args.domain] # Defer provider selection to the canonical resolver so DeepSeek V4 # Pro (primary) gets picked automatically when its key is set, with # the Anthropic / Google / OpenAI chain queued for retry on 429/5xx. # Leaving teacher_api_key blank tells DistillationPipeline to call # ResilientTeacherClient.from_env() rather than build a single- # provider client. primary = resolve_primary() teacher_model_override = args.teacher_model or os.getenv("BEE_TEACHER_MODEL", "") config = DistillationConfig( teacher_api_url="", teacher_api_key="", teacher_model=teacher_model_override or (primary.model if primary else "deepseek-v4-pro"), output_dir=args.output, samples_per_domain=args.samples, domains=domains, include_reasoning=True, include_corrections=True, ) print("=" * 60) print("BEE DOMAIN DISTILLATION") print("=" * 60) print(f" Provider chain: {describe_chain()}") print(f" Domains: {', '.join(domains)}") print(f" Samples/domain: {config.samples_per_domain}") print(f" Total samples: ~{config.samples_per_domain * len(domains)}") print(f" Output: {config.output_dir}") print("=" * 60) pipeline = DistillationPipeline(config) try: results = pipeline.run(domains=domains) print("\n" + "=" * 60) print("COMPLETE") print("=" * 60) print(f" Generated: {results.get('total_generated', 0)} samples") print(f" Errors: {results.get('total_errors', 0)}") print(f" Output: {config.output_dir}") print(f"\n Next step: Train LoRA adapters on this data:") print(f" python scripts/train_lora.py --data {config.output_dir}") finally: pipeline.close() if __name__ == "__main__": main()