| """Generate domain training data from teacher API. |
| |
| This is the single highest-impact thing you can do for Bee. |
| 500 expert-level training samples per domain, generated by the configured |
| teacher provider chain (DeepSeek V4 Pro primary; Anthropic Haiku 4.5 β |
| Gemini 2.5 Flash β GPT-5 fallback per `bee/teacher_providers.py`). |
| |
| Cost depends on which provider serves the run. With DeepSeek V4 Pro |
| on its 75%-off promo (through 2026-05-31) ~200 samples Γ 6 domains is |
| roughly $5-8; on Haiku 4.5 fallback it's $15-20. |
| |
| Then train LoRA adapters on the data (see train_lora.py). |
| |
| Usage (any of these will work β script defers to `resolve_chain` and |
| picks the highest-priority provider that has a configured key): |
| |
| # DeepSeek primary (recommended β cheapest + strongest): |
| BEE_DEEPSEEK_API_KEY=sk-... python scripts/distill_domains.py |
| |
| # Anthropic fallback (only if DeepSeek key isn't set): |
| BEE_TEACHER_API_KEY=sk-ant-... python scripts/distill_domains.py |
| |
| # Force a specific provider regardless of which keys are set: |
| BEE_TEACHER_PROVIDER=deepseek python scripts/distill_domains.py |
| |
| # Single domain, smaller batch for a fast/cheap test: |
| BEE_DEEPSEEK_API_KEY=... python scripts/distill_domains.py --domain cui_labs --samples 50 |
| """ |
|
|
| import argparse |
| import json |
| import logging |
| import os |
| import sys |
| from pathlib import Path |
|
|
| |
| PROJECT_ROOT = Path(__file__).parent.parent |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from dotenv import load_dotenv |
| load_dotenv(PROJECT_ROOT / ".env") |
|
|
| from bee.distillation import DistillationConfig, DistillationPipeline |
| from bee.teacher_providers import describe_chain, is_any_teacher_configured, resolve_primary |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s [%(name)s] %(levelname)s: %(message)s", |
| ) |
| logger = logging.getLogger("distill") |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Generate domain training data from teacher API") |
| parser.add_argument("--domain", type=str, default=None, help="Single domain to generate (default: all)") |
| parser.add_argument("--samples", type=int, default=200, help="Samples per domain") |
| parser.add_argument("--output", type=str, default="./data/datasets/distilled", help="Output directory") |
| parser.add_argument("--teacher-model", type=str, default=None, help="Override teacher model") |
| args = parser.parse_args() |
|
|
| if not is_any_teacher_configured(): |
| print("ERROR: No teacher provider configured. Set ONE of:") |
| print(" BEE_DEEPSEEK_API_KEY (recommended β primary; DeepSeek V4 Pro)") |
| print(" BEE_TEACHER_API_KEY (Anthropic fallback β Haiku 4.5)") |
| print(" BEE_GOOGLE_API_KEY (Gemini 2.5 Flash fallback)") |
| print(" BEE_OPENAI_API_KEY (GPT-5 fallback)") |
| sys.exit(1) |
|
|
| |
| |
| |
| |
| domains = ["cui_labs", "programming", "cybersecurity", "quantum", "fintech", "general"] |
| if args.domain: |
| if args.domain not in domains: |
| print(f"Unknown domain: {args.domain}. Available: {domains}") |
| sys.exit(1) |
| domains = [args.domain] |
|
|
| |
| |
| |
| |
| |
| |
| primary = resolve_primary() |
| teacher_model_override = args.teacher_model or os.getenv("BEE_TEACHER_MODEL", "") |
|
|
| config = DistillationConfig( |
| teacher_api_url="", |
| teacher_api_key="", |
| teacher_model=teacher_model_override or (primary.model if primary else "deepseek-v4-pro"), |
| output_dir=args.output, |
| samples_per_domain=args.samples, |
| domains=domains, |
| include_reasoning=True, |
| include_corrections=True, |
| ) |
|
|
| print("=" * 60) |
| print("BEE DOMAIN DISTILLATION") |
| print("=" * 60) |
| print(f" Provider chain: {describe_chain()}") |
| print(f" Domains: {', '.join(domains)}") |
| print(f" Samples/domain: {config.samples_per_domain}") |
| print(f" Total samples: ~{config.samples_per_domain * len(domains)}") |
| print(f" Output: {config.output_dir}") |
| print("=" * 60) |
|
|
| pipeline = DistillationPipeline(config) |
|
|
| try: |
| results = pipeline.run(domains=domains) |
| print("\n" + "=" * 60) |
| print("COMPLETE") |
| print("=" * 60) |
| print(f" Generated: {results.get('total_generated', 0)} samples") |
| print(f" Errors: {results.get('total_errors', 0)}") |
| print(f" Output: {config.output_dir}") |
| print(f"\n Next step: Train LoRA adapters on this data:") |
| print(f" python scripts/train_lora.py --data {config.output_dir}") |
| finally: |
| pipeline.close() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|