bee / scripts /distill_domains.py
Bee Deploy
HF Space backend deploy [de0cba5]
5e21013
"""Generate domain training data from teacher API.
This is the single highest-impact thing you can do for Bee.
500 expert-level training samples per domain, generated by the configured
teacher provider chain (DeepSeek V4 Pro primary; Anthropic Haiku 4.5 β†’
Gemini 2.5 Flash β†’ GPT-5 fallback per `bee/teacher_providers.py`).
Cost depends on which provider serves the run. With DeepSeek V4 Pro
on its 75%-off promo (through 2026-05-31) ~200 samples Γ— 6 domains is
roughly $5-8; on Haiku 4.5 fallback it's $15-20.
Then train LoRA adapters on the data (see train_lora.py).
Usage (any of these will work β€” script defers to `resolve_chain` and
picks the highest-priority provider that has a configured key):
# DeepSeek primary (recommended β€” cheapest + strongest):
BEE_DEEPSEEK_API_KEY=sk-... python scripts/distill_domains.py
# Anthropic fallback (only if DeepSeek key isn't set):
BEE_TEACHER_API_KEY=sk-ant-... python scripts/distill_domains.py
# Force a specific provider regardless of which keys are set:
BEE_TEACHER_PROVIDER=deepseek python scripts/distill_domains.py
# Single domain, smaller batch for a fast/cheap test:
BEE_DEEPSEEK_API_KEY=... python scripts/distill_domains.py --domain cui_labs --samples 50
"""
import argparse
import json
import logging
import os
import sys
from pathlib import Path
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")
from bee.distillation import DistillationConfig, DistillationPipeline
from bee.teacher_providers import describe_chain, is_any_teacher_configured, resolve_primary
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
)
logger = logging.getLogger("distill")
def main():
parser = argparse.ArgumentParser(description="Generate domain training data from teacher API")
parser.add_argument("--domain", type=str, default=None, help="Single domain to generate (default: all)")
parser.add_argument("--samples", type=int, default=200, help="Samples per domain")
parser.add_argument("--output", type=str, default="./data/datasets/distilled", help="Output directory")
parser.add_argument("--teacher-model", type=str, default=None, help="Override teacher model")
args = parser.parse_args()
if not is_any_teacher_configured():
print("ERROR: No teacher provider configured. Set ONE of:")
print(" BEE_DEEPSEEK_API_KEY (recommended β€” primary; DeepSeek V4 Pro)")
print(" BEE_TEACHER_API_KEY (Anthropic fallback β€” Haiku 4.5)")
print(" BEE_GOOGLE_API_KEY (Gemini 2.5 Flash fallback)")
print(" BEE_OPENAI_API_KEY (GPT-5 fallback)")
sys.exit(1)
# CUI Labs domains β€” aligned to cuilabs.io focus areas. `cui_labs`
# runs first so identity + portfolio facts are baked into the LoRA
# adapter early; without that, a fresh model on Bee Cell still
# hallucinates being from HuggingFace / Berkeley on identity prompts.
domains = ["cui_labs", "programming", "cybersecurity", "quantum", "fintech", "general"]
if args.domain:
if args.domain not in domains:
print(f"Unknown domain: {args.domain}. Available: {domains}")
sys.exit(1)
domains = [args.domain]
# Defer provider selection to the canonical resolver so DeepSeek V4
# Pro (primary) gets picked automatically when its key is set, with
# the Anthropic / Google / OpenAI chain queued for retry on 429/5xx.
# Leaving teacher_api_key blank tells DistillationPipeline to call
# ResilientTeacherClient.from_env() rather than build a single-
# provider client.
primary = resolve_primary()
teacher_model_override = args.teacher_model or os.getenv("BEE_TEACHER_MODEL", "")
config = DistillationConfig(
teacher_api_url="",
teacher_api_key="",
teacher_model=teacher_model_override or (primary.model if primary else "deepseek-v4-pro"),
output_dir=args.output,
samples_per_domain=args.samples,
domains=domains,
include_reasoning=True,
include_corrections=True,
)
print("=" * 60)
print("BEE DOMAIN DISTILLATION")
print("=" * 60)
print(f" Provider chain: {describe_chain()}")
print(f" Domains: {', '.join(domains)}")
print(f" Samples/domain: {config.samples_per_domain}")
print(f" Total samples: ~{config.samples_per_domain * len(domains)}")
print(f" Output: {config.output_dir}")
print("=" * 60)
pipeline = DistillationPipeline(config)
try:
results = pipeline.run(domains=domains)
print("\n" + "=" * 60)
print("COMPLETE")
print("=" * 60)
print(f" Generated: {results.get('total_generated', 0)} samples")
print(f" Errors: {results.get('total_errors', 0)}")
print(f" Output: {config.output_dir}")
print(f"\n Next step: Train LoRA adapters on this data:")
print(f" python scripts/train_lora.py --data {config.output_dir}")
finally:
pipeline.close()
if __name__ == "__main__":
main()