Spaces:

cuilabs
/

bee

Paused

File size: 5,217 Bytes

5e21013

"""Generate domain training data from teacher API.

This is the single highest-impact thing you can do for Bee.
500 expert-level training samples per domain, generated by the configured
teacher provider chain (DeepSeek V4 Pro primary; Anthropic Haiku 4.5 →
Gemini 2.5 Flash → GPT-5 fallback per `bee/teacher_providers.py`).

Cost depends on which provider serves the run. With DeepSeek V4 Pro
on its 75%-off promo (through 2026-05-31) ~200 samples × 6 domains is
roughly $5-8; on Haiku 4.5 fallback it's $15-20.

Then train LoRA adapters on the data (see train_lora.py).

Usage (any of these will work — script defers to `resolve_chain` and
picks the highest-priority provider that has a configured key):

    # DeepSeek primary (recommended — cheapest + strongest):
    BEE_DEEPSEEK_API_KEY=sk-... python scripts/distill_domains.py

    # Anthropic fallback (only if DeepSeek key isn't set):
    BEE_TEACHER_API_KEY=sk-ant-... python scripts/distill_domains.py

    # Force a specific provider regardless of which keys are set:
    BEE_TEACHER_PROVIDER=deepseek python scripts/distill_domains.py

    # Single domain, smaller batch for a fast/cheap test:
    BEE_DEEPSEEK_API_KEY=... python scripts/distill_domains.py --domain cui_labs --samples 50
"""

import argparse
import json
import logging
import os
import sys
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")

from bee.distillation import DistillationConfig, DistillationPipeline
from bee.teacher_providers import describe_chain, is_any_teacher_configured, resolve_primary

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
)
logger = logging.getLogger("distill")


def main():
    parser = argparse.ArgumentParser(description="Generate domain training data from teacher API")
    parser.add_argument("--domain", type=str, default=None, help="Single domain to generate (default: all)")
    parser.add_argument("--samples", type=int, default=200, help="Samples per domain")
    parser.add_argument("--output", type=str, default="./data/datasets/distilled", help="Output directory")
    parser.add_argument("--teacher-model", type=str, default=None, help="Override teacher model")
    args = parser.parse_args()

    if not is_any_teacher_configured():
        print("ERROR: No teacher provider configured. Set ONE of:")
        print("  BEE_DEEPSEEK_API_KEY  (recommended — primary; DeepSeek V4 Pro)")
        print("  BEE_TEACHER_API_KEY   (Anthropic fallback — Haiku 4.5)")
        print("  BEE_GOOGLE_API_KEY    (Gemini 2.5 Flash fallback)")
        print("  BEE_OPENAI_API_KEY    (GPT-5 fallback)")
        sys.exit(1)

    # CUI Labs domains — aligned to cuilabs.io focus areas. `cui_labs`
    # runs first so identity + portfolio facts are baked into the LoRA
    # adapter early; without that, a fresh model on Bee Cell still
    # hallucinates being from HuggingFace / Berkeley on identity prompts.
    domains = ["cui_labs", "programming", "cybersecurity", "quantum", "fintech", "general"]
    if args.domain:
        if args.domain not in domains:
            print(f"Unknown domain: {args.domain}. Available: {domains}")
            sys.exit(1)
        domains = [args.domain]

    # Defer provider selection to the canonical resolver so DeepSeek V4
    # Pro (primary) gets picked automatically when its key is set, with
    # the Anthropic / Google / OpenAI chain queued for retry on 429/5xx.
    # Leaving teacher_api_key blank tells DistillationPipeline to call
    # ResilientTeacherClient.from_env() rather than build a single-
    # provider client.
    primary = resolve_primary()
    teacher_model_override = args.teacher_model or os.getenv("BEE_TEACHER_MODEL", "")

    config = DistillationConfig(
        teacher_api_url="",
        teacher_api_key="",
        teacher_model=teacher_model_override or (primary.model if primary else "deepseek-v4-pro"),
        output_dir=args.output,
        samples_per_domain=args.samples,
        domains=domains,
        include_reasoning=True,
        include_corrections=True,
    )

    print("=" * 60)
    print("BEE DOMAIN DISTILLATION")
    print("=" * 60)
    print(f"  Provider chain: {describe_chain()}")
    print(f"  Domains:        {', '.join(domains)}")
    print(f"  Samples/domain: {config.samples_per_domain}")
    print(f"  Total samples:  ~{config.samples_per_domain * len(domains)}")
    print(f"  Output:         {config.output_dir}")
    print("=" * 60)

    pipeline = DistillationPipeline(config)

    try:
        results = pipeline.run(domains=domains)
        print("\n" + "=" * 60)
        print("COMPLETE")
        print("=" * 60)
        print(f"  Generated: {results.get('total_generated', 0)} samples")
        print(f"  Errors:    {results.get('total_errors', 0)}")
        print(f"  Output:    {config.output_dir}")
        print(f"\n  Next step: Train LoRA adapters on this data:")
        print(f"    python scripts/train_lora.py --data {config.output_dir}")
    finally:
        pipeline.close()


if __name__ == "__main__":
    main()