File size: 5,217 Bytes
5e21013
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Generate domain training data from teacher API.

This is the single highest-impact thing you can do for Bee.
500 expert-level training samples per domain, generated by the configured
teacher provider chain (DeepSeek V4 Pro primary; Anthropic Haiku 4.5 β†’
Gemini 2.5 Flash β†’ GPT-5 fallback per `bee/teacher_providers.py`).

Cost depends on which provider serves the run. With DeepSeek V4 Pro
on its 75%-off promo (through 2026-05-31) ~200 samples Γ— 6 domains is
roughly $5-8; on Haiku 4.5 fallback it's $15-20.

Then train LoRA adapters on the data (see train_lora.py).

Usage (any of these will work β€” script defers to `resolve_chain` and
picks the highest-priority provider that has a configured key):

    # DeepSeek primary (recommended β€” cheapest + strongest):
    BEE_DEEPSEEK_API_KEY=sk-... python scripts/distill_domains.py

    # Anthropic fallback (only if DeepSeek key isn't set):
    BEE_TEACHER_API_KEY=sk-ant-... python scripts/distill_domains.py

    # Force a specific provider regardless of which keys are set:
    BEE_TEACHER_PROVIDER=deepseek python scripts/distill_domains.py

    # Single domain, smaller batch for a fast/cheap test:
    BEE_DEEPSEEK_API_KEY=... python scripts/distill_domains.py --domain cui_labs --samples 50
"""

import argparse
import json
import logging
import os
import sys
from pathlib import Path

# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")

from bee.distillation import DistillationConfig, DistillationPipeline
from bee.teacher_providers import describe_chain, is_any_teacher_configured, resolve_primary

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
)
logger = logging.getLogger("distill")


def main():
    parser = argparse.ArgumentParser(description="Generate domain training data from teacher API")
    parser.add_argument("--domain", type=str, default=None, help="Single domain to generate (default: all)")
    parser.add_argument("--samples", type=int, default=200, help="Samples per domain")
    parser.add_argument("--output", type=str, default="./data/datasets/distilled", help="Output directory")
    parser.add_argument("--teacher-model", type=str, default=None, help="Override teacher model")
    args = parser.parse_args()

    if not is_any_teacher_configured():
        print("ERROR: No teacher provider configured. Set ONE of:")
        print("  BEE_DEEPSEEK_API_KEY  (recommended β€” primary; DeepSeek V4 Pro)")
        print("  BEE_TEACHER_API_KEY   (Anthropic fallback β€” Haiku 4.5)")
        print("  BEE_GOOGLE_API_KEY    (Gemini 2.5 Flash fallback)")
        print("  BEE_OPENAI_API_KEY    (GPT-5 fallback)")
        sys.exit(1)

    # CUI Labs domains β€” aligned to cuilabs.io focus areas. `cui_labs`
    # runs first so identity + portfolio facts are baked into the LoRA
    # adapter early; without that, a fresh model on Bee Cell still
    # hallucinates being from HuggingFace / Berkeley on identity prompts.
    domains = ["cui_labs", "programming", "cybersecurity", "quantum", "fintech", "general"]
    if args.domain:
        if args.domain not in domains:
            print(f"Unknown domain: {args.domain}. Available: {domains}")
            sys.exit(1)
        domains = [args.domain]

    # Defer provider selection to the canonical resolver so DeepSeek V4
    # Pro (primary) gets picked automatically when its key is set, with
    # the Anthropic / Google / OpenAI chain queued for retry on 429/5xx.
    # Leaving teacher_api_key blank tells DistillationPipeline to call
    # ResilientTeacherClient.from_env() rather than build a single-
    # provider client.
    primary = resolve_primary()
    teacher_model_override = args.teacher_model or os.getenv("BEE_TEACHER_MODEL", "")

    config = DistillationConfig(
        teacher_api_url="",
        teacher_api_key="",
        teacher_model=teacher_model_override or (primary.model if primary else "deepseek-v4-pro"),
        output_dir=args.output,
        samples_per_domain=args.samples,
        domains=domains,
        include_reasoning=True,
        include_corrections=True,
    )

    print("=" * 60)
    print("BEE DOMAIN DISTILLATION")
    print("=" * 60)
    print(f"  Provider chain: {describe_chain()}")
    print(f"  Domains:        {', '.join(domains)}")
    print(f"  Samples/domain: {config.samples_per_domain}")
    print(f"  Total samples:  ~{config.samples_per_domain * len(domains)}")
    print(f"  Output:         {config.output_dir}")
    print("=" * 60)

    pipeline = DistillationPipeline(config)

    try:
        results = pipeline.run(domains=domains)
        print("\n" + "=" * 60)
        print("COMPLETE")
        print("=" * 60)
        print(f"  Generated: {results.get('total_generated', 0)} samples")
        print(f"  Errors:    {results.get('total_errors', 0)}")
        print(f"  Output:    {config.output_dir}")
        print(f"\n  Next step: Train LoRA adapters on this data:")
        print(f"    python scripts/train_lora.py --data {config.output_dir}")
    finally:
        pipeline.close()


if __name__ == "__main__":
    main()