File size: 17,931 Bytes
b85c683
 
 
 
 
b39f0e3
b85c683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b39f0e3
 
b85c683
 
 
b39f0e3
 
 
 
b85c683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b39f0e3
 
 
 
 
 
b85c683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b39f0e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b85c683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b39f0e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b85c683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "runpod>=1.6.0",
#     "requests>=2.28.0",
#     "python-dotenv>=1.0.0",
# ]
# ///
"""
RunPod setup script for Bamboo-1 training.

Usage:
    # Set your RunPod API key
    export RUNPOD_API_KEY="your-api-key"

    # Create a network volume for data
    uv run scripts/runpod_setup.py volume-create --name bamboo-data --size 10

    # List volumes
    uv run scripts/runpod_setup.py volume-list

    # Launch training pod with volume
    uv run scripts/runpod_setup.py launch --volume <volume-id>

    # Check pod status
    uv run scripts/runpod_setup.py status

    # Stop pod
    uv run scripts/runpod_setup.py stop
"""

import os
from pathlib import Path

import click
import runpod
import requests
from dotenv import load_dotenv

# Load .env file from project root
load_dotenv(Path(__file__).parent.parent / ".env")


@click.group()
def cli():
    """RunPod management for Bamboo-1 training."""
    api_key = os.environ.get("RUNPOD_API_KEY")
    if not api_key:
        raise click.ClickException(
            "RUNPOD_API_KEY environment variable not set.\n"
            "Get your API key from https://runpod.io/console/user/settings"
        )
    runpod.api_key = api_key


def get_ssh_public_key() -> str:
    """Get the user's SSH public key."""
    from pathlib import Path
    for key_file in ["~/.ssh/id_rsa.pub", "~/.ssh/id_ed25519.pub"]:
        path = Path(key_file).expanduser()
        if path.exists():
            return path.read_text().strip()
    return None


# Default images
DEFAULT_IMAGE = "runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04"
BAMBOO1_IMAGE = "undertheseanlp/bamboo-1:latest"  # Pre-built image with dependencies


@cli.command()
@click.option("--gpu", default="NVIDIA RTX A4000", help="GPU type")
@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
@click.option("--prebuilt", is_flag=True, help="Use pre-built bamboo-1 image (faster startup)")
@click.option("--disk", default=20, type=int, help="Disk size in GB")
@click.option("--name", default="bamboo-1-training", help="Pod name")
@click.option("--volume", default=None, help="Network volume ID to attach")
@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
@click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)")
@click.option("--epochs", default=100, type=int, help="Number of epochs")
def launch(gpu, image, prebuilt, disk, name, volume, wandb_key, sample, epochs):
    """Launch a RunPod instance for training."""

    # Use pre-built image if requested
    if prebuilt:
        image = BAMBOO1_IMAGE

    click.echo("Launching RunPod instance...")
    click.echo(f"  GPU: {gpu}")
    click.echo(f"  Image: {image}")
    click.echo(f"  Disk: {disk}GB")

    # Build training command
    train_cmd = "uv run scripts/train.py"
    if sample > 0:
        train_cmd += f" --sample {sample}"
    train_cmd += f" --epochs {epochs}"
    if wandb_key:
        train_cmd += " --wandb --wandb-project bamboo-1"

    # Set environment variables
    env_vars = {}
    if wandb_key:
        env_vars["WANDB_API_KEY"] = wandb_key

    # Add SSH public key
    ssh_key = get_ssh_public_key()
    if ssh_key:
        env_vars["PUBLIC_KEY"] = ssh_key
        click.echo("  SSH key: configured")

    if volume:
        click.echo(f"  Volume: {volume}")

    pod = runpod.create_pod(
        name=name,
        image_name=image,
        gpu_type_id=gpu,
        volume_in_gb=disk,
        env=env_vars if env_vars else None,
        ports="22/tcp",  # Expose SSH port
        network_volume_id=volume,  # Attach network volume
    )

    click.echo("\nPod created!")
    click.echo(f"  ID: {pod['id']}")
    click.echo(f"  Status: {pod.get('desiredStatus', 'PENDING')}")
    click.echo("\nMonitor at: https://runpod.io/console/pods")

    # Generate one-liner training command
    click.echo("\n" + "="*60)
    click.echo("SSH into the pod and run this command:")
    click.echo("="*60)

    if prebuilt:
        # Pre-built image: dependencies already installed
        one_liner = f"cd /workspace/bamboo-1 && {train_cmd}"
    else:
        # Standard image: need to install everything
        one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh | sh && source $HOME/.local/bin/env && git clone https://huggingface.co/undertheseanlp/bamboo-1 && cd bamboo-1 && uv sync && {train_cmd}"""

    click.echo(one_liner)
    click.echo("="*60)


@cli.command()
def status():
    """Check status of all pods."""
    pods = runpod.get_pods()

    if not pods:
        click.echo("No active pods.")
        return

    click.echo("Active pods:")
    for pod in pods:
        click.echo(f"\n  {pod['name']} ({pod['id']}): {pod.get('desiredStatus', 'UNKNOWN')}")
        runtime = pod.get('runtime') or {}
        ports = runtime.get('ports') or []
        for p in ports:
            if p.get('privatePort') == 22:
                click.echo(f"    SSH: ssh root@{p.get('ip')} -p {p.get('publicPort')}")


@cli.command()
@click.argument("pod_id")
def stop(pod_id):
    """Stop a pod by ID."""
    click.echo(f"Stopping pod {pod_id}...")
    runpod.stop_pod(pod_id)
    click.echo("Pod stopped.")


@cli.command()
@click.argument("pod_id")
def terminate(pod_id):
    """Terminate a pod by ID."""
    click.echo(f"Terminating pod {pod_id}...")
    runpod.terminate_pod(pod_id)
    click.echo("Pod terminated.")


GPU_RECOMMENDATIONS = {
    "budget": "NVIDIA RTX A4000",       # 16GB, $0.20/hr - Basic training
    "balanced": "NVIDIA RTX A5000",     # 24GB, $0.30/hr - Good balance (Recommended)
    "fast": "NVIDIA RTX A6000",         # 48GB, $0.50/hr - Larger batches, faster
    "fastest": "NVIDIA A100 80GB PCIe", # 80GB, $1.50/hr - Best for production
}


@cli.command("launch-phobert")
@click.option("--gpu", default="NVIDIA RTX A5000",
              help="GPU type: A4000 (budget), A5000 (balanced), A6000 (fast), A100 (fastest)")
@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
@click.option("--disk", default=30, type=int, help="Disk size in GB (PhoBERT needs more space)")
@click.option("--name", default="bamboo-1-phobert", help="Pod name")
@click.option("--volume", default=None, help="Network volume ID to attach")
@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
@click.option("--dataset", type=click.Choice(["udd1", "ud-vtb"]), default="udd1",
              help="Dataset: udd1 or ud-vtb (Trankit benchmark)")
@click.option("--encoder", default="vinai/phobert-base",
              help="Encoder: vinai/phobert-base or vinai/phobert-large")
@click.option("--epochs", default=100, type=int, help="Number of epochs")
@click.option("--sample", default=0, type=int, help="Sample N sentences (0=all)")
@click.option("--batch-size", default=0, type=int, help="Batch size (0=auto based on GPU)")
def launch_phobert(gpu, image, disk, name, volume, wandb_key, dataset, encoder, epochs, sample, batch_size):
    """Launch a RunPod instance for PhoBERT training.

    This launches a pod configured for training the PhoBERT-based dependency parser.
    After the pod starts, SSH in and run the training command printed below.

    GPU Recommendations:
        A4000 (16GB) - Budget option, batch_size=32
        A5000 (24GB) - Recommended balance, batch_size=48-64
        A6000 (48GB) - Fast training, batch_size=64-96
        A100 (80GB) - Fastest, batch_size=128+

    Example:
        uv run scripts/runpod_setup.py launch-phobert
        uv run scripts/runpod_setup.py launch-phobert --gpu "NVIDIA RTX A6000"  # Faster
        uv run scripts/runpod_setup.py launch-phobert --dataset ud-vtb  # Trankit benchmark
        uv run scripts/runpod_setup.py launch-phobert --encoder vinai/phobert-large --gpu "NVIDIA RTX A6000"
    """
    # Auto-select batch size based on GPU if not specified
    if batch_size == 0:
        if "A100" in gpu or "H100" in gpu:
            batch_size = 128
        elif "A6000" in gpu:
            batch_size = 64
        elif "A5000" in gpu:
            batch_size = 48
        else:  # A4000 or unknown
            batch_size = 32

        # Reduce batch size for large encoder
        if "large" in encoder:
            batch_size = batch_size // 2

    click.echo("Launching RunPod instance for PhoBERT training...")
    click.echo(f"  GPU: {gpu}")
    click.echo(f"  Image: {image}")
    click.echo(f"  Disk: {disk}GB")
    click.echo(f"  Dataset: {dataset}")
    click.echo(f"  Encoder: {encoder}")
    click.echo(f"  Batch size: {batch_size}")

    # Build training command with optimizations
    train_cmd = f"uv run scripts/train_phobert.py --encoder {encoder} --dataset {dataset} --epochs {epochs} --batch-size {batch_size} --fp16"
    if sample > 0:
        train_cmd += f" --sample {sample}"
    if wandb_key:
        train_cmd += " --wandb --wandb-project bamboo-1-phobert"

    # Output directory based on config
    output_suffix = ""
    if dataset == "ud-vtb":
        output_suffix += "-vtb"
    if "large" in encoder:
        output_suffix += "-large"
    train_cmd += f" --output models/bamboo-1-phobert{output_suffix}"

    # Set environment variables
    env_vars = {}
    if wandb_key:
        env_vars["WANDB_API_KEY"] = wandb_key

    # Add SSH public key
    ssh_key = get_ssh_public_key()
    if ssh_key:
        env_vars["PUBLIC_KEY"] = ssh_key
        click.echo("  SSH key: configured")

    if volume:
        click.echo(f"  Volume: {volume}")

    pod = runpod.create_pod(
        name=name,
        image_name=image,
        gpu_type_id=gpu,
        volume_in_gb=disk,
        env=env_vars if env_vars else None,
        ports="22/tcp",
        network_volume_id=volume,
    )

    click.echo("\nPod created!")
    click.echo(f"  ID: {pod['id']}")
    click.echo(f"  Status: {pod.get('desiredStatus', 'PENDING')}")
    click.echo("\nMonitor at: https://runpod.io/console/pods")

    # Generate setup and training commands
    click.echo("\n" + "="*70)
    click.echo("After SSH into the pod, run these commands:")
    click.echo("="*70)

    setup_cmd = """curl -LsSf https://astral.sh/uv/install.sh | sh && \\
source $HOME/.local/bin/env && \\
git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\
cd /workspace/bamboo-1 && uv sync"""

    click.echo("\n# 1. Setup (run once):")
    click.echo(setup_cmd)

    click.echo("\n# 2. Train:")
    click.echo(f"cd /workspace/bamboo-1 && {train_cmd}")

    click.echo("\n" + "="*70)

    if dataset == "ud-vtb":
        click.echo("\nTranskit benchmark reference:")
        click.echo("  Trankit base:  70.96% UAS / 64.76% LAS")
        click.echo("  Trankit large: 71.07% UAS / 65.37% LAS")
        click.echo("")


# =============================================================================
# Volume Management
# =============================================================================

DATACENTERS = {
    "EU-RO-1": "Europe (Romania)",
    "EU-CZ-1": "Europe (Czech Republic)",
    "EUR-IS-1": "Europe (Iceland)",
    "US-KS-2": "US (Kansas)",
    "US-CA-2": "US (California)",
}


def _graphql_request(query: str, variables: dict = None) -> dict:
    """Make a GraphQL request to RunPod API."""
    api_key = os.environ.get("RUNPOD_API_KEY")
    response = requests.post(
        "https://api.runpod.io/graphql",
        headers={"Authorization": f"Bearer {api_key}"},
        json={"query": query, "variables": variables or {}}
    )
    return response.json()


@cli.command("launch-fast")
@click.option("--gpu", default="NVIDIA H100 80GB HBM3", help="GPU type (H100 for fastest)")
@click.option("--image", default=DEFAULT_IMAGE, help="Docker image")
@click.option("--disk", default=30, type=int, help="Disk size in GB")
@click.option("--name", default="bamboo-1-trankit", help="Pod name")
@click.option("--volume", default=None, help="Network volume ID to attach")
@click.option("--wandb-key", envvar="WANDB_API_KEY", help="W&B API key for logging")
@click.option("--encoder", default="vinai/phobert-base", help="Encoder model")
def launch_fast(gpu, image, disk, name, volume, wandb_key, encoder):
    """Launch pod for FAST Trankit reproduction (<5 minutes).

    Trains on UD Vietnamese VTB to reproduce Trankit benchmark:
    - Trankit base:  70.96% UAS / 64.76% LAS
    - Trankit large: 71.07% UAS / 65.37% LAS

    Uses H100 with aggressive settings for <5 min training.

    Example:
        uv run scripts/runpod_setup.py launch-fast
        uv run scripts/runpod_setup.py launch-fast --encoder vinai/phobert-large
    """
    dataset = "ud-vtb"  # Always use UD-VTB for Trankit reproduction

    # Set batch size based on GPU
    if "H100" in gpu:
        batch_size = 256
        epochs = 30
    elif "A100" in gpu:
        batch_size = 128
        epochs = 40
    else:
        batch_size = 64
        epochs = 50
        click.echo("WARNING: For <5 min training, use H100!")

    # Reduce batch for large model
    if "large" in encoder:
        batch_size = batch_size // 2

    click.echo("Launching FAST Trankit reproduction (<5 minutes)...")
    click.echo(f"  GPU: {gpu}")
    click.echo(f"  Batch size: {batch_size}")
    click.echo(f"  Epochs: {epochs}")
    click.echo(f"  Dataset: {dataset} (UD Vietnamese VTB)")
    click.echo(f"  Encoder: {encoder}")
    click.echo("")
    click.echo("  Target: Trankit base 70.96% UAS / 64.76% LAS")

    # Output name
    output_name = "models/bamboo-1-phobert-vtb"
    if "large" in encoder:
        output_name += "-large"

    # Build optimized training command
    train_cmd = f"""uv run scripts/train_phobert.py \\
        --encoder {encoder} \\
        --dataset {dataset} \\
        --output {output_name} \\
        --epochs {epochs} \\
        --batch-size {batch_size} \\
        --patience 5 \\
        --warmup-steps 50 \\
        --num-workers 8 \\
        --fp16"""

    if wandb_key:
        train_cmd += " --wandb --wandb-project bamboo-1-phobert"

    # Set environment variables
    env_vars = {}
    if wandb_key:
        env_vars["WANDB_API_KEY"] = wandb_key

    ssh_key = get_ssh_public_key()
    if ssh_key:
        env_vars["PUBLIC_KEY"] = ssh_key
        click.echo("  SSH key: configured")

    if volume:
        click.echo(f"  Volume: {volume}")

    pod = runpod.create_pod(
        name=name,
        image_name=image,
        gpu_type_id=gpu,
        volume_in_gb=disk,
        env=env_vars if env_vars else None,
        ports="22/tcp",
        network_volume_id=volume,
    )

    click.echo(f"\nPod created!")
    click.echo(f"  ID: {pod['id']}")
    click.echo(f"  Status: {pod.get('desiredStatus', 'PENDING')}")
    click.echo("\nMonitor at: https://runpod.io/console/pods")

    # One-liner setup + train
    click.echo("\n" + "="*70)
    click.echo("SSH in and run this ONE command for <5 min training:")
    click.echo("="*70)

    one_liner = f"""curl -LsSf https://astral.sh/uv/install.sh | sh && \\
source $HOME/.local/bin/env && \\
git clone https://huggingface.co/undertheseanlp/bamboo-1 /workspace/bamboo-1 && \\
cd /workspace/bamboo-1 && uv sync && \\
{train_cmd}"""

    click.echo(one_liner)
    click.echo("="*70)


@cli.command("volume-list")
def volume_list():
    """List all network volumes."""
    query = """
    query {
        myself {
            networkVolumes {
                id
                name
                size
                dataCenterId
            }
        }
    }
    """
    result = _graphql_request(query)
    volumes = result.get("data", {}).get("myself", {}).get("networkVolumes", [])

    if not volumes:
        click.echo("No network volumes found.")
        click.echo(f"\nCreate one with: uv run scripts/runpod_setup.py volume-create --name bamboo-data --size 10")
        return

    click.echo("Network Volumes:")
    for vol in volumes:
        dc = DATACENTERS.get(vol['dataCenterId'], vol['dataCenterId'])
        click.echo(f"  - {vol['name']} ({vol['id']}): {vol['size']}GB @ {dc}")


@cli.command("volume-create")
@click.option("--name", default="bamboo-data", help="Volume name")
@click.option("--size", default=10, type=int, help="Size in GB")
@click.option("--datacenter", default="EUR-IS-1", type=click.Choice(list(DATACENTERS.keys())), help="Datacenter")
def volume_create(name, size, datacenter):
    """Create a network volume for data storage."""
    click.echo(f"Creating network volume...")
    click.echo(f"  Name: {name}")
    click.echo(f"  Size: {size}GB")
    click.echo(f"  Datacenter: {DATACENTERS[datacenter]}")

    query = """
    mutation createNetworkVolume($input: CreateNetworkVolumeInput!) {
        createNetworkVolume(input: $input) {
            id
            name
            size
            dataCenterId
        }
    }
    """
    variables = {
        "input": {
            "name": name,
            "size": size,
            "dataCenterId": datacenter
        }
    }

    result = _graphql_request(query, variables)

    if "errors" in result:
        click.echo(f"\nError: {result['errors'][0]['message']}")
        return

    volume = result.get("data", {}).get("createNetworkVolume", {})
    click.echo(f"\nVolume created!")
    click.echo(f"  ID: {volume['id']}")
    click.echo(f"\nUse with: uv run scripts/runpod_setup.py launch --volume {volume['id']}")


@cli.command("volume-delete")
@click.argument("volume_id")
@click.confirmation_option(prompt="Are you sure you want to delete this volume?")
def volume_delete(volume_id):
    """Delete a network volume."""
    query = """
    mutation deleteNetworkVolume($input: DeleteNetworkVolumeInput!) {
        deleteNetworkVolume(input: $input)
    }
    """
    variables = {"input": {"id": volume_id}}

    result = _graphql_request(query, variables)

    if "errors" in result:
        click.echo(f"Error: {result['errors'][0]['message']}")
        return

    click.echo(f"Volume {volume_id} deleted.")


if __name__ == "__main__":
    cli()