Spaces:

Sizzing
/

aws_rl_env

Running

File size: 9,856 Bytes

c745a99

"""Push the generated SFT JSONL splits to HuggingFace Hub as a proper dataset.

After upload, consumers can load it with:

    from datasets import load_dataset
    ds = load_dataset("<your-user>/aws-rl-sft")
    ds["train"]        # 1500 rows
    ds["validation"]   # 150 rows
    ds["reserve"]      # 200 held-out rows

Prerequisites:
    pip install datasets>=2.19 huggingface_hub
    export HF_TOKEN=hf_...           # or `huggingface-cli login`

Usage:
    python data/upload_sft_to_hf.py --repo-id <user>/aws-rl-sft
    python data/upload_sft_to_hf.py --repo-id <user>/aws-rl-sft --private
    python data/upload_sft_to_hf.py --repo-id <user>/aws-rl-sft --skip-push    # dry run
    python data/upload_sft_to_hf.py --repo-id Sizzing/aws-rl-sft --private --token hf_**** # upload to an org repo with explicit token
"""

from __future__ import annotations

import argparse
import json
import os
from pathlib import Path

def _find_repo_root(start: Path) -> Path:
    """Walk up from `start` looking for server/services/tasks/ as a sentinel."""
    for p in [start, *start.parents]:
        if (p / "server" / "services" / "tasks").is_dir():
            return p
    return start


REPO_ROOT = _find_repo_root(Path(__file__).resolve().parent)
SFT_DIR = REPO_ROOT / "data" / "sft"

SPLIT_FILES: dict[str, str] = {
    "train": "aws_rl_sft.train.jsonl",
    "validation": "aws_rl_sft.val.jsonl",
    "reserve": "aws_rl_sft.reserve.jsonl",
}


def load_jsonl(path: Path) -> list[dict]:
    rows: list[dict] = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows


def build_dataset_dict(sft_dir: Path):
    """Build a DatasetDict from the JSONL splits."""
    from datasets import Dataset, DatasetDict

    splits = {}
    for split, fname in SPLIT_FILES.items():
        path = sft_dir / fname
        if not path.exists():
            print(f"  skip split '{split}' — {path} not found")
            continue
        rows = load_jsonl(path)
        ds = Dataset.from_list(rows)
        splits[split] = ds
        print(f"  loaded '{split}': {len(rows)} rows, columns={list(ds.column_names)}")
    return DatasetDict(splits)


DATASET_CARD = """---
language:
- en
license: apache-2.0
size_categories:
- 1K<n<10K
task_categories:
- text-generation
tags:
- aws
- aws-cli
- sft
- lora
- agentic
- rl
- grpo
- tool-use
pretty_name: AWS RL Env SFT
---

# AWS RL Env — SFT Dataset

Supervised fine-tuning dataset for training an LLM agent that operates AWS
infrastructure via the CLI. Built for the **aws-rl-env** reinforcement-learning
environment, which emulates 34 AWS services in-container (MiniStack) and rewards
agents for completing cloud-operations tasks via single-command steps.

Designed as the **cold-start phase** of an SFT → GRPO pipeline:
1. **SFT with LoRA** (this dataset) — command-only assistant targets, lock output format
2. **GRPO on curriculum** — refine policy with online env reward, optionally emerge `<think>` reasoning

## Schema

Each row is one `(state → command)` decision, formatted as HuggingFace chat messages.
Directly compatible with `trl.SFTTrainer` (auto-detects `messages` column and
applies the tokenizer's chat template).

```python
{
    "task_id": int,
    "difficulty": "warmup" | "beginner" | "intermediate" | "advanced" | "expert",
    "source": "success_first_step" | "multi_step_continuation" | "failure_recovery" | "verification" | "hint_usage",
    "step_idx": int,
    "messages": [
        {"role": "system",    "content": "<system prompt>"},
        {"role": "user",      "content": "TASK: ... Step: N ..."},
        {"role": "assistant", "content": "aws ..."},
    ],
}
```

## Composition (by source)

| Source | Share | What it teaches |
|---|---:|---|
| `success_first_step` | ~55% | Canonical command at step 0 given empty state |
| `multi_step_continuation` | ~20% | Step N>0 with prior command history |
| `failure_recovery` | ~15% | Correct command after a plausible mistake (wrong-op, missing-arg, s3-vs-s3api, typo, etc.) |
| `verification` | ~5% | Read-only verify command after task completion |
| `hint_usage` | ~5% | Edge case: assistant requests hint via `aws help --task-hint` |

## Composition (by tier)

| Tier | Share |
|---|---:|
| warmup | ~30% |
| beginner | ~25% |
| intermediate | ~44% |
| advanced | 0% — deferred to GRPO (dynamic resource IDs can't be safely synthesized offline) |
| expert | 0% — deferred to GRPO (policy-crafting / security audits benefit from env reward) |

## Splits

| Split | Rows | Purpose |
|---|---:|---|
| `train` | 1500 | LoRA SFT training |
| `validation` | 150 | Eval loss, early stopping |
| `reserve` | 200 | Held-out; use only if train proves insufficient |

## Quickstart

### Load

```python
from datasets import load_dataset

ds = load_dataset("<your-user>/aws-rl-sft")
print(ds)
```

### Filter by source or tier

```python
easy = ds["train"].filter(lambda r: r["difficulty"] in ("warmup", "beginner"))
recovery = ds["train"].filter(lambda r: r["source"] == "failure_recovery")
```

### Train with `trl.SFTTrainer` + LoRA

```python
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig

model_id = "meta-llama/Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="bfloat16")

ds = load_dataset("<your-user>/aws-rl-sft")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    peft_config=LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        task_type="CAUSAL_LM",
    ),
    args=SFTConfig(
        output_dir="./sft-ckpt",
        max_seq_length=2048,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        warmup_ratio=0.03,
        lr_scheduler_type="cosine",
        eval_strategy="steps",
        eval_steps=100,
        save_steps=200,
        logging_steps=10,
        bf16=True,
        packing=False,
    ),
)
trainer.train()
trainer.save_model("./sft-ckpt/final")
```

## Generation notes

- **Fully synthetic, no teacher LLM required.** Canonical commands were pulled from
  the env's own test suite (`tests_tasks/test_*.py`), where each task's command
  sequence is already verified to pass the grader with reward 1.0.
- **Failure-recovery rows** use a 5-mistake catalog (wrong-op, missing-arg,
  wrong-service, s3-vs-s3api confusion, character-swap typo) paired with realistic
  AWS CLI error messages.
- **Prompt variance** injected via reward jitter (±0.1), history-window trimming,
  and sampled reset-state outputs so dedup-on-exact-prompt still produces enough
  unique rows.

## License

Apache 2.0. AWS CLI commands themselves are public interface; assistant targets
were generated deterministically from the env's grader test suite.
"""


def main() -> None:
    ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
    ap.add_argument("--repo-id", required=True, help="HF repo id, e.g. username/aws-rl-sft")
    ap.add_argument("--private", action="store_true", help="Create as private repo")
    ap.add_argument("--sft-dir", type=Path, default=SFT_DIR)
    ap.add_argument("--token", default=None, help="HF token (falls back to HF_TOKEN env var)")
    ap.add_argument(
        "--skip-push",
        action="store_true",
        help="Build + save locally, don't upload (useful for testing)",
    )
    args = ap.parse_args()

    try:
        from datasets import DatasetDict  # noqa: F401
    except ImportError:
        raise SystemExit(
            "The 'datasets' library is required. Install it with:\n"
            "  pip install datasets>=2.19"
        )

    token = args.token or os.getenv("HF_TOKEN")
    if not token and not args.skip_push:
        raise SystemExit(
            "No HF token found. Either:\n"
            "  export HF_TOKEN=hf_...\n"
            "  # or\n"
            "  huggingface-cli login\n"
            "  # or pass --token explicitly\n"
            "  # or use --skip-push to build locally without uploading"
        )

    print(f"Loading JSONL splits from {args.sft_dir}...")
    ds_dict = build_dataset_dict(args.sft_dir)
    if not ds_dict:
        raise SystemExit(
            f"No splits loaded from {args.sft_dir}. "
            "Run build_sft_dataset.py first."
        )

    if args.skip_push:
        local_path = args.sft_dir / "hf_dataset_preview"
        ds_dict.save_to_disk(str(local_path))
        print(f"\n--skip-push: saved DatasetDict to {local_path}")
        print("Inspect with: datasets.load_from_disk('{0}')".format(local_path))
        print("\nOne sample row from 'train':")
        print(json.dumps(ds_dict["train"][0], indent=2)[:800])
        return

    from huggingface_hub import HfApi, login

    login(token=token)
    print(f"\nPushing to https://huggingface.co/datasets/{args.repo_id}")
    print(f"  private={args.private}")
    ds_dict.push_to_hub(args.repo_id, private=args.private, token=token)

    api = HfApi(token=token)
    readme_bytes = DATASET_CARD.encode("utf-8")
    api.upload_file(
        path_or_fileobj=readme_bytes,
        path_in_repo="README.md",
        repo_id=args.repo_id,
        repo_type="dataset",
        commit_message="Add dataset card",
    )
    print(f"\nDone. https://huggingface.co/datasets/{args.repo_id}")
    print("\nConsumer usage:")
    print(f"  from datasets import load_dataset")
    print(f"  ds = load_dataset('{args.repo_id}')")


if __name__ == "__main__":
    main()