Upload folder using huggingface_hub

a15ae41 verified 13 days ago

9.92 kB

	"""Upload artifacts to HuggingFace.

	Creates two repos under prometheus04:
	1. microagent-train-v2 (dataset) -- the 26,627-trajectory JSONL + dataset card
	2. qwen3-4b-thinking-microagent (model) -- scripts, docs, README

	HF token must be set via $HF_TOKEN env var.
	"""
	from __future__ import annotations

	import os
	import sys
	from pathlib import Path

	from huggingface_hub import HfApi, create_repo


	USER = "prometheus04"
	DATASET_REPO = f"{USER}/microagent-train-v2"
	MODEL_REPO = f"{USER}/qwen3-4b-thinking-microagent"


	DATASET_CARD = """---
	license: other
	license_name: nvidia-open-model
	license_link: https://huggingface.co/datasets/nvidia/Nemotron-Terminal-Corpus
	language: en
	size_categories:
	- 10K<n<100K
	task_categories:
	- text-generation
	tags:
	- agentic
	- terminal-bench
	- sft
	- bash
	- tool-use
	- reasoning
	---

	# microagent-train-v2

	Curated SFT corpus for training a terminal/bash agent. Derived from
	[`nvidia/Nemotron-Terminal-Corpus`](https://huggingface.co/datasets/nvidia/Nemotron-Terminal-Corpus)
	with a custom code-specific filter that recovers parse-error trajectories.

	## Quick numbers

	- 26,627 trajectories
	- ~244M tokens (avg 36.7k chars/trajectory)
	- 94.9% `<finish>` endings (successful completion)
	- 5.1% `<give_up>` endings (Nvidia-style informative failures)
	- 81.7% multi-turn (≥6 turns), avg ~8.5 turns
	- Math-free (math.parquet dropped — 4B base already knows math)

	## Source mix

	\| Source \| Trajectories \| % \|
	\|---\|---\|---\|
	\| code.parquet \| 12,875 \| 48.4% \|
	\| skill_based_medium \| 4,916 \| 18.5% \|
	\| skill_based_easy \| 4,804 \| 18.0% \|
	\| swe.parquet \| 3,841 \| 14.4% \|
	\| skill_based_mixed \| 191 \| 0.7% \|

	## Format

	Each line is a JSON object:

	```json
	{
	"conversations": [
	{"role": "system", "content": "You are a terminal agent..."},
	{"role": "user", "content": "TASK:\\n...\\n\\nInitial state:\\n..."},
	{"role": "assistant", "content": "<think>...</think>\\n<bash>...</bash>"},
	{"role": "user", "content": "<observation>"},
	...
	{"role": "assistant", "content": "<think>...</think>\\n<finish>...</finish>"}
	],
	"task": "...",
	"episode": "...",
	"source_config": "code.parquet",
	"n_assistant_turns": 7,
	"ending_mode": "finish"
	}
	```

	`ending_mode` is `"finish"` or `"give_up"`.

	## Key filter innovations

	1. Math dropped entirely — removed 24,718 dilutive shell-as-calculator samples
	2. Code recovery path — kept trajectories where the original Nemotron agent
	emitted malformed JSON once but recovered (yielded 14× more code data than
	strict filter)
	3. Parse-error observation translation — original Terminus-2 "parsing error"
	messages rewritten as generic `[FORMAT ERROR]` messages that reference the
	new XML tag format
	4. `<give_up>` ending tag — failed trajectories with ≥5 turns, ≥3 distinct
	commands, and a real observed error get accepted with retrospective give-up
	summary (e.g., `tried 4 distinct approaches; last failure: ModuleNotFoundError: No module named 'numpy'`)
	5. Per-task cap of 5 — prevents over-represented common tasks from dominating

	See the full filter design in the project repo:
	[prometheus04/qwen3-4b-thinking-microagent](https://huggingface.co/prometheus04/qwen3-4b-thinking-microagent)

	## Reproduction

	```bash
	git clone https://huggingface.co/prometheus04/qwen3-4b-thinking-microagent
	cd qwen3-4b-thinking-microagent
	export HF_TOKEN=hf_xxx
	python scripts/run_pipeline_v2.py --out data/microagent_train_v2.jsonl
	python scripts/rewrite_giveups.py data/microagent_train_v2.jsonl data/_tmp.jsonl
	mv data/_tmp.jsonl data/microagent_train_v2.jsonl
	```

	## Intended use

	Fine-tuning small LMs (4–8B) into terminal agents for benchmarks like
	Terminal-Bench 2.0. Target task: drive a real bash shell to complete coding,
	SWE-style, and general system administration tasks.

	## License

	This corpus is derived from `nvidia/Nemotron-Terminal-Corpus` and inherits
	the upstream NVIDIA Open Model License. The filter/conversion code is MIT.
	"""


	MODEL_CARD = """---
	license: apache-2.0
	base_model: Qwen/Qwen3-4B-Thinking-2507
	language: en
	library_name: peft
	tags:
	- agentic
	- terminal-bench
	- sft
	- lora
	- qwen3
	- tool-use
	- bash
	- reasoning
	datasets:
	- prometheus04/microagent-train-v2
	---

	# qwen3-4b-thinking-microagent

	LoRA SFT pipeline + scripts + docs for fine-tuning
	[`Qwen/Qwen3-4B-Thinking-2507`](https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507)
	into a terminal agent.

	Target: beat 13% on Terminal-Bench 2.0 with a single A100-40GB.

	## What's in this repo

	\| Path \| What \|
	\|---\|---\|
	\| `README.md` \| top-level overview \|
	\| `docs/PROJECT_OVERVIEW.md` \| project goals + status \|
	\| `docs/DATA_PIPELINE.md` \| how the training corpus is built \|
	\| `docs/FILTER_DESIGN.md` \| filter rules deep dive \|
	\| `docs/MODEL_SELECTION.md` \| why Qwen3-4B-Thinking-2507 vs alternatives \|
	\| `docs/HPC_PRINCIPLES.md` \| single-A100 training optimization playbook \|
	\| `docs/REPRODUCIBILITY.md` \| step-by-step reproduction guide \|
	\| `docs/VAST_AI_SETUP.md` \| running on cheap rental A100s \|
	\| `docs/CHANGELOG.md` \| v1 → v2 changes \|
	\| `scripts/run_pipeline_v2.py` \| builds the training corpus \|
	\| `scripts/convert_code_v2.py` \| code-specific filter (recovery + give_up) \|
	\| `scripts/rewrite_giveups.py` \| retrospective give_up rewriter \|
	\| `scripts/train_v2.py` \| HPC-grade LoRA training (Unsloth + packing + FA2) \|
	\| `scripts/setup_a100.sh` \| one-shot A100 installer \|
	\| `scripts/merge_lora.py` \| adapter → merged model for vLLM serving \|
	\| `data/pipeline_v2_log.txt` \| full v2 pipeline run log \|

	## Training corpus

	Lives in a separate repo:
	[`prometheus04/microagent-train-v2`](https://huggingface.co/datasets/prometheus04/microagent-train-v2)
	(26,627 trajectories, ~1 GB).

	## Why this exists

	There's a lot of public commentary about training small agents on terminal-style
	data. There's much less executable code you can run. This repo is the
	end-to-end recipe — corpus build, filter design rationale, HPC-optimized training,
	and the reasoning behind every choice.

	## Headline numbers (corpus)

	- 26,627 trajectories, ~244M training tokens
	- 81.7% multi-turn (≥6 turns), avg ~8.5 assistant turns
	- 5.1% `<give_up>` examples for honest failure handling
	- Math content: 0% (deliberately dropped)
	- Code content: 48.4%

	## Headline numbers (training, projected)

	- A100-40GB single-GPU
	- 4–5 hours wall time for 1 epoch
	- ~$5 cost on Vast.ai
	- ~80MB final LoRA adapter

	## How to run

	See [`docs/REPRODUCIBILITY.md`](https://huggingface.co/prometheus04/qwen3-4b-thinking-microagent/blob/main/docs/REPRODUCIBILITY.md)
	for the full step-by-step.

	Short version:
	```bash
	git clone https://huggingface.co/prometheus04/qwen3-4b-thinking-microagent
	cd qwen3-4b-thinking-microagent
	huggingface-cli download prometheus04/microagent-train-v2 \\
	--repo-type dataset --local-dir data
	bash scripts/setup_a100.sh
	python scripts/train_v2.py --output-dir runs/v1 --epochs 1.0
	```

	## Format the model learns

	```
	<think>brief reasoning</think>
	<bash>shell commands</bash>
	```

	Or to end:
	```
	<think>verification</think>
	<finish>one-line summary</finish>
	```

	Or honest stop:
	```
	<think>three approaches all failed; out of turns</think>
	<give_up>tried 3 distinct approaches; last failure: NameError: name 'x' is not defined</give_up>
	```

	## License

	MIT for code. Base model is Apache 2.0. Training corpus derived from Nvidia's
	Nemotron-Terminal-Corpus (NVIDIA Open Model License).
	"""


	def main():
	token = os.environ.get("HF_TOKEN")
	if not token:
	sys.exit("HF_TOKEN env var not set")

	api = HfApi(token=token)

	# ---- DATASET REPO ----
	print(f"[1/2] creating dataset repo {DATASET_REPO} (if missing)...")
	create_repo(
	repo_id=DATASET_REPO,
	repo_type="dataset",
	exist_ok=True,
	token=token,
	)

	print(f" uploading README.md (dataset card)...")
	api.upload_file(
	path_or_fileobj=DATASET_CARD.encode("utf-8"),
	path_in_repo="README.md",
	repo_id=DATASET_REPO,
	repo_type="dataset",
	)

	jsonl = Path("data/microagent_train_v2.jsonl")
	if not jsonl.exists():
	sys.exit(f"missing: {jsonl}")
	print(f" uploading {jsonl.name} ({jsonl.stat().st_size/1e9:.2f} GB) ...")
	api.upload_file(
	path_or_fileobj=str(jsonl),
	path_in_repo="microagent_train_v2.jsonl",
	repo_id=DATASET_REPO,
	repo_type="dataset",
	)
	print(f" done -> https://huggingface.co/datasets/{DATASET_REPO}")

	# ---- MODEL/CODE REPO ----
	print(f"\n[2/2] creating model repo {MODEL_REPO} (if missing)...")
	create_repo(
	repo_id=MODEL_REPO,
	repo_type="model",
	exist_ok=True,
	token=token,
	)

	print(f" uploading README.md (model card)...")
	api.upload_file(
	path_or_fileobj=MODEL_CARD.encode("utf-8"),
	path_in_repo="README.md",
	repo_id=MODEL_REPO,
	repo_type="model",
	)

	# Upload scripts/ folder
	print(f" uploading scripts/ ...")
	api.upload_folder(
	folder_path="scripts",
	path_in_repo="scripts",
	repo_id=MODEL_REPO,
	repo_type="model",
	ignore_patterns=["__pycache__", "*.pyc"],
	)

	# Upload docs/ folder
	print(f" uploading docs/ ...")
	api.upload_folder(
	folder_path="docs",
	path_in_repo="docs",
	repo_id=MODEL_REPO,
	repo_type="model",
	)

	# Upload pipeline run log for reproducibility audit
	log = Path("data/pipeline_v2_log.txt")
	if log.exists():
	print(f" uploading pipeline run log...")
	api.upload_file(
	path_or_fileobj=str(log),
	path_in_repo="data/pipeline_v2_log.txt",
	repo_id=MODEL_REPO,
	repo_type="model",
	)

	print(f" done -> https://huggingface.co/{MODEL_REPO}")


	if __name__ == "__main__":
	main()