passagereptile455
/

training-scripts

@@ -1,299 +1,299 @@
-# /// script
-# dependencies = [
-#     "trl>=0.15.0",
-#     "peft>=0.14.0",
-#     "transformers>=4.51.0",
-#     "accelerate>=0.30.0",
-#     "datasets",
-#     "torch",
-#     "huggingface_hub",
-#     "human_eval",
-# ]
-# ///
-"""
-Fine-tune Qwen3-0.6B on codeforces-cots (Python subset) to beat base on HumanEval.
-Reproduction of Ben Burtenshaw's Claude Code vs Codex challenge.
-"""
-import os
-import sys
-import time
-import tempfile
-import json
-# === PHASE 0: Authentication ===
-print("=" * 60)
-print("PHASE 0: Authentication")
-print("=" * 60)
-from huggingface_hub import HfApi
-HF_TOKEN = os.environ.get("HF_TOKEN")
-if not HF_TOKEN:
-    raise ValueError("HF_TOKEN environment variable required")
-# Removed login() - using HfApi(token=) instead
-api = HfApi(token=HF_TOKEN)
-user_info = api.whoami()
-print(f"Authenticated as: {user_info['name']}")
-MODEL_NAME = "Qwen/Qwen3-0.6B"
-DATASET_NAME = "open-r1/codeforces-cots"
-DATASET_SUBSET = "solutions_py"
-OUTPUT_REPO = f"{user_info['name']}/qwen3-humaneval-sft"
-NUM_EXAMPLES = 500
-MAX_STEPS = 150
-print(f"Model: {MODEL_NAME}")
-print(f"Dataset: {DATASET_NAME} ({DATASET_SUBSET} subset)")
-print(f"Output: {OUTPUT_REPO}")
-# === PHASE 1: Load Base Model and Run Benchmark ===
-print("\n" + "=" * 60)
-print("PHASE 1: Benchmark Base Model on HumanEval")
-print("=" * 60)
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-print("Loading base model...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-base_model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    trust_remote_code=True,
-)
-print(f"Model loaded on {base_model.device}")
-def run_humaneval_benchmark(model, tokenizer, label="model"):
-    """Run HumanEval benchmark on model."""
-    from human_eval.data import read_problems
-    from human_eval.evaluation import evaluate_functional_correctness as check_correctness
-    problems = read_problems()
-    print(f"Testing {label} on {len(problems)} HumanEval problems...")
-    samples = []
-    model.eval()
-    for i, (task_id, problem) in enumerate(problems.items()):
-        prompt = problem["prompt"]
-        messages = [{"role": "user", "content": f"Complete this Python function:\n\n{prompt}"}]
-        text = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True,
-            enable_thinking=False,
-        )
-        inputs = tokenizer(text, return_tensors="pt").to(model.device)
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=512,
-                do_sample=False,
-                pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
-            )
-        response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-        if "```python" in response:
-            code = response.split("```python")[1].split("```")[0].strip()
-        elif "```" in response:
-            code = response.split("```")[1].split("```")[0].strip()
-        else:
-            code = response.strip()
-        completion = prompt + code
-        samples.append({"task_id": task_id, "completion": completion})
-        if (i + 1) % 20 == 0:
-            print(f"  Progress: {i + 1}/{len(problems)}")
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
-        for s in samples:
-            f.write(json.dumps(s) + "\n")
-        samples_file = f.name
-    results = check_correctness(samples_file, k=[1], timeout=10.0)
-    os.unlink(samples_file)
-    score = results["pass@1"] * 100
-    passed = int(score * len(problems) / 100)
-    print(f"{label} score: {score:.2f}% ({passed}/{len(problems)} passed)")
-    return score, passed, len(problems)
-base_score, base_passed, total = run_humaneval_benchmark(base_model, tokenizer, "BASE")
-del base_model
-torch.cuda.empty_cache()
-print(f"\nBase model score: {base_score:.2f}%")
-# === PHASE 2: Train on codeforces-cots (Python subset) ===
-print("\n" + "=" * 60)
-print("PHASE 2: Fine-tune on codeforces-cots (solutions_py)")
-print("=" * 60)
-from datasets import load_dataset, Dataset
-from peft import LoraConfig
-from trl import SFTTrainer, SFTConfig
-print("Reloading model for training...")
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    torch_dtype=torch.float16,
-    device_map="auto",
-    trust_remote_code=True,
-)
-print(f"Loading {DATASET_NAME} ({DATASET_SUBSET} subset)...")
-ds = load_dataset(DATASET_NAME, DATASET_SUBSET, split="train", streaming=True)
-examples = []
-print(f"Preparing {NUM_EXAMPLES} training examples...")
-for i, ex in enumerate(ds):
-    if i >= NUM_EXAMPLES:
-        break
-    text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
-    examples.append({"text": text})
-    if (i + 1) % 100 == 0:
-        print(f"  Prepared {i + 1}/{NUM_EXAMPLES} examples")
-train_dataset = Dataset.from_list(examples)
-print(f"Training dataset ready: {len(train_dataset)} examples")
-lora_config = LoraConfig(
-    r=8,
-    lora_alpha=16,
-    lora_dropout=0.05,
-    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-sft_config = SFTConfig(
-    output_dir="./sft_output",
-    max_steps=MAX_STEPS,
-    learning_rate=5e-6,
-    per_device_train_batch_size=2,
-    gradient_accumulation_steps=4,
-    fp16=True,
-    gradient_checkpointing=True,
-    logging_steps=10,
-    save_steps=50,
-    max_length=2048,
-    dataset_text_field="text",
-)
-trainer = SFTTrainer(
-    model=model,
-    args=sft_config,
-    train_dataset=train_dataset,
-    peft_config=lora_config,
-    processing_class=tokenizer,
-)
-print(f"Starting training for {MAX_STEPS} steps...")
-start_time = time.time()
-trainer.train()
-train_time = time.time() - start_time
-print(f"Training completed in {train_time/60:.1f} minutes")
-print("Merging LoRA weights...")
-model = trainer.model.merge_and_unload()
-# === PHASE 3: Benchmark Fine-tuned Model ===
-print("\n" + "=" * 60)
-print("PHASE 3: Benchmark Fine-tuned Model")
-print("=" * 60)
-ft_score, ft_passed, _ = run_humaneval_benchmark(model, tokenizer, "FINE-TUNED")
-# === PHASE 4: Compare and Upload ===
-print("\n" + "=" * 60)
-print("PHASE 4: Results and Upload")
-print("=" * 60)
-improvement = ft_score - base_score
-improved_problems = ft_passed - base_passed
-print(f"\n{'='*40}")
-print("RESULTS SUMMARY")
-print(f"{'='*40}")
-print(f"Base model:       {base_score:.2f}% ({base_passed}/{total})")
-print(f"Fine-tuned model: {ft_score:.2f}% ({ft_passed}/{total})")
-print(f"Improvement:      {improvement:+.2f}% ({improved_problems:+d} problems)")
-print(f"{'='*40}")
-if ft_score > base_score:
-    print("\n*** SUCCESS: Fine-tuned beats base! ***")
-    print(f"Uploading to {OUTPUT_REPO}...")
-    model_card = f"""---
-tags:
-- fine-tuned
-- qwen3
-- humaneval
-- codeforces
-- lora
-base_model: {MODEL_NAME}
-datasets:
-- {DATASET_NAME}
----
-# Qwen3-0.6B Fine-tuned on Codeforces-CoTS (Python)
-Fine-tuned using SFT on the **solutions_py** subset of `{DATASET_NAME}`.
-## Results on HumanEval
-| Model | Score | Problems Passed |
-|-------|-------|-----------------|
-| Base (Qwen3-0.6B) | {base_score:.2f}% | {base_passed}/{total} |
-| **Fine-tuned** | **{ft_score:.2f}%** | **{ft_passed}/{total}** |
-| **Improvement** | **{improvement:+.2f}%** | **{improved_problems:+d} problems** |
-## Training Details
-- **Dataset**: {DATASET_NAME} ({DATASET_SUBSET} subset) - {NUM_EXAMPLES} examples
-- **Method**: LoRA (r=8, alpha=16)
-- **Steps**: {MAX_STEPS}
-- **Learning Rate**: 5e-6
-## Usage
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-model = AutoModelForCausalLM.from_pretrained("{OUTPUT_REPO}")
-tokenizer = AutoTokenizer.from_pretrained("{OUTPUT_REPO}")
-```
-"""
-    model.push_to_hub(OUTPUT_REPO, commit_message="Fine-tuned model beating base on HumanEval")
-    tokenizer.push_to_hub(OUTPUT_REPO, commit_message="Add tokenizer")
-    api.upload_file(
-        path_or_fileobj=model_card.encode(),
-        path_in_repo="README.md",
-        repo_id=OUTPUT_REPO,
-        commit_message="Add model card with results",
-    )
-    print(f"\n*** Model uploaded to: https://huggingface.co/{OUTPUT_REPO} ***")
-else:
-    print(f"\nFine-tuned ({ft_score:.2f}%) did not beat base ({base_score:.2f}%)")
-    print("Consider running another job with different random state.")
-print(f"\n{'='*60}")
-print("JOB COMPLETE")
-print(f"{'='*60}")

+# /// script
+# dependencies = [
+#     "trl>=0.15.0",
+#     "peft>=0.14.0",
+#     "transformers>=4.51.0",
+#     "accelerate>=0.30.0",
+#     "datasets",
+#     "torch",
+#     "huggingface_hub",
+#     "human_eval",
+# ]
+# ///
+"""
+Fine-tune Qwen3-0.6B on codeforces-cots (Python subset) to beat base on HumanEval.
+Reproduction of Ben Burtenshaw's Claude Code vs Codex challenge.
+"""
+import os
+import sys
+import time
+import tempfile
+import json
+# === PHASE 0: Authentication ===
+print("=" * 60)
+print("PHASE 0: Authentication")
+print("=" * 60)
+from huggingface_hub import HfApi
+HF_TOKEN = os.environ.get("HF_TOKEN")
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN environment variable required")
+# Removed login() - using HfApi(token=) instead
+api = HfApi(token=HF_TOKEN)
+user_info = api.whoami()
+print(f"Authenticated as: {user_info['name']}")
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+DATASET_NAME = "open-r1/codeforces-cots"
+DATASET_SUBSET = "solutions_py"
+OUTPUT_REPO = f"{user_info['name']}/qwen3-humaneval-sft"
+NUM_EXAMPLES = 500
+MAX_STEPS = 150
+print(f"Model: {MODEL_NAME}")
+print(f"Dataset: {DATASET_NAME} ({DATASET_SUBSET} subset)")
+print(f"Output: {OUTPUT_REPO}")
+# === PHASE 1: Load Base Model and Run Benchmark ===
+print("\n" + "=" * 60)
+print("PHASE 1: Benchmark Base Model on HumanEval")
+print("=" * 60)
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+print("Loading base model...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+base_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    torch_dtype=torch.float16,
+    device_map="auto",
+    trust_remote_code=True,
+)
+print(f"Model loaded on {base_model.device}")
+def run_humaneval_benchmark(model, tokenizer, label="model"):
+    """Run HumanEval benchmark on model."""
+    from human_eval.data import read_problems
+    from human_eval.evaluation import evaluate_functional_correctness as check_correctness
+    problems = read_problems()
+    print(f"Testing {label} on {len(problems)} HumanEval problems...")
+    samples = []
+    model.eval()
+    for i, (task_id, problem) in enumerate(problems.items()):
+        prompt = problem["prompt"]
+        messages = [{"role": "user", "content": f"Complete this Python function:\n\n{prompt}"}]
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False,
+        )
+        inputs = tokenizer(text, return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=512,
+                do_sample=False,
+                pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id,
+            )
+        response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+        if "```python" in response:
+            code = response.split("```python")[1].split("```")[0].strip()
+        elif "```" in response:
+            code = response.split("```")[1].split("```")[0].strip()
+        else:
+            code = response.strip()
+        completion = prompt + code
+        samples.append({"task_id": task_id, "completion": completion})
+        if (i + 1) % 20 == 0:
+            print(f"  Progress: {i + 1}/{len(problems)}")
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+        for s in samples:
+            f.write(json.dumps(s) + "\n")
+        samples_file = f.name
+    results = check_correctness(samples_file, k=[1], timeout=10.0)
+    os.unlink(samples_file)
+    score = results["pass@1"] * 100
+    passed = int(score * len(problems) / 100)
+    print(f"{label} score: {score:.2f}% ({passed}/{len(problems)} passed)")
+    return score, passed, len(problems)
+base_score, base_passed, total = run_humaneval_benchmark(base_model, tokenizer, "BASE")
+del base_model
+torch.cuda.empty_cache()
+print(f"\nBase model score: {base_score:.2f}%")
+# === PHASE 2: Train on codeforces-cots (Python subset) ===
+print("\n" + "=" * 60)
+print("PHASE 2: Fine-tune on codeforces-cots (solutions_py)")
+print("=" * 60)
+from datasets import load_dataset, Dataset
+from peft import LoraConfig
+from trl import SFTTrainer, SFTConfig
+print("Reloading model for training...")
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    torch_dtype=torch.float16,
+    device_map="auto",
+    trust_remote_code=True,
+)
+print(f"Loading {DATASET_NAME} ({DATASET_SUBSET} subset)...")
+ds = load_dataset(DATASET_NAME, DATASET_SUBSET, split="train", streaming=True)
+examples = []
+print(f"Preparing {NUM_EXAMPLES} training examples...")
+for i, ex in enumerate(ds):
+    if i >= NUM_EXAMPLES:
+        break
+    text = tokenizer.apply_chat_template(ex["messages"], tokenize=False)
+    examples.append({"text": text})
+    if (i + 1) % 100 == 0:
+        print(f"  Prepared {i + 1}/{NUM_EXAMPLES} examples")
+train_dataset = Dataset.from_list(examples)
+print(f"Training dataset ready: {len(train_dataset)} examples")
+lora_config = LoraConfig(
+    r=8,
+    lora_alpha=16,
+    lora_dropout=0.05,
+    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+sft_config = SFTConfig(
+    output_dir="./sft_output",
+    max_steps=MAX_STEPS,
+    learning_rate=5e-6,
+    per_device_train_batch_size=2,
+    gradient_accumulation_steps=4,
+    fp16=True,
+    gradient_checkpointing=True,
+    logging_steps=10,
+    save_steps=50,
+    max_length=2048,
+    dataset_text_field="text",
+)
+trainer = SFTTrainer(
+    model=model,
+    args=sft_config,
+    train_dataset=train_dataset,
+    peft_config=lora_config,
+    processing_class=tokenizer,
+)
+print(f"Starting training for {MAX_STEPS} steps...")
+start_time = time.time()
+trainer.train()
+train_time = time.time() - start_time
+print(f"Training completed in {train_time/60:.1f} minutes")
+print("Merging LoRA weights...")
+model = trainer.model.merge_and_unload()
+# === PHASE 3: Benchmark Fine-tuned Model ===
+print("\n" + "=" * 60)
+print("PHASE 3: Benchmark Fine-tuned Model")
+print("=" * 60)
+ft_score, ft_passed, _ = run_humaneval_benchmark(model, tokenizer, "FINE-TUNED")
+# === PHASE 4: Compare and Upload ===
+print("\n" + "=" * 60)
+print("PHASE 4: Results and Upload")
+print("=" * 60)
+improvement = ft_score - base_score
+improved_problems = ft_passed - base_passed
+print(f"\n{'='*40}")
+print("RESULTS SUMMARY")
+print(f"{'='*40}")
+print(f"Base model:       {base_score:.2f}% ({base_passed}/{total})")
+print(f"Fine-tuned model: {ft_score:.2f}% ({ft_passed}/{total})")
+print(f"Improvement:      {improvement:+.2f}% ({improved_problems:+d} problems)")
+print(f"{'='*40}")
+if ft_score > base_score:
+    print("\n*** SUCCESS: Fine-tuned beats base! ***")
+    print(f"Uploading to {OUTPUT_REPO}...")
+    model_card = f"""---
+tags:
+- fine-tuned
+- qwen3
+- humaneval
+- codeforces
+- lora
+base_model: {MODEL_NAME}
+datasets:
+- {DATASET_NAME}
+---
+# Qwen3-0.6B Fine-tuned on Codeforces-CoTS (Python)
+Fine-tuned using SFT on the **solutions_py** subset of `{DATASET_NAME}`.
+## Results on HumanEval
+| Model | Score | Problems Passed |
+|-------|-------|-----------------|
+| Base (Qwen3-0.6B) | {base_score:.2f}% | {base_passed}/{total} |
+| **Fine-tuned** | **{ft_score:.2f}%** | **{ft_passed}/{total}** |
+| **Improvement** | **{improvement:+.2f}%** | **{improved_problems:+d} problems** |
+## Training Details
+- **Dataset**: {DATASET_NAME} ({DATASET_SUBSET} subset) - {NUM_EXAMPLES} examples
+- **Method**: LoRA (r=8, alpha=16)
+- **Steps**: {MAX_STEPS}
+- **Learning Rate**: 5e-6
+## Usage
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+model = AutoModelForCausalLM.from_pretrained("{OUTPUT_REPO}")
+tokenizer = AutoTokenizer.from_pretrained("{OUTPUT_REPO}")
+```
+"""
+    model.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Fine-tuned model beating base on HumanEval")
+    tokenizer.push_to_hub(OUTPUT_REPO, token=HF_TOKEN, commit_message="Add tokenizer")
+    api.upload_file(
+        path_or_fileobj=model_card.encode(),
+        path_in_repo="README.md",
+        repo_id=OUTPUT_REPO,
+        commit_message="Add model card with results",
+    )
+    print(f"\n*** Model uploaded to: https://huggingface.co/{OUTPUT_REPO} ***")
+else:
+    print(f"\nFine-tuned ({ft_score:.2f}%) did not beat base ({base_score:.2f}%)")
+    print("Consider running another job with different random state.")
+print(f"\n{'='*60}")
+print("JOB COMPLETE")
+print(f"{'='*60}")