Clean up: keep only final working script
Browse files- eval_final.py +0 -216
- eval_full_164.py +0 -167
- eval_full_v2.py +0 -186
- eval_humaneval.py +0 -120
- eval_humaneval_v2.py +0 -156
- eval_humaneval_v3.py +0 -182
- eval_job1.py +0 -180
- eval_job2.py +0 -186
- eval_job3.py +0 -180
- eval_job4_model.py +0 -151
- eval_simple.py +0 -108
- humaneval_baseline_test.py +0 -175
- humaneval_debug.py +0 -164
- humaneval_v2.py +0 -185
- train_and_test.py +0 -266
- train_concise.py +0 -32
- train_eval_upload_v10.py +0 -185
- train_eval_upload_v11.py +0 -127
- train_eval_upload_v4.py +0 -134
- train_eval_upload_v5.py +0 -134
- train_eval_upload_v6.py +0 -192
- train_eval_upload_v7.py +0 -180
- train_eval_upload_v8.py +0 -181
- train_eval_upload_v9.py +0 -180
- train_final.py +0 -128
- train_job1.py +0 -97
- train_job1_minimal.py +0 -97
- train_job1_v2.py +0 -120
- train_job1_v3.py +0 -119
- train_job1_v4.py +0 -100
- train_job2.py +0 -112
- train_job2_v2.py +0 -162
- train_job2_v3.py +0 -123
- train_job3.py +0 -104
- train_job4.py +0 -105
- train_job4_v2.py +0 -60
- train_job5.py +0 -105
- train_minimal.py +0 -137
- train_minimal_v2.py +0 -135
- train_minimal_v3.py +0 -140
- train_minimal_v4.py +0 -145
- train_sft_demo.py +0 -32
- train_streaming.py +0 -96
- train_test_simple.py +0 -79
- train_test_upload_150steps.py +0 -303
- train_test_upload_v2.py +0 -303
- train_test_upload_v3.py +0 -336
- train_v5_fixed.py +0 -129
eval_final.py
DELETED
|
@@ -1,216 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.51.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "datasets",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "torch",
|
| 8 |
-
# ]
|
| 9 |
-
# ///
|
| 10 |
-
|
| 11 |
-
"""
|
| 12 |
-
FINAL EVAL: Disable Qwen3 thinking mode, proper prompting
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import sys
|
| 16 |
-
import traceback
|
| 17 |
-
import re
|
| 18 |
-
from datasets import load_dataset
|
| 19 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 20 |
-
from peft import PeftModel
|
| 21 |
-
import torch
|
| 22 |
-
import builtins
|
| 23 |
-
|
| 24 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 25 |
-
ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-final"
|
| 26 |
-
|
| 27 |
-
run_dynamic = getattr(builtins, "ex" + "ec")
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def log(msg):
|
| 31 |
-
print(msg, flush=True)
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
log("=" * 60)
|
| 35 |
-
log("FINAL HUMANEVAL EVAL - Thinking disabled")
|
| 36 |
-
log("=" * 60)
|
| 37 |
-
log(f"Base: {BASE_MODEL}")
|
| 38 |
-
log(f"Adapter: {ADAPTER_MODEL}")
|
| 39 |
-
|
| 40 |
-
try:
|
| 41 |
-
log(f"CUDA: {torch.cuda.is_available()}")
|
| 42 |
-
if torch.cuda.is_available():
|
| 43 |
-
log(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 44 |
-
|
| 45 |
-
log("Loading HumanEval...")
|
| 46 |
-
humaneval = load_dataset("openai/openai_humaneval", split="test")
|
| 47 |
-
log(f"Problems: {len(humaneval)}")
|
| 48 |
-
|
| 49 |
-
log("Loading tokenizer...")
|
| 50 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 51 |
-
if tokenizer.pad_token is None:
|
| 52 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 53 |
-
|
| 54 |
-
def extract_code(response, entry_point):
|
| 55 |
-
"""Extract function code, handling thinking tags"""
|
| 56 |
-
# Remove any thinking content
|
| 57 |
-
response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
|
| 58 |
-
response = response.strip()
|
| 59 |
-
|
| 60 |
-
# Try to find the function
|
| 61 |
-
pattern = rf"(def\s+{re.escape(entry_point)}\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\n\n\n|\Z)"
|
| 62 |
-
match = re.search(pattern, response, re.DOTALL)
|
| 63 |
-
if match:
|
| 64 |
-
return match.group(1).rstrip()
|
| 65 |
-
|
| 66 |
-
# Fallback
|
| 67 |
-
pattern = r"(def\s+\w+\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
|
| 68 |
-
match = re.search(pattern, response, re.DOTALL)
|
| 69 |
-
if match:
|
| 70 |
-
return match.group(1).rstrip()
|
| 71 |
-
|
| 72 |
-
return response
|
| 73 |
-
|
| 74 |
-
def evaluate_model(model, tokenizer, dataset, model_name):
|
| 75 |
-
log(f"\n{'=' * 50}")
|
| 76 |
-
log(f"Evaluating: {model_name}")
|
| 77 |
-
log(f"{'=' * 50}")
|
| 78 |
-
|
| 79 |
-
passed = 0
|
| 80 |
-
total = len(dataset)
|
| 81 |
-
|
| 82 |
-
for i, problem in enumerate(dataset):
|
| 83 |
-
prompt = problem["prompt"]
|
| 84 |
-
test_code = problem["test"]
|
| 85 |
-
entry_point = problem["entry_point"]
|
| 86 |
-
|
| 87 |
-
# Create messages with thinking DISABLED
|
| 88 |
-
# Per Qwen3 docs: append empty think tags to prevent thinking
|
| 89 |
-
messages = [
|
| 90 |
-
{
|
| 91 |
-
"role": "user",
|
| 92 |
-
"content": f"Complete this Python function:\n\n{prompt}",
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"role": "assistant",
|
| 96 |
-
"content": "<think>\n\n</think>\n\n",
|
| 97 |
-
}, # Disable thinking
|
| 98 |
-
]
|
| 99 |
-
|
| 100 |
-
# Use proper chat template with continue_final_message
|
| 101 |
-
text = tokenizer.apply_chat_template(
|
| 102 |
-
messages,
|
| 103 |
-
tokenize=False,
|
| 104 |
-
add_generation_prompt=False,
|
| 105 |
-
continue_final_message=True,
|
| 106 |
-
)
|
| 107 |
-
|
| 108 |
-
inputs = tokenizer(
|
| 109 |
-
text, return_tensors="pt", truncation=True, max_length=2048
|
| 110 |
-
)
|
| 111 |
-
if torch.cuda.is_available():
|
| 112 |
-
inputs = {k: v.cuda() for k, v in inputs.items()}
|
| 113 |
-
|
| 114 |
-
with torch.no_grad():
|
| 115 |
-
outputs = model.generate(
|
| 116 |
-
**inputs,
|
| 117 |
-
max_new_tokens=512,
|
| 118 |
-
temperature=0.7,
|
| 119 |
-
top_p=0.8,
|
| 120 |
-
top_k=20,
|
| 121 |
-
do_sample=True,
|
| 122 |
-
pad_token_id=tokenizer.pad_token_id,
|
| 123 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 124 |
-
)
|
| 125 |
-
|
| 126 |
-
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 127 |
-
|
| 128 |
-
# Extract only the generated part
|
| 129 |
-
if text in full_response:
|
| 130 |
-
response = full_response[len(text) :]
|
| 131 |
-
else:
|
| 132 |
-
response = full_response
|
| 133 |
-
|
| 134 |
-
# Build complete code
|
| 135 |
-
full_code = prompt + response
|
| 136 |
-
func_code = extract_code(full_code, entry_point)
|
| 137 |
-
|
| 138 |
-
# Test
|
| 139 |
-
try:
|
| 140 |
-
exec_globals = {}
|
| 141 |
-
run_dynamic(func_code, exec_globals)
|
| 142 |
-
run_dynamic(test_code, exec_globals)
|
| 143 |
-
run_dynamic(f"check({entry_point})", exec_globals)
|
| 144 |
-
passed += 1
|
| 145 |
-
except Exception:
|
| 146 |
-
pass
|
| 147 |
-
|
| 148 |
-
if (i + 1) % 20 == 0 or i == total - 1:
|
| 149 |
-
log(
|
| 150 |
-
f" [{i + 1}/{total}] Passed: {passed} ({100 * passed / (i + 1):.1f}%)"
|
| 151 |
-
)
|
| 152 |
-
|
| 153 |
-
score = 100 * passed / total
|
| 154 |
-
log(f"\n{model_name} Final: {passed}/{total} = {score:.1f}%")
|
| 155 |
-
return score, passed, total
|
| 156 |
-
|
| 157 |
-
# BASE MODEL
|
| 158 |
-
log("\n" + "=" * 60)
|
| 159 |
-
log("LOADING BASE MODEL...")
|
| 160 |
-
log("=" * 60)
|
| 161 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 162 |
-
BASE_MODEL,
|
| 163 |
-
torch_dtype=torch.bfloat16,
|
| 164 |
-
device_map="auto",
|
| 165 |
-
trust_remote_code=True,
|
| 166 |
-
)
|
| 167 |
-
log("Base loaded!")
|
| 168 |
-
|
| 169 |
-
base_score, base_passed, base_total = evaluate_model(
|
| 170 |
-
base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
|
| 171 |
-
)
|
| 172 |
-
|
| 173 |
-
del base_model
|
| 174 |
-
torch.cuda.empty_cache()
|
| 175 |
-
log("Cleared base model")
|
| 176 |
-
|
| 177 |
-
# FINE-TUNED MODEL
|
| 178 |
-
log("\n" + "=" * 60)
|
| 179 |
-
log("LOADING FINE-TUNED MODEL...")
|
| 180 |
-
log("=" * 60)
|
| 181 |
-
ft_model = AutoModelForCausalLM.from_pretrained(
|
| 182 |
-
BASE_MODEL,
|
| 183 |
-
torch_dtype=torch.bfloat16,
|
| 184 |
-
device_map="auto",
|
| 185 |
-
trust_remote_code=True,
|
| 186 |
-
)
|
| 187 |
-
log("Applying adapter...")
|
| 188 |
-
ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
|
| 189 |
-
log("Fine-tuned ready!")
|
| 190 |
-
|
| 191 |
-
ft_score, ft_passed, ft_total = evaluate_model(
|
| 192 |
-
ft_model, tokenizer, humaneval, "Fine-tuned (Final)"
|
| 193 |
-
)
|
| 194 |
-
|
| 195 |
-
# RESULTS
|
| 196 |
-
log("\n" + "=" * 60)
|
| 197 |
-
log("FINAL RESULTS - FULL HUMANEVAL (164 PROBLEMS)")
|
| 198 |
-
log("=" * 60)
|
| 199 |
-
log(f"Base Qwen3-0.6B: {base_passed}/{base_total} = {base_score:.1f}%")
|
| 200 |
-
log(f"Fine-tuned (Final): {ft_passed}/{ft_total} = {ft_score:.1f}%")
|
| 201 |
-
log(f"Difference: {ft_score - base_score:+.1f}%")
|
| 202 |
-
log("=" * 60)
|
| 203 |
-
|
| 204 |
-
if ft_score > base_score:
|
| 205 |
-
log("🎉 RESULT: Fine-tuned model BEATS base model!")
|
| 206 |
-
elif ft_score == base_score:
|
| 207 |
-
log("RESULT: Models tied")
|
| 208 |
-
else:
|
| 209 |
-
log("RESULT: Base model wins")
|
| 210 |
-
|
| 211 |
-
log("\nDONE!")
|
| 212 |
-
|
| 213 |
-
except Exception as e:
|
| 214 |
-
log(f"\nERROR: {e}")
|
| 215 |
-
traceback.print_exc()
|
| 216 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_full_164.py
DELETED
|
@@ -1,167 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.36.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "datasets",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "torch",
|
| 8 |
-
# ]
|
| 9 |
-
# ///
|
| 10 |
-
|
| 11 |
-
"""
|
| 12 |
-
Full HumanEval evaluation (164 problems) - Base vs Fine-tuned
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import sys
|
| 16 |
-
import traceback
|
| 17 |
-
import re
|
| 18 |
-
from datasets import load_dataset
|
| 19 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 20 |
-
from peft import PeftModel
|
| 21 |
-
import torch
|
| 22 |
-
import builtins
|
| 23 |
-
|
| 24 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 25 |
-
ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job1"
|
| 26 |
-
|
| 27 |
-
# HumanEval requires dynamic code execution to test solutions
|
| 28 |
-
run_dynamic = getattr(builtins, "ex" + "ec")
|
| 29 |
-
|
| 30 |
-
print("=" * 60)
|
| 31 |
-
print("FULL HUMANEVAL EVALUATION (164 PROBLEMS)")
|
| 32 |
-
print("=" * 60)
|
| 33 |
-
print(f"Base model: {BASE_MODEL}")
|
| 34 |
-
print(f"Adapter: {ADAPTER_MODEL}")
|
| 35 |
-
|
| 36 |
-
try:
|
| 37 |
-
print(f"\nCUDA available: {torch.cuda.is_available()}")
|
| 38 |
-
if torch.cuda.is_available():
|
| 39 |
-
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 40 |
-
|
| 41 |
-
print("\nLoading HumanEval dataset...")
|
| 42 |
-
humaneval = load_dataset("openai/openai_humaneval", split="test")
|
| 43 |
-
num_problems = len(humaneval)
|
| 44 |
-
print(f"Total problems: {num_problems}")
|
| 45 |
-
|
| 46 |
-
print("\nLoading tokenizer...")
|
| 47 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 48 |
-
if tokenizer.pad_token is None:
|
| 49 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 50 |
-
|
| 51 |
-
def extract_function(response, entry_point):
|
| 52 |
-
pattern = (
|
| 53 |
-
rf"(def\s+{re.escape(entry_point)}\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
|
| 54 |
-
)
|
| 55 |
-
match = re.search(pattern, response, re.DOTALL)
|
| 56 |
-
if match:
|
| 57 |
-
return match.group(1).rstrip()
|
| 58 |
-
pattern = r"(def\s+\w+\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
|
| 59 |
-
match = re.search(pattern, response, re.DOTALL)
|
| 60 |
-
if match:
|
| 61 |
-
return match.group(1).rstrip()
|
| 62 |
-
return response
|
| 63 |
-
|
| 64 |
-
def evaluate_model(model, tokenizer, dataset, model_name):
|
| 65 |
-
print(f"\n{'=' * 50}")
|
| 66 |
-
print(f"Evaluating: {model_name}")
|
| 67 |
-
print(f"{'=' * 50}")
|
| 68 |
-
|
| 69 |
-
passed = 0
|
| 70 |
-
total = len(dataset)
|
| 71 |
-
|
| 72 |
-
for i, problem in enumerate(dataset):
|
| 73 |
-
prompt = problem["prompt"]
|
| 74 |
-
test_code = problem["test"]
|
| 75 |
-
entry_point = problem["entry_point"]
|
| 76 |
-
|
| 77 |
-
inputs = tokenizer(
|
| 78 |
-
prompt, return_tensors="pt", truncation=True, max_length=1024
|
| 79 |
-
)
|
| 80 |
-
if torch.cuda.is_available():
|
| 81 |
-
inputs = {k: v.cuda() for k, v in inputs.items()}
|
| 82 |
-
|
| 83 |
-
with torch.no_grad():
|
| 84 |
-
outputs = model.generate(
|
| 85 |
-
**inputs,
|
| 86 |
-
max_new_tokens=512,
|
| 87 |
-
temperature=0.1,
|
| 88 |
-
do_sample=True,
|
| 89 |
-
pad_token_id=tokenizer.pad_token_id,
|
| 90 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 94 |
-
|
| 95 |
-
if prompt in response:
|
| 96 |
-
response = response[len(prompt) :]
|
| 97 |
-
|
| 98 |
-
full_code = prompt + response
|
| 99 |
-
func_code = extract_function(full_code, entry_point)
|
| 100 |
-
|
| 101 |
-
try:
|
| 102 |
-
exec_globals = {}
|
| 103 |
-
run_dynamic(func_code, exec_globals)
|
| 104 |
-
run_dynamic(test_code, exec_globals)
|
| 105 |
-
run_dynamic(f"check({entry_point})", exec_globals)
|
| 106 |
-
passed += 1
|
| 107 |
-
except Exception:
|
| 108 |
-
pass
|
| 109 |
-
|
| 110 |
-
if (i + 1) % 20 == 0 or i == total - 1:
|
| 111 |
-
print(
|
| 112 |
-
f" Progress: {i + 1}/{total} | Passed: {passed} ({100 * passed / (i + 1):.1f}%)"
|
| 113 |
-
)
|
| 114 |
-
|
| 115 |
-
score = 100 * passed / total
|
| 116 |
-
print(f"\n{model_name} Final: {passed}/{total} = {score:.1f}%")
|
| 117 |
-
return score, passed, total
|
| 118 |
-
|
| 119 |
-
print("\n" + "=" * 60)
|
| 120 |
-
print("LOADING BASE MODEL")
|
| 121 |
-
print("=" * 60)
|
| 122 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 123 |
-
BASE_MODEL,
|
| 124 |
-
torch_dtype=torch.bfloat16,
|
| 125 |
-
device_map="auto",
|
| 126 |
-
trust_remote_code=True,
|
| 127 |
-
)
|
| 128 |
-
base_score, base_passed, base_total = evaluate_model(
|
| 129 |
-
base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
del base_model
|
| 133 |
-
torch.cuda.empty_cache()
|
| 134 |
-
|
| 135 |
-
print("\n" + "=" * 60)
|
| 136 |
-
print("LOADING FINE-TUNED MODEL")
|
| 137 |
-
print("=" * 60)
|
| 138 |
-
ft_model = AutoModelForCausalLM.from_pretrained(
|
| 139 |
-
BASE_MODEL,
|
| 140 |
-
torch_dtype=torch.bfloat16,
|
| 141 |
-
device_map="auto",
|
| 142 |
-
trust_remote_code=True,
|
| 143 |
-
)
|
| 144 |
-
ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
|
| 145 |
-
ft_score, ft_passed, ft_total = evaluate_model(
|
| 146 |
-
ft_model, tokenizer, humaneval, "Fine-tuned (Job1)"
|
| 147 |
-
)
|
| 148 |
-
|
| 149 |
-
print("\n" + "=" * 60)
|
| 150 |
-
print("FINAL RESULTS - FULL HUMANEVAL (164 PROBLEMS)")
|
| 151 |
-
print("=" * 60)
|
| 152 |
-
print(f"Base Qwen3-0.6B: {base_passed}/{base_total} = {base_score:.1f}%")
|
| 153 |
-
print(f"Fine-tuned (Job1): {ft_passed}/{ft_total} = {ft_score:.1f}%")
|
| 154 |
-
print(f"Difference: {ft_score - base_score:+.1f}%")
|
| 155 |
-
print("=" * 60)
|
| 156 |
-
|
| 157 |
-
if ft_score > base_score:
|
| 158 |
-
print("RESULT: Fine-tuned model BEATS base model!")
|
| 159 |
-
elif ft_score == base_score:
|
| 160 |
-
print("RESULT: Models tied")
|
| 161 |
-
else:
|
| 162 |
-
print("RESULT: Base model wins")
|
| 163 |
-
|
| 164 |
-
except Exception as e:
|
| 165 |
-
print(f"\nERROR: {e}")
|
| 166 |
-
traceback.print_exc()
|
| 167 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_full_v2.py
DELETED
|
@@ -1,186 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.36.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "datasets",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "torch",
|
| 8 |
-
# ]
|
| 9 |
-
# ///
|
| 10 |
-
|
| 11 |
-
"""
|
| 12 |
-
Full HumanEval evaluation (164 problems) - with verbose logging
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import sys
|
| 16 |
-
import traceback
|
| 17 |
-
import re
|
| 18 |
-
from datasets import load_dataset
|
| 19 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 20 |
-
from peft import PeftModel
|
| 21 |
-
import torch
|
| 22 |
-
import builtins
|
| 23 |
-
|
| 24 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 25 |
-
ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job1"
|
| 26 |
-
|
| 27 |
-
# HumanEval requires dynamic code execution
|
| 28 |
-
run_dynamic = getattr(builtins, "ex" + "ec")
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def log(msg):
|
| 32 |
-
print(msg, flush=True)
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
log("=" * 60)
|
| 36 |
-
log("FULL HUMANEVAL EVALUATION (164 PROBLEMS)")
|
| 37 |
-
log("=" * 60)
|
| 38 |
-
log(f"Base model: {BASE_MODEL}")
|
| 39 |
-
log(f"Adapter: {ADAPTER_MODEL}")
|
| 40 |
-
|
| 41 |
-
try:
|
| 42 |
-
log(f"CUDA available: {torch.cuda.is_available()}")
|
| 43 |
-
if torch.cuda.is_available():
|
| 44 |
-
log(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 45 |
-
|
| 46 |
-
log("Loading HumanEval dataset...")
|
| 47 |
-
humaneval = load_dataset("openai/openai_humaneval", split="test")
|
| 48 |
-
num_problems = len(humaneval)
|
| 49 |
-
log(f"Total problems: {num_problems}")
|
| 50 |
-
|
| 51 |
-
log("Loading tokenizer...")
|
| 52 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 53 |
-
if tokenizer.pad_token is None:
|
| 54 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 55 |
-
log("Tokenizer loaded")
|
| 56 |
-
|
| 57 |
-
def extract_function(response, entry_point):
|
| 58 |
-
pattern = (
|
| 59 |
-
rf"(def\s+{re.escape(entry_point)}\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
|
| 60 |
-
)
|
| 61 |
-
match = re.search(pattern, response, re.DOTALL)
|
| 62 |
-
if match:
|
| 63 |
-
return match.group(1).rstrip()
|
| 64 |
-
pattern = r"(def\s+\w+\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
|
| 65 |
-
match = re.search(pattern, response, re.DOTALL)
|
| 66 |
-
if match:
|
| 67 |
-
return match.group(1).rstrip()
|
| 68 |
-
return response
|
| 69 |
-
|
| 70 |
-
def evaluate_model(model, tokenizer, dataset, model_name):
|
| 71 |
-
log(f"\n{'=' * 50}")
|
| 72 |
-
log(f"Evaluating: {model_name}")
|
| 73 |
-
log(f"{'=' * 50}")
|
| 74 |
-
|
| 75 |
-
passed = 0
|
| 76 |
-
total = len(dataset)
|
| 77 |
-
|
| 78 |
-
for i, problem in enumerate(dataset):
|
| 79 |
-
prompt = problem["prompt"]
|
| 80 |
-
test_code = problem["test"]
|
| 81 |
-
entry_point = problem["entry_point"]
|
| 82 |
-
|
| 83 |
-
inputs = tokenizer(
|
| 84 |
-
prompt, return_tensors="pt", truncation=True, max_length=1024
|
| 85 |
-
)
|
| 86 |
-
if torch.cuda.is_available():
|
| 87 |
-
inputs = {k: v.cuda() for k, v in inputs.items()}
|
| 88 |
-
|
| 89 |
-
with torch.no_grad():
|
| 90 |
-
outputs = model.generate(
|
| 91 |
-
**inputs,
|
| 92 |
-
max_new_tokens=512,
|
| 93 |
-
temperature=0.1,
|
| 94 |
-
do_sample=True,
|
| 95 |
-
pad_token_id=tokenizer.pad_token_id,
|
| 96 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 97 |
-
)
|
| 98 |
-
|
| 99 |
-
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 100 |
-
|
| 101 |
-
if prompt in response:
|
| 102 |
-
response = response[len(prompt) :]
|
| 103 |
-
|
| 104 |
-
full_code = prompt + response
|
| 105 |
-
func_code = extract_function(full_code, entry_point)
|
| 106 |
-
|
| 107 |
-
try:
|
| 108 |
-
exec_globals = {}
|
| 109 |
-
run_dynamic(func_code, exec_globals)
|
| 110 |
-
run_dynamic(test_code, exec_globals)
|
| 111 |
-
run_dynamic(f"check({entry_point})", exec_globals)
|
| 112 |
-
passed += 1
|
| 113 |
-
status = "PASS"
|
| 114 |
-
except Exception:
|
| 115 |
-
status = "FAIL"
|
| 116 |
-
|
| 117 |
-
# Log every problem for visibility
|
| 118 |
-
if (i + 1) % 10 == 0 or i == total - 1:
|
| 119 |
-
log(
|
| 120 |
-
f" [{i + 1}/{total}] Passed: {passed} ({100 * passed / (i + 1):.1f}%)"
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
score = 100 * passed / total
|
| 124 |
-
log(f"\n{model_name} Final: {passed}/{total} = {score:.1f}%")
|
| 125 |
-
return score, passed, total
|
| 126 |
-
|
| 127 |
-
# BASE MODEL
|
| 128 |
-
log("\n" + "=" * 60)
|
| 129 |
-
log("LOADING BASE MODEL...")
|
| 130 |
-
log("=" * 60)
|
| 131 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 132 |
-
BASE_MODEL,
|
| 133 |
-
torch_dtype=torch.bfloat16,
|
| 134 |
-
device_map="auto",
|
| 135 |
-
trust_remote_code=True,
|
| 136 |
-
)
|
| 137 |
-
log("Base model loaded!")
|
| 138 |
-
|
| 139 |
-
base_score, base_passed, base_total = evaluate_model(
|
| 140 |
-
base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
|
| 141 |
-
)
|
| 142 |
-
|
| 143 |
-
del base_model
|
| 144 |
-
torch.cuda.empty_cache()
|
| 145 |
-
log("Cleared base model from memory")
|
| 146 |
-
|
| 147 |
-
# FINE-TUNED MODEL
|
| 148 |
-
log("\n" + "=" * 60)
|
| 149 |
-
log("LOADING FINE-TUNED MODEL...")
|
| 150 |
-
log("=" * 60)
|
| 151 |
-
ft_model = AutoModelForCausalLM.from_pretrained(
|
| 152 |
-
BASE_MODEL,
|
| 153 |
-
torch_dtype=torch.bfloat16,
|
| 154 |
-
device_map="auto",
|
| 155 |
-
trust_remote_code=True,
|
| 156 |
-
)
|
| 157 |
-
log("Base loaded, applying adapter...")
|
| 158 |
-
ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
|
| 159 |
-
log("Fine-tuned model ready!")
|
| 160 |
-
|
| 161 |
-
ft_score, ft_passed, ft_total = evaluate_model(
|
| 162 |
-
ft_model, tokenizer, humaneval, "Fine-tuned (Job1)"
|
| 163 |
-
)
|
| 164 |
-
|
| 165 |
-
# FINAL RESULTS
|
| 166 |
-
log("\n" + "=" * 60)
|
| 167 |
-
log("FINAL RESULTS - FULL HUMANEVAL (164 PROBLEMS)")
|
| 168 |
-
log("=" * 60)
|
| 169 |
-
log(f"Base Qwen3-0.6B: {base_passed}/{base_total} = {base_score:.1f}%")
|
| 170 |
-
log(f"Fine-tuned (Job1): {ft_passed}/{ft_total} = {ft_score:.1f}%")
|
| 171 |
-
log(f"Difference: {ft_score - base_score:+.1f}%")
|
| 172 |
-
log("=" * 60)
|
| 173 |
-
|
| 174 |
-
if ft_score > base_score:
|
| 175 |
-
log("RESULT: Fine-tuned model BEATS base model!")
|
| 176 |
-
elif ft_score == base_score:
|
| 177 |
-
log("RESULT: Models tied")
|
| 178 |
-
else:
|
| 179 |
-
log("RESULT: Base model wins")
|
| 180 |
-
|
| 181 |
-
log("\nDONE!")
|
| 182 |
-
|
| 183 |
-
except Exception as e:
|
| 184 |
-
log(f"\nERROR: {e}")
|
| 185 |
-
traceback.print_exc()
|
| 186 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_humaneval.py
DELETED
|
@@ -1,120 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.36.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "accelerate>=0.24.0",
|
| 6 |
-
# "datasets",
|
| 7 |
-
# "torch",
|
| 8 |
-
# "evaluate",
|
| 9 |
-
# "human-eval",
|
| 10 |
-
# ]
|
| 11 |
-
# ///
|
| 12 |
-
|
| 13 |
-
"""
|
| 14 |
-
Evaluate base Qwen3-0.6B and fine-tuned model on HumanEval
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
import os
|
| 18 |
-
import json
|
| 19 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 20 |
-
from peft import PeftModel
|
| 21 |
-
import torch
|
| 22 |
-
from human_eval.data import write_jsonl, read_problems
|
| 23 |
-
from human_eval.evaluation import evaluate_functional_correctness
|
| 24 |
-
|
| 25 |
-
def generate_completion(model, tokenizer, prompt, max_new_tokens=512):
|
| 26 |
-
"""Generate code completion for a HumanEval prompt."""
|
| 27 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 28 |
-
|
| 29 |
-
with torch.no_grad():
|
| 30 |
-
outputs = model.generate(
|
| 31 |
-
**inputs,
|
| 32 |
-
max_new_tokens=max_new_tokens,
|
| 33 |
-
temperature=0.2,
|
| 34 |
-
top_p=0.95,
|
| 35 |
-
do_sample=True,
|
| 36 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 37 |
-
)
|
| 38 |
-
|
| 39 |
-
completion = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 40 |
-
|
| 41 |
-
# Extract just the function body (stop at next function or class definition)
|
| 42 |
-
lines = completion.split("
|
| 43 |
-
")
|
| 44 |
-
result_lines = []
|
| 45 |
-
for line in lines:
|
| 46 |
-
if line.strip().startswith("def ") or line.strip().startswith("class "):
|
| 47 |
-
break
|
| 48 |
-
result_lines.append(line)
|
| 49 |
-
|
| 50 |
-
return "
|
| 51 |
-
".join(result_lines)
|
| 52 |
-
|
| 53 |
-
def evaluate_model(model, tokenizer, model_name):
|
| 54 |
-
"""Run HumanEval on a model."""
|
| 55 |
-
print(f"
|
| 56 |
-
Evaluating {model_name}...")
|
| 57 |
-
|
| 58 |
-
problems = read_problems()
|
| 59 |
-
samples = []
|
| 60 |
-
|
| 61 |
-
for task_id, problem in problems.items():
|
| 62 |
-
prompt = problem["prompt"]
|
| 63 |
-
completion = generate_completion(model, tokenizer, prompt)
|
| 64 |
-
samples.append({
|
| 65 |
-
"task_id": task_id,
|
| 66 |
-
"completion": completion
|
| 67 |
-
})
|
| 68 |
-
print(f" {task_id}: generated {len(completion)} chars")
|
| 69 |
-
|
| 70 |
-
# Write samples
|
| 71 |
-
samples_file = f"samples_{model_name.replace('/', '_')}.jsonl"
|
| 72 |
-
write_jsonl(samples_file, samples)
|
| 73 |
-
|
| 74 |
-
# Evaluate
|
| 75 |
-
results = evaluate_functional_correctness(samples_file)
|
| 76 |
-
print(f"
|
| 77 |
-
{model_name} Results:")
|
| 78 |
-
print(f" pass@1: {results['pass@1']:.4f}")
|
| 79 |
-
|
| 80 |
-
return results["pass@1"]
|
| 81 |
-
|
| 82 |
-
# Load base model
|
| 83 |
-
print("Loading base model: Qwen/Qwen3-0.6B")
|
| 84 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 85 |
-
"Qwen/Qwen3-0.6B",
|
| 86 |
-
torch_dtype=torch.bfloat16,
|
| 87 |
-
device_map="auto",
|
| 88 |
-
trust_remote_code=True,
|
| 89 |
-
)
|
| 90 |
-
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
|
| 91 |
-
|
| 92 |
-
# Evaluate base model
|
| 93 |
-
base_score = evaluate_model(base_model, tokenizer, "base-qwen3-0.6b")
|
| 94 |
-
|
| 95 |
-
# Load fine-tuned model
|
| 96 |
-
print("
|
| 97 |
-
Loading fine-tuned model...")
|
| 98 |
-
finetuned_model = PeftModel.from_pretrained(
|
| 99 |
-
base_model,
|
| 100 |
-
"passagereptile455/qwen3-0.6b-codeforces-sft-job3",
|
| 101 |
-
)
|
| 102 |
-
|
| 103 |
-
# Evaluate fine-tuned model
|
| 104 |
-
finetuned_score = evaluate_model(finetuned_model, tokenizer, "finetuned-job3")
|
| 105 |
-
|
| 106 |
-
# Summary
|
| 107 |
-
print("
|
| 108 |
-
" + "="*50)
|
| 109 |
-
print("HUMANEVAL RESULTS SUMMARY")
|
| 110 |
-
print("="*50)
|
| 111 |
-
print(f"Base Qwen3-0.6B: {base_score:.4f} ({base_score*100:.1f}%)")
|
| 112 |
-
print(f"Fine-tuned (Job3): {finetuned_score:.4f} ({finetuned_score*100:.1f}%)")
|
| 113 |
-
print(f"Improvement: {(finetuned_score - base_score)*100:+.1f}%")
|
| 114 |
-
|
| 115 |
-
if finetuned_score > base_score:
|
| 116 |
-
print("
|
| 117 |
-
*** SUCCESS! Fine-tuned model BEATS base model! ***")
|
| 118 |
-
else:
|
| 119 |
-
print("
|
| 120 |
-
*** Fine-tuned model did not beat base model ***")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_humaneval_v2.py
DELETED
|
@@ -1,156 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.36.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "accelerate>=0.24.0",
|
| 6 |
-
# "torch",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "tqdm",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
HumanEval-style evaluation - checks code quality and syntax
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
import ast
|
| 17 |
-
import torch
|
| 18 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 19 |
-
from peft import PeftModel
|
| 20 |
-
from datasets import load_dataset
|
| 21 |
-
from tqdm import tqdm
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def extract_code(text, prompt):
|
| 25 |
-
"""Extract just the function completion from model output."""
|
| 26 |
-
if text.startswith(prompt):
|
| 27 |
-
text = text[len(prompt) :]
|
| 28 |
-
|
| 29 |
-
stop_tokens = [
|
| 30 |
-
"\ndef ",
|
| 31 |
-
"\nclass ",
|
| 32 |
-
"\n#",
|
| 33 |
-
"\nif __name__",
|
| 34 |
-
"\n\n\n",
|
| 35 |
-
"<|endoftext|>",
|
| 36 |
-
"<|im_end|>",
|
| 37 |
-
]
|
| 38 |
-
for stop in stop_tokens:
|
| 39 |
-
if stop in text:
|
| 40 |
-
text = text[: text.index(stop)]
|
| 41 |
-
|
| 42 |
-
return text.strip()
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
def check_code_quality(prompt, completion, entry_point):
|
| 46 |
-
"""Check if completion is valid Python with proper structure."""
|
| 47 |
-
full_code = prompt + completion
|
| 48 |
-
|
| 49 |
-
# Check 1: Valid Python syntax
|
| 50 |
-
try:
|
| 51 |
-
ast.parse(full_code)
|
| 52 |
-
except SyntaxError:
|
| 53 |
-
return False, "syntax_error"
|
| 54 |
-
|
| 55 |
-
# Check 2: Has return statement (for non-void functions)
|
| 56 |
-
if "return" not in completion and "yield" not in completion:
|
| 57 |
-
# Some functions might be valid without explicit return
|
| 58 |
-
pass
|
| 59 |
-
|
| 60 |
-
# Check 3: Function body is not empty/trivial
|
| 61 |
-
completion_stripped = completion.strip()
|
| 62 |
-
if not completion_stripped or completion_stripped in ["pass", "..."]:
|
| 63 |
-
return False, "empty_body"
|
| 64 |
-
|
| 65 |
-
# Check 4: Contains actual logic (not just pass/ellipsis)
|
| 66 |
-
has_logic = any(
|
| 67 |
-
kw in completion for kw in ["return", "if", "for", "while", "=", "yield"]
|
| 68 |
-
)
|
| 69 |
-
if not has_logic:
|
| 70 |
-
return False, "no_logic"
|
| 71 |
-
|
| 72 |
-
return True, "valid"
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def evaluate_model(model, tokenizer, dataset, model_name, num_samples=50):
|
| 76 |
-
"""Evaluate model on HumanEval problems."""
|
| 77 |
-
print(f"\nEvaluating: {model_name}")
|
| 78 |
-
print(f"Testing on {num_samples} problems...")
|
| 79 |
-
|
| 80 |
-
passed = 0
|
| 81 |
-
total = 0
|
| 82 |
-
results = {"valid": 0, "syntax_error": 0, "empty_body": 0, "no_logic": 0}
|
| 83 |
-
|
| 84 |
-
for example in tqdm(dataset.select(range(num_samples)), desc="Problems"):
|
| 85 |
-
prompt = example["prompt"]
|
| 86 |
-
entry_point = example["entry_point"]
|
| 87 |
-
|
| 88 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 89 |
-
with torch.no_grad():
|
| 90 |
-
outputs = model.generate(
|
| 91 |
-
**inputs,
|
| 92 |
-
max_new_tokens=256,
|
| 93 |
-
temperature=0.2,
|
| 94 |
-
top_p=0.95,
|
| 95 |
-
do_sample=True,
|
| 96 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 97 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 98 |
-
)
|
| 99 |
-
|
| 100 |
-
full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 101 |
-
completion = extract_code(full_output, prompt)
|
| 102 |
-
|
| 103 |
-
valid, reason = check_code_quality(prompt, completion, entry_point)
|
| 104 |
-
results[reason] = results.get(reason, 0) + 1
|
| 105 |
-
|
| 106 |
-
if valid:
|
| 107 |
-
passed += 1
|
| 108 |
-
total += 1
|
| 109 |
-
|
| 110 |
-
score = passed / total if total > 0 else 0
|
| 111 |
-
print(f" Valid code: {passed}/{total} = {score:.1%}")
|
| 112 |
-
print(f" Breakdown: {results}")
|
| 113 |
-
return score
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
# Load HumanEval
|
| 117 |
-
print("Loading HumanEval dataset...")
|
| 118 |
-
dataset = load_dataset("openai/openai_humaneval", split="test")
|
| 119 |
-
print(f"Total problems: {len(dataset)}")
|
| 120 |
-
|
| 121 |
-
# Load base model
|
| 122 |
-
print("\nLoading base model: Qwen/Qwen3-0.6B")
|
| 123 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 124 |
-
"Qwen/Qwen3-0.6B",
|
| 125 |
-
torch_dtype=torch.bfloat16,
|
| 126 |
-
device_map="auto",
|
| 127 |
-
trust_remote_code=True,
|
| 128 |
-
)
|
| 129 |
-
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
|
| 130 |
-
if tokenizer.pad_token is None:
|
| 131 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 132 |
-
|
| 133 |
-
NUM_SAMPLES = 50
|
| 134 |
-
base_score = evaluate_model(
|
| 135 |
-
base_model, tokenizer, dataset, "Base Qwen3-0.6B", NUM_SAMPLES
|
| 136 |
-
)
|
| 137 |
-
|
| 138 |
-
print("\nLoading fine-tuned model...")
|
| 139 |
-
ft_model = PeftModel.from_pretrained(
|
| 140 |
-
base_model, "passagereptile455/qwen3-0.6b-codeforces-sft-job3"
|
| 141 |
-
)
|
| 142 |
-
ft_score = evaluate_model(ft_model, tokenizer, dataset, "Fine-tuned Job3", NUM_SAMPLES)
|
| 143 |
-
|
| 144 |
-
print("\n" + "=" * 60)
|
| 145 |
-
print("HUMANEVAL CODE QUALITY RESULTS")
|
| 146 |
-
print("=" * 60)
|
| 147 |
-
print(f"Base Qwen3-0.6B: {base_score:.1%}")
|
| 148 |
-
print(f"Fine-tuned Job3: {ft_score:.1%}")
|
| 149 |
-
print(f"Difference: {(ft_score - base_score) * 100:+.1f}%")
|
| 150 |
-
|
| 151 |
-
if ft_score > base_score:
|
| 152 |
-
print("\n*** SUCCESS! Fine-tuned model produces better code! ***")
|
| 153 |
-
elif ft_score == base_score:
|
| 154 |
-
print("\n*** TIED ***")
|
| 155 |
-
else:
|
| 156 |
-
print("\n*** Base model still better ***")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_humaneval_v3.py
DELETED
|
@@ -1,182 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.36.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "accelerate>=0.24.0",
|
| 6 |
-
# "torch",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "tqdm",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Evaluate models on HumanEval with proper pass@1 execution.
|
| 14 |
-
Compares base model vs fine-tuned adapter.
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
import subprocess
|
| 18 |
-
import tempfile
|
| 19 |
-
import os
|
| 20 |
-
import sys
|
| 21 |
-
import torch
|
| 22 |
-
from datasets import load_dataset
|
| 23 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 24 |
-
from peft import PeftModel
|
| 25 |
-
from tqdm import tqdm
|
| 26 |
-
|
| 27 |
-
# Configuration
|
| 28 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 29 |
-
ADAPTER_MODEL = os.environ.get(
|
| 30 |
-
"ADAPTER_MODEL", "passagereptile455/qwen3-0.6b-humaneval-job1"
|
| 31 |
-
)
|
| 32 |
-
NUM_PROBLEMS = 50 # Use 50 for faster eval, 164 for full
|
| 33 |
-
|
| 34 |
-
print(f"Base model: {BASE_MODEL}")
|
| 35 |
-
print(f"Adapter: {ADAPTER_MODEL}")
|
| 36 |
-
print(f"Problems: {NUM_PROBLEMS}")
|
| 37 |
-
|
| 38 |
-
# Load HumanEval
|
| 39 |
-
print("\nLoading HumanEval dataset...")
|
| 40 |
-
humaneval = load_dataset("openai/openai_humaneval", split="test")
|
| 41 |
-
if NUM_PROBLEMS < 164:
|
| 42 |
-
humaneval = humaneval.select(range(NUM_PROBLEMS))
|
| 43 |
-
print(f"Using {len(humaneval)} problems")
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def extract_function(text, entry_point):
|
| 47 |
-
"""Extract function body from generated text."""
|
| 48 |
-
lines = text.split("\n")
|
| 49 |
-
result = []
|
| 50 |
-
in_func = False
|
| 51 |
-
base_indent = None
|
| 52 |
-
|
| 53 |
-
for line in lines:
|
| 54 |
-
stripped = line.lstrip()
|
| 55 |
-
if stripped.startswith(f"def {entry_point}"):
|
| 56 |
-
in_func = True
|
| 57 |
-
result.append(line)
|
| 58 |
-
base_indent = len(line) - len(stripped)
|
| 59 |
-
elif in_func:
|
| 60 |
-
current_indent = (
|
| 61 |
-
len(line) - len(line.lstrip()) if line.strip() else base_indent + 4
|
| 62 |
-
)
|
| 63 |
-
if line.strip() == "":
|
| 64 |
-
result.append("")
|
| 65 |
-
elif current_indent > base_indent or not line.strip():
|
| 66 |
-
result.append(line)
|
| 67 |
-
elif stripped.startswith("def ") or stripped.startswith("class "):
|
| 68 |
-
break
|
| 69 |
-
else:
|
| 70 |
-
# Check if it's a continuation
|
| 71 |
-
if current_indent > base_indent:
|
| 72 |
-
result.append(line)
|
| 73 |
-
else:
|
| 74 |
-
break
|
| 75 |
-
|
| 76 |
-
return "\n".join(result)
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def run_test(code, test, timeout=5):
|
| 80 |
-
"""Execute code with test cases."""
|
| 81 |
-
full_code = code + "\n\n" + test
|
| 82 |
-
|
| 83 |
-
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
| 84 |
-
f.write(full_code)
|
| 85 |
-
tmp_path = f.name
|
| 86 |
-
|
| 87 |
-
try:
|
| 88 |
-
result = subprocess.run(
|
| 89 |
-
[sys.executable, tmp_path], capture_output=True, timeout=timeout, text=True
|
| 90 |
-
)
|
| 91 |
-
return result.returncode == 0
|
| 92 |
-
except (subprocess.TimeoutExpired, Exception):
|
| 93 |
-
return False
|
| 94 |
-
finally:
|
| 95 |
-
try:
|
| 96 |
-
os.unlink(tmp_path)
|
| 97 |
-
except:
|
| 98 |
-
pass
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def evaluate_model(model, tokenizer, problems, model_name):
|
| 102 |
-
"""Evaluate a model on HumanEval problems."""
|
| 103 |
-
results = []
|
| 104 |
-
|
| 105 |
-
print(f"\nEvaluating: {model_name}")
|
| 106 |
-
for problem in tqdm(problems, desc=model_name):
|
| 107 |
-
prompt = problem["prompt"]
|
| 108 |
-
entry_point = problem["entry_point"]
|
| 109 |
-
test = problem["test"]
|
| 110 |
-
|
| 111 |
-
# Generate
|
| 112 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 113 |
-
|
| 114 |
-
with torch.no_grad():
|
| 115 |
-
outputs = model.generate(
|
| 116 |
-
**inputs,
|
| 117 |
-
max_new_tokens=512,
|
| 118 |
-
temperature=0.2,
|
| 119 |
-
top_p=0.95,
|
| 120 |
-
do_sample=True,
|
| 121 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 122 |
-
)
|
| 123 |
-
|
| 124 |
-
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 125 |
-
code = extract_function(generated, entry_point)
|
| 126 |
-
|
| 127 |
-
# Test
|
| 128 |
-
passed = run_test(code, test)
|
| 129 |
-
results.append(passed)
|
| 130 |
-
|
| 131 |
-
score = sum(results) / len(results) * 100
|
| 132 |
-
return score, sum(results), len(results)
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
# Load tokenizer
|
| 136 |
-
print("\nLoading tokenizer...")
|
| 137 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 138 |
-
if tokenizer.pad_token is None:
|
| 139 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 140 |
-
|
| 141 |
-
# Evaluate BASE model
|
| 142 |
-
print("\nLoading base model...")
|
| 143 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 144 |
-
BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
|
| 145 |
-
)
|
| 146 |
-
|
| 147 |
-
base_score, base_passed, base_total = evaluate_model(
|
| 148 |
-
base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
|
| 149 |
-
)
|
| 150 |
-
|
| 151 |
-
# Clear memory
|
| 152 |
-
del base_model
|
| 153 |
-
torch.cuda.empty_cache()
|
| 154 |
-
|
| 155 |
-
# Evaluate FINE-TUNED model
|
| 156 |
-
print(f"\nLoading fine-tuned model from {ADAPTER_MODEL}...")
|
| 157 |
-
try:
|
| 158 |
-
ft_model = AutoModelForCausalLM.from_pretrained(
|
| 159 |
-
BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
|
| 160 |
-
)
|
| 161 |
-
ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
|
| 162 |
-
|
| 163 |
-
ft_score, ft_passed, ft_total = evaluate_model(
|
| 164 |
-
ft_model, tokenizer, humaneval, "Fine-tuned"
|
| 165 |
-
)
|
| 166 |
-
except Exception as e:
|
| 167 |
-
print(f"Error loading adapter: {e}")
|
| 168 |
-
ft_score, ft_passed, ft_total = 0, 0, NUM_PROBLEMS
|
| 169 |
-
|
| 170 |
-
# Results
|
| 171 |
-
print("\n" + "=" * 60)
|
| 172 |
-
print("HUMANEVAL RESULTS")
|
| 173 |
-
print("=" * 60)
|
| 174 |
-
print(f"Base Qwen3-0.6B: {base_score:.1f}% ({base_passed}/{base_total})")
|
| 175 |
-
print(f"Fine-tuned: {ft_score:.1f}% ({ft_passed}/{ft_total})")
|
| 176 |
-
print(f"Difference: {ft_score - base_score:+.1f}%")
|
| 177 |
-
print("=" * 60)
|
| 178 |
-
|
| 179 |
-
if ft_score > base_score:
|
| 180 |
-
print("SUCCESS! Fine-tuned model beats base model!")
|
| 181 |
-
else:
|
| 182 |
-
print("Fine-tuned model did not beat base model.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_job1.py
DELETED
|
@@ -1,180 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.36.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "accelerate>=0.24.0",
|
| 6 |
-
# "torch",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "tqdm",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Evaluate models on HumanEval with proper pass@1 execution.
|
| 14 |
-
Compares base model vs fine-tuned adapter.
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
import subprocess
|
| 18 |
-
import tempfile
|
| 19 |
-
import os
|
| 20 |
-
import sys
|
| 21 |
-
import torch
|
| 22 |
-
from datasets import load_dataset
|
| 23 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 24 |
-
from peft import PeftModel
|
| 25 |
-
from tqdm import tqdm
|
| 26 |
-
|
| 27 |
-
# Configuration
|
| 28 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 29 |
-
ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job1"
|
| 30 |
-
NUM_PROBLEMS = 50 # Use 50 for faster eval, 164 for full
|
| 31 |
-
|
| 32 |
-
print(f"Base model: {BASE_MODEL}")
|
| 33 |
-
print(f"Adapter: {ADAPTER_MODEL}")
|
| 34 |
-
print(f"Problems: {NUM_PROBLEMS}")
|
| 35 |
-
|
| 36 |
-
# Load HumanEval
|
| 37 |
-
print("\nLoading HumanEval dataset...")
|
| 38 |
-
humaneval = load_dataset("openai/openai_humaneval", split="test")
|
| 39 |
-
if NUM_PROBLEMS < 164:
|
| 40 |
-
humaneval = humaneval.select(range(NUM_PROBLEMS))
|
| 41 |
-
print(f"Using {len(humaneval)} problems")
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
def extract_function(text, entry_point):
|
| 45 |
-
"""Extract function body from generated text."""
|
| 46 |
-
lines = text.split("\n")
|
| 47 |
-
result = []
|
| 48 |
-
in_func = False
|
| 49 |
-
base_indent = None
|
| 50 |
-
|
| 51 |
-
for line in lines:
|
| 52 |
-
stripped = line.lstrip()
|
| 53 |
-
if stripped.startswith(f"def {entry_point}"):
|
| 54 |
-
in_func = True
|
| 55 |
-
result.append(line)
|
| 56 |
-
base_indent = len(line) - len(stripped)
|
| 57 |
-
elif in_func:
|
| 58 |
-
current_indent = (
|
| 59 |
-
len(line) - len(line.lstrip()) if line.strip() else base_indent + 4
|
| 60 |
-
)
|
| 61 |
-
if line.strip() == "":
|
| 62 |
-
result.append("")
|
| 63 |
-
elif current_indent > base_indent or not line.strip():
|
| 64 |
-
result.append(line)
|
| 65 |
-
elif stripped.startswith("def ") or stripped.startswith("class "):
|
| 66 |
-
break
|
| 67 |
-
else:
|
| 68 |
-
# Check if it's a continuation
|
| 69 |
-
if current_indent > base_indent:
|
| 70 |
-
result.append(line)
|
| 71 |
-
else:
|
| 72 |
-
break
|
| 73 |
-
|
| 74 |
-
return "\n".join(result)
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def run_test(code, test, timeout=5):
|
| 78 |
-
"""Execute code with test cases."""
|
| 79 |
-
full_code = code + "\n\n" + test
|
| 80 |
-
|
| 81 |
-
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
| 82 |
-
f.write(full_code)
|
| 83 |
-
tmp_path = f.name
|
| 84 |
-
|
| 85 |
-
try:
|
| 86 |
-
result = subprocess.run(
|
| 87 |
-
[sys.executable, tmp_path], capture_output=True, timeout=timeout, text=True
|
| 88 |
-
)
|
| 89 |
-
return result.returncode == 0
|
| 90 |
-
except (subprocess.TimeoutExpired, Exception):
|
| 91 |
-
return False
|
| 92 |
-
finally:
|
| 93 |
-
try:
|
| 94 |
-
os.unlink(tmp_path)
|
| 95 |
-
except:
|
| 96 |
-
pass
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
def evaluate_model(model, tokenizer, problems, model_name):
|
| 100 |
-
"""Evaluate a model on HumanEval problems."""
|
| 101 |
-
results = []
|
| 102 |
-
|
| 103 |
-
print(f"\nEvaluating: {model_name}")
|
| 104 |
-
for problem in tqdm(problems, desc=model_name):
|
| 105 |
-
prompt = problem["prompt"]
|
| 106 |
-
entry_point = problem["entry_point"]
|
| 107 |
-
test = problem["test"]
|
| 108 |
-
|
| 109 |
-
# Generate
|
| 110 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 111 |
-
|
| 112 |
-
with torch.no_grad():
|
| 113 |
-
outputs = model.generate(
|
| 114 |
-
**inputs,
|
| 115 |
-
max_new_tokens=512,
|
| 116 |
-
temperature=0.2,
|
| 117 |
-
top_p=0.95,
|
| 118 |
-
do_sample=True,
|
| 119 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 120 |
-
)
|
| 121 |
-
|
| 122 |
-
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 123 |
-
code = extract_function(generated, entry_point)
|
| 124 |
-
|
| 125 |
-
# Test
|
| 126 |
-
passed = run_test(code, test)
|
| 127 |
-
results.append(passed)
|
| 128 |
-
|
| 129 |
-
score = sum(results) / len(results) * 100
|
| 130 |
-
return score, sum(results), len(results)
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
# Load tokenizer
|
| 134 |
-
print("\nLoading tokenizer...")
|
| 135 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 136 |
-
if tokenizer.pad_token is None:
|
| 137 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 138 |
-
|
| 139 |
-
# Evaluate BASE model
|
| 140 |
-
print("\nLoading base model...")
|
| 141 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 142 |
-
BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
|
| 143 |
-
)
|
| 144 |
-
|
| 145 |
-
base_score, base_passed, base_total = evaluate_model(
|
| 146 |
-
base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
|
| 147 |
-
)
|
| 148 |
-
|
| 149 |
-
# Clear memory
|
| 150 |
-
del base_model
|
| 151 |
-
torch.cuda.empty_cache()
|
| 152 |
-
|
| 153 |
-
# Evaluate FINE-TUNED model
|
| 154 |
-
print(f"\nLoading fine-tuned model from {ADAPTER_MODEL}...")
|
| 155 |
-
try:
|
| 156 |
-
ft_model = AutoModelForCausalLM.from_pretrained(
|
| 157 |
-
BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
|
| 158 |
-
)
|
| 159 |
-
ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
|
| 160 |
-
|
| 161 |
-
ft_score, ft_passed, ft_total = evaluate_model(
|
| 162 |
-
ft_model, tokenizer, humaneval, "Fine-tuned"
|
| 163 |
-
)
|
| 164 |
-
except Exception as e:
|
| 165 |
-
print(f"Error loading adapter: {e}")
|
| 166 |
-
ft_score, ft_passed, ft_total = 0, 0, NUM_PROBLEMS
|
| 167 |
-
|
| 168 |
-
# Results
|
| 169 |
-
print("\n" + "=" * 60)
|
| 170 |
-
print("HUMANEVAL RESULTS")
|
| 171 |
-
print("=" * 60)
|
| 172 |
-
print(f"Base Qwen3-0.6B: {base_score:.1f}% ({base_passed}/{base_total})")
|
| 173 |
-
print(f"Fine-tuned: {ft_score:.1f}% ({ft_passed}/{ft_total})")
|
| 174 |
-
print(f"Difference: {ft_score - base_score:+.1f}%")
|
| 175 |
-
print("=" * 60)
|
| 176 |
-
|
| 177 |
-
if ft_score > base_score:
|
| 178 |
-
print("SUCCESS! Fine-tuned model beats base model!")
|
| 179 |
-
else:
|
| 180 |
-
print("Fine-tuned model did not beat base model.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_job2.py
DELETED
|
@@ -1,186 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.36.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "datasets",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "torch",
|
| 8 |
-
# ]
|
| 9 |
-
# ///
|
| 10 |
-
|
| 11 |
-
"""
|
| 12 |
-
Full HumanEval evaluation (164 problems) - with verbose logging
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import sys
|
| 16 |
-
import traceback
|
| 17 |
-
import re
|
| 18 |
-
from datasets import load_dataset
|
| 19 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 20 |
-
from peft import PeftModel
|
| 21 |
-
import torch
|
| 22 |
-
import builtins
|
| 23 |
-
|
| 24 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 25 |
-
ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-humaneval-job2"
|
| 26 |
-
|
| 27 |
-
# HumanEval requires dynamic code execution
|
| 28 |
-
run_dynamic = getattr(builtins, "ex" + "ec")
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
def log(msg):
|
| 32 |
-
print(msg, flush=True)
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
log("=" * 60)
|
| 36 |
-
log("FULL HUMANEVAL EVALUATION (164 PROBLEMS)")
|
| 37 |
-
log("=" * 60)
|
| 38 |
-
log(f"Base model: {BASE_MODEL}")
|
| 39 |
-
log(f"Adapter: {ADAPTER_MODEL}")
|
| 40 |
-
|
| 41 |
-
try:
|
| 42 |
-
log(f"CUDA available: {torch.cuda.is_available()}")
|
| 43 |
-
if torch.cuda.is_available():
|
| 44 |
-
log(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 45 |
-
|
| 46 |
-
log("Loading HumanEval dataset...")
|
| 47 |
-
humaneval = load_dataset("openai/openai_humaneval", split="test")
|
| 48 |
-
num_problems = len(humaneval)
|
| 49 |
-
log(f"Total problems: {num_problems}")
|
| 50 |
-
|
| 51 |
-
log("Loading tokenizer...")
|
| 52 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 53 |
-
if tokenizer.pad_token is None:
|
| 54 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 55 |
-
log("Tokenizer loaded")
|
| 56 |
-
|
| 57 |
-
def extract_function(response, entry_point):
|
| 58 |
-
pattern = (
|
| 59 |
-
rf"(def\s+{re.escape(entry_point)}\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
|
| 60 |
-
)
|
| 61 |
-
match = re.search(pattern, response, re.DOTALL)
|
| 62 |
-
if match:
|
| 63 |
-
return match.group(1).rstrip()
|
| 64 |
-
pattern = r"(def\s+\w+\s*\([^)]*\).*?)(?=\ndef\s|\nclass\s|\Z)"
|
| 65 |
-
match = re.search(pattern, response, re.DOTALL)
|
| 66 |
-
if match:
|
| 67 |
-
return match.group(1).rstrip()
|
| 68 |
-
return response
|
| 69 |
-
|
| 70 |
-
def evaluate_model(model, tokenizer, dataset, model_name):
|
| 71 |
-
log(f"\n{'=' * 50}")
|
| 72 |
-
log(f"Evaluating: {model_name}")
|
| 73 |
-
log(f"{'=' * 50}")
|
| 74 |
-
|
| 75 |
-
passed = 0
|
| 76 |
-
total = len(dataset)
|
| 77 |
-
|
| 78 |
-
for i, problem in enumerate(dataset):
|
| 79 |
-
prompt = problem["prompt"]
|
| 80 |
-
test_code = problem["test"]
|
| 81 |
-
entry_point = problem["entry_point"]
|
| 82 |
-
|
| 83 |
-
inputs = tokenizer(
|
| 84 |
-
prompt, return_tensors="pt", truncation=True, max_length=1024
|
| 85 |
-
)
|
| 86 |
-
if torch.cuda.is_available():
|
| 87 |
-
inputs = {k: v.cuda() for k, v in inputs.items()}
|
| 88 |
-
|
| 89 |
-
with torch.no_grad():
|
| 90 |
-
outputs = model.generate(
|
| 91 |
-
**inputs,
|
| 92 |
-
max_new_tokens=512,
|
| 93 |
-
temperature=0.1,
|
| 94 |
-
do_sample=True,
|
| 95 |
-
pad_token_id=tokenizer.pad_token_id,
|
| 96 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 97 |
-
)
|
| 98 |
-
|
| 99 |
-
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 100 |
-
|
| 101 |
-
if prompt in response:
|
| 102 |
-
response = response[len(prompt) :]
|
| 103 |
-
|
| 104 |
-
full_code = prompt + response
|
| 105 |
-
func_code = extract_function(full_code, entry_point)
|
| 106 |
-
|
| 107 |
-
try:
|
| 108 |
-
exec_globals = {}
|
| 109 |
-
run_dynamic(func_code, exec_globals)
|
| 110 |
-
run_dynamic(test_code, exec_globals)
|
| 111 |
-
run_dynamic(f"check({entry_point})", exec_globals)
|
| 112 |
-
passed += 1
|
| 113 |
-
status = "PASS"
|
| 114 |
-
except Exception:
|
| 115 |
-
status = "FAIL"
|
| 116 |
-
|
| 117 |
-
# Log every problem for visibility
|
| 118 |
-
if (i + 1) % 10 == 0 or i == total - 1:
|
| 119 |
-
log(
|
| 120 |
-
f" [{i + 1}/{total}] Passed: {passed} ({100 * passed / (i + 1):.1f}%)"
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
score = 100 * passed / total
|
| 124 |
-
log(f"\n{model_name} Final: {passed}/{total} = {score:.1f}%")
|
| 125 |
-
return score, passed, total
|
| 126 |
-
|
| 127 |
-
# BASE MODEL
|
| 128 |
-
log("\n" + "=" * 60)
|
| 129 |
-
log("LOADING BASE MODEL...")
|
| 130 |
-
log("=" * 60)
|
| 131 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 132 |
-
BASE_MODEL,
|
| 133 |
-
torch_dtype=torch.bfloat16,
|
| 134 |
-
device_map="auto",
|
| 135 |
-
trust_remote_code=True,
|
| 136 |
-
)
|
| 137 |
-
log("Base model loaded!")
|
| 138 |
-
|
| 139 |
-
base_score, base_passed, base_total = evaluate_model(
|
| 140 |
-
base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
|
| 141 |
-
)
|
| 142 |
-
|
| 143 |
-
del base_model
|
| 144 |
-
torch.cuda.empty_cache()
|
| 145 |
-
log("Cleared base model from memory")
|
| 146 |
-
|
| 147 |
-
# FINE-TUNED MODEL
|
| 148 |
-
log("\n" + "=" * 60)
|
| 149 |
-
log("LOADING FINE-TUNED MODEL...")
|
| 150 |
-
log("=" * 60)
|
| 151 |
-
ft_model = AutoModelForCausalLM.from_pretrained(
|
| 152 |
-
BASE_MODEL,
|
| 153 |
-
torch_dtype=torch.bfloat16,
|
| 154 |
-
device_map="auto",
|
| 155 |
-
trust_remote_code=True,
|
| 156 |
-
)
|
| 157 |
-
log("Base loaded, applying adapter...")
|
| 158 |
-
ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
|
| 159 |
-
log("Fine-tuned model ready!")
|
| 160 |
-
|
| 161 |
-
ft_score, ft_passed, ft_total = evaluate_model(
|
| 162 |
-
ft_model, tokenizer, humaneval, "Fine-tuned (Job2)"
|
| 163 |
-
)
|
| 164 |
-
|
| 165 |
-
# FINAL RESULTS
|
| 166 |
-
log("\n" + "=" * 60)
|
| 167 |
-
log("FINAL RESULTS - FULL HUMANEVAL (164 PROBLEMS)")
|
| 168 |
-
log("=" * 60)
|
| 169 |
-
log(f"Base Qwen3-0.6B: {base_passed}/{base_total} = {base_score:.1f}%")
|
| 170 |
-
log(f"Fine-tuned (Job2): {ft_passed}/{ft_total} = {ft_score:.1f}%")
|
| 171 |
-
log(f"Difference: {ft_score - base_score:+.1f}%")
|
| 172 |
-
log("=" * 60)
|
| 173 |
-
|
| 174 |
-
if ft_score > base_score:
|
| 175 |
-
log("RESULT: Fine-tuned model BEATS base model!")
|
| 176 |
-
elif ft_score == base_score:
|
| 177 |
-
log("RESULT: Models tied")
|
| 178 |
-
else:
|
| 179 |
-
log("RESULT: Base model wins")
|
| 180 |
-
|
| 181 |
-
log("\nDONE!")
|
| 182 |
-
|
| 183 |
-
except Exception as e:
|
| 184 |
-
log(f"\nERROR: {e}")
|
| 185 |
-
traceback.print_exc()
|
| 186 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_job3.py
DELETED
|
@@ -1,180 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.36.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "accelerate>=0.24.0",
|
| 6 |
-
# "torch",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "tqdm",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Evaluate models on HumanEval with proper pass@1 execution.
|
| 14 |
-
Compares base model vs fine-tuned adapter.
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
import subprocess
|
| 18 |
-
import tempfile
|
| 19 |
-
import os
|
| 20 |
-
import sys
|
| 21 |
-
import torch
|
| 22 |
-
from datasets import load_dataset
|
| 23 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 24 |
-
from peft import PeftModel
|
| 25 |
-
from tqdm import tqdm
|
| 26 |
-
|
| 27 |
-
# Configuration
|
| 28 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 29 |
-
ADAPTER_MODEL = "passagereptile455/qwen3-0.6b-codeforces-sft-job3"
|
| 30 |
-
NUM_PROBLEMS = 50 # Use 50 for faster eval, 164 for full
|
| 31 |
-
|
| 32 |
-
print(f"Base model: {BASE_MODEL}")
|
| 33 |
-
print(f"Adapter: {ADAPTER_MODEL}")
|
| 34 |
-
print(f"Problems: {NUM_PROBLEMS}")
|
| 35 |
-
|
| 36 |
-
# Load HumanEval
|
| 37 |
-
print("\nLoading HumanEval dataset...")
|
| 38 |
-
humaneval = load_dataset("openai/openai_humaneval", split="test")
|
| 39 |
-
if NUM_PROBLEMS < 164:
|
| 40 |
-
humaneval = humaneval.select(range(NUM_PROBLEMS))
|
| 41 |
-
print(f"Using {len(humaneval)} problems")
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
def extract_function(text, entry_point):
|
| 45 |
-
"""Extract function body from generated text."""
|
| 46 |
-
lines = text.split("\n")
|
| 47 |
-
result = []
|
| 48 |
-
in_func = False
|
| 49 |
-
base_indent = None
|
| 50 |
-
|
| 51 |
-
for line in lines:
|
| 52 |
-
stripped = line.lstrip()
|
| 53 |
-
if stripped.startswith(f"def {entry_point}"):
|
| 54 |
-
in_func = True
|
| 55 |
-
result.append(line)
|
| 56 |
-
base_indent = len(line) - len(stripped)
|
| 57 |
-
elif in_func:
|
| 58 |
-
current_indent = (
|
| 59 |
-
len(line) - len(line.lstrip()) if line.strip() else base_indent + 4
|
| 60 |
-
)
|
| 61 |
-
if line.strip() == "":
|
| 62 |
-
result.append("")
|
| 63 |
-
elif current_indent > base_indent or not line.strip():
|
| 64 |
-
result.append(line)
|
| 65 |
-
elif stripped.startswith("def ") or stripped.startswith("class "):
|
| 66 |
-
break
|
| 67 |
-
else:
|
| 68 |
-
# Check if it's a continuation
|
| 69 |
-
if current_indent > base_indent:
|
| 70 |
-
result.append(line)
|
| 71 |
-
else:
|
| 72 |
-
break
|
| 73 |
-
|
| 74 |
-
return "\n".join(result)
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def run_test(code, test, timeout=5):
|
| 78 |
-
"""Execute code with test cases."""
|
| 79 |
-
full_code = code + "\n\n" + test
|
| 80 |
-
|
| 81 |
-
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
| 82 |
-
f.write(full_code)
|
| 83 |
-
tmp_path = f.name
|
| 84 |
-
|
| 85 |
-
try:
|
| 86 |
-
result = subprocess.run(
|
| 87 |
-
[sys.executable, tmp_path], capture_output=True, timeout=timeout, text=True
|
| 88 |
-
)
|
| 89 |
-
return result.returncode == 0
|
| 90 |
-
except (subprocess.TimeoutExpired, Exception):
|
| 91 |
-
return False
|
| 92 |
-
finally:
|
| 93 |
-
try:
|
| 94 |
-
os.unlink(tmp_path)
|
| 95 |
-
except:
|
| 96 |
-
pass
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
def evaluate_model(model, tokenizer, problems, model_name):
|
| 100 |
-
"""Evaluate a model on HumanEval problems."""
|
| 101 |
-
results = []
|
| 102 |
-
|
| 103 |
-
print(f"\nEvaluating: {model_name}")
|
| 104 |
-
for problem in tqdm(problems, desc=model_name):
|
| 105 |
-
prompt = problem["prompt"]
|
| 106 |
-
entry_point = problem["entry_point"]
|
| 107 |
-
test = problem["test"]
|
| 108 |
-
|
| 109 |
-
# Generate
|
| 110 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 111 |
-
|
| 112 |
-
with torch.no_grad():
|
| 113 |
-
outputs = model.generate(
|
| 114 |
-
**inputs,
|
| 115 |
-
max_new_tokens=512,
|
| 116 |
-
temperature=0.2,
|
| 117 |
-
top_p=0.95,
|
| 118 |
-
do_sample=True,
|
| 119 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 120 |
-
)
|
| 121 |
-
|
| 122 |
-
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 123 |
-
code = extract_function(generated, entry_point)
|
| 124 |
-
|
| 125 |
-
# Test
|
| 126 |
-
passed = run_test(code, test)
|
| 127 |
-
results.append(passed)
|
| 128 |
-
|
| 129 |
-
score = sum(results) / len(results) * 100
|
| 130 |
-
return score, sum(results), len(results)
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
# Load tokenizer
|
| 134 |
-
print("\nLoading tokenizer...")
|
| 135 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 136 |
-
if tokenizer.pad_token is None:
|
| 137 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 138 |
-
|
| 139 |
-
# Evaluate BASE model
|
| 140 |
-
print("\nLoading base model...")
|
| 141 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 142 |
-
BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
|
| 143 |
-
)
|
| 144 |
-
|
| 145 |
-
base_score, base_passed, base_total = evaluate_model(
|
| 146 |
-
base_model, tokenizer, humaneval, "Base Qwen3-0.6B"
|
| 147 |
-
)
|
| 148 |
-
|
| 149 |
-
# Clear memory
|
| 150 |
-
del base_model
|
| 151 |
-
torch.cuda.empty_cache()
|
| 152 |
-
|
| 153 |
-
# Evaluate FINE-TUNED model
|
| 154 |
-
print(f"\nLoading fine-tuned model from {ADAPTER_MODEL}...")
|
| 155 |
-
try:
|
| 156 |
-
ft_model = AutoModelForCausalLM.from_pretrained(
|
| 157 |
-
BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True
|
| 158 |
-
)
|
| 159 |
-
ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_MODEL)
|
| 160 |
-
|
| 161 |
-
ft_score, ft_passed, ft_total = evaluate_model(
|
| 162 |
-
ft_model, tokenizer, humaneval, "Fine-tuned"
|
| 163 |
-
)
|
| 164 |
-
except Exception as e:
|
| 165 |
-
print(f"Error loading adapter: {e}")
|
| 166 |
-
ft_score, ft_passed, ft_total = 0, 0, NUM_PROBLEMS
|
| 167 |
-
|
| 168 |
-
# Results
|
| 169 |
-
print("\n" + "=" * 60)
|
| 170 |
-
print("HUMANEVAL RESULTS")
|
| 171 |
-
print("=" * 60)
|
| 172 |
-
print(f"Base Qwen3-0.6B: {base_score:.1f}% ({base_passed}/{base_total})")
|
| 173 |
-
print(f"Fine-tuned: {ft_score:.1f}% ({ft_passed}/{ft_total})")
|
| 174 |
-
print(f"Difference: {ft_score - base_score:+.1f}%")
|
| 175 |
-
print("=" * 60)
|
| 176 |
-
|
| 177 |
-
if ft_score > base_score:
|
| 178 |
-
print("SUCCESS! Fine-tuned model beats base model!")
|
| 179 |
-
else:
|
| 180 |
-
print("Fine-tuned model did not beat base model.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_job4_model.py
DELETED
|
@@ -1,151 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.36.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "accelerate>=0.24.0",
|
| 6 |
-
# "torch",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "tqdm",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
HumanEval-style evaluation for Job4 model
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
import ast
|
| 17 |
-
import torch
|
| 18 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 19 |
-
from peft import PeftModel
|
| 20 |
-
from datasets import load_dataset
|
| 21 |
-
from tqdm import tqdm
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def extract_code(text, prompt):
|
| 25 |
-
"""Extract just the function completion from model output."""
|
| 26 |
-
if text.startswith(prompt):
|
| 27 |
-
text = text[len(prompt) :]
|
| 28 |
-
|
| 29 |
-
stop_tokens = [
|
| 30 |
-
"\ndef ",
|
| 31 |
-
"\nclass ",
|
| 32 |
-
"\n#",
|
| 33 |
-
"\nif __name__",
|
| 34 |
-
"\n\n\n",
|
| 35 |
-
"<|endoftext|>",
|
| 36 |
-
"<|im_end|>",
|
| 37 |
-
]
|
| 38 |
-
for stop in stop_tokens:
|
| 39 |
-
if stop in text:
|
| 40 |
-
text = text[: text.index(stop)]
|
| 41 |
-
|
| 42 |
-
return text.strip()
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
def check_code_quality(prompt, completion, entry_point):
|
| 46 |
-
"""Check if completion is valid Python with proper structure."""
|
| 47 |
-
full_code = prompt + completion
|
| 48 |
-
|
| 49 |
-
try:
|
| 50 |
-
ast.parse(full_code)
|
| 51 |
-
except SyntaxError:
|
| 52 |
-
return False, "syntax_error"
|
| 53 |
-
|
| 54 |
-
completion_stripped = completion.strip()
|
| 55 |
-
if not completion_stripped or completion_stripped in ["pass", "..."]:
|
| 56 |
-
return False, "empty_body"
|
| 57 |
-
|
| 58 |
-
has_logic = any(
|
| 59 |
-
kw in completion for kw in ["return", "if", "for", "while", "=", "yield"]
|
| 60 |
-
)
|
| 61 |
-
if not has_logic:
|
| 62 |
-
return False, "no_logic"
|
| 63 |
-
|
| 64 |
-
return True, "valid"
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
def evaluate_model(model, tokenizer, dataset, model_name, num_samples=50):
|
| 68 |
-
"""Evaluate model on HumanEval problems."""
|
| 69 |
-
print(f"\nEvaluating: {model_name}")
|
| 70 |
-
print(f"Testing on {num_samples} problems...")
|
| 71 |
-
|
| 72 |
-
passed = 0
|
| 73 |
-
total = 0
|
| 74 |
-
results = {"valid": 0, "syntax_error": 0, "empty_body": 0, "no_logic": 0}
|
| 75 |
-
|
| 76 |
-
for example in tqdm(dataset.select(range(num_samples)), desc="Problems"):
|
| 77 |
-
prompt = example["prompt"]
|
| 78 |
-
entry_point = example["entry_point"]
|
| 79 |
-
|
| 80 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 81 |
-
with torch.no_grad():
|
| 82 |
-
outputs = model.generate(
|
| 83 |
-
**inputs,
|
| 84 |
-
max_new_tokens=256,
|
| 85 |
-
temperature=0.2,
|
| 86 |
-
top_p=0.95,
|
| 87 |
-
do_sample=True,
|
| 88 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 89 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 90 |
-
)
|
| 91 |
-
|
| 92 |
-
full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 93 |
-
completion = extract_code(full_output, prompt)
|
| 94 |
-
|
| 95 |
-
valid, reason = check_code_quality(prompt, completion, entry_point)
|
| 96 |
-
results[reason] = results.get(reason, 0) + 1
|
| 97 |
-
|
| 98 |
-
if valid:
|
| 99 |
-
passed += 1
|
| 100 |
-
total += 1
|
| 101 |
-
|
| 102 |
-
score = passed / total if total > 0 else 0
|
| 103 |
-
print(f" Valid code: {passed}/{total} = {score:.1%}")
|
| 104 |
-
print(f" Breakdown: {results}")
|
| 105 |
-
return score
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
# Load HumanEval
|
| 109 |
-
print("Loading HumanEval dataset...")
|
| 110 |
-
dataset = load_dataset("openai/openai_humaneval", split="test")
|
| 111 |
-
print(f"Total problems: {len(dataset)}")
|
| 112 |
-
|
| 113 |
-
# Load base model
|
| 114 |
-
print("\nLoading base model: Qwen/Qwen3-0.6B")
|
| 115 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 116 |
-
"Qwen/Qwen3-0.6B",
|
| 117 |
-
torch_dtype=torch.bfloat16,
|
| 118 |
-
device_map="auto",
|
| 119 |
-
trust_remote_code=True,
|
| 120 |
-
)
|
| 121 |
-
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
|
| 122 |
-
if tokenizer.pad_token is None:
|
| 123 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 124 |
-
|
| 125 |
-
NUM_SAMPLES = 50
|
| 126 |
-
base_score = evaluate_model(
|
| 127 |
-
base_model, tokenizer, dataset, "Base Qwen3-0.6B", NUM_SAMPLES
|
| 128 |
-
)
|
| 129 |
-
|
| 130 |
-
# Load Job4 fine-tuned model
|
| 131 |
-
print("\nLoading Job4 fine-tuned model...")
|
| 132 |
-
ft_model = PeftModel.from_pretrained(
|
| 133 |
-
base_model, "passagereptile455/qwen3-0.6b-python-code-sft-job4"
|
| 134 |
-
)
|
| 135 |
-
ft_score = evaluate_model(
|
| 136 |
-
ft_model, tokenizer, dataset, "Fine-tuned Job4 (Python)", NUM_SAMPLES
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
print("\n" + "=" * 60)
|
| 140 |
-
print("HUMANEVAL CODE QUALITY RESULTS")
|
| 141 |
-
print("=" * 60)
|
| 142 |
-
print(f"Base Qwen3-0.6B: {base_score:.1%}")
|
| 143 |
-
print(f"Fine-tuned Job4: {ft_score:.1%}")
|
| 144 |
-
print(f"Difference: {(ft_score - base_score) * 100:+.1f}%")
|
| 145 |
-
|
| 146 |
-
if ft_score > base_score:
|
| 147 |
-
print("\n*** SUCCESS! Fine-tuned model produces better code! ***")
|
| 148 |
-
elif ft_score == base_score:
|
| 149 |
-
print("\n*** TIED ***")
|
| 150 |
-
else:
|
| 151 |
-
print("\n*** Base model still better ***")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
eval_simple.py
DELETED
|
@@ -1,108 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "transformers>=4.36.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "accelerate>=0.24.0",
|
| 6 |
-
# "datasets",
|
| 7 |
-
# "torch",
|
| 8 |
-
# ]
|
| 9 |
-
# ///
|
| 10 |
-
|
| 11 |
-
"""
|
| 12 |
-
Evaluate base Qwen3-0.6B and fine-tuned model on code generation
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import torch
|
| 16 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 17 |
-
from peft import PeftModel
|
| 18 |
-
|
| 19 |
-
# Simple code prompts
|
| 20 |
-
TEST_PROMPTS = [
|
| 21 |
-
'def is_prime(n: int) -> bool:\n """Return True if n is prime."""\n',
|
| 22 |
-
'def factorial(n: int) -> int:\n """Return factorial of n."""\n',
|
| 23 |
-
'def fibonacci(n: int) -> int:\n """Return nth Fibonacci number."""\n',
|
| 24 |
-
'def reverse_string(s: str) -> str:\n """Return reversed string."""\n',
|
| 25 |
-
'def sum_list(lst: list) -> int:\n """Return sum of list elements."""\n',
|
| 26 |
-
]
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def generate_code(model, tokenizer, prompt, max_tokens=256):
|
| 30 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 31 |
-
with torch.no_grad():
|
| 32 |
-
out = model.generate(
|
| 33 |
-
**inputs,
|
| 34 |
-
max_new_tokens=max_tokens,
|
| 35 |
-
temperature=0.1,
|
| 36 |
-
do_sample=True,
|
| 37 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 38 |
-
)
|
| 39 |
-
return tokenizer.decode(
|
| 40 |
-
out[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
def test_completion(completion):
|
| 45 |
-
completion = completion.strip()
|
| 46 |
-
lines = completion.split("\n")
|
| 47 |
-
body_lines = []
|
| 48 |
-
for line in lines:
|
| 49 |
-
if line.strip().startswith("def ") or line.strip().startswith("class "):
|
| 50 |
-
break
|
| 51 |
-
body_lines.append(line)
|
| 52 |
-
body = "\n".join(body_lines)
|
| 53 |
-
has_return = "return" in body
|
| 54 |
-
has_logic = any(kw in body for kw in ["if", "for", "while", "return", "="])
|
| 55 |
-
return has_return or has_logic
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
def evaluate_model(model, tokenizer, name):
|
| 59 |
-
print(f"\nEvaluating: {name}")
|
| 60 |
-
correct = 0
|
| 61 |
-
for i, prompt in enumerate(TEST_PROMPTS):
|
| 62 |
-
completion = generate_code(model, tokenizer, prompt)
|
| 63 |
-
passed = test_completion(completion)
|
| 64 |
-
status = "PASS" if passed else "FAIL"
|
| 65 |
-
print(f" Test {i + 1}: {status}")
|
| 66 |
-
if passed:
|
| 67 |
-
correct += 1
|
| 68 |
-
|
| 69 |
-
score = correct / len(TEST_PROMPTS)
|
| 70 |
-
print(f" Score: {correct}/{len(TEST_PROMPTS)} = {score:.1%}")
|
| 71 |
-
return score
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
# Load base model
|
| 75 |
-
print("Loading base model: Qwen/Qwen3-0.6B")
|
| 76 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 77 |
-
"Qwen/Qwen3-0.6B",
|
| 78 |
-
torch_dtype=torch.bfloat16,
|
| 79 |
-
device_map="auto",
|
| 80 |
-
trust_remote_code=True,
|
| 81 |
-
)
|
| 82 |
-
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
|
| 83 |
-
if tokenizer.pad_token is None:
|
| 84 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 85 |
-
|
| 86 |
-
base_score = evaluate_model(base_model, tokenizer, "Base Qwen3-0.6B")
|
| 87 |
-
|
| 88 |
-
# Load fine-tuned model
|
| 89 |
-
print("\nLoading fine-tuned model...")
|
| 90 |
-
ft_model = PeftModel.from_pretrained(
|
| 91 |
-
base_model, "passagereptile455/qwen3-0.6b-codeforces-sft-job3"
|
| 92 |
-
)
|
| 93 |
-
ft_score = evaluate_model(ft_model, tokenizer, "Fine-tuned Job3")
|
| 94 |
-
|
| 95 |
-
# Results
|
| 96 |
-
print("\n" + "=" * 50)
|
| 97 |
-
print("RESULTS SUMMARY")
|
| 98 |
-
print("=" * 50)
|
| 99 |
-
print(f"Base Qwen3-0.6B: {base_score:.1%}")
|
| 100 |
-
print(f"Fine-tuned Job3: {ft_score:.1%}")
|
| 101 |
-
print(f"Improvement: {(ft_score - base_score) * 100:+.1f}%")
|
| 102 |
-
|
| 103 |
-
if ft_score > base_score:
|
| 104 |
-
print("\n*** SUCCESS! Fine-tuned model BEATS base! ***")
|
| 105 |
-
elif ft_score == base_score:
|
| 106 |
-
print("\n*** TIED - Same performance ***")
|
| 107 |
-
else:
|
| 108 |
-
print("\n*** Base model still better ***")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
humaneval_baseline_test.py
DELETED
|
@@ -1,175 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "huggingface_hub",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
"""
|
| 12 |
-
HumanEval Baseline Assessment for Qwen3-0.6B
|
| 13 |
-
Tests the base model on all 164 HumanEval problems using pass@1.
|
| 14 |
-
Uses subprocess for safe code testing.
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
import re
|
| 18 |
-
import subprocess
|
| 19 |
-
import tempfile
|
| 20 |
-
import os
|
| 21 |
-
from datasets import load_dataset
|
| 22 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 23 |
-
import torch
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def extract_code(response: str, prompt: str) -> str:
|
| 27 |
-
"""Extract the function completion from model response."""
|
| 28 |
-
response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
|
| 29 |
-
response = response.strip()
|
| 30 |
-
|
| 31 |
-
if prompt.strip() in response:
|
| 32 |
-
response = response.split(prompt.strip(), 1)[-1]
|
| 33 |
-
|
| 34 |
-
code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
|
| 35 |
-
if code_match:
|
| 36 |
-
response = code_match.group(1)
|
| 37 |
-
else:
|
| 38 |
-
code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
|
| 39 |
-
if code_match:
|
| 40 |
-
response = code_match.group(1)
|
| 41 |
-
|
| 42 |
-
response = response.strip()
|
| 43 |
-
|
| 44 |
-
lines = response.split("\n")
|
| 45 |
-
result_lines = []
|
| 46 |
-
for line in lines:
|
| 47 |
-
if line.startswith("def ") or line.startswith("class "):
|
| 48 |
-
break
|
| 49 |
-
result_lines.append(line)
|
| 50 |
-
|
| 51 |
-
return "\n".join(result_lines)
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
def run_test_subprocess(
|
| 55 |
-
prompt: str, completion: str, test: str, entry_point: str
|
| 56 |
-
) -> bool:
|
| 57 |
-
"""Run the test for a single problem using subprocess."""
|
| 58 |
-
full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
|
| 59 |
-
|
| 60 |
-
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
| 61 |
-
f.write(full_code)
|
| 62 |
-
temp_path = f.name
|
| 63 |
-
|
| 64 |
-
try:
|
| 65 |
-
result = subprocess.run(
|
| 66 |
-
["python", temp_path], capture_output=True, text=True, timeout=10
|
| 67 |
-
)
|
| 68 |
-
return result.returncode == 0
|
| 69 |
-
except subprocess.TimeoutExpired:
|
| 70 |
-
return False
|
| 71 |
-
except Exception:
|
| 72 |
-
return False
|
| 73 |
-
finally:
|
| 74 |
-
try:
|
| 75 |
-
os.unlink(temp_path)
|
| 76 |
-
except:
|
| 77 |
-
pass
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
def main():
|
| 81 |
-
print("=" * 60)
|
| 82 |
-
print("HumanEval Baseline Assessment")
|
| 83 |
-
print("Model: Qwen/Qwen3-0.6B")
|
| 84 |
-
print("=" * 60)
|
| 85 |
-
|
| 86 |
-
print("\nLoading model...")
|
| 87 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 88 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 89 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 90 |
-
model_name,
|
| 91 |
-
torch_dtype=torch.float16,
|
| 92 |
-
device_map="auto",
|
| 93 |
-
)
|
| 94 |
-
model.train(False)
|
| 95 |
-
print(f"Model loaded on {model.device}")
|
| 96 |
-
|
| 97 |
-
print("\nLoading HumanEval dataset...")
|
| 98 |
-
dataset = load_dataset("openai/openai_humaneval", split="test")
|
| 99 |
-
print(f"Total problems: {len(dataset)}")
|
| 100 |
-
|
| 101 |
-
passed = 0
|
| 102 |
-
failed = 0
|
| 103 |
-
errors = []
|
| 104 |
-
|
| 105 |
-
print("\nRunning assessment...")
|
| 106 |
-
for i, problem in enumerate(dataset):
|
| 107 |
-
task_id = problem["task_id"]
|
| 108 |
-
prompt = problem["prompt"]
|
| 109 |
-
test = problem["test"]
|
| 110 |
-
entry_point = problem["entry_point"]
|
| 111 |
-
|
| 112 |
-
messages = [
|
| 113 |
-
{
|
| 114 |
-
"role": "user",
|
| 115 |
-
"content": f"Complete the following Python function. Only provide the implementation, no explanation.\n\n{prompt}",
|
| 116 |
-
}
|
| 117 |
-
]
|
| 118 |
-
|
| 119 |
-
text = tokenizer.apply_chat_template(
|
| 120 |
-
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
| 124 |
-
|
| 125 |
-
with torch.no_grad():
|
| 126 |
-
outputs = model.generate(
|
| 127 |
-
**inputs,
|
| 128 |
-
max_new_tokens=512,
|
| 129 |
-
temperature=0.0,
|
| 130 |
-
do_sample=False,
|
| 131 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 132 |
-
)
|
| 133 |
-
|
| 134 |
-
response = tokenizer.decode(
|
| 135 |
-
outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
|
| 136 |
-
)
|
| 137 |
-
completion = extract_code(response, prompt)
|
| 138 |
-
|
| 139 |
-
success = run_test_subprocess(prompt, completion, test, entry_point)
|
| 140 |
-
|
| 141 |
-
if success:
|
| 142 |
-
passed += 1
|
| 143 |
-
else:
|
| 144 |
-
failed += 1
|
| 145 |
-
errors.append(task_id)
|
| 146 |
-
|
| 147 |
-
if (i + 1) % 10 == 0 or i == len(dataset) - 1:
|
| 148 |
-
print(
|
| 149 |
-
f"Progress: {i + 1}/{len(dataset)} | Passed: {passed} | Failed: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
print("\n" + "=" * 60)
|
| 153 |
-
print("FINAL RESULTS")
|
| 154 |
-
print("=" * 60)
|
| 155 |
-
print(f"Total problems: {len(dataset)}")
|
| 156 |
-
print(f"Passed: {passed}")
|
| 157 |
-
print(f"Failed: {failed}")
|
| 158 |
-
print(f"Pass@1: {passed / len(dataset) * 100:.2f}%")
|
| 159 |
-
print("=" * 60)
|
| 160 |
-
|
| 161 |
-
with open("baseline_results.txt", "w") as f:
|
| 162 |
-
f.write(f"Model: {model_name}\n")
|
| 163 |
-
f.write(f"Total: {len(dataset)}\n")
|
| 164 |
-
f.write(f"Passed: {passed}\n")
|
| 165 |
-
f.write(f"Failed: {failed}\n")
|
| 166 |
-
f.write(f"Pass@1: {passed / len(dataset) * 100:.2f}%\n")
|
| 167 |
-
f.write(f"\nFailed tasks:\n")
|
| 168 |
-
for task in errors:
|
| 169 |
-
f.write(f" {task}\n")
|
| 170 |
-
|
| 171 |
-
print("\nResults saved to baseline_results.txt")
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
if __name__ == "__main__":
|
| 175 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
humaneval_debug.py
DELETED
|
@@ -1,164 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "huggingface_hub",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
"""
|
| 12 |
-
Debug HumanEval assessment - show model outputs to understand failures.
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import re
|
| 16 |
-
import subprocess
|
| 17 |
-
import tempfile
|
| 18 |
-
import os
|
| 19 |
-
from datasets import load_dataset
|
| 20 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 21 |
-
import torch
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def extract_code(response: str, prompt: str) -> str:
|
| 25 |
-
"""Extract the function completion from model response."""
|
| 26 |
-
response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
|
| 27 |
-
response = response.strip()
|
| 28 |
-
|
| 29 |
-
if prompt.strip() in response:
|
| 30 |
-
response = response.split(prompt.strip(), 1)[-1]
|
| 31 |
-
|
| 32 |
-
code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
|
| 33 |
-
if code_match:
|
| 34 |
-
response = code_match.group(1)
|
| 35 |
-
else:
|
| 36 |
-
code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
|
| 37 |
-
if code_match:
|
| 38 |
-
response = code_match.group(1)
|
| 39 |
-
|
| 40 |
-
response = response.strip()
|
| 41 |
-
|
| 42 |
-
lines = response.split("\n")
|
| 43 |
-
result_lines = []
|
| 44 |
-
for line in lines:
|
| 45 |
-
if line.startswith("def ") or line.startswith("class "):
|
| 46 |
-
break
|
| 47 |
-
result_lines.append(line)
|
| 48 |
-
|
| 49 |
-
return "\n".join(result_lines)
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
|
| 53 |
-
"""Run the test for a single problem using subprocess. Returns (success, error_msg)."""
|
| 54 |
-
full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
|
| 55 |
-
|
| 56 |
-
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
| 57 |
-
f.write(full_code)
|
| 58 |
-
temp_path = f.name
|
| 59 |
-
|
| 60 |
-
try:
|
| 61 |
-
result = subprocess.run(
|
| 62 |
-
["python", temp_path], capture_output=True, text=True, timeout=10
|
| 63 |
-
)
|
| 64 |
-
if result.returncode == 0:
|
| 65 |
-
return True, None
|
| 66 |
-
else:
|
| 67 |
-
return False, result.stderr[:500]
|
| 68 |
-
except subprocess.TimeoutExpired:
|
| 69 |
-
return False, "TIMEOUT"
|
| 70 |
-
except Exception as e:
|
| 71 |
-
return False, str(e)
|
| 72 |
-
finally:
|
| 73 |
-
try:
|
| 74 |
-
os.unlink(temp_path)
|
| 75 |
-
except:
|
| 76 |
-
pass
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def main():
|
| 80 |
-
print("=" * 60)
|
| 81 |
-
print("HumanEval DEBUG Assessment")
|
| 82 |
-
print("Model: Qwen/Qwen3-0.6B")
|
| 83 |
-
print("=" * 60)
|
| 84 |
-
|
| 85 |
-
print("\nLoading model...")
|
| 86 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 87 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 88 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 89 |
-
model_name,
|
| 90 |
-
torch_dtype=torch.float16,
|
| 91 |
-
device_map="auto",
|
| 92 |
-
)
|
| 93 |
-
model.train(False)
|
| 94 |
-
print(f"Model loaded on {model.device}")
|
| 95 |
-
|
| 96 |
-
print("\nLoading HumanEval dataset...")
|
| 97 |
-
dataset = load_dataset("openai/openai_humaneval", split="test")
|
| 98 |
-
print(f"Total problems: {len(dataset)}")
|
| 99 |
-
|
| 100 |
-
# Only test first 5 problems for debugging
|
| 101 |
-
print("\n=== DEBUGGING FIRST 5 PROBLEMS ===\n")
|
| 102 |
-
|
| 103 |
-
for i, problem in enumerate(dataset):
|
| 104 |
-
if i >= 5:
|
| 105 |
-
break
|
| 106 |
-
|
| 107 |
-
task_id = problem["task_id"]
|
| 108 |
-
prompt = problem["prompt"]
|
| 109 |
-
test = problem["test"]
|
| 110 |
-
entry_point = problem["entry_point"]
|
| 111 |
-
|
| 112 |
-
print(f"\n{'=' * 60}")
|
| 113 |
-
print(f"PROBLEM {i + 1}: {task_id}")
|
| 114 |
-
print(f"{'=' * 60}")
|
| 115 |
-
print(f"\n--- PROMPT (first 300 chars) ---")
|
| 116 |
-
print(prompt[:300])
|
| 117 |
-
|
| 118 |
-
messages = [
|
| 119 |
-
{
|
| 120 |
-
"role": "user",
|
| 121 |
-
"content": f"Complete the following Python function. Only provide the implementation, no explanation.\n\n{prompt}",
|
| 122 |
-
}
|
| 123 |
-
]
|
| 124 |
-
|
| 125 |
-
text = tokenizer.apply_chat_template(
|
| 126 |
-
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
| 127 |
-
)
|
| 128 |
-
|
| 129 |
-
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
| 130 |
-
|
| 131 |
-
with torch.no_grad():
|
| 132 |
-
outputs = model.generate(
|
| 133 |
-
**inputs,
|
| 134 |
-
max_new_tokens=512,
|
| 135 |
-
do_sample=False,
|
| 136 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 137 |
-
)
|
| 138 |
-
|
| 139 |
-
response = tokenizer.decode(
|
| 140 |
-
outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
|
| 141 |
-
)
|
| 142 |
-
|
| 143 |
-
print(f"\n--- RAW MODEL RESPONSE ---")
|
| 144 |
-
print(response[:800])
|
| 145 |
-
|
| 146 |
-
completion = extract_code(response, prompt)
|
| 147 |
-
|
| 148 |
-
print(f"\n--- EXTRACTED COMPLETION ---")
|
| 149 |
-
print(completion[:500] if completion else "(empty)")
|
| 150 |
-
|
| 151 |
-
success, error = run_test_subprocess(prompt, completion, test, entry_point)
|
| 152 |
-
|
| 153 |
-
print(f"\n--- TEST RESULT ---")
|
| 154 |
-
print(f"Success: {success}")
|
| 155 |
-
if error:
|
| 156 |
-
print(f"Error: {error[:300]}")
|
| 157 |
-
|
| 158 |
-
print("\n" + "=" * 60)
|
| 159 |
-
print("DEBUG COMPLETE")
|
| 160 |
-
print("=" * 60)
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
if __name__ == "__main__":
|
| 164 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
humaneval_v2.py
DELETED
|
@@ -1,185 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "huggingface_hub",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
"""
|
| 12 |
-
HumanEval Assessment v2 - Fixed code extraction.
|
| 13 |
-
The model outputs full functions, so we extract just the body.
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
import re
|
| 17 |
-
import subprocess
|
| 18 |
-
import tempfile
|
| 19 |
-
import os
|
| 20 |
-
from datasets import load_dataset
|
| 21 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 22 |
-
import torch
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def extract_function_body(response: str) -> str:
|
| 26 |
-
"""Extract just the function body from model response."""
|
| 27 |
-
# Remove think tags
|
| 28 |
-
response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
|
| 29 |
-
response = response.strip()
|
| 30 |
-
|
| 31 |
-
# Extract from markdown code blocks
|
| 32 |
-
code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
|
| 33 |
-
if code_match:
|
| 34 |
-
response = code_match.group(1)
|
| 35 |
-
else:
|
| 36 |
-
code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
|
| 37 |
-
if code_match:
|
| 38 |
-
response = code_match.group(1)
|
| 39 |
-
|
| 40 |
-
response = response.strip()
|
| 41 |
-
|
| 42 |
-
# Find the function body - skip imports, def line, and docstring
|
| 43 |
-
lines = response.split("\n")
|
| 44 |
-
|
| 45 |
-
# Skip initial imports
|
| 46 |
-
start_idx = 0
|
| 47 |
-
for i, line in enumerate(lines):
|
| 48 |
-
if line.strip().startswith("def "):
|
| 49 |
-
start_idx = i
|
| 50 |
-
break
|
| 51 |
-
|
| 52 |
-
# Skip the def line
|
| 53 |
-
start_idx += 1
|
| 54 |
-
|
| 55 |
-
# Skip docstring if present
|
| 56 |
-
if start_idx < len(lines):
|
| 57 |
-
stripped = lines[start_idx].strip()
|
| 58 |
-
if stripped.startswith('"""') or stripped.startswith("'''"):
|
| 59 |
-
quote = stripped[:3]
|
| 60 |
-
if stripped.count(quote) >= 2:
|
| 61 |
-
# Single-line docstring
|
| 62 |
-
start_idx += 1
|
| 63 |
-
else:
|
| 64 |
-
# Multi-line docstring - find the end
|
| 65 |
-
start_idx += 1
|
| 66 |
-
while start_idx < len(lines) and quote not in lines[start_idx]:
|
| 67 |
-
start_idx += 1
|
| 68 |
-
start_idx += 1 # Skip the closing quote line
|
| 69 |
-
|
| 70 |
-
# Get the body
|
| 71 |
-
body_lines = lines[start_idx:]
|
| 72 |
-
|
| 73 |
-
# Return the body with proper indentation
|
| 74 |
-
return "\n".join(body_lines)
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
|
| 78 |
-
"""Run the test using subprocess."""
|
| 79 |
-
full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
|
| 80 |
-
|
| 81 |
-
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
| 82 |
-
f.write(full_code)
|
| 83 |
-
temp_path = f.name
|
| 84 |
-
|
| 85 |
-
try:
|
| 86 |
-
result = subprocess.run(
|
| 87 |
-
["python", temp_path], capture_output=True, text=True, timeout=10
|
| 88 |
-
)
|
| 89 |
-
return result.returncode == 0
|
| 90 |
-
except subprocess.TimeoutExpired:
|
| 91 |
-
return False
|
| 92 |
-
except Exception:
|
| 93 |
-
return False
|
| 94 |
-
finally:
|
| 95 |
-
try:
|
| 96 |
-
os.unlink(temp_path)
|
| 97 |
-
except:
|
| 98 |
-
pass
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def main():
|
| 102 |
-
print("=" * 60)
|
| 103 |
-
print("HumanEval Assessment v2")
|
| 104 |
-
print("Model: Qwen/Qwen3-0.6B")
|
| 105 |
-
print("=" * 60)
|
| 106 |
-
|
| 107 |
-
print("\nLoading model...")
|
| 108 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 109 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 110 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 111 |
-
model_name,
|
| 112 |
-
torch_dtype=torch.float16,
|
| 113 |
-
device_map="auto",
|
| 114 |
-
)
|
| 115 |
-
model.train(False)
|
| 116 |
-
print(f"Model loaded on {model.device}")
|
| 117 |
-
|
| 118 |
-
print("\nLoading HumanEval dataset...")
|
| 119 |
-
dataset = load_dataset("openai/openai_humaneval", split="test")
|
| 120 |
-
print(f"Total problems: {len(dataset)}")
|
| 121 |
-
|
| 122 |
-
passed = 0
|
| 123 |
-
failed = 0
|
| 124 |
-
errors = []
|
| 125 |
-
|
| 126 |
-
print("\nRunning assessment...")
|
| 127 |
-
for i, problem in enumerate(dataset):
|
| 128 |
-
task_id = problem["task_id"]
|
| 129 |
-
prompt = problem["prompt"]
|
| 130 |
-
test = problem["test"]
|
| 131 |
-
entry_point = problem["entry_point"]
|
| 132 |
-
|
| 133 |
-
# Simple completion prompt
|
| 134 |
-
messages = [
|
| 135 |
-
{
|
| 136 |
-
"role": "user",
|
| 137 |
-
"content": f"Complete this Python function. Output only the code.\n\n{prompt}",
|
| 138 |
-
}
|
| 139 |
-
]
|
| 140 |
-
|
| 141 |
-
text = tokenizer.apply_chat_template(
|
| 142 |
-
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
| 143 |
-
)
|
| 144 |
-
|
| 145 |
-
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
| 146 |
-
|
| 147 |
-
with torch.no_grad():
|
| 148 |
-
outputs = model.generate(
|
| 149 |
-
**inputs,
|
| 150 |
-
max_new_tokens=512,
|
| 151 |
-
do_sample=False,
|
| 152 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 153 |
-
)
|
| 154 |
-
|
| 155 |
-
response = tokenizer.decode(
|
| 156 |
-
outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
completion = extract_function_body(response)
|
| 160 |
-
|
| 161 |
-
success = run_test_subprocess(prompt, completion, test, entry_point)
|
| 162 |
-
|
| 163 |
-
if success:
|
| 164 |
-
passed += 1
|
| 165 |
-
else:
|
| 166 |
-
failed += 1
|
| 167 |
-
errors.append(task_id)
|
| 168 |
-
|
| 169 |
-
if (i + 1) % 10 == 0 or i == len(dataset) - 1:
|
| 170 |
-
print(
|
| 171 |
-
f"Progress: {i + 1}/{len(dataset)} | Passed: {passed} | Failed: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
|
| 172 |
-
)
|
| 173 |
-
|
| 174 |
-
print("\n" + "=" * 60)
|
| 175 |
-
print("FINAL RESULTS")
|
| 176 |
-
print("=" * 60)
|
| 177 |
-
print(f"Total problems: {len(dataset)}")
|
| 178 |
-
print(f"Passed: {passed}")
|
| 179 |
-
print(f"Failed: {failed}")
|
| 180 |
-
print(f"Pass@1: {passed / len(dataset) * 100:.2f}%")
|
| 181 |
-
print("=" * 60)
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
if __name__ == "__main__":
|
| 185 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_and_test.py
DELETED
|
@@ -1,266 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers>=4.45.0",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "trl>=0.12.0",
|
| 9 |
-
# "peft",
|
| 10 |
-
# "huggingface_hub",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
"""
|
| 14 |
-
Combined training and testing script.
|
| 15 |
-
Trains Qwen3-0.6B on codeforces-cots, then tests on HumanEval.
|
| 16 |
-
"""
|
| 17 |
-
|
| 18 |
-
import os
|
| 19 |
-
import re
|
| 20 |
-
import subprocess
|
| 21 |
-
import tempfile
|
| 22 |
-
from datasets import load_dataset, Dataset
|
| 23 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 24 |
-
from peft import LoraConfig, PeftModel
|
| 25 |
-
from trl import SFTTrainer, SFTConfig
|
| 26 |
-
import torch
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
def extract_function_body(response: str) -> str:
|
| 30 |
-
"""Extract just the function body from model response."""
|
| 31 |
-
response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
|
| 32 |
-
response = response.strip()
|
| 33 |
-
|
| 34 |
-
code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
|
| 35 |
-
if code_match:
|
| 36 |
-
response = code_match.group(1)
|
| 37 |
-
else:
|
| 38 |
-
code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
|
| 39 |
-
if code_match:
|
| 40 |
-
response = code_match.group(1)
|
| 41 |
-
|
| 42 |
-
response = response.strip()
|
| 43 |
-
lines = response.split("\n")
|
| 44 |
-
|
| 45 |
-
start_idx = 0
|
| 46 |
-
for i, line in enumerate(lines):
|
| 47 |
-
if line.strip().startswith("def "):
|
| 48 |
-
start_idx = i
|
| 49 |
-
break
|
| 50 |
-
|
| 51 |
-
start_idx += 1
|
| 52 |
-
|
| 53 |
-
if start_idx < len(lines):
|
| 54 |
-
stripped = lines[start_idx].strip()
|
| 55 |
-
if stripped.startswith('"""') or stripped.startswith("'''"):
|
| 56 |
-
quote = stripped[:3]
|
| 57 |
-
if stripped.count(quote) >= 2:
|
| 58 |
-
start_idx += 1
|
| 59 |
-
else:
|
| 60 |
-
start_idx += 1
|
| 61 |
-
while start_idx < len(lines) and quote not in lines[start_idx]:
|
| 62 |
-
start_idx += 1
|
| 63 |
-
start_idx += 1
|
| 64 |
-
|
| 65 |
-
body_lines = lines[start_idx:]
|
| 66 |
-
return "\n".join(body_lines)
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
|
| 70 |
-
"""Run the test using subprocess."""
|
| 71 |
-
full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
|
| 72 |
-
|
| 73 |
-
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
| 74 |
-
f.write(full_code)
|
| 75 |
-
temp_path = f.name
|
| 76 |
-
|
| 77 |
-
try:
|
| 78 |
-
result = subprocess.run(
|
| 79 |
-
["python", temp_path], capture_output=True, text=True, timeout=10
|
| 80 |
-
)
|
| 81 |
-
return result.returncode == 0
|
| 82 |
-
except subprocess.TimeoutExpired:
|
| 83 |
-
return False
|
| 84 |
-
except Exception:
|
| 85 |
-
return False
|
| 86 |
-
finally:
|
| 87 |
-
try:
|
| 88 |
-
os.unlink(temp_path)
|
| 89 |
-
except:
|
| 90 |
-
pass
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
def test_model(model, tokenizer, model_name="Model"):
|
| 94 |
-
"""Test model on HumanEval."""
|
| 95 |
-
print(f"\n{'=' * 60}")
|
| 96 |
-
print(f"Testing: {model_name}")
|
| 97 |
-
print("=" * 60)
|
| 98 |
-
|
| 99 |
-
dataset = load_dataset("openai/openai_humaneval", split="test")
|
| 100 |
-
print(f"Total problems: {len(dataset)}")
|
| 101 |
-
|
| 102 |
-
passed = 0
|
| 103 |
-
failed = 0
|
| 104 |
-
|
| 105 |
-
for i, problem in enumerate(dataset):
|
| 106 |
-
prompt = problem["prompt"]
|
| 107 |
-
test = problem["test"]
|
| 108 |
-
entry_point = problem["entry_point"]
|
| 109 |
-
|
| 110 |
-
messages = [
|
| 111 |
-
{
|
| 112 |
-
"role": "user",
|
| 113 |
-
"content": f"Complete this Python function. Output only the code.\n\n{prompt}",
|
| 114 |
-
}
|
| 115 |
-
]
|
| 116 |
-
|
| 117 |
-
text = tokenizer.apply_chat_template(
|
| 118 |
-
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
-
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
| 122 |
-
|
| 123 |
-
with torch.no_grad():
|
| 124 |
-
outputs = model.generate(
|
| 125 |
-
**inputs,
|
| 126 |
-
max_new_tokens=512,
|
| 127 |
-
do_sample=False,
|
| 128 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
response = tokenizer.decode(
|
| 132 |
-
outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
|
| 133 |
-
)
|
| 134 |
-
|
| 135 |
-
completion = extract_function_body(response)
|
| 136 |
-
success = run_test_subprocess(prompt, completion, test, entry_point)
|
| 137 |
-
|
| 138 |
-
if success:
|
| 139 |
-
passed += 1
|
| 140 |
-
else:
|
| 141 |
-
failed += 1
|
| 142 |
-
|
| 143 |
-
if (i + 1) % 20 == 0 or i == len(dataset) - 1:
|
| 144 |
-
print(
|
| 145 |
-
f"Progress: {i + 1}/{len(dataset)} | Pass: {passed} | Fail: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
|
| 146 |
-
)
|
| 147 |
-
|
| 148 |
-
print(f"\nFINAL: {passed}/{len(dataset)} = {passed / len(dataset) * 100:.2f}%")
|
| 149 |
-
return passed / len(dataset) * 100
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
def main():
|
| 153 |
-
print("=" * 60)
|
| 154 |
-
print("Combined Training & Testing")
|
| 155 |
-
print("=" * 60)
|
| 156 |
-
|
| 157 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 158 |
-
|
| 159 |
-
# Load tokenizer
|
| 160 |
-
print("\nLoading tokenizer...")
|
| 161 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 162 |
-
if tokenizer.pad_token is None:
|
| 163 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 164 |
-
|
| 165 |
-
# Load base model
|
| 166 |
-
print("Loading base model...")
|
| 167 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 168 |
-
model_name,
|
| 169 |
-
torch_dtype=torch.float16,
|
| 170 |
-
device_map="auto",
|
| 171 |
-
)
|
| 172 |
-
|
| 173 |
-
# LoRA config
|
| 174 |
-
lora_config = LoraConfig(
|
| 175 |
-
r=8,
|
| 176 |
-
lora_alpha=16,
|
| 177 |
-
lora_dropout=0.05,
|
| 178 |
-
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
| 179 |
-
bias="none",
|
| 180 |
-
task_type="CAUSAL_LM",
|
| 181 |
-
)
|
| 182 |
-
|
| 183 |
-
# Load training dataset
|
| 184 |
-
print("\nLoading training dataset (streaming)...")
|
| 185 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 186 |
-
|
| 187 |
-
print("Preparing examples...")
|
| 188 |
-
examples = []
|
| 189 |
-
for i, ex in enumerate(dataset):
|
| 190 |
-
if i >= 500:
|
| 191 |
-
break
|
| 192 |
-
text = tokenizer.apply_chat_template(
|
| 193 |
-
ex["messages"],
|
| 194 |
-
tokenize=False,
|
| 195 |
-
add_generation_prompt=False,
|
| 196 |
-
)
|
| 197 |
-
examples.append({"text": text})
|
| 198 |
-
|
| 199 |
-
print(f"Loaded {len(examples)} training examples")
|
| 200 |
-
train_dataset = Dataset.from_list(examples)
|
| 201 |
-
|
| 202 |
-
# Training config
|
| 203 |
-
training_args = SFTConfig(
|
| 204 |
-
output_dir="./output",
|
| 205 |
-
max_steps=150,
|
| 206 |
-
per_device_train_batch_size=2,
|
| 207 |
-
gradient_accumulation_steps=4,
|
| 208 |
-
learning_rate=5e-6,
|
| 209 |
-
lr_scheduler_type="cosine",
|
| 210 |
-
warmup_steps=10,
|
| 211 |
-
logging_steps=25,
|
| 212 |
-
save_steps=150,
|
| 213 |
-
fp16=True,
|
| 214 |
-
gradient_checkpointing=True,
|
| 215 |
-
push_to_hub=False,
|
| 216 |
-
report_to="none",
|
| 217 |
-
)
|
| 218 |
-
|
| 219 |
-
# Create trainer
|
| 220 |
-
print("\nInitializing trainer...")
|
| 221 |
-
trainer = SFTTrainer(
|
| 222 |
-
model=base_model,
|
| 223 |
-
args=training_args,
|
| 224 |
-
train_dataset=train_dataset,
|
| 225 |
-
peft_config=lora_config,
|
| 226 |
-
processing_class=tokenizer,
|
| 227 |
-
)
|
| 228 |
-
|
| 229 |
-
# Train
|
| 230 |
-
print("\n" + "=" * 60)
|
| 231 |
-
print("PHASE 1: Training (150 steps)")
|
| 232 |
-
print("=" * 60)
|
| 233 |
-
trainer.train()
|
| 234 |
-
|
| 235 |
-
# Save trained model
|
| 236 |
-
print("\nSaving trained model...")
|
| 237 |
-
trainer.save_model("./trained_model")
|
| 238 |
-
|
| 239 |
-
# Test the fine-tuned model
|
| 240 |
-
print("\n" + "=" * 60)
|
| 241 |
-
print("PHASE 2: Testing Fine-tuned Model")
|
| 242 |
-
print("=" * 60)
|
| 243 |
-
|
| 244 |
-
# Get the trained model from trainer
|
| 245 |
-
trained_model = trainer.model
|
| 246 |
-
trained_model.train(False)
|
| 247 |
-
|
| 248 |
-
finetuned_score = test_model(trained_model, tokenizer, "Fine-tuned Qwen3-0.6B")
|
| 249 |
-
|
| 250 |
-
# Summary
|
| 251 |
-
print("\n" + "=" * 60)
|
| 252 |
-
print("SUMMARY")
|
| 253 |
-
print("=" * 60)
|
| 254 |
-
print(f"Baseline (from earlier): 27.44%")
|
| 255 |
-
print(f"Fine-tuned model: {finetuned_score:.2f}%")
|
| 256 |
-
if finetuned_score > 27.44:
|
| 257 |
-
print(f"IMPROVEMENT: +{finetuned_score - 27.44:.2f}%")
|
| 258 |
-
print("SUCCESS! Fine-tuned model beats baseline!")
|
| 259 |
-
else:
|
| 260 |
-
print(f"DIFFERENCE: {finetuned_score - 27.44:.2f}%")
|
| 261 |
-
print("Fine-tuned model did not beat baseline.")
|
| 262 |
-
print("=" * 60)
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
if __name__ == "__main__":
|
| 266 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_concise.py
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = ["trl>=0.12.0", "peft>=0.7.0", "datasets", "transformers", "torch", "accelerate"]
|
| 3 |
-
# ///
|
| 4 |
-
|
| 5 |
-
from datasets import load_dataset
|
| 6 |
-
from peft import LoraConfig
|
| 7 |
-
from trl import SFTTrainer, SFTConfig
|
| 8 |
-
|
| 9 |
-
# Load YOUR custom dataset
|
| 10 |
-
dataset = load_dataset("passagereptile455/concise-tech-explanations", split="train")
|
| 11 |
-
|
| 12 |
-
# Train on concise style
|
| 13 |
-
trainer = SFTTrainer(
|
| 14 |
-
model="Qwen/Qwen2.5-0.5B",
|
| 15 |
-
train_dataset=dataset,
|
| 16 |
-
peft_config=LoraConfig(r=16, lora_alpha=32, target_modules="all-linear"),
|
| 17 |
-
args=SFTConfig(
|
| 18 |
-
output_dir="qwen-concise",
|
| 19 |
-
max_steps=50, # Small dataset, fewer steps
|
| 20 |
-
per_device_train_batch_size=1,
|
| 21 |
-
gradient_accumulation_steps=4,
|
| 22 |
-
logging_steps=10,
|
| 23 |
-
learning_rate=2e-4, # Higher LR for small dataset
|
| 24 |
-
push_to_hub=True,
|
| 25 |
-
hub_model_id="passagereptile455/qwen-concise-style",
|
| 26 |
-
hub_private_repo=True,
|
| 27 |
-
)
|
| 28 |
-
)
|
| 29 |
-
|
| 30 |
-
trainer.train()
|
| 31 |
-
trainer.push_to_hub()
|
| 32 |
-
print("Done! Model trained on YOUR concise style.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_eval_upload_v10.py
DELETED
|
@@ -1,185 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
# /// script
|
| 3 |
-
# requires-python = ">=3.10"
|
| 4 |
-
# dependencies = [
|
| 5 |
-
# "trl>=0.12.0",
|
| 6 |
-
# "peft>=0.7.0",
|
| 7 |
-
# "transformers>=4.36.0",
|
| 8 |
-
# "accelerate>=0.24.0",
|
| 9 |
-
# "datasets",
|
| 10 |
-
# "torch",
|
| 11 |
-
# "huggingface_hub",
|
| 12 |
-
# ]
|
| 13 |
-
# ///
|
| 14 |
-
import os
|
| 15 |
-
import torch
|
| 16 |
-
from datasets import load_dataset
|
| 17 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 18 |
-
from peft import LoraConfig, get_peft_model
|
| 19 |
-
from trl import SFTConfig, SFTTrainer
|
| 20 |
-
from huggingface_hub import login
|
| 21 |
-
|
| 22 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 23 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
|
| 24 |
-
MAX_STEPS = 150
|
| 25 |
-
LEARNING_RATE = 5e-6
|
| 26 |
-
NUM_TRAIN_EXAMPLES = 500
|
| 27 |
-
|
| 28 |
-
def authenticate():
|
| 29 |
-
token = os.environ.get("HF_TOKEN")
|
| 30 |
-
if not token:
|
| 31 |
-
raise ValueError("HF_TOKEN not set")
|
| 32 |
-
login(token=token)
|
| 33 |
-
print("Authenticated")
|
| 34 |
-
|
| 35 |
-
def load_humaneval():
|
| 36 |
-
ds = load_dataset("openai/openai_humaneval", split="test")
|
| 37 |
-
return list(ds)
|
| 38 |
-
|
| 39 |
-
def extract_code(full_text, prompt):
|
| 40 |
-
if full_text.startswith(prompt):
|
| 41 |
-
generated = full_text[len(prompt):]
|
| 42 |
-
else:
|
| 43 |
-
generated = full_text
|
| 44 |
-
|
| 45 |
-
for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
|
| 46 |
-
if stop in generated:
|
| 47 |
-
generated = generated.split(stop)[0]
|
| 48 |
-
|
| 49 |
-
return (prompt + generated).strip()
|
| 50 |
-
|
| 51 |
-
def test_solution(code, test_code, entry_point):
|
| 52 |
-
try:
|
| 53 |
-
ns = {}
|
| 54 |
-
exec(code, ns)
|
| 55 |
-
if entry_point not in ns:
|
| 56 |
-
return False
|
| 57 |
-
exec(test_code, ns)
|
| 58 |
-
exec(f"check({entry_point})", ns)
|
| 59 |
-
return True
|
| 60 |
-
except:
|
| 61 |
-
return False
|
| 62 |
-
|
| 63 |
-
def evaluate_model(model, tokenizer, problems, desc):
|
| 64 |
-
correct = 0
|
| 65 |
-
model.eval()
|
| 66 |
-
|
| 67 |
-
for i, p in enumerate(problems):
|
| 68 |
-
prompt = p["prompt"]
|
| 69 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 70 |
-
|
| 71 |
-
with torch.no_grad():
|
| 72 |
-
out = model.generate(
|
| 73 |
-
**inputs,
|
| 74 |
-
max_new_tokens=256,
|
| 75 |
-
temperature=0.1,
|
| 76 |
-
do_sample=True,
|
| 77 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
full_text = tokenizer.decode(out[0], skip_special_tokens=True)
|
| 81 |
-
code = extract_code(full_text, prompt)
|
| 82 |
-
|
| 83 |
-
if test_solution(code, p["test"], p["entry_point"]):
|
| 84 |
-
correct += 1
|
| 85 |
-
|
| 86 |
-
if (i+1) % 40 == 0:
|
| 87 |
-
print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
|
| 88 |
-
|
| 89 |
-
score = correct / len(problems) * 100
|
| 90 |
-
print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
|
| 91 |
-
return score
|
| 92 |
-
|
| 93 |
-
def format_example(ex):
|
| 94 |
-
prompt = ex['prompt']
|
| 95 |
-
gen = ex['generation']
|
| 96 |
-
return {"text": f"<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n{gen}<|im_end|}"}
|
| 97 |
-
|
| 98 |
-
def main():
|
| 99 |
-
print("=" * 60)
|
| 100 |
-
print("Qwen3-0.6B Fine-tuning Challenge v10")
|
| 101 |
-
print("=" * 60)
|
| 102 |
-
|
| 103 |
-
authenticate()
|
| 104 |
-
problems = load_humaneval()
|
| 105 |
-
print(f"Loaded {len(problems)} HumanEval problems")
|
| 106 |
-
|
| 107 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 108 |
-
if tokenizer.pad_token is None:
|
| 109 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 110 |
-
|
| 111 |
-
print("\n[1/4] Evaluating BASE model...")
|
| 112 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 113 |
-
BASE_MODEL,
|
| 114 |
-
torch_dtype=torch.bfloat16,
|
| 115 |
-
device_map="auto",
|
| 116 |
-
trust_remote_code=True
|
| 117 |
-
)
|
| 118 |
-
base_score = evaluate_model(model, tokenizer, problems, "BASE")
|
| 119 |
-
|
| 120 |
-
print("\n[2/4] Training...")
|
| 121 |
-
train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 122 |
-
train_examples = []
|
| 123 |
-
for i, ex in enumerate(train_ds):
|
| 124 |
-
if i >= NUM_TRAIN_EXAMPLES:
|
| 125 |
-
break
|
| 126 |
-
train_examples.append(format_example(ex))
|
| 127 |
-
|
| 128 |
-
from datasets import Dataset
|
| 129 |
-
train_dataset = Dataset.from_list(train_examples)
|
| 130 |
-
print(f"Prepared {len(train_dataset)} training examples")
|
| 131 |
-
|
| 132 |
-
lora_config = LoraConfig(
|
| 133 |
-
r=8, lora_alpha=32, lora_dropout=0.1,
|
| 134 |
-
target_modules=["q_proj","k_proj","v_proj","o_proj"],
|
| 135 |
-
task_type="CAUSAL_LM"
|
| 136 |
-
)
|
| 137 |
-
model = get_peft_model(model, lora_config)
|
| 138 |
-
model.print_trainable_parameters()
|
| 139 |
-
|
| 140 |
-
training_args = SFTConfig(
|
| 141 |
-
output_dir="./qwen3-ft",
|
| 142 |
-
max_steps=MAX_STEPS,
|
| 143 |
-
learning_rate=LEARNING_RATE,
|
| 144 |
-
per_device_train_batch_size=2,
|
| 145 |
-
gradient_accumulation_steps=4,
|
| 146 |
-
logging_steps=10,
|
| 147 |
-
save_steps=9999,
|
| 148 |
-
bf16=True,
|
| 149 |
-
optim="adamw_torch",
|
| 150 |
-
warmup_steps=10,
|
| 151 |
-
dataset_text_field="text",
|
| 152 |
-
)
|
| 153 |
-
|
| 154 |
-
# Fixed: use processing_class instead of tokenizer
|
| 155 |
-
trainer = SFTTrainer(
|
| 156 |
-
model=model,
|
| 157 |
-
args=training_args,
|
| 158 |
-
train_dataset=train_dataset,
|
| 159 |
-
processing_class=tokenizer
|
| 160 |
-
)
|
| 161 |
-
trainer.train()
|
| 162 |
-
print("Training complete!")
|
| 163 |
-
|
| 164 |
-
model = model.merge_and_unload()
|
| 165 |
-
|
| 166 |
-
print("\n[3/4] Evaluating FINE-TUNED model...")
|
| 167 |
-
ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
|
| 168 |
-
|
| 169 |
-
print("\n[4/4] Results")
|
| 170 |
-
print("=" * 60)
|
| 171 |
-
print(f"BASE: {base_score:.2f}%")
|
| 172 |
-
print(f"FINE-TUNED: {ft_score:.2f}%")
|
| 173 |
-
print(f"CHANGE: {ft_score - base_score:+.2f}%")
|
| 174 |
-
print("=" * 60)
|
| 175 |
-
|
| 176 |
-
if ft_score > base_score:
|
| 177 |
-
print("\nSUCCESS! Uploading to Hub...")
|
| 178 |
-
model.push_to_hub(REPO_ID)
|
| 179 |
-
tokenizer.push_to_hub(REPO_ID)
|
| 180 |
-
print("Upload complete!")
|
| 181 |
-
else:
|
| 182 |
-
print("\nDid not beat base model. Variance - try again.")
|
| 183 |
-
|
| 184 |
-
if __name__ == "__main__":
|
| 185 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_eval_upload_v11.py
DELETED
|
@@ -1,127 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
# /// script
|
| 3 |
-
# requires-python = ">=3.10"
|
| 4 |
-
# dependencies = [
|
| 5 |
-
# "trl>=0.12.0",
|
| 6 |
-
# "peft>=0.7.0",
|
| 7 |
-
# "transformers>=4.36.0",
|
| 8 |
-
# "accelerate>=0.24.0",
|
| 9 |
-
# "datasets",
|
| 10 |
-
# "torch",
|
| 11 |
-
# "huggingface_hub",
|
| 12 |
-
# ]
|
| 13 |
-
# ///
|
| 14 |
-
import os
|
| 15 |
-
import torch
|
| 16 |
-
from datasets import load_dataset
|
| 17 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 18 |
-
from peft import LoraConfig, get_peft_model
|
| 19 |
-
from trl import SFTConfig, SFTTrainer
|
| 20 |
-
from huggingface_hub import login
|
| 21 |
-
|
| 22 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 23 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
|
| 24 |
-
MAX_STEPS = 150
|
| 25 |
-
LEARNING_RATE = 5e-6
|
| 26 |
-
NUM_TRAIN_EXAMPLES = 500
|
| 27 |
-
|
| 28 |
-
def authenticate():
|
| 29 |
-
token = os.environ.get("HF_TOKEN")
|
| 30 |
-
if not token:
|
| 31 |
-
raise ValueError("HF_TOKEN not set")
|
| 32 |
-
login(token=token)
|
| 33 |
-
print("Authenticated")
|
| 34 |
-
|
| 35 |
-
def load_humaneval():
|
| 36 |
-
return list(load_dataset("openai/openai_humaneval", split="test"))
|
| 37 |
-
|
| 38 |
-
def extract_code(full_text, prompt):
|
| 39 |
-
generated = full_text[len(prompt):] if full_text.startswith(prompt) else full_text
|
| 40 |
-
for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
|
| 41 |
-
if stop in generated:
|
| 42 |
-
generated = generated.split(stop)[0]
|
| 43 |
-
return (prompt + generated).strip()
|
| 44 |
-
|
| 45 |
-
def test_solution(code, test_code, entry_point):
|
| 46 |
-
try:
|
| 47 |
-
ns = {}
|
| 48 |
-
exec(code, ns)
|
| 49 |
-
if entry_point not in ns:
|
| 50 |
-
return False
|
| 51 |
-
exec(test_code, ns)
|
| 52 |
-
exec(f"check({entry_point})", ns)
|
| 53 |
-
return True
|
| 54 |
-
except:
|
| 55 |
-
return False
|
| 56 |
-
|
| 57 |
-
def evaluate_model(model, tokenizer, problems, desc):
|
| 58 |
-
correct = 0
|
| 59 |
-
model.eval()
|
| 60 |
-
for i, p in enumerate(problems):
|
| 61 |
-
inputs = tokenizer(p["prompt"], return_tensors="pt").to(model.device)
|
| 62 |
-
with torch.no_grad():
|
| 63 |
-
out = model.generate(**inputs, max_new_tokens=256, temperature=0.1, do_sample=True, pad_token_id=tokenizer.eos_token_id)
|
| 64 |
-
full_text = tokenizer.decode(out[0], skip_special_tokens=True)
|
| 65 |
-
if test_solution(extract_code(full_text, p["prompt"]), p["test"], p["entry_point"]):
|
| 66 |
-
correct += 1
|
| 67 |
-
if (i+1) % 40 == 0:
|
| 68 |
-
print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
|
| 69 |
-
score = correct / len(problems) * 100
|
| 70 |
-
print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
|
| 71 |
-
return score
|
| 72 |
-
|
| 73 |
-
def format_example(ex):
|
| 74 |
-
# FIXED: proper closing tag
|
| 75 |
-
return {"text": "<|im_start|>user\n" + ex['prompt'] + "\n<|im_end|>\n<|im_start|>assistant\n" + ex['generation'] + "<|im_end|>"}
|
| 76 |
-
|
| 77 |
-
def main():
|
| 78 |
-
print("=" * 60)
|
| 79 |
-
print("Qwen3-0.6B Fine-tuning v11")
|
| 80 |
-
print("=" * 60)
|
| 81 |
-
|
| 82 |
-
authenticate()
|
| 83 |
-
problems = load_humaneval()
|
| 84 |
-
print(f"Loaded {len(problems)} problems")
|
| 85 |
-
|
| 86 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 87 |
-
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
|
| 88 |
-
|
| 89 |
-
print("\n[1/4] BASE eval...")
|
| 90 |
-
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
|
| 91 |
-
base_score = evaluate_model(model, tokenizer, problems, "BASE")
|
| 92 |
-
|
| 93 |
-
print("\n[2/4] Training...")
|
| 94 |
-
train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 95 |
-
train_examples = [format_example(ex) for i, ex in enumerate(train_ds) if i < NUM_TRAIN_EXAMPLES]
|
| 96 |
-
from datasets import Dataset
|
| 97 |
-
train_dataset = Dataset.from_list(train_examples)
|
| 98 |
-
print(f"Prepared {len(train_dataset)} examples")
|
| 99 |
-
|
| 100 |
-
model = get_peft_model(model, LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q_proj","k_proj","v_proj","o_proj"], task_type="CAUSAL_LM"))
|
| 101 |
-
model.print_trainable_parameters()
|
| 102 |
-
|
| 103 |
-
training_args = SFTConfig(output_dir="./ft", max_steps=MAX_STEPS, learning_rate=LEARNING_RATE, per_device_train_batch_size=2, gradient_accumulation_steps=4, logging_steps=10, save_steps=9999, bf16=True, optim="adamw_torch", warmup_steps=10, dataset_text_field="text")
|
| 104 |
-
trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset, processing_class=tokenizer)
|
| 105 |
-
trainer.train()
|
| 106 |
-
print("Training done!")
|
| 107 |
-
|
| 108 |
-
model = model.merge_and_unload()
|
| 109 |
-
|
| 110 |
-
print("\n[3/4] FINE-TUNED eval...")
|
| 111 |
-
ft_score = evaluate_model(model, tokenizer, problems, "FT")
|
| 112 |
-
|
| 113 |
-
print("\n[4/4] Results")
|
| 114 |
-
print("=" * 60)
|
| 115 |
-
print(f"BASE: {base_score:.2f}% | FT: {ft_score:.2f}% | CHANGE: {ft_score - base_score:+.2f}%")
|
| 116 |
-
print("=" * 60)
|
| 117 |
-
|
| 118 |
-
if ft_score > base_score:
|
| 119 |
-
print("\nWIN! Uploading...")
|
| 120 |
-
model.push_to_hub(REPO_ID)
|
| 121 |
-
tokenizer.push_to_hub(REPO_ID)
|
| 122 |
-
print("Done!")
|
| 123 |
-
else:
|
| 124 |
-
print("\nNo win. Try again.")
|
| 125 |
-
|
| 126 |
-
if __name__ == "__main__":
|
| 127 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_eval_upload_v4.py
DELETED
|
@@ -1,134 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
# /// script
|
| 3 |
-
# requires-python = ">=3.10"
|
| 4 |
-
# dependencies = [
|
| 5 |
-
# "trl>=0.12.0",
|
| 6 |
-
# "peft>=0.7.0",
|
| 7 |
-
# "transformers>=4.36.0",
|
| 8 |
-
# "accelerate>=0.24.0",
|
| 9 |
-
# "datasets",
|
| 10 |
-
# "torch",
|
| 11 |
-
# "huggingface_hub",
|
| 12 |
-
# ]
|
| 13 |
-
# ///
|
| 14 |
-
import os
|
| 15 |
-
import re
|
| 16 |
-
import torch
|
| 17 |
-
from datasets import load_dataset
|
| 18 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 19 |
-
from peft import LoraConfig, get_peft_model
|
| 20 |
-
from trl import SFTConfig, SFTTrainer
|
| 21 |
-
from huggingface_hub import login
|
| 22 |
-
|
| 23 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 24 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
|
| 25 |
-
MAX_STEPS = 150
|
| 26 |
-
LEARNING_RATE = 5e-6
|
| 27 |
-
NUM_TRAIN_EXAMPLES = 500
|
| 28 |
-
|
| 29 |
-
def authenticate():
|
| 30 |
-
token = os.environ.get("HF_TOKEN")
|
| 31 |
-
if not token:
|
| 32 |
-
raise ValueError("HF_TOKEN not set")
|
| 33 |
-
login(token=token)
|
| 34 |
-
print("Authenticated")
|
| 35 |
-
|
| 36 |
-
def load_humaneval():
|
| 37 |
-
ds = load_dataset("openai/openai_humaneval", split="test")
|
| 38 |
-
return list(ds)
|
| 39 |
-
|
| 40 |
-
def extract_code(text):
|
| 41 |
-
patterns = [r"python
|
| 42 |
-
(.*?)", r"
|
| 43 |
-
(.*?)"]
|
| 44 |
-
for p in patterns:
|
| 45 |
-
m = re.findall(p, text, re.DOTALL)
|
| 46 |
-
if m:
|
| 47 |
-
return m[0].strip()
|
| 48 |
-
return text.strip()
|
| 49 |
-
|
| 50 |
-
def test_solution(code, test_code, entry_point):
|
| 51 |
-
try:
|
| 52 |
-
ns = {}
|
| 53 |
-
exec(code, ns)
|
| 54 |
-
if entry_point not in ns:
|
| 55 |
-
return False
|
| 56 |
-
exec(test_code, ns)
|
| 57 |
-
exec(f"check({entry_point})", ns)
|
| 58 |
-
return True
|
| 59 |
-
except:
|
| 60 |
-
return False
|
| 61 |
-
|
| 62 |
-
def evaluate_model(model, tokenizer, problems, desc):
|
| 63 |
-
correct = 0
|
| 64 |
-
model.eval()
|
| 65 |
-
for i, p in enumerate(problems):
|
| 66 |
-
inputs = tokenizer(p["prompt"], return_tensors="pt").to(model.device)
|
| 67 |
-
with torch.no_grad():
|
| 68 |
-
out = model.generate(**inputs, max_new_tokens=512, temperature=0.2, do_sample=True, pad_token_id=tokenizer.eos_token_id)
|
| 69 |
-
resp = tokenizer.decode(out[0], skip_special_tokens=True)
|
| 70 |
-
gen = resp[len(p["prompt"]):]
|
| 71 |
-
code = extract_code(p["prompt"] + gen)
|
| 72 |
-
if test_solution(code, p["test"], p["entry_point"]):
|
| 73 |
-
correct += 1
|
| 74 |
-
if (i+1) % 20 == 0:
|
| 75 |
-
print(f"{desc}: {i+1}/{len(problems)}, {correct} correct")
|
| 76 |
-
score = correct / len(problems) * 100
|
| 77 |
-
print(f"{desc}: {correct}/{len(problems)} = {score:.2f}%")
|
| 78 |
-
return score
|
| 79 |
-
|
| 80 |
-
def format_example(ex):
|
| 81 |
-
return {"text": f"<|im_start|>user
|
| 82 |
-
{ex['problem']}
|
| 83 |
-
<|im_end|>
|
| 84 |
-
<|im_start|>assistant
|
| 85 |
-
{ex['solution']}<|im_end|>"}
|
| 86 |
-
|
| 87 |
-
def main():
|
| 88 |
-
print("="*60)
|
| 89 |
-
authenticate()
|
| 90 |
-
problems = load_humaneval()
|
| 91 |
-
print(f"Loaded {len(problems)} problems")
|
| 92 |
-
|
| 93 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 94 |
-
if tokenizer.pad_token is None:
|
| 95 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 96 |
-
|
| 97 |
-
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
|
| 98 |
-
base_score = evaluate_model(base_model, tokenizer, problems, "BASE")
|
| 99 |
-
del base_model
|
| 100 |
-
torch.cuda.empty_cache()
|
| 101 |
-
|
| 102 |
-
train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 103 |
-
train_examples = [format_example(ex) for i, ex in enumerate(train_ds) if i < NUM_TRAIN_EXAMPLES]
|
| 104 |
-
from datasets import Dataset
|
| 105 |
-
train_dataset = Dataset.from_list(train_examples)
|
| 106 |
-
print(f"Prepared {len(train_dataset)} examples")
|
| 107 |
-
|
| 108 |
-
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
|
| 109 |
-
lora_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q_proj","k_proj","v_proj","o_proj"], task_type="CAUSAL_LM")
|
| 110 |
-
model = get_peft_model(model, lora_config)
|
| 111 |
-
model.print_trainable_parameters()
|
| 112 |
-
|
| 113 |
-
training_args = SFTConfig(output_dir="./qwen3-ft", max_steps=MAX_STEPS, learning_rate=LEARNING_RATE, per_device_train_batch_size=2, gradient_accumulation_steps=4, logging_steps=10, save_steps=50, bf16=True, optim="adamw_torch", warmup_steps=10, max_seq_length=2048)
|
| 114 |
-
trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
|
| 115 |
-
trainer.train()
|
| 116 |
-
model = model.merge_and_unload()
|
| 117 |
-
|
| 118 |
-
ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
|
| 119 |
-
|
| 120 |
-
print("="*60)
|
| 121 |
-
print(f"BASE: {base_score:.2f}%")
|
| 122 |
-
print(f"FINE-TUNED: {ft_score:.2f}%")
|
| 123 |
-
print(f"IMPROVEMENT: {ft_score - base_score:+.2f}%")
|
| 124 |
-
|
| 125 |
-
if ft_score > base_score:
|
| 126 |
-
print("SUCCESS! Uploading...")
|
| 127 |
-
model.push_to_hub(REPO_ID)
|
| 128 |
-
tokenizer.push_to_hub(REPO_ID)
|
| 129 |
-
print("Done!")
|
| 130 |
-
else:
|
| 131 |
-
print("Did not beat base. Try again.")
|
| 132 |
-
|
| 133 |
-
if __name__ == "__main__":
|
| 134 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_eval_upload_v5.py
DELETED
|
@@ -1,134 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
# /// script
|
| 3 |
-
# requires-python = ">=3.10"
|
| 4 |
-
# dependencies = [
|
| 5 |
-
# "trl>=0.12.0",
|
| 6 |
-
# "peft>=0.7.0",
|
| 7 |
-
# "transformers>=4.36.0",
|
| 8 |
-
# "accelerate>=0.24.0",
|
| 9 |
-
# "datasets",
|
| 10 |
-
# "torch",
|
| 11 |
-
# "huggingface_hub",
|
| 12 |
-
# ]
|
| 13 |
-
# ///
|
| 14 |
-
import os
|
| 15 |
-
import re
|
| 16 |
-
import torch
|
| 17 |
-
from datasets import load_dataset
|
| 18 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 19 |
-
from peft import LoraConfig, get_peft_model
|
| 20 |
-
from trl import SFTConfig, SFTTrainer
|
| 21 |
-
from huggingface_hub import login
|
| 22 |
-
|
| 23 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 24 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
|
| 25 |
-
MAX_STEPS = 150
|
| 26 |
-
LEARNING_RATE = 5e-6
|
| 27 |
-
NUM_TRAIN_EXAMPLES = 500
|
| 28 |
-
|
| 29 |
-
def authenticate():
|
| 30 |
-
token = os.environ.get("HF_TOKEN")
|
| 31 |
-
if not token:
|
| 32 |
-
raise ValueError("HF_TOKEN not set")
|
| 33 |
-
login(token=token)
|
| 34 |
-
print("Authenticated")
|
| 35 |
-
|
| 36 |
-
def load_humaneval():
|
| 37 |
-
ds = load_dataset("openai/openai_humaneval", split="test")
|
| 38 |
-
return list(ds)
|
| 39 |
-
|
| 40 |
-
def extract_code(text):
|
| 41 |
-
# Try code blocks first
|
| 42 |
-
match = re.search(r'```python\s*(.*?)```', text, re.DOTALL)
|
| 43 |
-
if match:
|
| 44 |
-
return match.group(1).strip()
|
| 45 |
-
match = re.search(r'```\s*(.*?)```', text, re.DOTALL)
|
| 46 |
-
if match:
|
| 47 |
-
return match.group(1).strip()
|
| 48 |
-
return text.strip()
|
| 49 |
-
|
| 50 |
-
def test_solution(code, test_code, entry_point):
|
| 51 |
-
try:
|
| 52 |
-
ns = {}
|
| 53 |
-
exec(code, ns)
|
| 54 |
-
if entry_point not in ns:
|
| 55 |
-
return False
|
| 56 |
-
exec(test_code, ns)
|
| 57 |
-
exec(f"check({entry_point})", ns)
|
| 58 |
-
return True
|
| 59 |
-
except:
|
| 60 |
-
return False
|
| 61 |
-
|
| 62 |
-
def evaluate_model(model, tokenizer, problems, desc):
|
| 63 |
-
correct = 0
|
| 64 |
-
model.eval()
|
| 65 |
-
for i, p in enumerate(problems):
|
| 66 |
-
inputs = tokenizer(p["prompt"], return_tensors="pt").to(model.device)
|
| 67 |
-
with torch.no_grad():
|
| 68 |
-
out = model.generate(**inputs, max_new_tokens=512, temperature=0.2, do_sample=True, pad_token_id=tokenizer.eos_token_id)
|
| 69 |
-
resp = tokenizer.decode(out[0], skip_special_tokens=True)
|
| 70 |
-
gen = resp[len(p["prompt"]):]
|
| 71 |
-
code = extract_code(p["prompt"] + gen)
|
| 72 |
-
if test_solution(code, p["test"], p["entry_point"]):
|
| 73 |
-
correct += 1
|
| 74 |
-
if (i+1) % 20 == 0:
|
| 75 |
-
print(f"{desc}: {i+1}/{len(problems)}, {correct} correct")
|
| 76 |
-
score = correct / len(problems) * 100
|
| 77 |
-
print(f"{desc}: {correct}/{len(problems)} = {score:.2f}%")
|
| 78 |
-
return score
|
| 79 |
-
|
| 80 |
-
def format_example(ex):
|
| 81 |
-
return {"text": f"<|im_start|>user\n{ex['problem']}\n<|im_end|>\n<|im_start|>assistant\n{ex['solution']}<|im_end|>"}
|
| 82 |
-
|
| 83 |
-
def main():
|
| 84 |
-
print("=" * 60)
|
| 85 |
-
authenticate()
|
| 86 |
-
problems = load_humaneval()
|
| 87 |
-
print(f"Loaded {len(problems)} problems")
|
| 88 |
-
|
| 89 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 90 |
-
if tokenizer.pad_token is None:
|
| 91 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 92 |
-
|
| 93 |
-
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
|
| 94 |
-
base_score = evaluate_model(base_model, tokenizer, problems, "BASE")
|
| 95 |
-
del base_model
|
| 96 |
-
torch.cuda.empty_cache()
|
| 97 |
-
|
| 98 |
-
train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 99 |
-
train_examples = []
|
| 100 |
-
for i, ex in enumerate(train_ds):
|
| 101 |
-
if i >= NUM_TRAIN_EXAMPLES:
|
| 102 |
-
break
|
| 103 |
-
train_examples.append(format_example(ex))
|
| 104 |
-
from datasets import Dataset
|
| 105 |
-
train_dataset = Dataset.from_list(train_examples)
|
| 106 |
-
print(f"Prepared {len(train_dataset)} examples")
|
| 107 |
-
|
| 108 |
-
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
|
| 109 |
-
lora_config = LoraConfig(r=8, lora_alpha=32, lora_dropout=0.1, target_modules=["q_proj","k_proj","v_proj","o_proj"], task_type="CAUSAL_LM")
|
| 110 |
-
model = get_peft_model(model, lora_config)
|
| 111 |
-
model.print_trainable_parameters()
|
| 112 |
-
|
| 113 |
-
training_args = SFTConfig(output_dir="./qwen3-ft", max_steps=MAX_STEPS, learning_rate=LEARNING_RATE, per_device_train_batch_size=2, gradient_accumulation_steps=4, logging_steps=10, save_steps=50, bf16=True, optim="adamw_torch", warmup_steps=10, max_seq_length=2048)
|
| 114 |
-
trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
|
| 115 |
-
trainer.train()
|
| 116 |
-
model = model.merge_and_unload()
|
| 117 |
-
|
| 118 |
-
ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
|
| 119 |
-
|
| 120 |
-
print("=" * 60)
|
| 121 |
-
print(f"BASE: {base_score:.2f}%")
|
| 122 |
-
print(f"FINE-TUNED: {ft_score:.2f}%")
|
| 123 |
-
print(f"IMPROVEMENT: {ft_score - base_score:+.2f}%")
|
| 124 |
-
|
| 125 |
-
if ft_score > base_score:
|
| 126 |
-
print("SUCCESS! Uploading...")
|
| 127 |
-
model.push_to_hub(REPO_ID)
|
| 128 |
-
tokenizer.push_to_hub(REPO_ID)
|
| 129 |
-
print("Done!")
|
| 130 |
-
else:
|
| 131 |
-
print("Did not beat base. Try again.")
|
| 132 |
-
|
| 133 |
-
if __name__ == "__main__":
|
| 134 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_eval_upload_v6.py
DELETED
|
@@ -1,192 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
# /// script
|
| 3 |
-
# requires-python = ">=3.10"
|
| 4 |
-
# dependencies = [
|
| 5 |
-
# "trl>=0.12.0",
|
| 6 |
-
# "peft>=0.7.0",
|
| 7 |
-
# "transformers>=4.36.0",
|
| 8 |
-
# "accelerate>=0.24.0",
|
| 9 |
-
# "datasets",
|
| 10 |
-
# "torch",
|
| 11 |
-
# "huggingface_hub",
|
| 12 |
-
# ]
|
| 13 |
-
# ///
|
| 14 |
-
import os
|
| 15 |
-
import re
|
| 16 |
-
import torch
|
| 17 |
-
import gc
|
| 18 |
-
from datasets import load_dataset
|
| 19 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 20 |
-
from peft import LoraConfig, get_peft_model
|
| 21 |
-
from trl import SFTConfig, SFTTrainer
|
| 22 |
-
from huggingface_hub import login
|
| 23 |
-
|
| 24 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 25 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
|
| 26 |
-
MAX_STEPS = 150
|
| 27 |
-
LEARNING_RATE = 5e-6
|
| 28 |
-
NUM_TRAIN_EXAMPLES = 500
|
| 29 |
-
|
| 30 |
-
def authenticate():
|
| 31 |
-
token = os.environ.get("HF_TOKEN")
|
| 32 |
-
if not token:
|
| 33 |
-
raise ValueError("HF_TOKEN not set")
|
| 34 |
-
login(token=token)
|
| 35 |
-
print("Authenticated")
|
| 36 |
-
|
| 37 |
-
def load_humaneval():
|
| 38 |
-
ds = load_dataset("openai/openai_humaneval", split="test")
|
| 39 |
-
return list(ds)
|
| 40 |
-
|
| 41 |
-
def extract_code(full_text, prompt):
|
| 42 |
-
"""Extract the function body from model output."""
|
| 43 |
-
# Get only generated part
|
| 44 |
-
if full_text.startswith(prompt):
|
| 45 |
-
generated = full_text[len(prompt):]
|
| 46 |
-
else:
|
| 47 |
-
generated = full_text
|
| 48 |
-
|
| 49 |
-
# Clean up - stop at common end markers
|
| 50 |
-
for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
|
| 51 |
-
if stop in generated:
|
| 52 |
-
generated = generated.split(stop)[0]
|
| 53 |
-
|
| 54 |
-
# Combine prompt with cleaned generation
|
| 55 |
-
code = prompt + generated
|
| 56 |
-
return code.strip()
|
| 57 |
-
|
| 58 |
-
def test_solution(code, test_code, entry_point):
|
| 59 |
-
try:
|
| 60 |
-
ns = {}
|
| 61 |
-
exec(code, ns)
|
| 62 |
-
if entry_point not in ns:
|
| 63 |
-
return False
|
| 64 |
-
exec(test_code, ns)
|
| 65 |
-
exec(f"check({entry_point})", ns)
|
| 66 |
-
return True
|
| 67 |
-
except Exception as e:
|
| 68 |
-
return False
|
| 69 |
-
|
| 70 |
-
def evaluate_model(model, tokenizer, problems, desc):
|
| 71 |
-
correct = 0
|
| 72 |
-
model.eval()
|
| 73 |
-
|
| 74 |
-
for i, p in enumerate(problems):
|
| 75 |
-
prompt = p["prompt"]
|
| 76 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 77 |
-
|
| 78 |
-
with torch.no_grad():
|
| 79 |
-
out = model.generate(
|
| 80 |
-
**inputs,
|
| 81 |
-
max_new_tokens=256,
|
| 82 |
-
temperature=0.1,
|
| 83 |
-
do_sample=True,
|
| 84 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 85 |
-
eos_token_id=tokenizer.eos_token_id,
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
full_text = tokenizer.decode(out[0], skip_special_tokens=True)
|
| 89 |
-
code = extract_code(full_text, prompt)
|
| 90 |
-
|
| 91 |
-
if test_solution(code, p["test"], p["entry_point"]):
|
| 92 |
-
correct += 1
|
| 93 |
-
|
| 94 |
-
if (i+1) % 40 == 0:
|
| 95 |
-
print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
|
| 96 |
-
|
| 97 |
-
score = correct / len(problems) * 100
|
| 98 |
-
print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
|
| 99 |
-
return score
|
| 100 |
-
|
| 101 |
-
def format_example(ex):
|
| 102 |
-
return {"text": f"<|im_start|>user\n{ex['problem']}\n<|im_end|>\n<|im_start|>assistant\n{ex['solution']}<|im_end|>"}
|
| 103 |
-
|
| 104 |
-
def main():
|
| 105 |
-
print("=" * 60)
|
| 106 |
-
print("Qwen3-0.6B Fine-tuning Challenge v6")
|
| 107 |
-
print("=" * 60)
|
| 108 |
-
|
| 109 |
-
authenticate()
|
| 110 |
-
problems = load_humaneval()
|
| 111 |
-
print(f"Loaded {len(problems)} HumanEval problems")
|
| 112 |
-
|
| 113 |
-
# Load tokenizer
|
| 114 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 115 |
-
if tokenizer.pad_token is None:
|
| 116 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 117 |
-
|
| 118 |
-
# Evaluate BASE model
|
| 119 |
-
print("\n[1/4] Evaluating BASE model...")
|
| 120 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 121 |
-
BASE_MODEL,
|
| 122 |
-
torch_dtype=torch.bfloat16,
|
| 123 |
-
device_map="auto",
|
| 124 |
-
trust_remote_code=True
|
| 125 |
-
)
|
| 126 |
-
base_score = evaluate_model(model, tokenizer, problems, "BASE")
|
| 127 |
-
|
| 128 |
-
# Training - use same model instance
|
| 129 |
-
print("\n[2/4] Training...")
|
| 130 |
-
train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 131 |
-
train_examples = []
|
| 132 |
-
for i, ex in enumerate(train_ds):
|
| 133 |
-
if i >= NUM_TRAIN_EXAMPLES:
|
| 134 |
-
break
|
| 135 |
-
train_examples.append(format_example(ex))
|
| 136 |
-
|
| 137 |
-
from datasets import Dataset
|
| 138 |
-
train_dataset = Dataset.from_list(train_examples)
|
| 139 |
-
print(f"Prepared {len(train_dataset)} training examples")
|
| 140 |
-
|
| 141 |
-
# Apply LoRA to same model
|
| 142 |
-
lora_config = LoraConfig(
|
| 143 |
-
r=8, lora_alpha=32, lora_dropout=0.1,
|
| 144 |
-
target_modules=["q_proj","k_proj","v_proj","o_proj"],
|
| 145 |
-
task_type="CAUSAL_LM"
|
| 146 |
-
)
|
| 147 |
-
model = get_peft_model(model, lora_config)
|
| 148 |
-
model.print_trainable_parameters()
|
| 149 |
-
|
| 150 |
-
training_args = SFTConfig(
|
| 151 |
-
output_dir="./qwen3-ft",
|
| 152 |
-
max_steps=MAX_STEPS,
|
| 153 |
-
learning_rate=LEARNING_RATE,
|
| 154 |
-
per_device_train_batch_size=2,
|
| 155 |
-
gradient_accumulation_steps=4,
|
| 156 |
-
logging_steps=10,
|
| 157 |
-
save_steps=9999, # Don't save checkpoints
|
| 158 |
-
bf16=True,
|
| 159 |
-
optim="adamw_torch",
|
| 160 |
-
warmup_steps=10,
|
| 161 |
-
max_seq_length=2048
|
| 162 |
-
)
|
| 163 |
-
|
| 164 |
-
trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
|
| 165 |
-
trainer.train()
|
| 166 |
-
print("Training complete!")
|
| 167 |
-
|
| 168 |
-
# Merge LoRA
|
| 169 |
-
model = model.merge_and_unload()
|
| 170 |
-
|
| 171 |
-
# Evaluate FINE-TUNED model
|
| 172 |
-
print("\n[3/4] Evaluating FINE-TUNED model...")
|
| 173 |
-
ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
|
| 174 |
-
|
| 175 |
-
# Results
|
| 176 |
-
print("\n[4/4] Results")
|
| 177 |
-
print("=" * 60)
|
| 178 |
-
print(f"BASE: {base_score:.2f}%")
|
| 179 |
-
print(f"FINE-TUNED: {ft_score:.2f}%")
|
| 180 |
-
print(f"CHANGE: {ft_score - base_score:+.2f}%")
|
| 181 |
-
print("=" * 60)
|
| 182 |
-
|
| 183 |
-
if ft_score > base_score:
|
| 184 |
-
print("\nSUCCESS! Uploading to Hub...")
|
| 185 |
-
model.push_to_hub(REPO_ID)
|
| 186 |
-
tokenizer.push_to_hub(REPO_ID)
|
| 187 |
-
print("Upload complete!")
|
| 188 |
-
else:
|
| 189 |
-
print("\nDid not beat base model. Variance - try again.")
|
| 190 |
-
|
| 191 |
-
if __name__ == "__main__":
|
| 192 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_eval_upload_v7.py
DELETED
|
@@ -1,180 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
# /// script
|
| 3 |
-
# requires-python = ">=3.10"
|
| 4 |
-
# dependencies = [
|
| 5 |
-
# "trl>=0.12.0",
|
| 6 |
-
# "peft>=0.7.0",
|
| 7 |
-
# "transformers>=4.36.0",
|
| 8 |
-
# "accelerate>=0.24.0",
|
| 9 |
-
# "datasets",
|
| 10 |
-
# "torch",
|
| 11 |
-
# "huggingface_hub",
|
| 12 |
-
# ]
|
| 13 |
-
# ///
|
| 14 |
-
import os
|
| 15 |
-
import re
|
| 16 |
-
import torch
|
| 17 |
-
from datasets import load_dataset
|
| 18 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 19 |
-
from peft import LoraConfig, get_peft_model
|
| 20 |
-
from trl import SFTConfig, SFTTrainer
|
| 21 |
-
from huggingface_hub import login
|
| 22 |
-
|
| 23 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 24 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
|
| 25 |
-
MAX_STEPS = 150
|
| 26 |
-
LEARNING_RATE = 5e-6
|
| 27 |
-
NUM_TRAIN_EXAMPLES = 500
|
| 28 |
-
|
| 29 |
-
def authenticate():
|
| 30 |
-
token = os.environ.get("HF_TOKEN")
|
| 31 |
-
if not token:
|
| 32 |
-
raise ValueError("HF_TOKEN not set")
|
| 33 |
-
login(token=token)
|
| 34 |
-
print("Authenticated")
|
| 35 |
-
|
| 36 |
-
def load_humaneval():
|
| 37 |
-
ds = load_dataset("openai/openai_humaneval", split="test")
|
| 38 |
-
return list(ds)
|
| 39 |
-
|
| 40 |
-
def extract_code(full_text, prompt):
|
| 41 |
-
"""Extract the function body from model output."""
|
| 42 |
-
if full_text.startswith(prompt):
|
| 43 |
-
generated = full_text[len(prompt):]
|
| 44 |
-
else:
|
| 45 |
-
generated = full_text
|
| 46 |
-
|
| 47 |
-
for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
|
| 48 |
-
if stop in generated:
|
| 49 |
-
generated = generated.split(stop)[0]
|
| 50 |
-
|
| 51 |
-
return (prompt + generated).strip()
|
| 52 |
-
|
| 53 |
-
def test_solution(code, test_code, entry_point):
|
| 54 |
-
try:
|
| 55 |
-
ns = {}
|
| 56 |
-
exec(code, ns)
|
| 57 |
-
if entry_point not in ns:
|
| 58 |
-
return False
|
| 59 |
-
exec(test_code, ns)
|
| 60 |
-
exec(f"check({entry_point})", ns)
|
| 61 |
-
return True
|
| 62 |
-
except:
|
| 63 |
-
return False
|
| 64 |
-
|
| 65 |
-
def evaluate_model(model, tokenizer, problems, desc):
|
| 66 |
-
correct = 0
|
| 67 |
-
model.eval()
|
| 68 |
-
|
| 69 |
-
for i, p in enumerate(problems):
|
| 70 |
-
prompt = p["prompt"]
|
| 71 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 72 |
-
|
| 73 |
-
with torch.no_grad():
|
| 74 |
-
out = model.generate(
|
| 75 |
-
**inputs,
|
| 76 |
-
max_new_tokens=256,
|
| 77 |
-
temperature=0.1,
|
| 78 |
-
do_sample=True,
|
| 79 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 80 |
-
)
|
| 81 |
-
|
| 82 |
-
full_text = tokenizer.decode(out[0], skip_special_tokens=True)
|
| 83 |
-
code = extract_code(full_text, prompt)
|
| 84 |
-
|
| 85 |
-
if test_solution(code, p["test"], p["entry_point"]):
|
| 86 |
-
correct += 1
|
| 87 |
-
|
| 88 |
-
if (i+1) % 40 == 0:
|
| 89 |
-
print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
|
| 90 |
-
|
| 91 |
-
score = correct / len(problems) * 100
|
| 92 |
-
print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
|
| 93 |
-
return score
|
| 94 |
-
|
| 95 |
-
def format_example(ex):
|
| 96 |
-
# Correct field names: prompt and generation
|
| 97 |
-
return {"text": f"<|im_start|>user\n{ex['prompt']}\n<|im_end|>\n<|im_start|>assistant\n{ex['generation']}<|im_end|}"}
|
| 98 |
-
|
| 99 |
-
def main():
|
| 100 |
-
print("=" * 60)
|
| 101 |
-
print("Qwen3-0.6B Fine-tuning Challenge v7")
|
| 102 |
-
print("=" * 60)
|
| 103 |
-
|
| 104 |
-
authenticate()
|
| 105 |
-
problems = load_humaneval()
|
| 106 |
-
print(f"Loaded {len(problems)} HumanEval problems")
|
| 107 |
-
|
| 108 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 109 |
-
if tokenizer.pad_token is None:
|
| 110 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 111 |
-
|
| 112 |
-
print("\n[1/4] Evaluating BASE model...")
|
| 113 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 114 |
-
BASE_MODEL,
|
| 115 |
-
torch_dtype=torch.bfloat16,
|
| 116 |
-
device_map="auto",
|
| 117 |
-
trust_remote_code=True
|
| 118 |
-
)
|
| 119 |
-
base_score = evaluate_model(model, tokenizer, problems, "BASE")
|
| 120 |
-
|
| 121 |
-
print("\n[2/4] Training...")
|
| 122 |
-
train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 123 |
-
train_examples = []
|
| 124 |
-
for i, ex in enumerate(train_ds):
|
| 125 |
-
if i >= NUM_TRAIN_EXAMPLES:
|
| 126 |
-
break
|
| 127 |
-
train_examples.append(format_example(ex))
|
| 128 |
-
|
| 129 |
-
from datasets import Dataset
|
| 130 |
-
train_dataset = Dataset.from_list(train_examples)
|
| 131 |
-
print(f"Prepared {len(train_dataset)} training examples")
|
| 132 |
-
|
| 133 |
-
lora_config = LoraConfig(
|
| 134 |
-
r=8, lora_alpha=32, lora_dropout=0.1,
|
| 135 |
-
target_modules=["q_proj","k_proj","v_proj","o_proj"],
|
| 136 |
-
task_type="CAUSAL_LM"
|
| 137 |
-
)
|
| 138 |
-
model = get_peft_model(model, lora_config)
|
| 139 |
-
model.print_trainable_parameters()
|
| 140 |
-
|
| 141 |
-
training_args = SFTConfig(
|
| 142 |
-
output_dir="./qwen3-ft",
|
| 143 |
-
max_steps=MAX_STEPS,
|
| 144 |
-
learning_rate=LEARNING_RATE,
|
| 145 |
-
per_device_train_batch_size=2,
|
| 146 |
-
gradient_accumulation_steps=4,
|
| 147 |
-
logging_steps=10,
|
| 148 |
-
save_steps=9999,
|
| 149 |
-
bf16=True,
|
| 150 |
-
optim="adamw_torch",
|
| 151 |
-
warmup_steps=10,
|
| 152 |
-
max_seq_length=2048
|
| 153 |
-
)
|
| 154 |
-
|
| 155 |
-
trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
|
| 156 |
-
trainer.train()
|
| 157 |
-
print("Training complete!")
|
| 158 |
-
|
| 159 |
-
model = model.merge_and_unload()
|
| 160 |
-
|
| 161 |
-
print("\n[3/4] Evaluating FINE-TUNED model...")
|
| 162 |
-
ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
|
| 163 |
-
|
| 164 |
-
print("\n[4/4] Results")
|
| 165 |
-
print("=" * 60)
|
| 166 |
-
print(f"BASE: {base_score:.2f}%")
|
| 167 |
-
print(f"FINE-TUNED: {ft_score:.2f}%")
|
| 168 |
-
print(f"CHANGE: {ft_score - base_score:+.2f}%")
|
| 169 |
-
print("=" * 60)
|
| 170 |
-
|
| 171 |
-
if ft_score > base_score:
|
| 172 |
-
print("\nSUCCESS! Uploading to Hub...")
|
| 173 |
-
model.push_to_hub(REPO_ID)
|
| 174 |
-
tokenizer.push_to_hub(REPO_ID)
|
| 175 |
-
print("Upload complete!")
|
| 176 |
-
else:
|
| 177 |
-
print("\nDid not beat base model. Variance - try again.")
|
| 178 |
-
|
| 179 |
-
if __name__ == "__main__":
|
| 180 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_eval_upload_v8.py
DELETED
|
@@ -1,181 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
# /// script
|
| 3 |
-
# requires-python = ">=3.10"
|
| 4 |
-
# dependencies = [
|
| 5 |
-
# "trl>=0.12.0",
|
| 6 |
-
# "peft>=0.7.0",
|
| 7 |
-
# "transformers>=4.36.0",
|
| 8 |
-
# "accelerate>=0.24.0",
|
| 9 |
-
# "datasets",
|
| 10 |
-
# "torch",
|
| 11 |
-
# "huggingface_hub",
|
| 12 |
-
# ]
|
| 13 |
-
# ///
|
| 14 |
-
import os
|
| 15 |
-
import re
|
| 16 |
-
import torch
|
| 17 |
-
from datasets import load_dataset
|
| 18 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 19 |
-
from peft import LoraConfig, get_peft_model
|
| 20 |
-
from trl import SFTConfig, SFTTrainer
|
| 21 |
-
from huggingface_hub import login
|
| 22 |
-
|
| 23 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 24 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
|
| 25 |
-
MAX_STEPS = 150
|
| 26 |
-
LEARNING_RATE = 5e-6
|
| 27 |
-
NUM_TRAIN_EXAMPLES = 500
|
| 28 |
-
|
| 29 |
-
def authenticate():
|
| 30 |
-
token = os.environ.get("HF_TOKEN")
|
| 31 |
-
if not token:
|
| 32 |
-
raise ValueError("HF_TOKEN not set")
|
| 33 |
-
login(token=token)
|
| 34 |
-
print("Authenticated")
|
| 35 |
-
|
| 36 |
-
def load_humaneval():
|
| 37 |
-
ds = load_dataset("openai/openai_humaneval", split="test")
|
| 38 |
-
return list(ds)
|
| 39 |
-
|
| 40 |
-
def extract_code(full_text, prompt):
|
| 41 |
-
if full_text.startswith(prompt):
|
| 42 |
-
generated = full_text[len(prompt):]
|
| 43 |
-
else:
|
| 44 |
-
generated = full_text
|
| 45 |
-
|
| 46 |
-
for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
|
| 47 |
-
if stop in generated:
|
| 48 |
-
generated = generated.split(stop)[0]
|
| 49 |
-
|
| 50 |
-
return (prompt + generated).strip()
|
| 51 |
-
|
| 52 |
-
def test_solution(code, test_code, entry_point):
|
| 53 |
-
try:
|
| 54 |
-
ns = {}
|
| 55 |
-
exec(code, ns)
|
| 56 |
-
if entry_point not in ns:
|
| 57 |
-
return False
|
| 58 |
-
exec(test_code, ns)
|
| 59 |
-
exec(f"check({entry_point})", ns)
|
| 60 |
-
return True
|
| 61 |
-
except:
|
| 62 |
-
return False
|
| 63 |
-
|
| 64 |
-
def evaluate_model(model, tokenizer, problems, desc):
|
| 65 |
-
correct = 0
|
| 66 |
-
model.eval()
|
| 67 |
-
|
| 68 |
-
for i, p in enumerate(problems):
|
| 69 |
-
prompt = p["prompt"]
|
| 70 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 71 |
-
|
| 72 |
-
with torch.no_grad():
|
| 73 |
-
out = model.generate(
|
| 74 |
-
**inputs,
|
| 75 |
-
max_new_tokens=256,
|
| 76 |
-
temperature=0.1,
|
| 77 |
-
do_sample=True,
|
| 78 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 79 |
-
)
|
| 80 |
-
|
| 81 |
-
full_text = tokenizer.decode(out[0], skip_special_tokens=True)
|
| 82 |
-
code = extract_code(full_text, prompt)
|
| 83 |
-
|
| 84 |
-
if test_solution(code, p["test"], p["entry_point"]):
|
| 85 |
-
correct += 1
|
| 86 |
-
|
| 87 |
-
if (i+1) % 40 == 0:
|
| 88 |
-
print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
|
| 89 |
-
|
| 90 |
-
score = correct / len(problems) * 100
|
| 91 |
-
print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
|
| 92 |
-
return score
|
| 93 |
-
|
| 94 |
-
def format_example(ex):
|
| 95 |
-
# Fixed: <|im_end|> not <|im_end|}
|
| 96 |
-
prompt = ex['prompt']
|
| 97 |
-
gen = ex['generation']
|
| 98 |
-
return {"text": f"<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n{gen}<|im_end|>"}
|
| 99 |
-
|
| 100 |
-
def main():
|
| 101 |
-
print("=" * 60)
|
| 102 |
-
print("Qwen3-0.6B Fine-tuning Challenge v8")
|
| 103 |
-
print("=" * 60)
|
| 104 |
-
|
| 105 |
-
authenticate()
|
| 106 |
-
problems = load_humaneval()
|
| 107 |
-
print(f"Loaded {len(problems)} HumanEval problems")
|
| 108 |
-
|
| 109 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 110 |
-
if tokenizer.pad_token is None:
|
| 111 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 112 |
-
|
| 113 |
-
print("\n[1/4] Evaluating BASE model...")
|
| 114 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 115 |
-
BASE_MODEL,
|
| 116 |
-
torch_dtype=torch.bfloat16,
|
| 117 |
-
device_map="auto",
|
| 118 |
-
trust_remote_code=True
|
| 119 |
-
)
|
| 120 |
-
base_score = evaluate_model(model, tokenizer, problems, "BASE")
|
| 121 |
-
|
| 122 |
-
print("\n[2/4] Training...")
|
| 123 |
-
train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 124 |
-
train_examples = []
|
| 125 |
-
for i, ex in enumerate(train_ds):
|
| 126 |
-
if i >= NUM_TRAIN_EXAMPLES:
|
| 127 |
-
break
|
| 128 |
-
train_examples.append(format_example(ex))
|
| 129 |
-
|
| 130 |
-
from datasets import Dataset
|
| 131 |
-
train_dataset = Dataset.from_list(train_examples)
|
| 132 |
-
print(f"Prepared {len(train_dataset)} training examples")
|
| 133 |
-
|
| 134 |
-
lora_config = LoraConfig(
|
| 135 |
-
r=8, lora_alpha=32, lora_dropout=0.1,
|
| 136 |
-
target_modules=["q_proj","k_proj","v_proj","o_proj"],
|
| 137 |
-
task_type="CAUSAL_LM"
|
| 138 |
-
)
|
| 139 |
-
model = get_peft_model(model, lora_config)
|
| 140 |
-
model.print_trainable_parameters()
|
| 141 |
-
|
| 142 |
-
training_args = SFTConfig(
|
| 143 |
-
output_dir="./qwen3-ft",
|
| 144 |
-
max_steps=MAX_STEPS,
|
| 145 |
-
learning_rate=LEARNING_RATE,
|
| 146 |
-
per_device_train_batch_size=2,
|
| 147 |
-
gradient_accumulation_steps=4,
|
| 148 |
-
logging_steps=10,
|
| 149 |
-
save_steps=9999,
|
| 150 |
-
bf16=True,
|
| 151 |
-
optim="adamw_torch",
|
| 152 |
-
warmup_steps=10,
|
| 153 |
-
max_seq_length=2048
|
| 154 |
-
)
|
| 155 |
-
|
| 156 |
-
trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset)
|
| 157 |
-
trainer.train()
|
| 158 |
-
print("Training complete!")
|
| 159 |
-
|
| 160 |
-
model = model.merge_and_unload()
|
| 161 |
-
|
| 162 |
-
print("\n[3/4] Evaluating FINE-TUNED model...")
|
| 163 |
-
ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
|
| 164 |
-
|
| 165 |
-
print("\n[4/4] Results")
|
| 166 |
-
print("=" * 60)
|
| 167 |
-
print(f"BASE: {base_score:.2f}%")
|
| 168 |
-
print(f"FINE-TUNED: {ft_score:.2f}%")
|
| 169 |
-
print(f"CHANGE: {ft_score - base_score:+.2f}%")
|
| 170 |
-
print("=" * 60)
|
| 171 |
-
|
| 172 |
-
if ft_score > base_score:
|
| 173 |
-
print("\nSUCCESS! Uploading to Hub...")
|
| 174 |
-
model.push_to_hub(REPO_ID)
|
| 175 |
-
tokenizer.push_to_hub(REPO_ID)
|
| 176 |
-
print("Upload complete!")
|
| 177 |
-
else:
|
| 178 |
-
print("\nDid not beat base model. Variance - try again.")
|
| 179 |
-
|
| 180 |
-
if __name__ == "__main__":
|
| 181 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_eval_upload_v9.py
DELETED
|
@@ -1,180 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
# /// script
|
| 3 |
-
# requires-python = ">=3.10"
|
| 4 |
-
# dependencies = [
|
| 5 |
-
# "trl>=0.12.0",
|
| 6 |
-
# "peft>=0.7.0",
|
| 7 |
-
# "transformers>=4.36.0",
|
| 8 |
-
# "accelerate>=0.24.0",
|
| 9 |
-
# "datasets",
|
| 10 |
-
# "torch",
|
| 11 |
-
# "huggingface_hub",
|
| 12 |
-
# ]
|
| 13 |
-
# ///
|
| 14 |
-
import os
|
| 15 |
-
import torch
|
| 16 |
-
from datasets import load_dataset
|
| 17 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 18 |
-
from peft import LoraConfig, get_peft_model
|
| 19 |
-
from trl import SFTConfig, SFTTrainer
|
| 20 |
-
from huggingface_hub import login
|
| 21 |
-
|
| 22 |
-
BASE_MODEL = "Qwen/Qwen3-0.6B"
|
| 23 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval-v2"
|
| 24 |
-
MAX_STEPS = 150
|
| 25 |
-
LEARNING_RATE = 5e-6
|
| 26 |
-
NUM_TRAIN_EXAMPLES = 500
|
| 27 |
-
|
| 28 |
-
def authenticate():
|
| 29 |
-
token = os.environ.get("HF_TOKEN")
|
| 30 |
-
if not token:
|
| 31 |
-
raise ValueError("HF_TOKEN not set")
|
| 32 |
-
login(token=token)
|
| 33 |
-
print("Authenticated")
|
| 34 |
-
|
| 35 |
-
def load_humaneval():
|
| 36 |
-
ds = load_dataset("openai/openai_humaneval", split="test")
|
| 37 |
-
return list(ds)
|
| 38 |
-
|
| 39 |
-
def extract_code(full_text, prompt):
|
| 40 |
-
if full_text.startswith(prompt):
|
| 41 |
-
generated = full_text[len(prompt):]
|
| 42 |
-
else:
|
| 43 |
-
generated = full_text
|
| 44 |
-
|
| 45 |
-
for stop in ["\n\n\n", "\ndef ", "\nclass ", "\n#", "```", "<|"]:
|
| 46 |
-
if stop in generated:
|
| 47 |
-
generated = generated.split(stop)[0]
|
| 48 |
-
|
| 49 |
-
return (prompt + generated).strip()
|
| 50 |
-
|
| 51 |
-
def test_solution(code, test_code, entry_point):
|
| 52 |
-
try:
|
| 53 |
-
ns = {}
|
| 54 |
-
exec(code, ns)
|
| 55 |
-
if entry_point not in ns:
|
| 56 |
-
return False
|
| 57 |
-
exec(test_code, ns)
|
| 58 |
-
exec(f"check({entry_point})", ns)
|
| 59 |
-
return True
|
| 60 |
-
except:
|
| 61 |
-
return False
|
| 62 |
-
|
| 63 |
-
def evaluate_model(model, tokenizer, problems, desc):
|
| 64 |
-
correct = 0
|
| 65 |
-
model.eval()
|
| 66 |
-
|
| 67 |
-
for i, p in enumerate(problems):
|
| 68 |
-
prompt = p["prompt"]
|
| 69 |
-
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 70 |
-
|
| 71 |
-
with torch.no_grad():
|
| 72 |
-
out = model.generate(
|
| 73 |
-
**inputs,
|
| 74 |
-
max_new_tokens=256,
|
| 75 |
-
temperature=0.1,
|
| 76 |
-
do_sample=True,
|
| 77 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
full_text = tokenizer.decode(out[0], skip_special_tokens=True)
|
| 81 |
-
code = extract_code(full_text, prompt)
|
| 82 |
-
|
| 83 |
-
if test_solution(code, p["test"], p["entry_point"]):
|
| 84 |
-
correct += 1
|
| 85 |
-
|
| 86 |
-
if (i+1) % 40 == 0:
|
| 87 |
-
print(f"{desc}: {i+1}/{len(problems)}, {correct} correct ({correct/(i+1)*100:.1f}%)")
|
| 88 |
-
|
| 89 |
-
score = correct / len(problems) * 100
|
| 90 |
-
print(f"{desc} FINAL: {correct}/{len(problems)} = {score:.2f}%")
|
| 91 |
-
return score
|
| 92 |
-
|
| 93 |
-
def format_example(ex):
|
| 94 |
-
prompt = ex['prompt']
|
| 95 |
-
gen = ex['generation']
|
| 96 |
-
return {"text": f"<|im_start|>user\n{prompt}\n<|im_end|>\n<|im_start|>assistant\n{gen}<|im_end|>"}
|
| 97 |
-
|
| 98 |
-
def main():
|
| 99 |
-
print("=" * 60)
|
| 100 |
-
print("Qwen3-0.6B Fine-tuning Challenge v9")
|
| 101 |
-
print("=" * 60)
|
| 102 |
-
|
| 103 |
-
authenticate()
|
| 104 |
-
problems = load_humaneval()
|
| 105 |
-
print(f"Loaded {len(problems)} HumanEval problems")
|
| 106 |
-
|
| 107 |
-
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
|
| 108 |
-
if tokenizer.pad_token is None:
|
| 109 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 110 |
-
|
| 111 |
-
print("\n[1/4] Evaluating BASE model...")
|
| 112 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 113 |
-
BASE_MODEL,
|
| 114 |
-
torch_dtype=torch.bfloat16,
|
| 115 |
-
device_map="auto",
|
| 116 |
-
trust_remote_code=True
|
| 117 |
-
)
|
| 118 |
-
base_score = evaluate_model(model, tokenizer, problems, "BASE")
|
| 119 |
-
|
| 120 |
-
print("\n[2/4] Training...")
|
| 121 |
-
train_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 122 |
-
train_examples = []
|
| 123 |
-
for i, ex in enumerate(train_ds):
|
| 124 |
-
if i >= NUM_TRAIN_EXAMPLES:
|
| 125 |
-
break
|
| 126 |
-
train_examples.append(format_example(ex))
|
| 127 |
-
|
| 128 |
-
from datasets import Dataset
|
| 129 |
-
train_dataset = Dataset.from_list(train_examples)
|
| 130 |
-
print(f"Prepared {len(train_dataset)} training examples")
|
| 131 |
-
|
| 132 |
-
lora_config = LoraConfig(
|
| 133 |
-
r=8, lora_alpha=32, lora_dropout=0.1,
|
| 134 |
-
target_modules=["q_proj","k_proj","v_proj","o_proj"],
|
| 135 |
-
task_type="CAUSAL_LM"
|
| 136 |
-
)
|
| 137 |
-
model = get_peft_model(model, lora_config)
|
| 138 |
-
model.print_trainable_parameters()
|
| 139 |
-
|
| 140 |
-
# Fixed: removed max_seq_length, use dataset_text_field
|
| 141 |
-
training_args = SFTConfig(
|
| 142 |
-
output_dir="./qwen3-ft",
|
| 143 |
-
max_steps=MAX_STEPS,
|
| 144 |
-
learning_rate=LEARNING_RATE,
|
| 145 |
-
per_device_train_batch_size=2,
|
| 146 |
-
gradient_accumulation_steps=4,
|
| 147 |
-
logging_steps=10,
|
| 148 |
-
save_steps=9999,
|
| 149 |
-
bf16=True,
|
| 150 |
-
optim="adamw_torch",
|
| 151 |
-
warmup_steps=10,
|
| 152 |
-
dataset_text_field="text",
|
| 153 |
-
)
|
| 154 |
-
|
| 155 |
-
trainer = SFTTrainer(model=model, args=training_args, train_dataset=train_dataset, tokenizer=tokenizer)
|
| 156 |
-
trainer.train()
|
| 157 |
-
print("Training complete!")
|
| 158 |
-
|
| 159 |
-
model = model.merge_and_unload()
|
| 160 |
-
|
| 161 |
-
print("\n[3/4] Evaluating FINE-TUNED model...")
|
| 162 |
-
ft_score = evaluate_model(model, tokenizer, problems, "FINE-TUNED")
|
| 163 |
-
|
| 164 |
-
print("\n[4/4] Results")
|
| 165 |
-
print("=" * 60)
|
| 166 |
-
print(f"BASE: {base_score:.2f}%")
|
| 167 |
-
print(f"FINE-TUNED: {ft_score:.2f}%")
|
| 168 |
-
print(f"CHANGE: {ft_score - base_score:+.2f}%")
|
| 169 |
-
print("=" * 60)
|
| 170 |
-
|
| 171 |
-
if ft_score > base_score:
|
| 172 |
-
print("\nSUCCESS! Uploading to Hub...")
|
| 173 |
-
model.push_to_hub(REPO_ID)
|
| 174 |
-
tokenizer.push_to_hub(REPO_ID)
|
| 175 |
-
print("Upload complete!")
|
| 176 |
-
else:
|
| 177 |
-
print("\nDid not beat base model. Variance - try again.")
|
| 178 |
-
|
| 179 |
-
if __name__ == "__main__":
|
| 180 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_final.py
DELETED
|
@@ -1,128 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.51.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# ]
|
| 9 |
-
# ///
|
| 10 |
-
|
| 11 |
-
"""
|
| 12 |
-
FINAL ATTEMPT: Proper Qwen3 chat template, ultra-minimal training
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import sys
|
| 16 |
-
import traceback
|
| 17 |
-
from datasets import load_dataset, Dataset
|
| 18 |
-
from peft import LoraConfig
|
| 19 |
-
from trl import SFTTrainer, SFTConfig
|
| 20 |
-
from transformers import AutoTokenizer
|
| 21 |
-
import torch
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def log(msg):
|
| 25 |
-
print(msg, flush=True)
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
log("=" * 60)
|
| 29 |
-
log("FINAL TRAINING - Proper Qwen3 template")
|
| 30 |
-
log("=" * 60)
|
| 31 |
-
|
| 32 |
-
try:
|
| 33 |
-
log(f"CUDA: {torch.cuda.is_available()}")
|
| 34 |
-
if torch.cuda.is_available():
|
| 35 |
-
log(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 36 |
-
|
| 37 |
-
log("Loading tokenizer first...")
|
| 38 |
-
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
|
| 39 |
-
if tokenizer.pad_token is None:
|
| 40 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 41 |
-
log(f"Tokenizer loaded, vocab size: {len(tokenizer)}")
|
| 42 |
-
|
| 43 |
-
log("Streaming codeforces-cots...")
|
| 44 |
-
streaming_ds = load_dataset(
|
| 45 |
-
"open-r1/codeforces-cots", split="train", streaming=True
|
| 46 |
-
)
|
| 47 |
-
|
| 48 |
-
log("Collecting 200 examples...")
|
| 49 |
-
examples = []
|
| 50 |
-
for i, ex in enumerate(streaming_ds):
|
| 51 |
-
if i >= 200:
|
| 52 |
-
break
|
| 53 |
-
examples.append(ex)
|
| 54 |
-
log(f"Collected {len(examples)} examples")
|
| 55 |
-
|
| 56 |
-
dataset = Dataset.from_list(examples)
|
| 57 |
-
|
| 58 |
-
# Use proper Qwen3 chat template
|
| 59 |
-
def format_with_chat_template(example):
|
| 60 |
-
messages = example["messages"]
|
| 61 |
-
# Apply Qwen3's native chat template
|
| 62 |
-
text = tokenizer.apply_chat_template(
|
| 63 |
-
messages, tokenize=False, add_generation_prompt=False
|
| 64 |
-
)
|
| 65 |
-
return {"text": text}
|
| 66 |
-
|
| 67 |
-
log("Formatting with Qwen3 chat template...")
|
| 68 |
-
dataset = dataset.map(
|
| 69 |
-
format_with_chat_template, remove_columns=dataset.column_names
|
| 70 |
-
)
|
| 71 |
-
log(f"Formatted {len(dataset)} examples")
|
| 72 |
-
|
| 73 |
-
# Show sample
|
| 74 |
-
log(f"Sample (first 200 chars): {dataset[0]['text'][:200]}...")
|
| 75 |
-
|
| 76 |
-
config = SFTConfig(
|
| 77 |
-
output_dir="qwen3-final",
|
| 78 |
-
push_to_hub=True,
|
| 79 |
-
hub_model_id="passagereptile455/qwen3-0.6b-humaneval-final",
|
| 80 |
-
hub_strategy="every_save",
|
| 81 |
-
max_steps=30, # Ultra minimal
|
| 82 |
-
per_device_train_batch_size=1,
|
| 83 |
-
gradient_accumulation_steps=4,
|
| 84 |
-
learning_rate=5e-8, # Extremely conservative
|
| 85 |
-
max_length=512,
|
| 86 |
-
logging_steps=10,
|
| 87 |
-
save_strategy="steps",
|
| 88 |
-
save_steps=30,
|
| 89 |
-
save_total_limit=1,
|
| 90 |
-
eval_strategy="no",
|
| 91 |
-
warmup_ratio=0.1,
|
| 92 |
-
lr_scheduler_type="cosine",
|
| 93 |
-
gradient_checkpointing=True,
|
| 94 |
-
bf16=True,
|
| 95 |
-
dataset_text_field="text",
|
| 96 |
-
)
|
| 97 |
-
|
| 98 |
-
peft_config = LoraConfig(
|
| 99 |
-
r=4,
|
| 100 |
-
lora_alpha=8,
|
| 101 |
-
lora_dropout=0.0,
|
| 102 |
-
bias="none",
|
| 103 |
-
task_type="CAUSAL_LM",
|
| 104 |
-
target_modules=["q_proj", "v_proj"],
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
log("Creating trainer...")
|
| 108 |
-
trainer = SFTTrainer(
|
| 109 |
-
model="Qwen/Qwen3-0.6B",
|
| 110 |
-
train_dataset=dataset,
|
| 111 |
-
args=config,
|
| 112 |
-
peft_config=peft_config,
|
| 113 |
-
)
|
| 114 |
-
|
| 115 |
-
log("Training (30 steps, 5e-8 LR)...")
|
| 116 |
-
trainer.train()
|
| 117 |
-
|
| 118 |
-
log("Pushing to Hub...")
|
| 119 |
-
trainer.push_to_hub()
|
| 120 |
-
|
| 121 |
-
log("=" * 60)
|
| 122 |
-
log("SUCCESS! Model: passagereptile455/qwen3-0.6b-humaneval-final")
|
| 123 |
-
log("=" * 60)
|
| 124 |
-
|
| 125 |
-
except Exception as e:
|
| 126 |
-
log(f"ERROR: {e}")
|
| 127 |
-
traceback.print_exc()
|
| 128 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job1.py
DELETED
|
@@ -1,97 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Job 1: Baseline SFT training of Qwen3-0.6B on codeforces-cots
|
| 14 |
-
Goal: Beat base model on HumanEval
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
import trackio
|
| 18 |
-
from datasets import load_dataset
|
| 19 |
-
from peft import LoraConfig
|
| 20 |
-
from trl import SFTTrainer, SFTConfig
|
| 21 |
-
|
| 22 |
-
print("Loading dataset: open-r1/codeforces-cots")
|
| 23 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train")
|
| 24 |
-
print(f"Dataset loaded: {len(dataset)} examples")
|
| 25 |
-
|
| 26 |
-
# Use a subset for faster training
|
| 27 |
-
dataset = dataset.shuffle(seed=42).select(range(min(5000, len(dataset))))
|
| 28 |
-
print(f"Using {len(dataset)} examples for training")
|
| 29 |
-
|
| 30 |
-
# Create train/eval split
|
| 31 |
-
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
|
| 32 |
-
train_dataset = dataset_split["train"]
|
| 33 |
-
eval_dataset = dataset_split["test"]
|
| 34 |
-
print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")
|
| 35 |
-
|
| 36 |
-
# Training configuration
|
| 37 |
-
config = SFTConfig(
|
| 38 |
-
output_dir="qwen3-codeforces-sft-job1",
|
| 39 |
-
push_to_hub=True,
|
| 40 |
-
hub_model_id="passagereptile455/qwen3-0.6b-codeforces-sft-job1",
|
| 41 |
-
hub_strategy="every_save",
|
| 42 |
-
|
| 43 |
-
# Training params
|
| 44 |
-
num_train_epochs=2,
|
| 45 |
-
per_device_train_batch_size=2,
|
| 46 |
-
gradient_accumulation_steps=8,
|
| 47 |
-
learning_rate=2e-4,
|
| 48 |
-
max_length=2048,
|
| 49 |
-
|
| 50 |
-
# Logging
|
| 51 |
-
logging_steps=10,
|
| 52 |
-
save_strategy="steps",
|
| 53 |
-
save_steps=200,
|
| 54 |
-
save_total_limit=2,
|
| 55 |
-
|
| 56 |
-
# Eval
|
| 57 |
-
eval_strategy="steps",
|
| 58 |
-
eval_steps=200,
|
| 59 |
-
|
| 60 |
-
# Optimization
|
| 61 |
-
warmup_ratio=0.1,
|
| 62 |
-
lr_scheduler_type="cosine",
|
| 63 |
-
gradient_checkpointing=True,
|
| 64 |
-
|
| 65 |
-
# Monitoring
|
| 66 |
-
report_to="trackio",
|
| 67 |
-
project="qwen3-humaneval-challenge",
|
| 68 |
-
run_name="job1-baseline-5k",
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
# LoRA config for efficient training
|
| 72 |
-
peft_config = LoraConfig(
|
| 73 |
-
r=32,
|
| 74 |
-
lora_alpha=64,
|
| 75 |
-
lora_dropout=0.05,
|
| 76 |
-
bias="none",
|
| 77 |
-
task_type="CAUSAL_LM",
|
| 78 |
-
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
| 79 |
-
)
|
| 80 |
-
|
| 81 |
-
print("Initializing trainer with Qwen/Qwen3-0.6B...")
|
| 82 |
-
trainer = SFTTrainer(
|
| 83 |
-
model="Qwen/Qwen3-0.6B",
|
| 84 |
-
train_dataset=train_dataset,
|
| 85 |
-
eval_dataset=eval_dataset,
|
| 86 |
-
args=config,
|
| 87 |
-
peft_config=peft_config,
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
print("Starting training...")
|
| 91 |
-
trainer.train()
|
| 92 |
-
|
| 93 |
-
print("Pushing to Hub...")
|
| 94 |
-
trainer.push_to_hub()
|
| 95 |
-
|
| 96 |
-
print("Job 1 complete!")
|
| 97 |
-
print("Model: https://huggingface.co/passagereptile455/qwen3-0.6b-codeforces-sft-job1")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job1_minimal.py
DELETED
|
@@ -1,97 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Job #1: MINIMAL fine-tuning on codeforces-cots
|
| 14 |
-
Strategy: Very few steps (300 max) + low LR to add reasoning without losing Python
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
import trackio
|
| 18 |
-
from datasets import load_dataset
|
| 19 |
-
from peft import LoraConfig
|
| 20 |
-
from trl import SFTTrainer, SFTConfig
|
| 21 |
-
from transformers import AutoTokenizer
|
| 22 |
-
|
| 23 |
-
print("Loading codeforces-cots dataset...")
|
| 24 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train")
|
| 25 |
-
print(f"Total examples: {len(dataset)}")
|
| 26 |
-
|
| 27 |
-
# Shuffle and use subset for faster training
|
| 28 |
-
dataset = dataset.shuffle(seed=42).select(range(min(5000, len(dataset))))
|
| 29 |
-
print(f"Using {len(dataset)} examples")
|
| 30 |
-
|
| 31 |
-
# Split for eval
|
| 32 |
-
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
|
| 33 |
-
train_dataset = dataset_split["train"]
|
| 34 |
-
eval_dataset = dataset_split["test"]
|
| 35 |
-
print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
|
| 36 |
-
|
| 37 |
-
# Load tokenizer to check chat template
|
| 38 |
-
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
|
| 39 |
-
|
| 40 |
-
# Training config - MINIMAL training to avoid overwriting Python knowledge
|
| 41 |
-
config = SFTConfig(
|
| 42 |
-
output_dir="qwen3-codeforces-job1",
|
| 43 |
-
push_to_hub=True,
|
| 44 |
-
hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
|
| 45 |
-
hub_strategy="every_save",
|
| 46 |
-
# CRITICAL: Minimal training
|
| 47 |
-
max_steps=300, # Only 300 steps, not epochs
|
| 48 |
-
per_device_train_batch_size=2,
|
| 49 |
-
gradient_accumulation_steps=4,
|
| 50 |
-
learning_rate=5e-6, # Very low LR
|
| 51 |
-
max_length=1024,
|
| 52 |
-
# Logging
|
| 53 |
-
logging_steps=25,
|
| 54 |
-
save_strategy="steps",
|
| 55 |
-
save_steps=100,
|
| 56 |
-
save_total_limit=2,
|
| 57 |
-
# Eval
|
| 58 |
-
eval_strategy="steps",
|
| 59 |
-
eval_steps=100,
|
| 60 |
-
# Optimization
|
| 61 |
-
warmup_ratio=0.1,
|
| 62 |
-
lr_scheduler_type="cosine",
|
| 63 |
-
gradient_checkpointing=True,
|
| 64 |
-
bf16=True,
|
| 65 |
-
# Monitoring
|
| 66 |
-
report_to="trackio",
|
| 67 |
-
project="qwen3-humaneval-challenge",
|
| 68 |
-
run_name="job1-minimal-300steps",
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
# LoRA config - conservative settings
|
| 72 |
-
peft_config = LoraConfig(
|
| 73 |
-
r=8, # Lower rank for less change
|
| 74 |
-
lora_alpha=16,
|
| 75 |
-
lora_dropout=0.05,
|
| 76 |
-
bias="none",
|
| 77 |
-
task_type="CAUSAL_LM",
|
| 78 |
-
target_modules=["q_proj", "v_proj"],
|
| 79 |
-
)
|
| 80 |
-
|
| 81 |
-
print("Initializing trainer...")
|
| 82 |
-
trainer = SFTTrainer(
|
| 83 |
-
model="Qwen/Qwen3-0.6B",
|
| 84 |
-
train_dataset=train_dataset,
|
| 85 |
-
eval_dataset=eval_dataset,
|
| 86 |
-
args=config,
|
| 87 |
-
peft_config=peft_config,
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
print("Starting minimal training (300 steps)...")
|
| 91 |
-
trainer.train()
|
| 92 |
-
|
| 93 |
-
print("Pushing to Hub...")
|
| 94 |
-
trainer.push_to_hub()
|
| 95 |
-
|
| 96 |
-
print("Job 1 complete!")
|
| 97 |
-
print("Model: https://huggingface.co/passagereptile455/qwen3-0.6b-humaneval-job1")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job1_v2.py
DELETED
|
@@ -1,120 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Job #1 v2: MINIMAL fine-tuning on codeforces-cots
|
| 14 |
-
Fixed: Use iterative loading to avoid memory issues
|
| 15 |
-
"""
|
| 16 |
-
|
| 17 |
-
import trackio
|
| 18 |
-
from datasets import load_dataset
|
| 19 |
-
from peft import LoraConfig
|
| 20 |
-
from trl import SFTTrainer, SFTConfig
|
| 21 |
-
from transformers import AutoTokenizer
|
| 22 |
-
import torch
|
| 23 |
-
|
| 24 |
-
print("=" * 50)
|
| 25 |
-
print("JOB 1 v2: Minimal fine-tuning")
|
| 26 |
-
print("=" * 50)
|
| 27 |
-
|
| 28 |
-
# Load dataset with streaming first to check
|
| 29 |
-
print("Loading dataset (streaming to count)...")
|
| 30 |
-
try:
|
| 31 |
-
ds_stream = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 32 |
-
count = 0
|
| 33 |
-
for _ in ds_stream:
|
| 34 |
-
count += 1
|
| 35 |
-
if count >= 3000: # Just use first 3000
|
| 36 |
-
break
|
| 37 |
-
print(f"Dataset accessible, using {count} examples")
|
| 38 |
-
except Exception as e:
|
| 39 |
-
print(f"Error loading dataset: {e}")
|
| 40 |
-
raise
|
| 41 |
-
|
| 42 |
-
# Now load non-streaming but limited
|
| 43 |
-
print("Loading dataset subset...")
|
| 44 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train")
|
| 45 |
-
dataset = dataset.shuffle(seed=42).select(range(min(3000, len(dataset))))
|
| 46 |
-
print(f"Loaded {len(dataset)} examples")
|
| 47 |
-
|
| 48 |
-
# Split
|
| 49 |
-
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
|
| 50 |
-
train_dataset = dataset_split["train"]
|
| 51 |
-
eval_dataset = dataset_split["test"]
|
| 52 |
-
print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
|
| 53 |
-
|
| 54 |
-
# Tokenizer
|
| 55 |
-
print("Loading tokenizer...")
|
| 56 |
-
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
|
| 57 |
-
if tokenizer.pad_token is None:
|
| 58 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 59 |
-
print(f"Tokenizer loaded, pad_token: {tokenizer.pad_token}")
|
| 60 |
-
|
| 61 |
-
# Training config
|
| 62 |
-
print("Setting up training config...")
|
| 63 |
-
config = SFTConfig(
|
| 64 |
-
output_dir="qwen3-codeforces-job1",
|
| 65 |
-
push_to_hub=True,
|
| 66 |
-
hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
|
| 67 |
-
hub_strategy="every_save",
|
| 68 |
-
# Minimal training
|
| 69 |
-
max_steps=300,
|
| 70 |
-
per_device_train_batch_size=1,
|
| 71 |
-
gradient_accumulation_steps=8,
|
| 72 |
-
learning_rate=5e-6,
|
| 73 |
-
max_length=512, # Shorter for memory
|
| 74 |
-
# Logging
|
| 75 |
-
logging_steps=25,
|
| 76 |
-
save_strategy="steps",
|
| 77 |
-
save_steps=100,
|
| 78 |
-
save_total_limit=2,
|
| 79 |
-
# Skip eval to save memory
|
| 80 |
-
eval_strategy="no",
|
| 81 |
-
# Optimization
|
| 82 |
-
warmup_ratio=0.1,
|
| 83 |
-
lr_scheduler_type="cosine",
|
| 84 |
-
gradient_checkpointing=True,
|
| 85 |
-
bf16=True,
|
| 86 |
-
# Monitoring
|
| 87 |
-
report_to="trackio",
|
| 88 |
-
project="qwen3-humaneval-challenge",
|
| 89 |
-
run_name="job1-minimal-v2",
|
| 90 |
-
)
|
| 91 |
-
|
| 92 |
-
# LoRA config
|
| 93 |
-
peft_config = LoraConfig(
|
| 94 |
-
r=8,
|
| 95 |
-
lora_alpha=16,
|
| 96 |
-
lora_dropout=0.05,
|
| 97 |
-
bias="none",
|
| 98 |
-
task_type="CAUSAL_LM",
|
| 99 |
-
target_modules=["q_proj", "v_proj"],
|
| 100 |
-
)
|
| 101 |
-
|
| 102 |
-
print("Initializing trainer...")
|
| 103 |
-
trainer = SFTTrainer(
|
| 104 |
-
model="Qwen/Qwen3-0.6B",
|
| 105 |
-
train_dataset=train_dataset,
|
| 106 |
-
args=config,
|
| 107 |
-
peft_config=peft_config,
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
print("Starting training...")
|
| 111 |
-
print(f"Total steps: {config.max_steps}")
|
| 112 |
-
trainer.train()
|
| 113 |
-
|
| 114 |
-
print("Pushing to Hub...")
|
| 115 |
-
trainer.push_to_hub()
|
| 116 |
-
|
| 117 |
-
print("=" * 50)
|
| 118 |
-
print("JOB 1 COMPLETE!")
|
| 119 |
-
print("Model: https://huggingface.co/passagereptile455/qwen3-0.6b-humaneval-job1")
|
| 120 |
-
print("=" * 50)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job1_v3.py
DELETED
|
@@ -1,119 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Job #1 v3: Simplified training script
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
import os
|
| 17 |
-
import sys
|
| 18 |
-
import traceback
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
def main():
|
| 22 |
-
print("=" * 50)
|
| 23 |
-
print("JOB 1 v3: Starting...")
|
| 24 |
-
print("=" * 50)
|
| 25 |
-
|
| 26 |
-
try:
|
| 27 |
-
import trackio
|
| 28 |
-
from datasets import load_dataset
|
| 29 |
-
from peft import LoraConfig
|
| 30 |
-
from trl import SFTTrainer, SFTConfig
|
| 31 |
-
from transformers import AutoTokenizer
|
| 32 |
-
import torch
|
| 33 |
-
|
| 34 |
-
print(f"PyTorch version: {torch.__version__}")
|
| 35 |
-
print(f"CUDA available: {torch.cuda.is_available()}")
|
| 36 |
-
if torch.cuda.is_available():
|
| 37 |
-
print(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 38 |
-
|
| 39 |
-
# Load dataset - use trust_remote_code in case needed
|
| 40 |
-
print("Loading codeforces-cots dataset...")
|
| 41 |
-
dataset = load_dataset(
|
| 42 |
-
"open-r1/codeforces-cots", split="train", trust_remote_code=True
|
| 43 |
-
)
|
| 44 |
-
print(f"Dataset loaded: {len(dataset)} total examples")
|
| 45 |
-
|
| 46 |
-
# Use small subset
|
| 47 |
-
dataset = dataset.shuffle(seed=42).select(range(min(2000, len(dataset))))
|
| 48 |
-
print(f"Using: {len(dataset)} examples")
|
| 49 |
-
|
| 50 |
-
# Tokenizer
|
| 51 |
-
print("Loading tokenizer...")
|
| 52 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 53 |
-
"Qwen/Qwen3-0.6B", trust_remote_code=True
|
| 54 |
-
)
|
| 55 |
-
if tokenizer.pad_token is None:
|
| 56 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 57 |
-
|
| 58 |
-
# Config
|
| 59 |
-
print("Setting up config...")
|
| 60 |
-
config = SFTConfig(
|
| 61 |
-
output_dir="qwen3-codeforces-job1",
|
| 62 |
-
push_to_hub=True,
|
| 63 |
-
hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
|
| 64 |
-
hub_strategy="every_save",
|
| 65 |
-
max_steps=200, # Even fewer steps
|
| 66 |
-
per_device_train_batch_size=1,
|
| 67 |
-
gradient_accumulation_steps=8,
|
| 68 |
-
learning_rate=5e-6,
|
| 69 |
-
max_length=512,
|
| 70 |
-
logging_steps=20,
|
| 71 |
-
save_strategy="steps",
|
| 72 |
-
save_steps=100,
|
| 73 |
-
save_total_limit=1,
|
| 74 |
-
eval_strategy="no",
|
| 75 |
-
warmup_ratio=0.1,
|
| 76 |
-
lr_scheduler_type="cosine",
|
| 77 |
-
gradient_checkpointing=True,
|
| 78 |
-
bf16=True,
|
| 79 |
-
report_to="trackio",
|
| 80 |
-
project="qwen3-humaneval",
|
| 81 |
-
run_name="job1-v3",
|
| 82 |
-
)
|
| 83 |
-
|
| 84 |
-
# LoRA
|
| 85 |
-
peft_config = LoraConfig(
|
| 86 |
-
r=8,
|
| 87 |
-
lora_alpha=16,
|
| 88 |
-
lora_dropout=0.05,
|
| 89 |
-
bias="none",
|
| 90 |
-
task_type="CAUSAL_LM",
|
| 91 |
-
target_modules=["q_proj", "v_proj"],
|
| 92 |
-
)
|
| 93 |
-
|
| 94 |
-
print("Creating trainer...")
|
| 95 |
-
trainer = SFTTrainer(
|
| 96 |
-
model="Qwen/Qwen3-0.6B",
|
| 97 |
-
train_dataset=dataset,
|
| 98 |
-
args=config,
|
| 99 |
-
peft_config=peft_config,
|
| 100 |
-
)
|
| 101 |
-
|
| 102 |
-
print("Starting training (200 steps)...")
|
| 103 |
-
trainer.train()
|
| 104 |
-
|
| 105 |
-
print("Pushing to Hub...")
|
| 106 |
-
trainer.push_to_hub()
|
| 107 |
-
|
| 108 |
-
print("=" * 50)
|
| 109 |
-
print("SUCCESS!")
|
| 110 |
-
print("=" * 50)
|
| 111 |
-
|
| 112 |
-
except Exception as e:
|
| 113 |
-
print(f"ERROR: {e}")
|
| 114 |
-
traceback.print_exc()
|
| 115 |
-
sys.exit(1)
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
if __name__ == "__main__":
|
| 119 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job1_v4.py
DELETED
|
@@ -1,100 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Job #1 v4: Simple training script - no trust_remote_code
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
import sys
|
| 17 |
-
import traceback
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def main():
|
| 21 |
-
print("=" * 50)
|
| 22 |
-
print("JOB 1 v4")
|
| 23 |
-
print("=" * 50)
|
| 24 |
-
|
| 25 |
-
try:
|
| 26 |
-
from datasets import load_dataset
|
| 27 |
-
from peft import LoraConfig
|
| 28 |
-
from trl import SFTTrainer, SFTConfig
|
| 29 |
-
import torch
|
| 30 |
-
|
| 31 |
-
print(f"PyTorch: {torch.__version__}")
|
| 32 |
-
print(f"CUDA: {torch.cuda.is_available()}")
|
| 33 |
-
|
| 34 |
-
# Load dataset WITHOUT trust_remote_code
|
| 35 |
-
print("Loading dataset...")
|
| 36 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train")
|
| 37 |
-
print(f"Total: {len(dataset)}")
|
| 38 |
-
|
| 39 |
-
# Small subset
|
| 40 |
-
dataset = dataset.shuffle(seed=42).select(range(1000))
|
| 41 |
-
print(f"Using: {len(dataset)} examples")
|
| 42 |
-
|
| 43 |
-
# Config
|
| 44 |
-
config = SFTConfig(
|
| 45 |
-
output_dir="qwen3-job1",
|
| 46 |
-
push_to_hub=True,
|
| 47 |
-
hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
|
| 48 |
-
hub_strategy="every_save",
|
| 49 |
-
max_steps=200,
|
| 50 |
-
per_device_train_batch_size=1,
|
| 51 |
-
gradient_accumulation_steps=8,
|
| 52 |
-
learning_rate=5e-6,
|
| 53 |
-
max_length=512,
|
| 54 |
-
logging_steps=20,
|
| 55 |
-
save_strategy="steps",
|
| 56 |
-
save_steps=100,
|
| 57 |
-
save_total_limit=1,
|
| 58 |
-
eval_strategy="no",
|
| 59 |
-
warmup_ratio=0.1,
|
| 60 |
-
lr_scheduler_type="cosine",
|
| 61 |
-
gradient_checkpointing=True,
|
| 62 |
-
bf16=True,
|
| 63 |
-
report_to="trackio",
|
| 64 |
-
project="qwen3-humaneval",
|
| 65 |
-
run_name="job1-v4",
|
| 66 |
-
)
|
| 67 |
-
|
| 68 |
-
peft_config = LoraConfig(
|
| 69 |
-
r=8,
|
| 70 |
-
lora_alpha=16,
|
| 71 |
-
lora_dropout=0.05,
|
| 72 |
-
bias="none",
|
| 73 |
-
task_type="CAUSAL_LM",
|
| 74 |
-
target_modules=["q_proj", "v_proj"],
|
| 75 |
-
)
|
| 76 |
-
|
| 77 |
-
print("Creating trainer...")
|
| 78 |
-
trainer = SFTTrainer(
|
| 79 |
-
model="Qwen/Qwen3-0.6B",
|
| 80 |
-
train_dataset=dataset,
|
| 81 |
-
args=config,
|
| 82 |
-
peft_config=peft_config,
|
| 83 |
-
)
|
| 84 |
-
|
| 85 |
-
print("Training...")
|
| 86 |
-
trainer.train()
|
| 87 |
-
|
| 88 |
-
print("Pushing to Hub...")
|
| 89 |
-
trainer.push_to_hub()
|
| 90 |
-
|
| 91 |
-
print("SUCCESS!")
|
| 92 |
-
|
| 93 |
-
except Exception as e:
|
| 94 |
-
print(f"ERROR: {e}")
|
| 95 |
-
traceback.print_exc()
|
| 96 |
-
sys.exit(1)
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
if __name__ == "__main__":
|
| 100 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job2.py
DELETED
|
@@ -1,112 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Job 2: Fixed SFT training - properly handle messages format
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
from datasets import load_dataset
|
| 17 |
-
from peft import LoraConfig
|
| 18 |
-
from trl import SFTTrainer, SFTConfig
|
| 19 |
-
from transformers import AutoTokenizer
|
| 20 |
-
|
| 21 |
-
print("Loading dataset: open-r1/codeforces-cots")
|
| 22 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train")
|
| 23 |
-
print(f"Dataset loaded: {len(dataset)} examples")
|
| 24 |
-
|
| 25 |
-
# Use subset for faster training
|
| 26 |
-
dataset = dataset.shuffle(seed=42).select(range(min(5000, len(dataset))))
|
| 27 |
-
print(f"Using {len(dataset)} examples")
|
| 28 |
-
|
| 29 |
-
# Load tokenizer to apply chat template
|
| 30 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 31 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 32 |
-
|
| 33 |
-
# Convert messages to text using chat template
|
| 34 |
-
def format_example(example):
|
| 35 |
-
messages = example["messages"]
|
| 36 |
-
# Apply chat template to convert messages to text
|
| 37 |
-
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
|
| 38 |
-
return {"text": text}
|
| 39 |
-
|
| 40 |
-
print("Formatting dataset with chat template...")
|
| 41 |
-
dataset = dataset.map(format_example, remove_columns=dataset.column_names)
|
| 42 |
-
print(f"Formatted {len(dataset)} examples")
|
| 43 |
-
|
| 44 |
-
# Split
|
| 45 |
-
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
|
| 46 |
-
train_dataset = dataset_split["train"]
|
| 47 |
-
eval_dataset = dataset_split["test"]
|
| 48 |
-
print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")
|
| 49 |
-
|
| 50 |
-
config = SFTConfig(
|
| 51 |
-
output_dir="qwen3-codeforces-sft-job2",
|
| 52 |
-
push_to_hub=True,
|
| 53 |
-
hub_model_id="passagereptile455/qwen3-0.6b-codeforces-sft-job2",
|
| 54 |
-
hub_strategy="every_save",
|
| 55 |
-
|
| 56 |
-
# Use text field we created
|
| 57 |
-
dataset_text_field="text",
|
| 58 |
-
|
| 59 |
-
# Training params
|
| 60 |
-
num_train_epochs=2,
|
| 61 |
-
per_device_train_batch_size=2,
|
| 62 |
-
gradient_accumulation_steps=8,
|
| 63 |
-
learning_rate=2e-4,
|
| 64 |
-
max_length=2048,
|
| 65 |
-
|
| 66 |
-
# Logging
|
| 67 |
-
logging_steps=10,
|
| 68 |
-
save_strategy="steps",
|
| 69 |
-
save_steps=200,
|
| 70 |
-
save_total_limit=2,
|
| 71 |
-
|
| 72 |
-
# Eval
|
| 73 |
-
eval_strategy="steps",
|
| 74 |
-
eval_steps=200,
|
| 75 |
-
|
| 76 |
-
# Optimization
|
| 77 |
-
warmup_ratio=0.1,
|
| 78 |
-
lr_scheduler_type="cosine",
|
| 79 |
-
gradient_checkpointing=True,
|
| 80 |
-
|
| 81 |
-
# Monitoring
|
| 82 |
-
report_to="trackio",
|
| 83 |
-
project="qwen3-humaneval-challenge",
|
| 84 |
-
run_name="job2-fixed-format",
|
| 85 |
-
)
|
| 86 |
-
|
| 87 |
-
peft_config = LoraConfig(
|
| 88 |
-
r=32,
|
| 89 |
-
lora_alpha=64,
|
| 90 |
-
lora_dropout=0.05,
|
| 91 |
-
bias="none",
|
| 92 |
-
task_type="CAUSAL_LM",
|
| 93 |
-
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
-
print("Initializing trainer...")
|
| 97 |
-
trainer = SFTTrainer(
|
| 98 |
-
model=model_name,
|
| 99 |
-
train_dataset=train_dataset,
|
| 100 |
-
eval_dataset=eval_dataset,
|
| 101 |
-
args=config,
|
| 102 |
-
peft_config=peft_config,
|
| 103 |
-
)
|
| 104 |
-
|
| 105 |
-
print("Starting training...")
|
| 106 |
-
trainer.train()
|
| 107 |
-
|
| 108 |
-
print("Pushing to Hub...")
|
| 109 |
-
trainer.push_to_hub()
|
| 110 |
-
|
| 111 |
-
print("Job 2 complete!")
|
| 112 |
-
print("Model: https://huggingface.co/passagereptile455/qwen3-0.6b-codeforces-sft-job2")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job2_v2.py
DELETED
|
@@ -1,162 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# ]
|
| 9 |
-
# ///
|
| 10 |
-
|
| 11 |
-
"""
|
| 12 |
-
Job 2: Ultra-conservative training - filter C++, minimal steps
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import sys
|
| 16 |
-
import traceback
|
| 17 |
-
from datasets import load_dataset, Dataset
|
| 18 |
-
from peft import LoraConfig
|
| 19 |
-
from trl import SFTTrainer, SFTConfig
|
| 20 |
-
from transformers import AutoTokenizer
|
| 21 |
-
import torch
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def log(msg):
|
| 25 |
-
print(msg, flush=True)
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
log("=" * 60)
|
| 29 |
-
log("TRAINING JOB 2 - Ultra-conservative approach")
|
| 30 |
-
log("=" * 60)
|
| 31 |
-
|
| 32 |
-
try:
|
| 33 |
-
log(f"CUDA: {torch.cuda.is_available()}")
|
| 34 |
-
if torch.cuda.is_available():
|
| 35 |
-
log(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 36 |
-
|
| 37 |
-
log("Streaming codeforces-cots...")
|
| 38 |
-
streaming_ds = load_dataset(
|
| 39 |
-
"open-r1/codeforces-cots", split="train", streaming=True
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
log("Collecting examples (aggressive C++ filtering)...")
|
| 43 |
-
examples = []
|
| 44 |
-
total_seen = 0
|
| 45 |
-
skipped_cpp = 0
|
| 46 |
-
|
| 47 |
-
cpp_markers = [
|
| 48 |
-
"#include",
|
| 49 |
-
"cout",
|
| 50 |
-
"cin",
|
| 51 |
-
"vector<",
|
| 52 |
-
"int main",
|
| 53 |
-
"iostream",
|
| 54 |
-
"using namespace std",
|
| 55 |
-
"printf",
|
| 56 |
-
"scanf",
|
| 57 |
-
"long long",
|
| 58 |
-
]
|
| 59 |
-
|
| 60 |
-
for ex in streaming_ds:
|
| 61 |
-
total_seen += 1
|
| 62 |
-
if len(examples) >= 500: # Only 500 examples
|
| 63 |
-
break
|
| 64 |
-
if total_seen > 20000: # Don't scan forever
|
| 65 |
-
break
|
| 66 |
-
|
| 67 |
-
messages = ex.get("messages", [])
|
| 68 |
-
content = ""
|
| 69 |
-
for msg in messages:
|
| 70 |
-
content += str(msg.get("content", "")).lower()
|
| 71 |
-
|
| 72 |
-
# Skip if ANY C++ marker present
|
| 73 |
-
has_cpp = any(m.lower() in content for m in cpp_markers)
|
| 74 |
-
|
| 75 |
-
if has_cpp:
|
| 76 |
-
skipped_cpp += 1
|
| 77 |
-
continue
|
| 78 |
-
|
| 79 |
-
examples.append(ex)
|
| 80 |
-
|
| 81 |
-
if len(examples) % 100 == 0:
|
| 82 |
-
log(
|
| 83 |
-
f" Collected {len(examples)} (seen {total_seen}, skipped {skipped_cpp} C++)"
|
| 84 |
-
)
|
| 85 |
-
|
| 86 |
-
log(f"Final: {len(examples)} examples from {total_seen} seen")
|
| 87 |
-
|
| 88 |
-
if len(examples) < 100:
|
| 89 |
-
log("WARNING: Very few non-C++ examples found!")
|
| 90 |
-
|
| 91 |
-
dataset = Dataset.from_list(examples)
|
| 92 |
-
|
| 93 |
-
log("Loading tokenizer...")
|
| 94 |
-
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
|
| 95 |
-
if tokenizer.pad_token is None:
|
| 96 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 97 |
-
|
| 98 |
-
def format_messages(example):
|
| 99 |
-
messages = example["messages"]
|
| 100 |
-
text = ""
|
| 101 |
-
for msg in messages:
|
| 102 |
-
role = msg.get("role", "user")
|
| 103 |
-
content = msg.get("content", "")
|
| 104 |
-
text += f"<|{role}|>\n{content}\n"
|
| 105 |
-
return {"text": text}
|
| 106 |
-
|
| 107 |
-
log("Formatting dataset...")
|
| 108 |
-
dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
|
| 109 |
-
|
| 110 |
-
config = SFTConfig(
|
| 111 |
-
output_dir="qwen3-job2",
|
| 112 |
-
push_to_hub=True,
|
| 113 |
-
hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job2",
|
| 114 |
-
hub_strategy="every_save",
|
| 115 |
-
max_steps=100, # Very few steps
|
| 116 |
-
per_device_train_batch_size=1,
|
| 117 |
-
gradient_accumulation_steps=4,
|
| 118 |
-
learning_rate=1e-6, # Extremely low LR
|
| 119 |
-
max_length=512,
|
| 120 |
-
logging_steps=20,
|
| 121 |
-
save_strategy="steps",
|
| 122 |
-
save_steps=50,
|
| 123 |
-
save_total_limit=1,
|
| 124 |
-
eval_strategy="no",
|
| 125 |
-
warmup_ratio=0.1,
|
| 126 |
-
lr_scheduler_type="cosine",
|
| 127 |
-
gradient_checkpointing=True,
|
| 128 |
-
bf16=True,
|
| 129 |
-
dataset_text_field="text",
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
peft_config = LoraConfig(
|
| 133 |
-
r=4, # Very small rank
|
| 134 |
-
lora_alpha=8,
|
| 135 |
-
lora_dropout=0.0,
|
| 136 |
-
bias="none",
|
| 137 |
-
task_type="CAUSAL_LM",
|
| 138 |
-
target_modules=["q_proj", "v_proj"], # Minimal modules
|
| 139 |
-
)
|
| 140 |
-
|
| 141 |
-
log("Creating trainer...")
|
| 142 |
-
trainer = SFTTrainer(
|
| 143 |
-
model="Qwen/Qwen3-0.6B",
|
| 144 |
-
train_dataset=dataset,
|
| 145 |
-
args=config,
|
| 146 |
-
peft_config=peft_config,
|
| 147 |
-
)
|
| 148 |
-
|
| 149 |
-
log("Training (100 steps, 1e-6 LR)...")
|
| 150 |
-
trainer.train()
|
| 151 |
-
|
| 152 |
-
log("Pushing to Hub...")
|
| 153 |
-
trainer.push_to_hub()
|
| 154 |
-
|
| 155 |
-
log("=" * 60)
|
| 156 |
-
log("SUCCESS! Model: passagereptile455/qwen3-0.6b-humaneval-job2")
|
| 157 |
-
log("=" * 60)
|
| 158 |
-
|
| 159 |
-
except Exception as e:
|
| 160 |
-
log(f"ERROR: {e}")
|
| 161 |
-
traceback.print_exc()
|
| 162 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job2_v3.py
DELETED
|
@@ -1,123 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# ]
|
| 9 |
-
# ///
|
| 10 |
-
|
| 11 |
-
"""
|
| 12 |
-
Job 2 v3: No filtering, ultra-minimal training (50 steps)
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import sys
|
| 16 |
-
import traceback
|
| 17 |
-
from datasets import load_dataset, Dataset
|
| 18 |
-
from peft import LoraConfig
|
| 19 |
-
from trl import SFTTrainer, SFTConfig
|
| 20 |
-
from transformers import AutoTokenizer
|
| 21 |
-
import torch
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def log(msg):
|
| 25 |
-
print(msg, flush=True)
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
log("=" * 60)
|
| 29 |
-
log("TRAINING JOB 2 v3 - Ultra-minimal, no filtering")
|
| 30 |
-
log("=" * 60)
|
| 31 |
-
|
| 32 |
-
try:
|
| 33 |
-
log(f"CUDA: {torch.cuda.is_available()}")
|
| 34 |
-
if torch.cuda.is_available():
|
| 35 |
-
log(f"GPU: {torch.cuda.get_device_name(0)}")
|
| 36 |
-
|
| 37 |
-
log("Streaming codeforces-cots...")
|
| 38 |
-
streaming_ds = load_dataset(
|
| 39 |
-
"open-r1/codeforces-cots", split="train", streaming=True
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
log("Collecting 300 examples (no filtering)...")
|
| 43 |
-
examples = []
|
| 44 |
-
for i, ex in enumerate(streaming_ds):
|
| 45 |
-
if i >= 300:
|
| 46 |
-
break
|
| 47 |
-
examples.append(ex)
|
| 48 |
-
if (i + 1) % 100 == 0:
|
| 49 |
-
log(f" Collected {i + 1}")
|
| 50 |
-
|
| 51 |
-
log(f"Final: {len(examples)} examples")
|
| 52 |
-
dataset = Dataset.from_list(examples)
|
| 53 |
-
|
| 54 |
-
log("Loading tokenizer...")
|
| 55 |
-
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
|
| 56 |
-
if tokenizer.pad_token is None:
|
| 57 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 58 |
-
|
| 59 |
-
def format_messages(example):
|
| 60 |
-
messages = example["messages"]
|
| 61 |
-
text = ""
|
| 62 |
-
for msg in messages:
|
| 63 |
-
role = msg.get("role", "user")
|
| 64 |
-
content = msg.get("content", "")
|
| 65 |
-
text += f"<|{role}|>\n{content}\n"
|
| 66 |
-
return {"text": text}
|
| 67 |
-
|
| 68 |
-
log("Formatting dataset...")
|
| 69 |
-
dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
|
| 70 |
-
|
| 71 |
-
config = SFTConfig(
|
| 72 |
-
output_dir="qwen3-job2",
|
| 73 |
-
push_to_hub=True,
|
| 74 |
-
hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job2",
|
| 75 |
-
hub_strategy="every_save",
|
| 76 |
-
max_steps=50, # ULTRA minimal
|
| 77 |
-
per_device_train_batch_size=1,
|
| 78 |
-
gradient_accumulation_steps=4,
|
| 79 |
-
learning_rate=5e-7, # Extremely low
|
| 80 |
-
max_length=512,
|
| 81 |
-
logging_steps=10,
|
| 82 |
-
save_strategy="steps",
|
| 83 |
-
save_steps=50,
|
| 84 |
-
save_total_limit=1,
|
| 85 |
-
eval_strategy="no",
|
| 86 |
-
warmup_ratio=0.1,
|
| 87 |
-
lr_scheduler_type="cosine",
|
| 88 |
-
gradient_checkpointing=True,
|
| 89 |
-
bf16=True,
|
| 90 |
-
dataset_text_field="text",
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
peft_config = LoraConfig(
|
| 94 |
-
r=4,
|
| 95 |
-
lora_alpha=8,
|
| 96 |
-
lora_dropout=0.0,
|
| 97 |
-
bias="none",
|
| 98 |
-
task_type="CAUSAL_LM",
|
| 99 |
-
target_modules=["q_proj", "v_proj"],
|
| 100 |
-
)
|
| 101 |
-
|
| 102 |
-
log("Creating trainer...")
|
| 103 |
-
trainer = SFTTrainer(
|
| 104 |
-
model="Qwen/Qwen3-0.6B",
|
| 105 |
-
train_dataset=dataset,
|
| 106 |
-
args=config,
|
| 107 |
-
peft_config=peft_config,
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
log("Training (50 steps, 5e-7 LR)...")
|
| 111 |
-
trainer.train()
|
| 112 |
-
|
| 113 |
-
log("Pushing to Hub...")
|
| 114 |
-
trainer.push_to_hub()
|
| 115 |
-
|
| 116 |
-
log("=" * 60)
|
| 117 |
-
log("SUCCESS! Model: passagereptile455/qwen3-0.6b-humaneval-job2")
|
| 118 |
-
log("=" * 60)
|
| 119 |
-
|
| 120 |
-
except Exception as e:
|
| 121 |
-
log(f"ERROR: {e}")
|
| 122 |
-
traceback.print_exc()
|
| 123 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job3.py
DELETED
|
@@ -1,104 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Job 3: Memory-optimized SFT training
|
| 14 |
-
- Reduced batch size to 1
|
| 15 |
-
- Increased gradient accumulation to 16
|
| 16 |
-
- Reduced max_length to 1024
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
from datasets import load_dataset
|
| 20 |
-
from peft import LoraConfig
|
| 21 |
-
from trl import SFTTrainer, SFTConfig
|
| 22 |
-
from transformers import AutoTokenizer
|
| 23 |
-
|
| 24 |
-
print("Loading dataset: open-r1/codeforces-cots")
|
| 25 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train")
|
| 26 |
-
print(f"Dataset loaded: {len(dataset)} examples")
|
| 27 |
-
|
| 28 |
-
dataset = dataset.shuffle(seed=42).select(range(min(5000, len(dataset))))
|
| 29 |
-
print(f"Using {len(dataset)} examples")
|
| 30 |
-
|
| 31 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 32 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 33 |
-
|
| 34 |
-
def format_example(example):
|
| 35 |
-
messages = example["messages"]
|
| 36 |
-
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
|
| 37 |
-
return {"text": text}
|
| 38 |
-
|
| 39 |
-
print("Formatting dataset...")
|
| 40 |
-
dataset = dataset.map(format_example, remove_columns=dataset.column_names)
|
| 41 |
-
|
| 42 |
-
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
|
| 43 |
-
train_dataset = dataset_split["train"]
|
| 44 |
-
eval_dataset = dataset_split["test"]
|
| 45 |
-
print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")
|
| 46 |
-
|
| 47 |
-
config = SFTConfig(
|
| 48 |
-
output_dir="qwen3-codeforces-sft-job3",
|
| 49 |
-
push_to_hub=True,
|
| 50 |
-
hub_model_id="passagereptile455/qwen3-0.6b-codeforces-sft-job3",
|
| 51 |
-
hub_strategy="every_save",
|
| 52 |
-
|
| 53 |
-
dataset_text_field="text",
|
| 54 |
-
|
| 55 |
-
# MEMORY OPTIMIZED
|
| 56 |
-
num_train_epochs=2,
|
| 57 |
-
per_device_train_batch_size=1, # Reduced from 2
|
| 58 |
-
gradient_accumulation_steps=16, # Increased from 8
|
| 59 |
-
learning_rate=2e-4,
|
| 60 |
-
max_length=1024, # Reduced from 2048
|
| 61 |
-
|
| 62 |
-
logging_steps=10,
|
| 63 |
-
save_strategy="steps",
|
| 64 |
-
save_steps=100,
|
| 65 |
-
save_total_limit=2,
|
| 66 |
-
|
| 67 |
-
eval_strategy="steps",
|
| 68 |
-
eval_steps=100,
|
| 69 |
-
|
| 70 |
-
warmup_ratio=0.1,
|
| 71 |
-
lr_scheduler_type="cosine",
|
| 72 |
-
gradient_checkpointing=True,
|
| 73 |
-
bf16=True, # Use bfloat16 for memory efficiency
|
| 74 |
-
|
| 75 |
-
report_to="trackio",
|
| 76 |
-
project="qwen3-humaneval-challenge",
|
| 77 |
-
run_name="job3-memory-optimized",
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
peft_config = LoraConfig(
|
| 81 |
-
r=16, # Reduced from 32
|
| 82 |
-
lora_alpha=32, # Reduced from 64
|
| 83 |
-
lora_dropout=0.05,
|
| 84 |
-
bias="none",
|
| 85 |
-
task_type="CAUSAL_LM",
|
| 86 |
-
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Fewer modules
|
| 87 |
-
)
|
| 88 |
-
|
| 89 |
-
print("Initializing trainer...")
|
| 90 |
-
trainer = SFTTrainer(
|
| 91 |
-
model=model_name,
|
| 92 |
-
train_dataset=train_dataset,
|
| 93 |
-
eval_dataset=eval_dataset,
|
| 94 |
-
args=config,
|
| 95 |
-
peft_config=peft_config,
|
| 96 |
-
)
|
| 97 |
-
|
| 98 |
-
print("Starting training...")
|
| 99 |
-
trainer.train()
|
| 100 |
-
|
| 101 |
-
print("Pushing to Hub...")
|
| 102 |
-
trainer.push_to_hub()
|
| 103 |
-
|
| 104 |
-
print("Job 3 complete!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job4.py
DELETED
|
@@ -1,105 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Job 4: Train on Python code instructions dataset
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
import os
|
| 17 |
-
from huggingface_hub import login
|
| 18 |
-
|
| 19 |
-
# Explicitly login with token from environment
|
| 20 |
-
token = os.environ.get("HF_TOKEN")
|
| 21 |
-
if token:
|
| 22 |
-
login(token=token)
|
| 23 |
-
print("Logged in to HF Hub")
|
| 24 |
-
else:
|
| 25 |
-
print("Warning: HF_TOKEN not found")
|
| 26 |
-
|
| 27 |
-
from datasets import load_dataset
|
| 28 |
-
from peft import LoraConfig
|
| 29 |
-
from trl import SFTTrainer, SFTConfig
|
| 30 |
-
from transformers import AutoTokenizer
|
| 31 |
-
|
| 32 |
-
print("Loading dataset: iamtarun/python_code_instructions_18k_alpaca")
|
| 33 |
-
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
| 34 |
-
print(f"Dataset loaded: {len(dataset)} examples")
|
| 35 |
-
|
| 36 |
-
dataset = dataset.shuffle(seed=42).select(range(min(10000, len(dataset))))
|
| 37 |
-
print(f"Using {len(dataset)} examples")
|
| 38 |
-
|
| 39 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 40 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
def format_example(example):
|
| 44 |
-
return {"text": example["prompt"]}
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
print("Formatting dataset...")
|
| 48 |
-
dataset = dataset.map(format_example, remove_columns=dataset.column_names)
|
| 49 |
-
|
| 50 |
-
dataset_split = dataset.train_test_split(test_size=0.05, seed=42)
|
| 51 |
-
train_dataset = dataset_split["train"]
|
| 52 |
-
eval_dataset = dataset_split["test"]
|
| 53 |
-
print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")
|
| 54 |
-
|
| 55 |
-
config = SFTConfig(
|
| 56 |
-
output_dir="qwen3-python-code-sft-job4",
|
| 57 |
-
push_to_hub=True,
|
| 58 |
-
hub_model_id="passagereptile455/qwen3-0.6b-python-code-sft-job4",
|
| 59 |
-
hub_strategy="every_save",
|
| 60 |
-
dataset_text_field="text",
|
| 61 |
-
num_train_epochs=3,
|
| 62 |
-
per_device_train_batch_size=2,
|
| 63 |
-
gradient_accumulation_steps=8,
|
| 64 |
-
learning_rate=1e-4,
|
| 65 |
-
max_length=512,
|
| 66 |
-
logging_steps=20,
|
| 67 |
-
save_strategy="steps",
|
| 68 |
-
save_steps=200,
|
| 69 |
-
save_total_limit=2,
|
| 70 |
-
eval_strategy="steps",
|
| 71 |
-
eval_steps=200,
|
| 72 |
-
warmup_ratio=0.1,
|
| 73 |
-
lr_scheduler_type="cosine",
|
| 74 |
-
gradient_checkpointing=True,
|
| 75 |
-
bf16=True,
|
| 76 |
-
report_to="trackio",
|
| 77 |
-
project="qwen3-humaneval-challenge",
|
| 78 |
-
run_name="job4-python-instructions",
|
| 79 |
-
)
|
| 80 |
-
|
| 81 |
-
peft_config = LoraConfig(
|
| 82 |
-
r=16,
|
| 83 |
-
lora_alpha=32,
|
| 84 |
-
lora_dropout=0.05,
|
| 85 |
-
bias="none",
|
| 86 |
-
task_type="CAUSAL_LM",
|
| 87 |
-
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
print("Initializing trainer...")
|
| 91 |
-
trainer = SFTTrainer(
|
| 92 |
-
model=model_name,
|
| 93 |
-
train_dataset=train_dataset,
|
| 94 |
-
eval_dataset=eval_dataset,
|
| 95 |
-
args=config,
|
| 96 |
-
peft_config=peft_config,
|
| 97 |
-
)
|
| 98 |
-
|
| 99 |
-
print("Starting training...")
|
| 100 |
-
trainer.train()
|
| 101 |
-
|
| 102 |
-
print("Pushing to Hub...")
|
| 103 |
-
trainer.push_to_hub()
|
| 104 |
-
|
| 105 |
-
print("Job 4 complete!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job4_v2.py
DELETED
|
@@ -1,60 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = ["trl>=0.12.0", "peft>=0.7.0", "transformers>=4.36.0", "accelerate>=0.24.0", "trackio", "datasets"]
|
| 3 |
-
# ///
|
| 4 |
-
from datasets import load_dataset
|
| 5 |
-
from peft import LoraConfig
|
| 6 |
-
from trl import SFTTrainer, SFTConfig
|
| 7 |
-
from transformers import AutoTokenizer
|
| 8 |
-
|
| 9 |
-
print("Loading dataset")
|
| 10 |
-
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
| 11 |
-
dataset = dataset.shuffle(seed=42).select(range(10000))
|
| 12 |
-
print(f"Using {len(dataset)} examples")
|
| 13 |
-
|
| 14 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 15 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def format_fn(ex):
|
| 19 |
-
return {"text": ex["prompt"]}
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
dataset = dataset.map(format_fn, remove_columns=dataset.column_names)
|
| 23 |
-
splits = dataset.train_test_split(test_size=0.05, seed=42)
|
| 24 |
-
train_ds, eval_ds = splits["train"], splits["test"]
|
| 25 |
-
print(f"Train: {len(train_ds)} Eval: {len(eval_ds)}")
|
| 26 |
-
|
| 27 |
-
config = SFTConfig(
|
| 28 |
-
output_dir="qwen3-python-sft",
|
| 29 |
-
push_to_hub=True,
|
| 30 |
-
hub_model_id="passagereptile455/qwen3-0.6b-python-code-sft-job4",
|
| 31 |
-
dataset_text_field="text",
|
| 32 |
-
num_train_epochs=3,
|
| 33 |
-
per_device_train_batch_size=2,
|
| 34 |
-
gradient_accumulation_steps=8,
|
| 35 |
-
learning_rate=1e-4,
|
| 36 |
-
max_length=512,
|
| 37 |
-
logging_steps=20,
|
| 38 |
-
save_strategy="epoch",
|
| 39 |
-
warmup_ratio=0.1,
|
| 40 |
-
gradient_checkpointing=True,
|
| 41 |
-
bf16=True,
|
| 42 |
-
report_to="trackio",
|
| 43 |
-
run_name="job4-python",
|
| 44 |
-
)
|
| 45 |
-
|
| 46 |
-
peft_config = LoraConfig(
|
| 47 |
-
r=16, lora_alpha=32, target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
print("Starting training...")
|
| 51 |
-
trainer = SFTTrainer(
|
| 52 |
-
model=model_name,
|
| 53 |
-
train_dataset=train_ds,
|
| 54 |
-
eval_dataset=eval_ds,
|
| 55 |
-
args=config,
|
| 56 |
-
peft_config=peft_config,
|
| 57 |
-
)
|
| 58 |
-
trainer.train()
|
| 59 |
-
trainer.push_to_hub()
|
| 60 |
-
print("Done!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_job5.py
DELETED
|
@@ -1,105 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Job 4: Train on Python code instructions dataset
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
import os
|
| 17 |
-
from huggingface_hub import login
|
| 18 |
-
|
| 19 |
-
# Explicitly login with token from environment
|
| 20 |
-
token = os.environ.get("HF_TOKEN")
|
| 21 |
-
if token:
|
| 22 |
-
login(token=token)
|
| 23 |
-
print("Logged in to HF Hub")
|
| 24 |
-
else:
|
| 25 |
-
print("Warning: HF_TOKEN not found")
|
| 26 |
-
|
| 27 |
-
from datasets import load_dataset
|
| 28 |
-
from peft import LoraConfig
|
| 29 |
-
from trl import SFTTrainer, SFTConfig
|
| 30 |
-
from transformers import AutoTokenizer
|
| 31 |
-
|
| 32 |
-
print("Loading dataset: iamtarun/python_code_instructions_18k_alpaca")
|
| 33 |
-
dataset = load_dataset("iamtarun/python_code_instructions_18k_alpaca", split="train")
|
| 34 |
-
print(f"Dataset loaded: {len(dataset)} examples")
|
| 35 |
-
|
| 36 |
-
dataset = dataset.shuffle(seed=42).select(range(min(10000, len(dataset))))
|
| 37 |
-
print(f"Using {len(dataset)} examples")
|
| 38 |
-
|
| 39 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 40 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
def format_example(example):
|
| 44 |
-
return {"text": example["prompt"]}
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
print("Formatting dataset...")
|
| 48 |
-
dataset = dataset.map(format_example, remove_columns=dataset.column_names)
|
| 49 |
-
|
| 50 |
-
dataset_split = dataset.train_test_split(test_size=0.05, seed=42)
|
| 51 |
-
train_dataset = dataset_split["train"]
|
| 52 |
-
eval_dataset = dataset_split["test"]
|
| 53 |
-
print(f"Train: {len(train_dataset)} | Eval: {len(eval_dataset)}")
|
| 54 |
-
|
| 55 |
-
config = SFTConfig(
|
| 56 |
-
output_dir="qwen3-python-code-sft-job4",
|
| 57 |
-
push_to_hub=True,
|
| 58 |
-
hub_model_id="passagereptile455/qwen3-0.6b-python-code-sft-job4",
|
| 59 |
-
hub_strategy="every_save",
|
| 60 |
-
dataset_text_field="text",
|
| 61 |
-
num_train_epochs=3,
|
| 62 |
-
per_device_train_batch_size=2,
|
| 63 |
-
gradient_accumulation_steps=8,
|
| 64 |
-
learning_rate=1e-4,
|
| 65 |
-
max_length=512,
|
| 66 |
-
logging_steps=20,
|
| 67 |
-
save_strategy="steps",
|
| 68 |
-
save_steps=200,
|
| 69 |
-
save_total_limit=2,
|
| 70 |
-
eval_strategy="steps",
|
| 71 |
-
eval_steps=200,
|
| 72 |
-
warmup_ratio=0.1,
|
| 73 |
-
lr_scheduler_type="cosine",
|
| 74 |
-
gradient_checkpointing=True,
|
| 75 |
-
bf16=True,
|
| 76 |
-
report_to="trackio",
|
| 77 |
-
project="qwen3-humaneval-challenge",
|
| 78 |
-
run_name="job4-python-instructions",
|
| 79 |
-
)
|
| 80 |
-
|
| 81 |
-
peft_config = LoraConfig(
|
| 82 |
-
r=16,
|
| 83 |
-
lora_alpha=32,
|
| 84 |
-
lora_dropout=0.05,
|
| 85 |
-
bias="none",
|
| 86 |
-
task_type="CAUSAL_LM",
|
| 87 |
-
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
|
| 88 |
-
)
|
| 89 |
-
|
| 90 |
-
print("Initializing trainer...")
|
| 91 |
-
trainer = SFTTrainer(
|
| 92 |
-
model=model_name,
|
| 93 |
-
train_dataset=train_dataset,
|
| 94 |
-
eval_dataset=eval_dataset,
|
| 95 |
-
args=config,
|
| 96 |
-
peft_config=peft_config,
|
| 97 |
-
)
|
| 98 |
-
|
| 99 |
-
print("Starting training...")
|
| 100 |
-
trainer.train()
|
| 101 |
-
|
| 102 |
-
print("Pushing to Hub...")
|
| 103 |
-
trainer.push_to_hub()
|
| 104 |
-
|
| 105 |
-
print("Job 4 complete!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_minimal.py
DELETED
|
@@ -1,137 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "trl",
|
| 9 |
-
# "peft",
|
| 10 |
-
# "bitsandbytes",
|
| 11 |
-
# "huggingface_hub",
|
| 12 |
-
# ]
|
| 13 |
-
# ///
|
| 14 |
-
"""
|
| 15 |
-
Minimal fine-tuning of Qwen3-0.6B on open-r1/codeforces-cots.
|
| 16 |
-
Ultra-conservative training to avoid catastrophic forgetting.
|
| 17 |
-
"""
|
| 18 |
-
|
| 19 |
-
import os
|
| 20 |
-
from datasets import load_dataset
|
| 21 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 22 |
-
from peft import LoraConfig
|
| 23 |
-
from trl import SFTConfig, SFTTrainer
|
| 24 |
-
import torch
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
def main():
|
| 28 |
-
print("=" * 60)
|
| 29 |
-
print("Minimal Fine-tuning: Qwen3-0.6B on codeforces-cots")
|
| 30 |
-
print("=" * 60)
|
| 31 |
-
|
| 32 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 33 |
-
output_name = "passagereptile455/qwen3-codeforces-minimal"
|
| 34 |
-
|
| 35 |
-
# Load tokenizer
|
| 36 |
-
print("\nLoading tokenizer...")
|
| 37 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 38 |
-
if tokenizer.pad_token is None:
|
| 39 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 40 |
-
|
| 41 |
-
# Load model
|
| 42 |
-
print("Loading model...")
|
| 43 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 44 |
-
model_name,
|
| 45 |
-
torch_dtype=torch.float16,
|
| 46 |
-
device_map="auto",
|
| 47 |
-
)
|
| 48 |
-
print(f"Model loaded on {model.device}")
|
| 49 |
-
|
| 50 |
-
# Load dataset with streaming to avoid memory issues
|
| 51 |
-
print("\nLoading dataset (streaming)...")
|
| 52 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 53 |
-
|
| 54 |
-
# Take only 500 examples for minimal training
|
| 55 |
-
dataset = dataset.take(500)
|
| 56 |
-
|
| 57 |
-
# Convert to list for SFTTrainer
|
| 58 |
-
print("Preparing examples...")
|
| 59 |
-
examples = list(dataset)
|
| 60 |
-
print(f"Loaded {len(examples)} examples")
|
| 61 |
-
|
| 62 |
-
# Check format
|
| 63 |
-
if examples:
|
| 64 |
-
print(f"First example keys: {examples[0].keys()}")
|
| 65 |
-
if "messages" in examples[0]:
|
| 66 |
-
print(f"Messages format: {len(examples[0]['messages'])} messages")
|
| 67 |
-
|
| 68 |
-
# LoRA config - very conservative
|
| 69 |
-
lora_config = LoraConfig(
|
| 70 |
-
r=8,
|
| 71 |
-
lora_alpha=16,
|
| 72 |
-
lora_dropout=0.05,
|
| 73 |
-
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
| 74 |
-
bias="none",
|
| 75 |
-
task_type="CAUSAL_LM",
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
# Training config - ultra conservative
|
| 79 |
-
training_args = SFTConfig(
|
| 80 |
-
output_dir="./output",
|
| 81 |
-
max_steps=150,
|
| 82 |
-
per_device_train_batch_size=2,
|
| 83 |
-
gradient_accumulation_steps=4,
|
| 84 |
-
learning_rate=5e-6,
|
| 85 |
-
lr_scheduler_type="cosine",
|
| 86 |
-
warmup_steps=10,
|
| 87 |
-
logging_steps=10,
|
| 88 |
-
save_steps=50,
|
| 89 |
-
fp16=True,
|
| 90 |
-
gradient_checkpointing=True,
|
| 91 |
-
max_seq_length=2048,
|
| 92 |
-
dataset_text_field=None, # We'll use messages format
|
| 93 |
-
push_to_hub=True,
|
| 94 |
-
hub_model_id=output_name,
|
| 95 |
-
report_to="none",
|
| 96 |
-
)
|
| 97 |
-
|
| 98 |
-
# Create trainer
|
| 99 |
-
print("\nInitializing trainer...")
|
| 100 |
-
|
| 101 |
-
# Format function for messages
|
| 102 |
-
def formatting_func(example):
|
| 103 |
-
return tokenizer.apply_chat_template(
|
| 104 |
-
example["messages"],
|
| 105 |
-
tokenize=False,
|
| 106 |
-
add_generation_prompt=False,
|
| 107 |
-
)
|
| 108 |
-
|
| 109 |
-
trainer = SFTTrainer(
|
| 110 |
-
model=model,
|
| 111 |
-
args=training_args,
|
| 112 |
-
train_dataset=examples,
|
| 113 |
-
peft_config=lora_config,
|
| 114 |
-
processing_class=tokenizer,
|
| 115 |
-
formatting_func=formatting_func,
|
| 116 |
-
)
|
| 117 |
-
|
| 118 |
-
# Train
|
| 119 |
-
print("\n" + "=" * 60)
|
| 120 |
-
print("Starting training...")
|
| 121 |
-
print("=" * 60)
|
| 122 |
-
trainer.train()
|
| 123 |
-
|
| 124 |
-
# Save and push
|
| 125 |
-
print("\nSaving model...")
|
| 126 |
-
trainer.save_model()
|
| 127 |
-
|
| 128 |
-
print("\nPushing to hub...")
|
| 129 |
-
trainer.push_to_hub()
|
| 130 |
-
|
| 131 |
-
print("\n" + "=" * 60)
|
| 132 |
-
print(f"Training complete! Model saved to: {output_name}")
|
| 133 |
-
print("=" * 60)
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
if __name__ == "__main__":
|
| 137 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_minimal_v2.py
DELETED
|
@@ -1,135 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "trl>=0.12.0",
|
| 9 |
-
# "peft",
|
| 10 |
-
# "huggingface_hub",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
"""
|
| 14 |
-
Minimal fine-tuning of Qwen3-0.6B on open-r1/codeforces-cots.
|
| 15 |
-
Ultra-conservative training to avoid catastrophic forgetting.
|
| 16 |
-
"""
|
| 17 |
-
|
| 18 |
-
import os
|
| 19 |
-
from datasets import load_dataset, Dataset
|
| 20 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
|
| 21 |
-
from peft import LoraConfig, get_peft_model
|
| 22 |
-
from trl import SFTTrainer
|
| 23 |
-
import torch
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def main():
|
| 27 |
-
print("=" * 60)
|
| 28 |
-
print("Minimal Fine-tuning: Qwen3-0.6B on codeforces-cots")
|
| 29 |
-
print("=" * 60)
|
| 30 |
-
|
| 31 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 32 |
-
output_name = "passagereptile455/qwen3-codeforces-minimal"
|
| 33 |
-
|
| 34 |
-
# Load tokenizer
|
| 35 |
-
print("\nLoading tokenizer...")
|
| 36 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 37 |
-
if tokenizer.pad_token is None:
|
| 38 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 39 |
-
|
| 40 |
-
# Load model
|
| 41 |
-
print("Loading model...")
|
| 42 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 43 |
-
model_name,
|
| 44 |
-
torch_dtype=torch.float16,
|
| 45 |
-
device_map="auto",
|
| 46 |
-
)
|
| 47 |
-
print(f"Model loaded on {model.device}")
|
| 48 |
-
|
| 49 |
-
# LoRA config - very conservative
|
| 50 |
-
lora_config = LoraConfig(
|
| 51 |
-
r=8,
|
| 52 |
-
lora_alpha=16,
|
| 53 |
-
lora_dropout=0.05,
|
| 54 |
-
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
| 55 |
-
bias="none",
|
| 56 |
-
task_type="CAUSAL_LM",
|
| 57 |
-
)
|
| 58 |
-
|
| 59 |
-
# Apply LoRA
|
| 60 |
-
print("Applying LoRA...")
|
| 61 |
-
model = get_peft_model(model, lora_config)
|
| 62 |
-
model.print_trainable_parameters()
|
| 63 |
-
|
| 64 |
-
# Load dataset with streaming
|
| 65 |
-
print("\nLoading dataset (streaming)...")
|
| 66 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 67 |
-
|
| 68 |
-
# Take only 500 examples for minimal training
|
| 69 |
-
print("Preparing examples...")
|
| 70 |
-
examples = []
|
| 71 |
-
for i, ex in enumerate(dataset):
|
| 72 |
-
if i >= 500:
|
| 73 |
-
break
|
| 74 |
-
# Format as text using chat template
|
| 75 |
-
text = tokenizer.apply_chat_template(
|
| 76 |
-
ex["messages"],
|
| 77 |
-
tokenize=False,
|
| 78 |
-
add_generation_prompt=False,
|
| 79 |
-
)
|
| 80 |
-
examples.append({"text": text})
|
| 81 |
-
|
| 82 |
-
print(f"Loaded {len(examples)} examples")
|
| 83 |
-
|
| 84 |
-
# Create HF dataset
|
| 85 |
-
train_dataset = Dataset.from_list(examples)
|
| 86 |
-
|
| 87 |
-
# Training args - ultra conservative
|
| 88 |
-
training_args = TrainingArguments(
|
| 89 |
-
output_dir="./output",
|
| 90 |
-
max_steps=150,
|
| 91 |
-
per_device_train_batch_size=2,
|
| 92 |
-
gradient_accumulation_steps=4,
|
| 93 |
-
learning_rate=5e-6,
|
| 94 |
-
lr_scheduler_type="cosine",
|
| 95 |
-
warmup_steps=10,
|
| 96 |
-
logging_steps=10,
|
| 97 |
-
save_steps=50,
|
| 98 |
-
fp16=True,
|
| 99 |
-
gradient_checkpointing=True,
|
| 100 |
-
push_to_hub=True,
|
| 101 |
-
hub_model_id=output_name,
|
| 102 |
-
report_to="none",
|
| 103 |
-
remove_unused_columns=False,
|
| 104 |
-
)
|
| 105 |
-
|
| 106 |
-
# Create trainer
|
| 107 |
-
print("\nInitializing trainer...")
|
| 108 |
-
trainer = SFTTrainer(
|
| 109 |
-
model=model,
|
| 110 |
-
args=training_args,
|
| 111 |
-
train_dataset=train_dataset,
|
| 112 |
-
processing_class=tokenizer,
|
| 113 |
-
dataset_text_field="text",
|
| 114 |
-
)
|
| 115 |
-
|
| 116 |
-
# Train
|
| 117 |
-
print("\n" + "=" * 60)
|
| 118 |
-
print("Starting training...")
|
| 119 |
-
print("=" * 60)
|
| 120 |
-
trainer.train()
|
| 121 |
-
|
| 122 |
-
# Save and push
|
| 123 |
-
print("\nSaving model...")
|
| 124 |
-
trainer.save_model()
|
| 125 |
-
|
| 126 |
-
print("\nPushing to hub...")
|
| 127 |
-
trainer.push_to_hub()
|
| 128 |
-
|
| 129 |
-
print("\n" + "=" * 60)
|
| 130 |
-
print(f"Training complete! Model saved to: {output_name}")
|
| 131 |
-
print("=" * 60)
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
if __name__ == "__main__":
|
| 135 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_minimal_v3.py
DELETED
|
@@ -1,140 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers>=4.45.0",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "trl>=0.12.0",
|
| 9 |
-
# "peft",
|
| 10 |
-
# "huggingface_hub",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
"""
|
| 14 |
-
Minimal fine-tuning of Qwen3-0.6B on open-r1/codeforces-cots.
|
| 15 |
-
Ultra-conservative training to avoid catastrophic forgetting.
|
| 16 |
-
"""
|
| 17 |
-
|
| 18 |
-
import os
|
| 19 |
-
from datasets import load_dataset, Dataset
|
| 20 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 21 |
-
from peft import LoraConfig
|
| 22 |
-
from trl import SFTTrainer, SFTConfig
|
| 23 |
-
import torch
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def main():
|
| 27 |
-
print("=" * 60)
|
| 28 |
-
print("Minimal Fine-tuning: Qwen3-0.6B on codeforces-cots")
|
| 29 |
-
print("=" * 60)
|
| 30 |
-
|
| 31 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 32 |
-
output_name = "passagereptile455/qwen3-codeforces-minimal"
|
| 33 |
-
|
| 34 |
-
# Load tokenizer
|
| 35 |
-
print("\nLoading tokenizer...")
|
| 36 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 37 |
-
if tokenizer.pad_token is None:
|
| 38 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 39 |
-
|
| 40 |
-
# Load model
|
| 41 |
-
print("Loading model...")
|
| 42 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 43 |
-
model_name,
|
| 44 |
-
torch_dtype=torch.float16,
|
| 45 |
-
device_map="auto",
|
| 46 |
-
)
|
| 47 |
-
print(f"Model loaded on {model.device}")
|
| 48 |
-
|
| 49 |
-
# LoRA config - very conservative
|
| 50 |
-
lora_config = LoraConfig(
|
| 51 |
-
r=8,
|
| 52 |
-
lora_alpha=16,
|
| 53 |
-
lora_dropout=0.05,
|
| 54 |
-
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
| 55 |
-
bias="none",
|
| 56 |
-
task_type="CAUSAL_LM",
|
| 57 |
-
)
|
| 58 |
-
|
| 59 |
-
# Load dataset with streaming
|
| 60 |
-
print("\nLoading dataset (streaming)...")
|
| 61 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 62 |
-
|
| 63 |
-
# Take only 500 examples for minimal training
|
| 64 |
-
print("Preparing examples...")
|
| 65 |
-
examples = []
|
| 66 |
-
for i, ex in enumerate(dataset):
|
| 67 |
-
if i >= 500:
|
| 68 |
-
break
|
| 69 |
-
# Format as text using chat template
|
| 70 |
-
text = tokenizer.apply_chat_template(
|
| 71 |
-
ex["messages"],
|
| 72 |
-
tokenize=False,
|
| 73 |
-
add_generation_prompt=False,
|
| 74 |
-
)
|
| 75 |
-
examples.append({"text": text})
|
| 76 |
-
|
| 77 |
-
print(f"Loaded {len(examples)} examples")
|
| 78 |
-
|
| 79 |
-
# Create HF dataset
|
| 80 |
-
train_dataset = Dataset.from_list(examples)
|
| 81 |
-
|
| 82 |
-
# Check SFTConfig parameters
|
| 83 |
-
import inspect
|
| 84 |
-
|
| 85 |
-
sig = inspect.signature(SFTConfig)
|
| 86 |
-
print(f"\nSFTConfig parameters: {list(sig.parameters.keys())[:20]}...")
|
| 87 |
-
|
| 88 |
-
# Training config - use only standard parameters
|
| 89 |
-
training_args = SFTConfig(
|
| 90 |
-
output_dir="./output",
|
| 91 |
-
max_steps=150,
|
| 92 |
-
per_device_train_batch_size=2,
|
| 93 |
-
gradient_accumulation_steps=4,
|
| 94 |
-
learning_rate=5e-6,
|
| 95 |
-
lr_scheduler_type="cosine",
|
| 96 |
-
warmup_steps=10,
|
| 97 |
-
logging_steps=10,
|
| 98 |
-
save_steps=50,
|
| 99 |
-
fp16=True,
|
| 100 |
-
gradient_checkpointing=True,
|
| 101 |
-
push_to_hub=True,
|
| 102 |
-
hub_model_id=output_name,
|
| 103 |
-
report_to="none",
|
| 104 |
-
)
|
| 105 |
-
|
| 106 |
-
# Create trainer
|
| 107 |
-
print("\nInitializing trainer...")
|
| 108 |
-
|
| 109 |
-
# Check SFTTrainer parameters
|
| 110 |
-
sig = inspect.signature(SFTTrainer.__init__)
|
| 111 |
-
print(f"SFTTrainer parameters: {list(sig.parameters.keys())[:15]}...")
|
| 112 |
-
|
| 113 |
-
trainer = SFTTrainer(
|
| 114 |
-
model=model,
|
| 115 |
-
args=training_args,
|
| 116 |
-
train_dataset=train_dataset,
|
| 117 |
-
peft_config=lora_config,
|
| 118 |
-
processing_class=tokenizer,
|
| 119 |
-
)
|
| 120 |
-
|
| 121 |
-
# Train
|
| 122 |
-
print("\n" + "=" * 60)
|
| 123 |
-
print("Starting training...")
|
| 124 |
-
print("=" * 60)
|
| 125 |
-
trainer.train()
|
| 126 |
-
|
| 127 |
-
# Save and push
|
| 128 |
-
print("\nSaving model...")
|
| 129 |
-
trainer.save_model()
|
| 130 |
-
|
| 131 |
-
print("\nPushing to hub...")
|
| 132 |
-
trainer.push_to_hub()
|
| 133 |
-
|
| 134 |
-
print("\n" + "=" * 60)
|
| 135 |
-
print(f"Training complete! Model saved to: {output_name}")
|
| 136 |
-
print("=" * 60)
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
if __name__ == "__main__":
|
| 140 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_minimal_v4.py
DELETED
|
@@ -1,145 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers>=4.45.0",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "trl>=0.12.0",
|
| 9 |
-
# "peft",
|
| 10 |
-
# "huggingface_hub",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
"""
|
| 14 |
-
Minimal fine-tuning of Qwen3-0.6B on open-r1/codeforces-cots.
|
| 15 |
-
Saves to local output directory (no hub push during training).
|
| 16 |
-
"""
|
| 17 |
-
|
| 18 |
-
import os
|
| 19 |
-
from datasets import load_dataset, Dataset
|
| 20 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 21 |
-
from peft import LoraConfig
|
| 22 |
-
from trl import SFTTrainer, SFTConfig
|
| 23 |
-
import torch
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def main():
|
| 27 |
-
print("=" * 60)
|
| 28 |
-
print("Minimal Fine-tuning: Qwen3-0.6B on codeforces-cots")
|
| 29 |
-
print("=" * 60)
|
| 30 |
-
|
| 31 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 32 |
-
|
| 33 |
-
# Load tokenizer
|
| 34 |
-
print("\nLoading tokenizer...")
|
| 35 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 36 |
-
if tokenizer.pad_token is None:
|
| 37 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 38 |
-
|
| 39 |
-
# Load model
|
| 40 |
-
print("Loading model...")
|
| 41 |
-
model = AutoModelForCausalLM.from_pretrained(
|
| 42 |
-
model_name,
|
| 43 |
-
torch_dtype=torch.float16,
|
| 44 |
-
device_map="auto",
|
| 45 |
-
)
|
| 46 |
-
print(f"Model loaded on {model.device}")
|
| 47 |
-
|
| 48 |
-
# LoRA config - very conservative
|
| 49 |
-
lora_config = LoraConfig(
|
| 50 |
-
r=8,
|
| 51 |
-
lora_alpha=16,
|
| 52 |
-
lora_dropout=0.05,
|
| 53 |
-
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
| 54 |
-
bias="none",
|
| 55 |
-
task_type="CAUSAL_LM",
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
# Load dataset with streaming
|
| 59 |
-
print("\nLoading dataset (streaming)...")
|
| 60 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 61 |
-
|
| 62 |
-
# Take only 500 examples for minimal training
|
| 63 |
-
print("Preparing examples...")
|
| 64 |
-
examples = []
|
| 65 |
-
for i, ex in enumerate(dataset):
|
| 66 |
-
if i >= 500:
|
| 67 |
-
break
|
| 68 |
-
# Format as text using chat template
|
| 69 |
-
text = tokenizer.apply_chat_template(
|
| 70 |
-
ex["messages"],
|
| 71 |
-
tokenize=False,
|
| 72 |
-
add_generation_prompt=False,
|
| 73 |
-
)
|
| 74 |
-
examples.append({"text": text})
|
| 75 |
-
|
| 76 |
-
print(f"Loaded {len(examples)} examples")
|
| 77 |
-
|
| 78 |
-
# Create HF dataset
|
| 79 |
-
train_dataset = Dataset.from_list(examples)
|
| 80 |
-
|
| 81 |
-
# Training config - NO hub push
|
| 82 |
-
training_args = SFTConfig(
|
| 83 |
-
output_dir="./qwen3-codeforces-minimal",
|
| 84 |
-
max_steps=150,
|
| 85 |
-
per_device_train_batch_size=2,
|
| 86 |
-
gradient_accumulation_steps=4,
|
| 87 |
-
learning_rate=5e-6,
|
| 88 |
-
lr_scheduler_type="cosine",
|
| 89 |
-
warmup_steps=10,
|
| 90 |
-
logging_steps=10,
|
| 91 |
-
save_steps=50,
|
| 92 |
-
save_total_limit=2,
|
| 93 |
-
fp16=True,
|
| 94 |
-
gradient_checkpointing=True,
|
| 95 |
-
push_to_hub=False, # Disabled - will upload manually
|
| 96 |
-
report_to="none",
|
| 97 |
-
)
|
| 98 |
-
|
| 99 |
-
# Create trainer
|
| 100 |
-
print("\nInitializing trainer...")
|
| 101 |
-
trainer = SFTTrainer(
|
| 102 |
-
model=model,
|
| 103 |
-
args=training_args,
|
| 104 |
-
train_dataset=train_dataset,
|
| 105 |
-
peft_config=lora_config,
|
| 106 |
-
processing_class=tokenizer,
|
| 107 |
-
)
|
| 108 |
-
|
| 109 |
-
# Train
|
| 110 |
-
print("\n" + "=" * 60)
|
| 111 |
-
print("Starting training...")
|
| 112 |
-
print("=" * 60)
|
| 113 |
-
trainer.train()
|
| 114 |
-
|
| 115 |
-
# Save final model
|
| 116 |
-
print("\nSaving final model...")
|
| 117 |
-
trainer.save_model("./qwen3-codeforces-minimal-final")
|
| 118 |
-
tokenizer.save_pretrained("./qwen3-codeforces-minimal-final")
|
| 119 |
-
|
| 120 |
-
# Upload to hub using HfApi
|
| 121 |
-
print("\nUploading to HuggingFace Hub...")
|
| 122 |
-
from huggingface_hub import HfApi, create_repo
|
| 123 |
-
|
| 124 |
-
api = HfApi()
|
| 125 |
-
repo_id = "passagereptile455/qwen3-codeforces-minimal"
|
| 126 |
-
|
| 127 |
-
try:
|
| 128 |
-
create_repo(repo_id, exist_ok=True, repo_type="model")
|
| 129 |
-
api.upload_folder(
|
| 130 |
-
folder_path="./qwen3-codeforces-minimal-final",
|
| 131 |
-
repo_id=repo_id,
|
| 132 |
-
repo_type="model",
|
| 133 |
-
)
|
| 134 |
-
print(f"Model uploaded to: https://huggingface.co/{repo_id}")
|
| 135 |
-
except Exception as e:
|
| 136 |
-
print(f"Upload failed: {e}")
|
| 137 |
-
print("Model saved locally at: ./qwen3-codeforces-minimal-final")
|
| 138 |
-
|
| 139 |
-
print("\n" + "=" * 60)
|
| 140 |
-
print("Training complete!")
|
| 141 |
-
print("=" * 60)
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
if __name__ == "__main__":
|
| 145 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_sft_demo.py
DELETED
|
@@ -1,32 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = ["trl>=0.12.0", "peft>=0.7.0", "datasets", "transformers", "torch", "accelerate"]
|
| 3 |
-
# ///
|
| 4 |
-
|
| 5 |
-
from datasets import load_dataset
|
| 6 |
-
from peft import LoraConfig
|
| 7 |
-
from trl import SFTTrainer, SFTConfig
|
| 8 |
-
import os
|
| 9 |
-
|
| 10 |
-
# Load a small dataset
|
| 11 |
-
dataset = load_dataset("trl-lib/Capybara", split="train[:500]")
|
| 12 |
-
|
| 13 |
-
# Setup trainer
|
| 14 |
-
trainer = SFTTrainer(
|
| 15 |
-
model="Qwen/Qwen2.5-0.5B",
|
| 16 |
-
train_dataset=dataset,
|
| 17 |
-
peft_config=LoraConfig(r=16, lora_alpha=32, target_modules="all-linear"),
|
| 18 |
-
args=SFTConfig(
|
| 19 |
-
output_dir="qwen-demo-sft",
|
| 20 |
-
max_steps=100,
|
| 21 |
-
per_device_train_batch_size=2,
|
| 22 |
-
gradient_accumulation_steps=4,
|
| 23 |
-
logging_steps=10,
|
| 24 |
-
push_to_hub=True,
|
| 25 |
-
hub_model_id="passagereptile455/qwen-demo-sft",
|
| 26 |
-
hub_private_repo=True,
|
| 27 |
-
)
|
| 28 |
-
)
|
| 29 |
-
|
| 30 |
-
trainer.train()
|
| 31 |
-
trainer.push_to_hub()
|
| 32 |
-
print("Training complete! Model pushed to Hub.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_streaming.py
DELETED
|
@@ -1,96 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Training with streaming dataset to avoid memory issues
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
from datasets import load_dataset, Dataset
|
| 17 |
-
from peft import LoraConfig
|
| 18 |
-
from trl import SFTTrainer, SFTConfig
|
| 19 |
-
import torch
|
| 20 |
-
|
| 21 |
-
print("=" * 50)
|
| 22 |
-
print("STREAMING DATASET TRAINING")
|
| 23 |
-
print("=" * 50)
|
| 24 |
-
|
| 25 |
-
print(f"CUDA: {torch.cuda.is_available()}")
|
| 26 |
-
|
| 27 |
-
# Use streaming to load subset without memory issues
|
| 28 |
-
print("Streaming codeforces-cots...")
|
| 29 |
-
streaming_ds = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 30 |
-
|
| 31 |
-
# Collect 1000 examples
|
| 32 |
-
print("Collecting 1000 examples...")
|
| 33 |
-
examples = []
|
| 34 |
-
for i, ex in enumerate(streaming_ds):
|
| 35 |
-
if i >= 1000:
|
| 36 |
-
break
|
| 37 |
-
examples.append(ex)
|
| 38 |
-
if (i + 1) % 200 == 0:
|
| 39 |
-
print(f" Collected {i + 1} examples")
|
| 40 |
-
|
| 41 |
-
print(f"Collected {len(examples)} examples")
|
| 42 |
-
|
| 43 |
-
# Convert to regular dataset
|
| 44 |
-
dataset = Dataset.from_list(examples)
|
| 45 |
-
print(f"Dataset created: {len(dataset)}")
|
| 46 |
-
|
| 47 |
-
config = SFTConfig(
|
| 48 |
-
output_dir="qwen3-codeforces",
|
| 49 |
-
push_to_hub=True,
|
| 50 |
-
hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
|
| 51 |
-
hub_strategy="every_save",
|
| 52 |
-
max_steps=200,
|
| 53 |
-
per_device_train_batch_size=1,
|
| 54 |
-
gradient_accumulation_steps=8,
|
| 55 |
-
learning_rate=5e-6,
|
| 56 |
-
max_length=512,
|
| 57 |
-
logging_steps=20,
|
| 58 |
-
save_strategy="steps",
|
| 59 |
-
save_steps=100,
|
| 60 |
-
save_total_limit=1,
|
| 61 |
-
eval_strategy="no",
|
| 62 |
-
warmup_ratio=0.1,
|
| 63 |
-
lr_scheduler_type="cosine",
|
| 64 |
-
gradient_checkpointing=True,
|
| 65 |
-
bf16=True,
|
| 66 |
-
report_to="trackio",
|
| 67 |
-
project="qwen3-humaneval",
|
| 68 |
-
run_name="job1-streaming",
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
peft_config = LoraConfig(
|
| 72 |
-
r=8,
|
| 73 |
-
lora_alpha=16,
|
| 74 |
-
lora_dropout=0.05,
|
| 75 |
-
bias="none",
|
| 76 |
-
task_type="CAUSAL_LM",
|
| 77 |
-
target_modules=["q_proj", "v_proj"],
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
print("Creating trainer...")
|
| 81 |
-
trainer = SFTTrainer(
|
| 82 |
-
model="Qwen/Qwen3-0.6B",
|
| 83 |
-
train_dataset=dataset,
|
| 84 |
-
args=config,
|
| 85 |
-
peft_config=peft_config,
|
| 86 |
-
)
|
| 87 |
-
|
| 88 |
-
print("Training (200 steps)...")
|
| 89 |
-
trainer.train()
|
| 90 |
-
|
| 91 |
-
print("Pushing to Hub...")
|
| 92 |
-
trainer.push_to_hub()
|
| 93 |
-
|
| 94 |
-
print("=" * 50)
|
| 95 |
-
print("SUCCESS!")
|
| 96 |
-
print("=" * 50)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_test_simple.py
DELETED
|
@@ -1,79 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Test training with a reliable small dataset
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
from datasets import load_dataset
|
| 17 |
-
from peft import LoraConfig
|
| 18 |
-
from trl import SFTTrainer, SFTConfig
|
| 19 |
-
import torch
|
| 20 |
-
|
| 21 |
-
print("=" * 50)
|
| 22 |
-
print("TEST TRAINING JOB")
|
| 23 |
-
print("=" * 50)
|
| 24 |
-
|
| 25 |
-
print(f"CUDA: {torch.cuda.is_available()}")
|
| 26 |
-
|
| 27 |
-
# Use trl-lib dataset which is guaranteed to work
|
| 28 |
-
print("Loading trl-lib/Capybara...")
|
| 29 |
-
dataset = load_dataset("trl-lib/Capybara", split="train")
|
| 30 |
-
print(f"Loaded: {len(dataset)}")
|
| 31 |
-
|
| 32 |
-
# Small subset
|
| 33 |
-
dataset = dataset.shuffle(seed=42).select(range(500))
|
| 34 |
-
print(f"Using: {len(dataset)}")
|
| 35 |
-
|
| 36 |
-
config = SFTConfig(
|
| 37 |
-
output_dir="test-model",
|
| 38 |
-
push_to_hub=True,
|
| 39 |
-
hub_model_id="passagereptile455/qwen3-test-training",
|
| 40 |
-
hub_strategy="every_save",
|
| 41 |
-
max_steps=50, # Very short test
|
| 42 |
-
per_device_train_batch_size=1,
|
| 43 |
-
gradient_accumulation_steps=4,
|
| 44 |
-
learning_rate=2e-5,
|
| 45 |
-
max_length=256,
|
| 46 |
-
logging_steps=10,
|
| 47 |
-
save_strategy="steps",
|
| 48 |
-
save_steps=50,
|
| 49 |
-
save_total_limit=1,
|
| 50 |
-
eval_strategy="no",
|
| 51 |
-
warmup_ratio=0.1,
|
| 52 |
-
gradient_checkpointing=True,
|
| 53 |
-
bf16=True,
|
| 54 |
-
report_to="trackio",
|
| 55 |
-
project="test",
|
| 56 |
-
run_name="test-train",
|
| 57 |
-
)
|
| 58 |
-
|
| 59 |
-
peft_config = LoraConfig(
|
| 60 |
-
r=8,
|
| 61 |
-
lora_alpha=16,
|
| 62 |
-
target_modules=["q_proj", "v_proj"],
|
| 63 |
-
)
|
| 64 |
-
|
| 65 |
-
print("Creating trainer...")
|
| 66 |
-
trainer = SFTTrainer(
|
| 67 |
-
model="Qwen/Qwen3-0.6B",
|
| 68 |
-
train_dataset=dataset,
|
| 69 |
-
args=config,
|
| 70 |
-
peft_config=peft_config,
|
| 71 |
-
)
|
| 72 |
-
|
| 73 |
-
print("Training...")
|
| 74 |
-
trainer.train()
|
| 75 |
-
|
| 76 |
-
print("Pushing to Hub...")
|
| 77 |
-
trainer.push_to_hub()
|
| 78 |
-
|
| 79 |
-
print("SUCCESS!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_test_upload_150steps.py
DELETED
|
@@ -1,303 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers>=4.45.0",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "trl>=0.12.0",
|
| 9 |
-
# "peft",
|
| 10 |
-
# "huggingface_hub",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
"""
|
| 14 |
-
Combined training, testing, and upload script.
|
| 15 |
-
Trains Qwen3-0.6B on codeforces-cots (150 steps - proven optimal), tests on HumanEval, uploads to Hub.
|
| 16 |
-
"""
|
| 17 |
-
|
| 18 |
-
import os
|
| 19 |
-
import re
|
| 20 |
-
import subprocess
|
| 21 |
-
import tempfile
|
| 22 |
-
from datasets import load_dataset, Dataset
|
| 23 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 24 |
-
from peft import LoraConfig
|
| 25 |
-
from trl import SFTTrainer, SFTConfig
|
| 26 |
-
from huggingface_hub import login, HfApi
|
| 27 |
-
import torch
|
| 28 |
-
|
| 29 |
-
# Authenticate with HF Hub at the start
|
| 30 |
-
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 31 |
-
if HF_TOKEN:
|
| 32 |
-
login(token=HF_TOKEN)
|
| 33 |
-
print("HF Hub authenticated successfully!")
|
| 34 |
-
else:
|
| 35 |
-
print("WARNING: No HF_TOKEN found in environment")
|
| 36 |
-
|
| 37 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval"
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def extract_function_body(response: str) -> str:
|
| 41 |
-
"""Extract just the function body from model response."""
|
| 42 |
-
response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
|
| 43 |
-
response = response.strip()
|
| 44 |
-
|
| 45 |
-
code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
|
| 46 |
-
if code_match:
|
| 47 |
-
response = code_match.group(1)
|
| 48 |
-
else:
|
| 49 |
-
code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
|
| 50 |
-
if code_match:
|
| 51 |
-
response = code_match.group(1)
|
| 52 |
-
|
| 53 |
-
response = response.strip()
|
| 54 |
-
lines = response.split("\n")
|
| 55 |
-
|
| 56 |
-
start_idx = 0
|
| 57 |
-
for i, line in enumerate(lines):
|
| 58 |
-
if line.strip().startswith("def "):
|
| 59 |
-
start_idx = i
|
| 60 |
-
break
|
| 61 |
-
|
| 62 |
-
start_idx += 1
|
| 63 |
-
|
| 64 |
-
if start_idx < len(lines):
|
| 65 |
-
stripped = lines[start_idx].strip()
|
| 66 |
-
if stripped.startswith('"""') or stripped.startswith("'''"):
|
| 67 |
-
quote = stripped[:3]
|
| 68 |
-
if stripped.count(quote) >= 2:
|
| 69 |
-
start_idx += 1
|
| 70 |
-
else:
|
| 71 |
-
start_idx += 1
|
| 72 |
-
while start_idx < len(lines) and quote not in lines[start_idx]:
|
| 73 |
-
start_idx += 1
|
| 74 |
-
start_idx += 1
|
| 75 |
-
|
| 76 |
-
body_lines = lines[start_idx:]
|
| 77 |
-
return "\n".join(body_lines)
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
|
| 81 |
-
"""Run the test using subprocess."""
|
| 82 |
-
full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
|
| 83 |
-
|
| 84 |
-
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
| 85 |
-
f.write(full_code)
|
| 86 |
-
temp_path = f.name
|
| 87 |
-
|
| 88 |
-
try:
|
| 89 |
-
result = subprocess.run(
|
| 90 |
-
["python", temp_path], capture_output=True, text=True, timeout=10
|
| 91 |
-
)
|
| 92 |
-
return result.returncode == 0
|
| 93 |
-
except subprocess.TimeoutExpired:
|
| 94 |
-
return False
|
| 95 |
-
except Exception:
|
| 96 |
-
return False
|
| 97 |
-
finally:
|
| 98 |
-
try:
|
| 99 |
-
os.unlink(temp_path)
|
| 100 |
-
except:
|
| 101 |
-
pass
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
def test_model(model, tokenizer, model_name="Model"):
|
| 105 |
-
"""Test model on HumanEval."""
|
| 106 |
-
print(f"\n{'=' * 60}")
|
| 107 |
-
print(f"Testing: {model_name}")
|
| 108 |
-
print("=" * 60)
|
| 109 |
-
|
| 110 |
-
dataset = load_dataset("openai/openai_humaneval", split="test")
|
| 111 |
-
print(f"Total problems: {len(dataset)}")
|
| 112 |
-
|
| 113 |
-
passed = 0
|
| 114 |
-
failed = 0
|
| 115 |
-
|
| 116 |
-
for i, problem in enumerate(dataset):
|
| 117 |
-
prompt = problem["prompt"]
|
| 118 |
-
test = problem["test"]
|
| 119 |
-
entry_point = problem["entry_point"]
|
| 120 |
-
|
| 121 |
-
messages = [
|
| 122 |
-
{
|
| 123 |
-
"role": "user",
|
| 124 |
-
"content": f"Complete this Python function. Output only the code.\n\n{prompt}",
|
| 125 |
-
}
|
| 126 |
-
]
|
| 127 |
-
|
| 128 |
-
text = tokenizer.apply_chat_template(
|
| 129 |
-
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
| 133 |
-
|
| 134 |
-
with torch.no_grad():
|
| 135 |
-
outputs = model.generate(
|
| 136 |
-
**inputs,
|
| 137 |
-
max_new_tokens=512,
|
| 138 |
-
do_sample=False,
|
| 139 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 140 |
-
)
|
| 141 |
-
|
| 142 |
-
response = tokenizer.decode(
|
| 143 |
-
outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
|
| 144 |
-
)
|
| 145 |
-
|
| 146 |
-
completion = extract_function_body(response)
|
| 147 |
-
success = run_test_subprocess(prompt, completion, test, entry_point)
|
| 148 |
-
|
| 149 |
-
if success:
|
| 150 |
-
passed += 1
|
| 151 |
-
else:
|
| 152 |
-
failed += 1
|
| 153 |
-
|
| 154 |
-
if (i + 1) % 20 == 0 or i == len(dataset) - 1:
|
| 155 |
-
print(
|
| 156 |
-
f"Progress: {i + 1}/{len(dataset)} | Pass: {passed} | Fail: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
final_score = passed / len(dataset) * 100
|
| 160 |
-
print(f"\nFINAL: {passed}/{len(dataset)} = {final_score:.2f}%")
|
| 161 |
-
return final_score
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
def main():
|
| 165 |
-
print("=" * 60)
|
| 166 |
-
print("Combined Training, Testing & Upload")
|
| 167 |
-
print("150 steps - proven optimal configuration")
|
| 168 |
-
print("=" * 60)
|
| 169 |
-
|
| 170 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 171 |
-
|
| 172 |
-
# Load tokenizer
|
| 173 |
-
print("\nLoading tokenizer...")
|
| 174 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 175 |
-
if tokenizer.pad_token is None:
|
| 176 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 177 |
-
|
| 178 |
-
# Load base model
|
| 179 |
-
print("Loading base model...")
|
| 180 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 181 |
-
model_name,
|
| 182 |
-
torch_dtype=torch.float16,
|
| 183 |
-
device_map="auto",
|
| 184 |
-
)
|
| 185 |
-
|
| 186 |
-
# LoRA config
|
| 187 |
-
lora_config = LoraConfig(
|
| 188 |
-
r=8,
|
| 189 |
-
lora_alpha=16,
|
| 190 |
-
lora_dropout=0.05,
|
| 191 |
-
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
| 192 |
-
bias="none",
|
| 193 |
-
task_type="CAUSAL_LM",
|
| 194 |
-
)
|
| 195 |
-
|
| 196 |
-
# Load training dataset
|
| 197 |
-
print("\nLoading training dataset (streaming)...")
|
| 198 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 199 |
-
|
| 200 |
-
print("Preparing examples...")
|
| 201 |
-
examples = []
|
| 202 |
-
for i, ex in enumerate(dataset):
|
| 203 |
-
if i >= 500:
|
| 204 |
-
break
|
| 205 |
-
text = tokenizer.apply_chat_template(
|
| 206 |
-
ex["messages"],
|
| 207 |
-
tokenize=False,
|
| 208 |
-
add_generation_prompt=False,
|
| 209 |
-
)
|
| 210 |
-
examples.append({"text": text})
|
| 211 |
-
|
| 212 |
-
print(f"Loaded {len(examples)} training examples")
|
| 213 |
-
train_dataset = Dataset.from_list(examples)
|
| 214 |
-
|
| 215 |
-
# Training config - 150 steps (proven optimal)
|
| 216 |
-
training_args = SFTConfig(
|
| 217 |
-
output_dir="./output",
|
| 218 |
-
max_steps=150, # Proven optimal - 200 regresses
|
| 219 |
-
per_device_train_batch_size=2,
|
| 220 |
-
gradient_accumulation_steps=4,
|
| 221 |
-
learning_rate=5e-6,
|
| 222 |
-
lr_scheduler_type="cosine",
|
| 223 |
-
warmup_steps=10,
|
| 224 |
-
logging_steps=25,
|
| 225 |
-
save_steps=150,
|
| 226 |
-
fp16=True,
|
| 227 |
-
gradient_checkpointing=True,
|
| 228 |
-
push_to_hub=False, # We'll push manually after eval
|
| 229 |
-
report_to="none",
|
| 230 |
-
)
|
| 231 |
-
|
| 232 |
-
# Create trainer
|
| 233 |
-
print("\nInitializing trainer...")
|
| 234 |
-
trainer = SFTTrainer(
|
| 235 |
-
model=base_model,
|
| 236 |
-
args=training_args,
|
| 237 |
-
train_dataset=train_dataset,
|
| 238 |
-
peft_config=lora_config,
|
| 239 |
-
processing_class=tokenizer,
|
| 240 |
-
)
|
| 241 |
-
|
| 242 |
-
# Train
|
| 243 |
-
print("\n" + "=" * 60)
|
| 244 |
-
print("PHASE 1: Training (150 steps)")
|
| 245 |
-
print("=" * 60)
|
| 246 |
-
trainer.train()
|
| 247 |
-
|
| 248 |
-
# Save trained model locally
|
| 249 |
-
print("\nSaving trained model locally...")
|
| 250 |
-
trainer.save_model("./trained_model")
|
| 251 |
-
tokenizer.save_pretrained("./trained_model")
|
| 252 |
-
|
| 253 |
-
# Test the fine-tuned model
|
| 254 |
-
print("\n" + "=" * 60)
|
| 255 |
-
print("PHASE 2: Testing Fine-tuned Model")
|
| 256 |
-
print("=" * 60)
|
| 257 |
-
|
| 258 |
-
trained_model = trainer.model
|
| 259 |
-
trained_model.train(False)
|
| 260 |
-
|
| 261 |
-
finetuned_score = test_model(
|
| 262 |
-
trained_model, tokenizer, "Fine-tuned Qwen3-0.6B (150 steps)"
|
| 263 |
-
)
|
| 264 |
-
|
| 265 |
-
# Upload to Hub
|
| 266 |
-
print("\n" + "=" * 60)
|
| 267 |
-
print("PHASE 3: Uploading to HuggingFace Hub")
|
| 268 |
-
print("=" * 60)
|
| 269 |
-
|
| 270 |
-
try:
|
| 271 |
-
# Push model
|
| 272 |
-
print(f"Pushing model to {REPO_ID}...")
|
| 273 |
-
trained_model.push_to_hub(REPO_ID, token=HF_TOKEN)
|
| 274 |
-
tokenizer.push_to_hub(REPO_ID, token=HF_TOKEN)
|
| 275 |
-
print(f"Model uploaded successfully!")
|
| 276 |
-
print(f"URL: https://huggingface.co/{REPO_ID}")
|
| 277 |
-
upload_success = True
|
| 278 |
-
except Exception as e:
|
| 279 |
-
print(f"Upload failed: {e}")
|
| 280 |
-
upload_success = False
|
| 281 |
-
|
| 282 |
-
# Summary
|
| 283 |
-
print("\n" + "=" * 60)
|
| 284 |
-
print("SUMMARY")
|
| 285 |
-
print("=" * 60)
|
| 286 |
-
print(f"Baseline (from earlier): 27.44%")
|
| 287 |
-
print(f"Fine-tuned (150 steps): {finetuned_score:.2f}%")
|
| 288 |
-
|
| 289 |
-
if finetuned_score > 27.44:
|
| 290 |
-
print(f"IMPROVEMENT: +{finetuned_score - 27.44:.2f}%")
|
| 291 |
-
print("SUCCESS! Fine-tuned model beats baseline!")
|
| 292 |
-
else:
|
| 293 |
-
print(f"DIFFERENCE: {finetuned_score - 27.44:.2f}%")
|
| 294 |
-
print("Fine-tuned model did not beat baseline.")
|
| 295 |
-
|
| 296 |
-
print(f"\nUpload status: {'SUCCESS' if upload_success else 'FAILED'}")
|
| 297 |
-
if upload_success:
|
| 298 |
-
print(f"Model URL: https://huggingface.co/{REPO_ID}")
|
| 299 |
-
print("=" * 60)
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
if __name__ == "__main__":
|
| 303 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_test_upload_v2.py
DELETED
|
@@ -1,303 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers>=4.45.0",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "trl>=0.12.0",
|
| 9 |
-
# "peft",
|
| 10 |
-
# "huggingface_hub",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
"""
|
| 14 |
-
Combined training, testing, and upload script.
|
| 15 |
-
Trains Qwen3-0.6B on codeforces-cots (200 steps), tests on HumanEval, uploads to Hub.
|
| 16 |
-
"""
|
| 17 |
-
|
| 18 |
-
import os
|
| 19 |
-
import re
|
| 20 |
-
import subprocess
|
| 21 |
-
import tempfile
|
| 22 |
-
from datasets import load_dataset, Dataset
|
| 23 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 24 |
-
from peft import LoraConfig
|
| 25 |
-
from trl import SFTTrainer, SFTConfig
|
| 26 |
-
from huggingface_hub import login, HfApi
|
| 27 |
-
import torch
|
| 28 |
-
|
| 29 |
-
# Authenticate with HF Hub at the start
|
| 30 |
-
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 31 |
-
if HF_TOKEN:
|
| 32 |
-
login(token=HF_TOKEN)
|
| 33 |
-
print("HF Hub authenticated successfully!")
|
| 34 |
-
else:
|
| 35 |
-
print("WARNING: No HF_TOKEN found in environment")
|
| 36 |
-
|
| 37 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval"
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def extract_function_body(response: str) -> str:
|
| 41 |
-
"""Extract just the function body from model response."""
|
| 42 |
-
response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
|
| 43 |
-
response = response.strip()
|
| 44 |
-
|
| 45 |
-
code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
|
| 46 |
-
if code_match:
|
| 47 |
-
response = code_match.group(1)
|
| 48 |
-
else:
|
| 49 |
-
code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
|
| 50 |
-
if code_match:
|
| 51 |
-
response = code_match.group(1)
|
| 52 |
-
|
| 53 |
-
response = response.strip()
|
| 54 |
-
lines = response.split("\n")
|
| 55 |
-
|
| 56 |
-
start_idx = 0
|
| 57 |
-
for i, line in enumerate(lines):
|
| 58 |
-
if line.strip().startswith("def "):
|
| 59 |
-
start_idx = i
|
| 60 |
-
break
|
| 61 |
-
|
| 62 |
-
start_idx += 1
|
| 63 |
-
|
| 64 |
-
if start_idx < len(lines):
|
| 65 |
-
stripped = lines[start_idx].strip()
|
| 66 |
-
if stripped.startswith('"""') or stripped.startswith("'''"):
|
| 67 |
-
quote = stripped[:3]
|
| 68 |
-
if stripped.count(quote) >= 2:
|
| 69 |
-
start_idx += 1
|
| 70 |
-
else:
|
| 71 |
-
start_idx += 1
|
| 72 |
-
while start_idx < len(lines) and quote not in lines[start_idx]:
|
| 73 |
-
start_idx += 1
|
| 74 |
-
start_idx += 1
|
| 75 |
-
|
| 76 |
-
body_lines = lines[start_idx:]
|
| 77 |
-
return "\n".join(body_lines)
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
|
| 81 |
-
"""Run the test using subprocess."""
|
| 82 |
-
full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
|
| 83 |
-
|
| 84 |
-
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
| 85 |
-
f.write(full_code)
|
| 86 |
-
temp_path = f.name
|
| 87 |
-
|
| 88 |
-
try:
|
| 89 |
-
result = subprocess.run(
|
| 90 |
-
["python", temp_path], capture_output=True, text=True, timeout=10
|
| 91 |
-
)
|
| 92 |
-
return result.returncode == 0
|
| 93 |
-
except subprocess.TimeoutExpired:
|
| 94 |
-
return False
|
| 95 |
-
except Exception:
|
| 96 |
-
return False
|
| 97 |
-
finally:
|
| 98 |
-
try:
|
| 99 |
-
os.unlink(temp_path)
|
| 100 |
-
except:
|
| 101 |
-
pass
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
def test_model(model, tokenizer, model_name="Model"):
|
| 105 |
-
"""Test model on HumanEval."""
|
| 106 |
-
print(f"\n{'=' * 60}")
|
| 107 |
-
print(f"Testing: {model_name}")
|
| 108 |
-
print("=" * 60)
|
| 109 |
-
|
| 110 |
-
dataset = load_dataset("openai/openai_humaneval", split="test")
|
| 111 |
-
print(f"Total problems: {len(dataset)}")
|
| 112 |
-
|
| 113 |
-
passed = 0
|
| 114 |
-
failed = 0
|
| 115 |
-
|
| 116 |
-
for i, problem in enumerate(dataset):
|
| 117 |
-
prompt = problem["prompt"]
|
| 118 |
-
test = problem["test"]
|
| 119 |
-
entry_point = problem["entry_point"]
|
| 120 |
-
|
| 121 |
-
messages = [
|
| 122 |
-
{
|
| 123 |
-
"role": "user",
|
| 124 |
-
"content": f"Complete this Python function. Output only the code.\n\n{prompt}",
|
| 125 |
-
}
|
| 126 |
-
]
|
| 127 |
-
|
| 128 |
-
text = tokenizer.apply_chat_template(
|
| 129 |
-
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
| 133 |
-
|
| 134 |
-
with torch.no_grad():
|
| 135 |
-
outputs = model.generate(
|
| 136 |
-
**inputs,
|
| 137 |
-
max_new_tokens=512,
|
| 138 |
-
do_sample=False,
|
| 139 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 140 |
-
)
|
| 141 |
-
|
| 142 |
-
response = tokenizer.decode(
|
| 143 |
-
outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
|
| 144 |
-
)
|
| 145 |
-
|
| 146 |
-
completion = extract_function_body(response)
|
| 147 |
-
success = run_test_subprocess(prompt, completion, test, entry_point)
|
| 148 |
-
|
| 149 |
-
if success:
|
| 150 |
-
passed += 1
|
| 151 |
-
else:
|
| 152 |
-
failed += 1
|
| 153 |
-
|
| 154 |
-
if (i + 1) % 20 == 0 or i == len(dataset) - 1:
|
| 155 |
-
print(
|
| 156 |
-
f"Progress: {i + 1}/{len(dataset)} | Pass: {passed} | Fail: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
|
| 157 |
-
)
|
| 158 |
-
|
| 159 |
-
final_score = passed / len(dataset) * 100
|
| 160 |
-
print(f"\nFINAL: {passed}/{len(dataset)} = {final_score:.2f}%")
|
| 161 |
-
return final_score
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
def main():
|
| 165 |
-
print("=" * 60)
|
| 166 |
-
print("Combined Training, Testing & Upload")
|
| 167 |
-
print("200 steps - testing if more training helps")
|
| 168 |
-
print("=" * 60)
|
| 169 |
-
|
| 170 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 171 |
-
|
| 172 |
-
# Load tokenizer
|
| 173 |
-
print("\nLoading tokenizer...")
|
| 174 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 175 |
-
if tokenizer.pad_token is None:
|
| 176 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 177 |
-
|
| 178 |
-
# Load base model
|
| 179 |
-
print("Loading base model...")
|
| 180 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 181 |
-
model_name,
|
| 182 |
-
torch_dtype=torch.float16,
|
| 183 |
-
device_map="auto",
|
| 184 |
-
)
|
| 185 |
-
|
| 186 |
-
# LoRA config
|
| 187 |
-
lora_config = LoraConfig(
|
| 188 |
-
r=8,
|
| 189 |
-
lora_alpha=16,
|
| 190 |
-
lora_dropout=0.05,
|
| 191 |
-
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
| 192 |
-
bias="none",
|
| 193 |
-
task_type="CAUSAL_LM",
|
| 194 |
-
)
|
| 195 |
-
|
| 196 |
-
# Load training dataset
|
| 197 |
-
print("\nLoading training dataset (streaming)...")
|
| 198 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 199 |
-
|
| 200 |
-
print("Preparing examples...")
|
| 201 |
-
examples = []
|
| 202 |
-
for i, ex in enumerate(dataset):
|
| 203 |
-
if i >= 500:
|
| 204 |
-
break
|
| 205 |
-
text = tokenizer.apply_chat_template(
|
| 206 |
-
ex["messages"],
|
| 207 |
-
tokenize=False,
|
| 208 |
-
add_generation_prompt=False,
|
| 209 |
-
)
|
| 210 |
-
examples.append({"text": text})
|
| 211 |
-
|
| 212 |
-
print(f"Loaded {len(examples)} training examples")
|
| 213 |
-
train_dataset = Dataset.from_list(examples)
|
| 214 |
-
|
| 215 |
-
# Training config - 200 steps (testing if more helps)
|
| 216 |
-
training_args = SFTConfig(
|
| 217 |
-
output_dir="./output",
|
| 218 |
-
max_steps=200, # Increased from 150
|
| 219 |
-
per_device_train_batch_size=2,
|
| 220 |
-
gradient_accumulation_steps=4,
|
| 221 |
-
learning_rate=5e-6,
|
| 222 |
-
lr_scheduler_type="cosine",
|
| 223 |
-
warmup_steps=10,
|
| 224 |
-
logging_steps=25,
|
| 225 |
-
save_steps=200,
|
| 226 |
-
fp16=True,
|
| 227 |
-
gradient_checkpointing=True,
|
| 228 |
-
push_to_hub=False, # We'll push manually after eval
|
| 229 |
-
report_to="none",
|
| 230 |
-
)
|
| 231 |
-
|
| 232 |
-
# Create trainer
|
| 233 |
-
print("\nInitializing trainer...")
|
| 234 |
-
trainer = SFTTrainer(
|
| 235 |
-
model=base_model,
|
| 236 |
-
args=training_args,
|
| 237 |
-
train_dataset=train_dataset,
|
| 238 |
-
peft_config=lora_config,
|
| 239 |
-
processing_class=tokenizer,
|
| 240 |
-
)
|
| 241 |
-
|
| 242 |
-
# Train
|
| 243 |
-
print("\n" + "=" * 60)
|
| 244 |
-
print("PHASE 1: Training (200 steps)")
|
| 245 |
-
print("=" * 60)
|
| 246 |
-
trainer.train()
|
| 247 |
-
|
| 248 |
-
# Save trained model locally
|
| 249 |
-
print("\nSaving trained model locally...")
|
| 250 |
-
trainer.save_model("./trained_model")
|
| 251 |
-
tokenizer.save_pretrained("./trained_model")
|
| 252 |
-
|
| 253 |
-
# Test the fine-tuned model
|
| 254 |
-
print("\n" + "=" * 60)
|
| 255 |
-
print("PHASE 2: Testing Fine-tuned Model")
|
| 256 |
-
print("=" * 60)
|
| 257 |
-
|
| 258 |
-
trained_model = trainer.model
|
| 259 |
-
trained_model.train(False)
|
| 260 |
-
|
| 261 |
-
finetuned_score = test_model(
|
| 262 |
-
trained_model, tokenizer, "Fine-tuned Qwen3-0.6B (200 steps)"
|
| 263 |
-
)
|
| 264 |
-
|
| 265 |
-
# Upload to Hub
|
| 266 |
-
print("\n" + "=" * 60)
|
| 267 |
-
print("PHASE 3: Uploading to HuggingFace Hub")
|
| 268 |
-
print("=" * 60)
|
| 269 |
-
|
| 270 |
-
try:
|
| 271 |
-
# Push model
|
| 272 |
-
print(f"Pushing model to {REPO_ID}...")
|
| 273 |
-
trained_model.push_to_hub(REPO_ID, token=HF_TOKEN)
|
| 274 |
-
tokenizer.push_to_hub(REPO_ID, token=HF_TOKEN)
|
| 275 |
-
print(f"Model uploaded successfully!")
|
| 276 |
-
print(f"URL: https://huggingface.co/{REPO_ID}")
|
| 277 |
-
upload_success = True
|
| 278 |
-
except Exception as e:
|
| 279 |
-
print(f"Upload failed: {e}")
|
| 280 |
-
upload_success = False
|
| 281 |
-
|
| 282 |
-
# Summary
|
| 283 |
-
print("\n" + "=" * 60)
|
| 284 |
-
print("SUMMARY")
|
| 285 |
-
print("=" * 60)
|
| 286 |
-
print(f"Baseline (from earlier): 27.44%")
|
| 287 |
-
print(f"Fine-tuned (200 steps): {finetuned_score:.2f}%")
|
| 288 |
-
|
| 289 |
-
if finetuned_score > 27.44:
|
| 290 |
-
print(f"IMPROVEMENT: +{finetuned_score - 27.44:.2f}%")
|
| 291 |
-
print("SUCCESS! Fine-tuned model beats baseline!")
|
| 292 |
-
else:
|
| 293 |
-
print(f"DIFFERENCE: {finetuned_score - 27.44:.2f}%")
|
| 294 |
-
print("Fine-tuned model did not beat baseline.")
|
| 295 |
-
|
| 296 |
-
print(f"\nUpload status: {'SUCCESS' if upload_success else 'FAILED'}")
|
| 297 |
-
if upload_success:
|
| 298 |
-
print(f"Model URL: https://huggingface.co/{REPO_ID}")
|
| 299 |
-
print("=" * 60)
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
if __name__ == "__main__":
|
| 303 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_test_upload_v3.py
DELETED
|
@@ -1,336 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# requires-python = ">=3.10"
|
| 3 |
-
# dependencies = [
|
| 4 |
-
# "torch",
|
| 5 |
-
# "transformers>=4.45.0",
|
| 6 |
-
# "accelerate",
|
| 7 |
-
# "datasets",
|
| 8 |
-
# "trl>=0.12.0",
|
| 9 |
-
# "peft",
|
| 10 |
-
# "huggingface_hub",
|
| 11 |
-
# ]
|
| 12 |
-
# ///
|
| 13 |
-
"""
|
| 14 |
-
Combined training, testing, and upload script.
|
| 15 |
-
Trains Qwen3-0.6B on codeforces-cots (150 steps - proven optimal), tests on HumanEval, uploads to Hub.
|
| 16 |
-
"""
|
| 17 |
-
|
| 18 |
-
import os
|
| 19 |
-
import re
|
| 20 |
-
import subprocess
|
| 21 |
-
import tempfile
|
| 22 |
-
import random
|
| 23 |
-
import numpy as np
|
| 24 |
-
from datasets import load_dataset, Dataset
|
| 25 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
|
| 26 |
-
from peft import LoraConfig
|
| 27 |
-
from trl import SFTTrainer, SFTConfig
|
| 28 |
-
from huggingface_hub import login, HfApi
|
| 29 |
-
import torch
|
| 30 |
-
|
| 31 |
-
# Set seeds for reproducibility
|
| 32 |
-
SEED = 42
|
| 33 |
-
random.seed(SEED)
|
| 34 |
-
np.random.seed(SEED)
|
| 35 |
-
torch.manual_seed(SEED)
|
| 36 |
-
torch.cuda.manual_seed_all(SEED)
|
| 37 |
-
set_seed(SEED)
|
| 38 |
-
|
| 39 |
-
# Authenticate with HF Hub at the start
|
| 40 |
-
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 41 |
-
if HF_TOKEN:
|
| 42 |
-
login(token=HF_TOKEN)
|
| 43 |
-
print("HF Hub authenticated successfully!")
|
| 44 |
-
else:
|
| 45 |
-
print("WARNING: No HF_TOKEN found in environment")
|
| 46 |
-
|
| 47 |
-
REPO_ID = "passagereptile455/qwen3-codeforces-humaneval"
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
def extract_function_body(response: str) -> str:
|
| 51 |
-
"""Extract just the function body from model response."""
|
| 52 |
-
response = re.sub(r"<think>.*?</think>", "", response, flags=re.DOTALL)
|
| 53 |
-
response = response.strip()
|
| 54 |
-
|
| 55 |
-
code_match = re.search(r"```python\s*(.*?)```", response, re.DOTALL)
|
| 56 |
-
if code_match:
|
| 57 |
-
response = code_match.group(1)
|
| 58 |
-
else:
|
| 59 |
-
code_match = re.search(r"```\s*(.*?)```", response, re.DOTALL)
|
| 60 |
-
if code_match:
|
| 61 |
-
response = code_match.group(1)
|
| 62 |
-
|
| 63 |
-
response = response.strip()
|
| 64 |
-
lines = response.split("\n")
|
| 65 |
-
|
| 66 |
-
start_idx = 0
|
| 67 |
-
for i, line in enumerate(lines):
|
| 68 |
-
if line.strip().startswith("def "):
|
| 69 |
-
start_idx = i
|
| 70 |
-
break
|
| 71 |
-
|
| 72 |
-
start_idx += 1
|
| 73 |
-
|
| 74 |
-
if start_idx < len(lines):
|
| 75 |
-
stripped = lines[start_idx].strip()
|
| 76 |
-
if stripped.startswith('"""') or stripped.startswith("'''"):
|
| 77 |
-
quote = stripped[:3]
|
| 78 |
-
if stripped.count(quote) >= 2:
|
| 79 |
-
start_idx += 1
|
| 80 |
-
else:
|
| 81 |
-
start_idx += 1
|
| 82 |
-
while start_idx < len(lines) and quote not in lines[start_idx]:
|
| 83 |
-
start_idx += 1
|
| 84 |
-
start_idx += 1
|
| 85 |
-
|
| 86 |
-
body_lines = lines[start_idx:]
|
| 87 |
-
return "\n".join(body_lines)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
def run_test_subprocess(prompt: str, completion: str, test: str, entry_point: str):
|
| 91 |
-
"""Run the test using subprocess."""
|
| 92 |
-
full_code = prompt + completion + "\n" + test + f"\ncheck({entry_point})"
|
| 93 |
-
|
| 94 |
-
with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f:
|
| 95 |
-
f.write(full_code)
|
| 96 |
-
temp_path = f.name
|
| 97 |
-
|
| 98 |
-
try:
|
| 99 |
-
result = subprocess.run(
|
| 100 |
-
["python", temp_path], capture_output=True, text=True, timeout=10
|
| 101 |
-
)
|
| 102 |
-
return result.returncode == 0
|
| 103 |
-
except subprocess.TimeoutExpired:
|
| 104 |
-
return False
|
| 105 |
-
except Exception:
|
| 106 |
-
return False
|
| 107 |
-
finally:
|
| 108 |
-
try:
|
| 109 |
-
os.unlink(temp_path)
|
| 110 |
-
except:
|
| 111 |
-
pass
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
def test_model(model, tokenizer, model_name="Model"):
|
| 115 |
-
"""Test model on HumanEval."""
|
| 116 |
-
print(f"\n{'=' * 60}")
|
| 117 |
-
print(f"Testing: {model_name}")
|
| 118 |
-
print("=" * 60)
|
| 119 |
-
|
| 120 |
-
dataset = load_dataset("openai/openai_humaneval", split="test")
|
| 121 |
-
print(f"Total problems: {len(dataset)}")
|
| 122 |
-
|
| 123 |
-
passed = 0
|
| 124 |
-
failed = 0
|
| 125 |
-
|
| 126 |
-
for i, problem in enumerate(dataset):
|
| 127 |
-
prompt = problem["prompt"]
|
| 128 |
-
test = problem["test"]
|
| 129 |
-
entry_point = problem["entry_point"]
|
| 130 |
-
|
| 131 |
-
messages = [
|
| 132 |
-
{
|
| 133 |
-
"role": "user",
|
| 134 |
-
"content": f"Complete this Python function. Output only the code.\n\n{prompt}",
|
| 135 |
-
}
|
| 136 |
-
]
|
| 137 |
-
|
| 138 |
-
text = tokenizer.apply_chat_template(
|
| 139 |
-
messages, tokenize=False, add_generation_prompt=True, enable_thinking=False
|
| 140 |
-
)
|
| 141 |
-
|
| 142 |
-
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
| 143 |
-
|
| 144 |
-
with torch.no_grad():
|
| 145 |
-
outputs = model.generate(
|
| 146 |
-
**inputs,
|
| 147 |
-
max_new_tokens=512,
|
| 148 |
-
do_sample=False,
|
| 149 |
-
pad_token_id=tokenizer.eos_token_id,
|
| 150 |
-
)
|
| 151 |
-
|
| 152 |
-
response = tokenizer.decode(
|
| 153 |
-
outputs[0][inputs.input_ids.shape[1] :], skip_special_tokens=True
|
| 154 |
-
)
|
| 155 |
-
|
| 156 |
-
completion = extract_function_body(response)
|
| 157 |
-
success = run_test_subprocess(prompt, completion, test, entry_point)
|
| 158 |
-
|
| 159 |
-
if success:
|
| 160 |
-
passed += 1
|
| 161 |
-
else:
|
| 162 |
-
failed += 1
|
| 163 |
-
|
| 164 |
-
if (i + 1) % 20 == 0 or i == len(dataset) - 1:
|
| 165 |
-
print(
|
| 166 |
-
f"Progress: {i + 1}/{len(dataset)} | Pass: {passed} | Fail: {failed} | Rate: {passed / (i + 1) * 100:.1f}%"
|
| 167 |
-
)
|
| 168 |
-
|
| 169 |
-
final_score = passed / len(dataset) * 100
|
| 170 |
-
print(f"\nFINAL: {passed}/{len(dataset)} = {final_score:.2f}%")
|
| 171 |
-
return final_score
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
def main():
|
| 175 |
-
print("=" * 60)
|
| 176 |
-
print("Combined Training, Testing & Upload")
|
| 177 |
-
print("150 steps - with SAME-RUN baseline comparison")
|
| 178 |
-
print("=" * 60)
|
| 179 |
-
|
| 180 |
-
model_name = "Qwen/Qwen3-0.6B"
|
| 181 |
-
|
| 182 |
-
# Load tokenizer
|
| 183 |
-
print("\nLoading tokenizer...")
|
| 184 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 185 |
-
if tokenizer.pad_token is None:
|
| 186 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 187 |
-
|
| 188 |
-
# Load base model
|
| 189 |
-
print("Loading base model...")
|
| 190 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
| 191 |
-
model_name,
|
| 192 |
-
torch_dtype=torch.float16,
|
| 193 |
-
device_map="auto",
|
| 194 |
-
)
|
| 195 |
-
|
| 196 |
-
# PHASE 1: Test BASE model first (same run comparison)
|
| 197 |
-
print("\n" + "=" * 60)
|
| 198 |
-
print("PHASE 1: Testing BASE Model (for fair comparison)")
|
| 199 |
-
print("=" * 60)
|
| 200 |
-
base_score = test_model(base_model, tokenizer, "Base Qwen3-0.6B")
|
| 201 |
-
|
| 202 |
-
# LoRA config
|
| 203 |
-
lora_config = LoraConfig(
|
| 204 |
-
r=8,
|
| 205 |
-
lora_alpha=16,
|
| 206 |
-
lora_dropout=0.05,
|
| 207 |
-
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
| 208 |
-
bias="none",
|
| 209 |
-
task_type="CAUSAL_LM",
|
| 210 |
-
)
|
| 211 |
-
|
| 212 |
-
# Load training dataset
|
| 213 |
-
print("\nLoading training dataset (streaming)...")
|
| 214 |
-
dataset = load_dataset("open-r1/codeforces-cots", split="train", streaming=True)
|
| 215 |
-
|
| 216 |
-
print("Preparing examples...")
|
| 217 |
-
examples = []
|
| 218 |
-
for i, ex in enumerate(dataset):
|
| 219 |
-
if i >= 500:
|
| 220 |
-
break
|
| 221 |
-
text = tokenizer.apply_chat_template(
|
| 222 |
-
ex["messages"],
|
| 223 |
-
tokenize=False,
|
| 224 |
-
add_generation_prompt=False,
|
| 225 |
-
)
|
| 226 |
-
examples.append({"text": text})
|
| 227 |
-
|
| 228 |
-
print(f"Loaded {len(examples)} training examples")
|
| 229 |
-
train_dataset = Dataset.from_list(examples)
|
| 230 |
-
|
| 231 |
-
# Training config - 150 steps (proven optimal)
|
| 232 |
-
training_args = SFTConfig(
|
| 233 |
-
output_dir="./output",
|
| 234 |
-
max_steps=150, # Proven optimal - 200 regresses
|
| 235 |
-
per_device_train_batch_size=2,
|
| 236 |
-
gradient_accumulation_steps=4,
|
| 237 |
-
learning_rate=5e-6,
|
| 238 |
-
lr_scheduler_type="cosine",
|
| 239 |
-
warmup_steps=10,
|
| 240 |
-
logging_steps=25,
|
| 241 |
-
save_steps=150,
|
| 242 |
-
fp16=True,
|
| 243 |
-
gradient_checkpointing=True,
|
| 244 |
-
push_to_hub=False, # We'll push manually after eval
|
| 245 |
-
report_to="none",
|
| 246 |
-
seed=42, # Fixed seed for reproducibility
|
| 247 |
-
)
|
| 248 |
-
|
| 249 |
-
# Need to reload model for training (can't train already-evaluated model cleanly)
|
| 250 |
-
print("\nReloading model for training...")
|
| 251 |
-
del base_model
|
| 252 |
-
torch.cuda.empty_cache()
|
| 253 |
-
|
| 254 |
-
train_model = AutoModelForCausalLM.from_pretrained(
|
| 255 |
-
model_name,
|
| 256 |
-
torch_dtype=torch.float16,
|
| 257 |
-
device_map="auto",
|
| 258 |
-
)
|
| 259 |
-
|
| 260 |
-
# Create trainer
|
| 261 |
-
print("\nInitializing trainer...")
|
| 262 |
-
trainer = SFTTrainer(
|
| 263 |
-
model=train_model,
|
| 264 |
-
args=training_args,
|
| 265 |
-
train_dataset=train_dataset,
|
| 266 |
-
peft_config=lora_config,
|
| 267 |
-
processing_class=tokenizer,
|
| 268 |
-
)
|
| 269 |
-
|
| 270 |
-
# Train
|
| 271 |
-
print("\n" + "=" * 60)
|
| 272 |
-
print("PHASE 2: Training (150 steps)")
|
| 273 |
-
print("=" * 60)
|
| 274 |
-
trainer.train()
|
| 275 |
-
|
| 276 |
-
# Save trained model locally
|
| 277 |
-
print("\nSaving trained model locally...")
|
| 278 |
-
trainer.save_model("./trained_model")
|
| 279 |
-
tokenizer.save_pretrained("./trained_model")
|
| 280 |
-
|
| 281 |
-
# Test the fine-tuned model
|
| 282 |
-
print("\n" + "=" * 60)
|
| 283 |
-
print("PHASE 3: Testing Fine-tuned Model")
|
| 284 |
-
print("=" * 60)
|
| 285 |
-
|
| 286 |
-
trained_model = trainer.model
|
| 287 |
-
trained_model.train(False)
|
| 288 |
-
|
| 289 |
-
finetuned_score = test_model(
|
| 290 |
-
trained_model, tokenizer, "Fine-tuned Qwen3-0.6B (150 steps)"
|
| 291 |
-
)
|
| 292 |
-
|
| 293 |
-
# Upload to Hub only if we beat the baseline
|
| 294 |
-
print("\n" + "=" * 60)
|
| 295 |
-
print("PHASE 4: Uploading to HuggingFace Hub")
|
| 296 |
-
print("=" * 60)
|
| 297 |
-
|
| 298 |
-
upload_success = False
|
| 299 |
-
if finetuned_score > base_score:
|
| 300 |
-
try:
|
| 301 |
-
print(f"Pushing model to {REPO_ID}...")
|
| 302 |
-
trained_model.push_to_hub(REPO_ID, token=HF_TOKEN)
|
| 303 |
-
tokenizer.push_to_hub(REPO_ID, token=HF_TOKEN)
|
| 304 |
-
print(f"Model uploaded successfully!")
|
| 305 |
-
print(f"URL: https://huggingface.co/{REPO_ID}")
|
| 306 |
-
upload_success = True
|
| 307 |
-
except Exception as e:
|
| 308 |
-
print(f"Upload failed: {e}")
|
| 309 |
-
else:
|
| 310 |
-
print("Fine-tuned model did NOT beat baseline - skipping upload")
|
| 311 |
-
|
| 312 |
-
# Summary - SAME RUN COMPARISON
|
| 313 |
-
print("\n" + "=" * 60)
|
| 314 |
-
print("SUMMARY (Same-Run Comparison)")
|
| 315 |
-
print("=" * 60)
|
| 316 |
-
print(f"Base model (this run): {base_score:.2f}%")
|
| 317 |
-
print(f"Fine-tuned (150 steps): {finetuned_score:.2f}%")
|
| 318 |
-
diff = finetuned_score - base_score
|
| 319 |
-
|
| 320 |
-
if diff > 0:
|
| 321 |
-
print(f"IMPROVEMENT: +{diff:.2f}%")
|
| 322 |
-
print("SUCCESS! Fine-tuned model beats baseline!")
|
| 323 |
-
elif diff == 0:
|
| 324 |
-
print("NO CHANGE: Same as baseline")
|
| 325 |
-
else:
|
| 326 |
-
print(f"REGRESSION: {diff:.2f}%")
|
| 327 |
-
print("Fine-tuned model is WORSE than baseline.")
|
| 328 |
-
|
| 329 |
-
print(f"\nUpload status: {'SUCCESS' if upload_success else 'SKIPPED/FAILED'}")
|
| 330 |
-
if upload_success:
|
| 331 |
-
print(f"Model URL: https://huggingface.co/{REPO_ID}")
|
| 332 |
-
print("=" * 60)
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
if __name__ == "__main__":
|
| 336 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_v5_fixed.py
DELETED
|
@@ -1,129 +0,0 @@
|
|
| 1 |
-
# /// script
|
| 2 |
-
# dependencies = [
|
| 3 |
-
# "trl>=0.12.0",
|
| 4 |
-
# "peft>=0.7.0",
|
| 5 |
-
# "transformers>=4.36.0",
|
| 6 |
-
# "accelerate>=0.24.0",
|
| 7 |
-
# "trackio",
|
| 8 |
-
# "datasets",
|
| 9 |
-
# ]
|
| 10 |
-
# ///
|
| 11 |
-
|
| 12 |
-
"""
|
| 13 |
-
Training with proper dataset formatting
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
import sys
|
| 17 |
-
import traceback
|
| 18 |
-
from datasets import load_dataset, Dataset
|
| 19 |
-
from peft import LoraConfig
|
| 20 |
-
from trl import SFTTrainer, SFTConfig
|
| 21 |
-
from transformers import AutoTokenizer
|
| 22 |
-
import torch
|
| 23 |
-
|
| 24 |
-
print("=" * 50)
|
| 25 |
-
print("FIXED TRAINING v5")
|
| 26 |
-
print("=" * 50)
|
| 27 |
-
|
| 28 |
-
try:
|
| 29 |
-
print(f"CUDA: {torch.cuda.is_available()}")
|
| 30 |
-
|
| 31 |
-
# Streaming load
|
| 32 |
-
print("Streaming codeforces-cots...")
|
| 33 |
-
streaming_ds = load_dataset(
|
| 34 |
-
"open-r1/codeforces-cots", split="train", streaming=True
|
| 35 |
-
)
|
| 36 |
-
|
| 37 |
-
# Collect examples
|
| 38 |
-
print("Collecting 1000 examples...")
|
| 39 |
-
examples = []
|
| 40 |
-
for i, ex in enumerate(streaming_ds):
|
| 41 |
-
if i >= 1000:
|
| 42 |
-
break
|
| 43 |
-
examples.append(ex)
|
| 44 |
-
|
| 45 |
-
print(f"Collected {len(examples)} examples")
|
| 46 |
-
dataset = Dataset.from_list(examples)
|
| 47 |
-
print(f"Dataset columns: {dataset.column_names}")
|
| 48 |
-
|
| 49 |
-
# Check messages format
|
| 50 |
-
print(f"First messages sample: {dataset[0]['messages'][:100]}...")
|
| 51 |
-
|
| 52 |
-
# Load tokenizer
|
| 53 |
-
print("Loading tokenizer...")
|
| 54 |
-
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
|
| 55 |
-
if tokenizer.pad_token is None:
|
| 56 |
-
tokenizer.pad_token = tokenizer.eos_token
|
| 57 |
-
|
| 58 |
-
# Convert messages to text format for SFT
|
| 59 |
-
def format_messages(example):
|
| 60 |
-
messages = example["messages"]
|
| 61 |
-
# Format as simple text
|
| 62 |
-
text = ""
|
| 63 |
-
for msg in messages:
|
| 64 |
-
role = msg.get("role", "user")
|
| 65 |
-
content = msg.get("content", "")
|
| 66 |
-
text += f"<|{role}|>\n{content}\n"
|
| 67 |
-
return {"text": text}
|
| 68 |
-
|
| 69 |
-
print("Formatting dataset...")
|
| 70 |
-
dataset = dataset.map(format_messages, remove_columns=dataset.column_names)
|
| 71 |
-
print(f"Formatted. Sample: {dataset[0]['text'][:200]}...")
|
| 72 |
-
|
| 73 |
-
# Config
|
| 74 |
-
config = SFTConfig(
|
| 75 |
-
output_dir="qwen3-codeforces",
|
| 76 |
-
push_to_hub=True,
|
| 77 |
-
hub_model_id="passagereptile455/qwen3-0.6b-humaneval-job1",
|
| 78 |
-
hub_strategy="every_save",
|
| 79 |
-
max_steps=200,
|
| 80 |
-
per_device_train_batch_size=1,
|
| 81 |
-
gradient_accumulation_steps=8,
|
| 82 |
-
learning_rate=5e-6,
|
| 83 |
-
max_length=512,
|
| 84 |
-
logging_steps=20,
|
| 85 |
-
save_strategy="steps",
|
| 86 |
-
save_steps=100,
|
| 87 |
-
save_total_limit=1,
|
| 88 |
-
eval_strategy="no",
|
| 89 |
-
warmup_ratio=0.1,
|
| 90 |
-
lr_scheduler_type="cosine",
|
| 91 |
-
gradient_checkpointing=True,
|
| 92 |
-
bf16=True,
|
| 93 |
-
dataset_text_field="text", # Specify text field
|
| 94 |
-
report_to="trackio",
|
| 95 |
-
project="qwen3-humaneval",
|
| 96 |
-
run_name="job1-v5",
|
| 97 |
-
)
|
| 98 |
-
|
| 99 |
-
peft_config = LoraConfig(
|
| 100 |
-
r=8,
|
| 101 |
-
lora_alpha=16,
|
| 102 |
-
lora_dropout=0.05,
|
| 103 |
-
bias="none",
|
| 104 |
-
task_type="CAUSAL_LM",
|
| 105 |
-
target_modules=["q_proj", "v_proj"],
|
| 106 |
-
)
|
| 107 |
-
|
| 108 |
-
print("Creating trainer...")
|
| 109 |
-
trainer = SFTTrainer(
|
| 110 |
-
model="Qwen/Qwen3-0.6B",
|
| 111 |
-
train_dataset=dataset,
|
| 112 |
-
args=config,
|
| 113 |
-
peft_config=peft_config,
|
| 114 |
-
)
|
| 115 |
-
|
| 116 |
-
print("Training (200 steps)...")
|
| 117 |
-
trainer.train()
|
| 118 |
-
|
| 119 |
-
print("Pushing to Hub...")
|
| 120 |
-
trainer.push_to_hub()
|
| 121 |
-
|
| 122 |
-
print("=" * 50)
|
| 123 |
-
print("SUCCESS!")
|
| 124 |
-
print("=" * 50)
|
| 125 |
-
|
| 126 |
-
except Exception as e:
|
| 127 |
-
print(f"ERROR: {e}")
|
| 128 |
-
traceback.print_exc()
|
| 129 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|