td-builder
/

td-toolkit

Safetensors

Model card Files Files and versions

xet

Community

td-builder commited on Feb 26

Commit

5f2f755

verified ·

1 Parent(s): 7cf8e19

Upload 137 files

Browse files

Files changed (2) hide show

hugging/td_fuse/validate.py +57 -9
hugging/td_lang/compiler.py +80 -75

hugging/td_fuse/validate.py CHANGED Viewed

@@ -155,6 +155,45 @@ def compute_perplexity(
     return perplexity
 def test_thinking_mode(
     model: AutoModelForCausalLM,
     tokenizer: AutoTokenizer,
@@ -167,16 +206,21 @@ def test_thinking_mode(
     """
     prompt = "Solve step by step: What is 15 × 13?"
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
-            max_new_tokens=200,
-            temperature=0.7,
             do_sample=True,
         )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=False)
     # Check for thinking tags
     has_think_open = "<think>" in response
@@ -185,7 +229,7 @@ def test_thinking_mode(
     print(f"\n[validate] Thinking mode test:")
     print(f"  Prompt:    {prompt}")
-    print(f"  Response:  {response[:200]}...")
     print(f"  <think>:   {'✓ found' if has_think_open else '✗ missing'}")
     print(f"  </think>:  {'✓ found' if has_think_close else '✗ missing'}")
     print(f"  Status:    {'✓ PASS' if passed else '✗ FAIL'}")
@@ -201,26 +245,30 @@ def test_reasoning(
     Quick reasoning sanity check — can the model still do basic math?
     This catches catastrophic failures where the merge produced gibberish.
     """
     prompt = "What is 7 + 8?"
     expected_answer = "15"
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             max_new_tokens=50,
-            temperature=0.1,
             do_sample=False,
         )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     passed = expected_answer in response
     print(f"\n[validate] Quick reasoning test:")
     print(f"  Prompt:   {prompt}")
     print(f"  Expected: {expected_answer}")
-    print(f"  Got:      {response}")
     print(f"  Status:   {'✓ PASS' if passed else '✗ FAIL'}")
     return passed

     return perplexity
+def _format_chat_prompt(tokenizer, user_message: str, enable_thinking: bool = True) -> dict:
+    """
+    Format a prompt using Qwen3's chat template.
+    Qwen3 models expect messages in chat format — without it, the model
+    just autocompletes the text instead of answering.
+    Args:
+        tokenizer: The tokenizer (or processor.tokenizer for VL models)
+        user_message: The user's question
+        enable_thinking: If True, allow <think> tags. If False, add /no_think.
+    Returns:
+        Dict with input_ids ready for model.generate()
+    """
+    messages = [{"role": "user", "content": user_message}]
+    # Try using the chat template (Qwen3 has one built in)
+    try:
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=enable_thinking,
+        )
+        inputs = tokenizer(text, return_tensors="pt")
+        return inputs
+    except Exception:
+        pass
+    # Fallback: manual Qwen3 chat format
+    if enable_thinking:
+        text = f"<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"
+    else:
+        text = f"<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n/no_think\n"
+    inputs = tokenizer(text, return_tensors="pt")
+    return inputs
 def test_thinking_mode(
     model: AutoModelForCausalLM,
     tokenizer: AutoTokenizer,
     """
     prompt = "Solve step by step: What is 15 × 13?"
+    inputs = _format_chat_prompt(tokenizer, prompt, enable_thinking=True)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
+            max_new_tokens=300,
             do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
         )
+    # Decode only the NEW tokens (skip the prompt)
+    new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+    response = tokenizer.decode(new_tokens, skip_special_tokens=False)
     # Check for thinking tags
     has_think_open = "<think>" in response
     print(f"\n[validate] Thinking mode test:")
     print(f"  Prompt:    {prompt}")
+    print(f"  Response:  {response[:300]}...")
     print(f"  <think>:   {'✓ found' if has_think_open else '✗ missing'}")
     print(f"  </think>:  {'✓ found' if has_think_close else '✗ missing'}")
     print(f"  Status:    {'✓ PASS' if passed else '✗ FAIL'}")
     Quick reasoning sanity check — can the model still do basic math?
     This catches catastrophic failures where the merge produced gibberish.
+    Uses /no_think mode so the model answers directly without chain-of-thought.
     """
     prompt = "What is 7 + 8?"
     expected_answer = "15"
+    inputs = _format_chat_prompt(tokenizer, prompt, enable_thinking=False)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.no_grad():
         outputs = model.generate(
             **inputs,
             max_new_tokens=50,
             do_sample=False,
         )
+    # Decode only the NEW tokens (skip the prompt)
+    new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
+    response = tokenizer.decode(new_tokens, skip_special_tokens=True)
     passed = expected_answer in response
     print(f"\n[validate] Quick reasoning test:")
     print(f"  Prompt:   {prompt}")
     print(f"  Expected: {expected_answer}")
+    print(f"  Got:      {response[:200]}")
     print(f"  Status:   {'✓ PASS' if passed else '✗ FAIL'}")
     return passed

hugging/td_lang/compiler.py CHANGED Viewed

@@ -246,17 +246,27 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent -= 1
         self._emit("]")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.float16, device_map='auto')")
         self._emit("model.eval()")
         self._emit("scores = []")
         self._emit("for p in prompts:")
         self._indent += 1
         self._emit("inputs = tok(p, return_tensors='pt').to(model.device)")
         self._emit("with torch.no_grad():")
         self._indent += 1
         self._emit("out = model.generate(**inputs, max_new_tokens=32, do_sample=False)")
         self._indent -= 1
-        self._emit("resp = tok.decode(out[0], skip_special_tokens=True)")
         self._emit("scores.append(len(resp))")
         self._indent -= 1
         self._emit("avg_len = sum(scores) / len(scores)")
@@ -266,6 +276,32 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent -= 1
         self._emit("")
         if program.setup:
             self._emit_setup(program.setup)
@@ -486,14 +522,10 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent += 1
         self._emit(f'checkpoint = models["{cmd.target}"]["model_ref"]')
         self._indent -= 1
-        self._emit("from transformers import AutoModelForCausalLM, AutoTokenizer")
         self._emit("import torch, re, ast")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(")
-        self._indent += 1
-        self._emit('checkpoint, torch_dtype=torch.bfloat16, device_map="auto"')
-        self._indent -= 1
-        self._emit(")")
         self._emit("model.eval()")
         self._emit("")
         self._emit("# Mini-benchmark: math, code, reasoning, perplexity")
@@ -702,14 +734,10 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit('print("[td_lang] WARNING: No checkpoint - using model_ref instead.")')
         self._emit(f'checkpoint = models["{cmd.target}"]["model_ref"]')
         self._indent -= 1
-        self._emit("from transformers import AutoModelForCausalLM, AutoTokenizer")
         self._emit("import torch")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(")
-        self._indent += 1
-        self._emit('checkpoint, torch_dtype=torch.bfloat16, device_map="auto"')
-        self._indent -= 1
-        self._emit(")")
         self._emit("model.eval()")
         self._emit("")
         self._emit("# Self-diagnosis prompts (from TD interview findings test_12)")
@@ -724,12 +752,23 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("diagnose_results = []")
         self._emit("for prompt in diag_prompts:")
         self._indent += 1
         self._emit('inputs = tok(prompt, return_tensors="pt").to(model.device)')
         self._emit("with torch.no_grad():")
         self._indent += 1
         self._emit("output = model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)")
         self._indent -= 1
-        self._emit("response = tok.decode(output[0], skip_special_tokens=True)")
         self._emit('diagnose_results.append({"prompt": prompt, "response": response})')
         self._emit('print(f"  Prompt: {prompt[:50]}...")')
         self._emit('print(f"  Response: {response[:200]}...")')
@@ -926,14 +965,10 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent += 1
         self._emit(f'checkpoint = models["{cmd.target}"]["model_ref"]')
         self._indent -= 1
-        self._emit("from transformers import AutoModelForCausalLM, AutoTokenizer")
         self._emit("import torch, random, re")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(")
-        self._indent += 1
-        self._emit('checkpoint, torch_dtype=torch.bfloat16, device_map="auto"')
-        self._indent -= 1
-        self._emit(")")
         self._emit("model.eval()")
         self._emit("")
         self._emit("# Use structured diagnosis if available (upgraded diagnose outputs top_weaknesses)")
@@ -1164,13 +1199,7 @@ DO NOT EDIT - regenerate from the .td file instead.
             self._indent -= 1
             self._emit(")")
             self._emit("")
-            self._emit("model = AutoModelForCausalLM.from_pretrained(")
-            self._indent += 1
-            self._emit("checkpoint,")
-            self._emit("quantization_config=bnb_config,")
-            self._emit('device_map="auto",')
-            self._indent -= 1
-            self._emit(")")
             self._emit("model = prepare_model_for_kbit_training(model)")
             self._emit("")
             self._emit("# LoRA adapters on mid-to-late layers (test_12: layers 16-28 for 32-layer)")
@@ -1395,11 +1424,7 @@ DO NOT EDIT - regenerate from the .td file instead.
             self._emit("bnb_4bit_use_double_quant=True,")
             self._indent -= 1
             self._emit(")")
-            self._emit("model = AutoModelForCausalLM.from_pretrained(")
-            self._indent += 1
-            self._emit("checkpoint, quantization_config=bnb_config, device_map='auto',")
-            self._indent -= 1
-            self._emit(")")
             self._emit("model = prepare_model_for_kbit_training(model)")
             self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
             self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],')
@@ -1500,11 +1525,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("from transformers import AutoModelForCausalLM, AutoTokenizer")
         self._emit("import torch, random, json")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(")
-        self._indent += 1
-        self._emit('checkpoint, torch_dtype=torch.bfloat16, device_map="auto"')
-        self._indent -= 1
-        self._emit(")")
         self._emit("model.eval()")
         self._emit("")
         self._emit("# Persona-based debate (test_14: single-model diversity protocol)")
@@ -1643,11 +1664,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit('"bnb_4bit_quant_type": "nf4",')
         self._indent -= 1
         self._emit("}")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(")
-        self._indent += 1
-        self._emit("checkpoint, device_map='auto', **bnb_config")
-        self._indent -= 1
-        self._emit(")")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
         self._emit("")
         # Parse layer spec into layers_to_transform
@@ -1879,7 +1896,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("manifest = json.load(f)")
         self._indent -= 1
         self._emit('base_ref = manifest.get("base_ref", ckpt_path)')
-        self._emit("model = AutoModelForCausalLM.from_pretrained(base_ref, torch_dtype=torch.float16, device_map='cuda')")
         self._emit('if manifest.get("fork_type") == "adapter":')
         self._indent += 1
         self._emit("from peft import PeftModel")
@@ -1889,7 +1906,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("elif os.path.isdir(ckpt_path):")
         self._indent += 1
         self._emit("# Loading from a HF-style directory")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(ckpt_path, torch_dtype=torch.float16, device_map='cuda')")
         self._indent -= 1
         self._emit("else:")
         self._indent += 1
@@ -1898,7 +1915,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("state = load_file(ckpt_path, device='cpu')")
         self._emit("# Need base model architecture - reload from original")
         self._emit(f'base_ref = models.get("__base_ref_{alias}", ckpt_path)')
-        self._emit("model = AutoModelForCausalLM.from_pretrained(base_ref, torch_dtype=torch.float16, device_map='cuda')")
         self._emit("try:")
         self._indent += 1
         self._emit("model.load_state_dict(state, strict=True, assign=True)")
@@ -3207,7 +3224,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         # Test source model
         self._emit(f'print("[td_lang] Loading source model: {source}...")')
         self._emit(f'_src_tok = AutoTokenizer.from_pretrained("{source}")')
-        self._emit(f'_src_model = AutoModelForCausalLM.from_pretrained("{source}", torch_dtype=torch.bfloat16, device_map="auto")')
         self._emit("_src_model.eval()")
         self._emit("")
         self._emit("_src_answers = {}")
@@ -3241,7 +3258,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit(f'_mrg_checkpoint = models["{alias}"]["model_ref"]')
         self._indent -= 1
         self._emit("_mrg_tok = AutoTokenizer.from_pretrained(_mrg_checkpoint)")
-        self._emit('_mrg_model = AutoModelForCausalLM.from_pretrained(_mrg_checkpoint, torch_dtype=torch.bfloat16, device_map="auto")')
         self._emit("_mrg_model.eval()")
         self._emit("")
         self._emit("_mrg_answers = {}")
@@ -3357,7 +3374,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit(f'_vfy_checkpoint = models["{alias}"]["model_ref"]')
         self._indent -= 1
         self._emit("_vfy_tok = AutoTokenizer.from_pretrained(_vfy_checkpoint)")
-        self._emit('_vfy_model = AutoModelForCausalLM.from_pretrained(_vfy_checkpoint, torch_dtype=torch.bfloat16, device_map="auto")')
         self._emit("_vfy_model.eval()")
         self._emit("")
@@ -3670,7 +3687,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,")
         self._indent -= 1
         self._emit(")")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
@@ -3773,7 +3790,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent -= 1
         self._emit("bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',")
         self._emit("    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         self._emit("correct_chains = []")
@@ -3833,7 +3850,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("")
         self._emit("# Step 2: Train on correct reasoning chains")
         self._emit("ds = Dataset.from_dict({'text': correct_chains})")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
@@ -3916,7 +3933,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent -= 1
         self._emit("bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',")
         self._emit("    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         self._emit("def _score_response(resp):")
@@ -3987,7 +4004,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("# Train on the best completions")
         self._emit(f'print(f"[td_lang] Training on {{len(best_completions)}} best-of-{cmd.n} completions...")')
         self._emit("ds = Dataset.from_dict({'text': best_completions})")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
@@ -4066,7 +4083,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent -= 1
         self._emit("bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',")
         self._emit("    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         self._emit("# EXPLOIT: Generate MANY diverse solutions with HIGH temperature")
@@ -4170,7 +4187,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("# Train on ALL correct solutions (the controlled hack)")
         self._emit(f'print("[td_lang] Training on {{len(exploit_data)}} diverse correct solutions...")')
         self._emit("ds = Dataset.from_dict({'text': exploit_data})")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
@@ -4366,7 +4383,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent -= 1
         self._emit("bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',")
         self._emit("    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         # Episode loop
@@ -4509,7 +4526,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit('print(f"[td_lang] Training on {len(training_texts)} reward-weighted experiences...")')
         self._emit("")
         self._emit("ds = Dataset.from_dict({'text': training_texts})")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
@@ -4957,7 +4974,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent -= 1
         self._emit("bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',")
         self._emit("    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         # Build questions for this round
@@ -5080,7 +5097,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit('print(f"[td_lang] Training on {len(training_texts)} reward-weighted experiences...")')
         self._emit("")
         self._emit("ds = Dataset.from_dict({'text': training_texts})")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
@@ -5171,11 +5188,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("from transformers import AutoModelForCausalLM, AutoTokenizer")
         self._emit("import torch")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
-        self._emit("model = AutoModelForCausalLM.from_pretrained(")
-        self._indent += 1
-        self._emit('checkpoint, torch_dtype=torch.bfloat16, device_map="auto"')
-        self._indent -= 1
-        self._emit(")")
         self._emit("model.eval()")
         self._emit(f'question = {repr(cmd.question)}')
         self._emit(f"n_samples = {n}")
@@ -5261,11 +5274,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._emit("import torch")
         self._emit('print("[td_lang] Loading teacher model...")')
         self._emit("teacher_tok = AutoTokenizer.from_pretrained(teacher_checkpoint)")
-        self._emit("teacher_model = AutoModelForCausalLM.from_pretrained(")
-        self._indent += 1
-        self._emit('teacher_checkpoint, torch_dtype=torch.bfloat16, device_map="auto"')
-        self._indent -= 1
-        self._emit(")")
         self._emit("teacher_model.eval()")
         self._emit("")
         self._emit("distill_prompts = [")
@@ -5329,11 +5338,7 @@ DO NOT EDIT - regenerate from the .td file instead.
         self._indent -= 1
         self._emit(")")
         self._emit("student_tok = AutoTokenizer.from_pretrained(student_path)")
-        self._emit("student_model = AutoModelForCausalLM.from_pretrained(")
-        self._indent += 1
-        self._emit("student_path, quantization_config=bnb_config, device_map='auto'")
-        self._indent -= 1
-        self._emit(")")
         self._emit("student_model = prepare_model_for_kbit_training(student_model)")
         self._emit("")
         self._emit("lora_config = LoraConfig(")

         self._indent -= 1
         self._emit("]")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
+        self._emit("model = _load_model_smart(checkpoint, torch_dtype=torch.float16, device_map='auto')")
         self._emit("model.eval()")
         self._emit("scores = []")
         self._emit("for p in prompts:")
         self._indent += 1
+        self._emit("messages = [{'role': 'user', 'content': p}]")
+        self._emit("try:")
+        self._indent += 1
+        self._emit("text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)")
+        self._emit("inputs = tok(text, return_tensors='pt').to(model.device)")
+        self._indent -= 1
+        self._emit("except Exception:")
+        self._indent += 1
         self._emit("inputs = tok(p, return_tensors='pt').to(model.device)")
+        self._indent -= 1
         self._emit("with torch.no_grad():")
         self._indent += 1
         self._emit("out = model.generate(**inputs, max_new_tokens=32, do_sample=False)")
         self._indent -= 1
+        self._emit("new_tokens = out[0][inputs['input_ids'].shape[1]:]")
+        self._emit("resp = tok.decode(new_tokens, skip_special_tokens=True)")
         self._emit("scores.append(len(resp))")
         self._indent -= 1
         self._emit("avg_len = sum(scores) / len(scores)")
         self._indent -= 1
         self._emit("")
+        # Smart model loader that handles Qwen3-VL and other model types
+        self._emit("def _load_model_smart(checkpoint, **kwargs):")
+        self._indent += 1
+        self._emit('"""Load model — auto-detects Qwen3-VL and uses the correct class."""')
+        self._emit("from transformers import AutoConfig")
+        self._emit("try:")
+        self._indent += 1
+        self._emit("config = AutoConfig.from_pretrained(checkpoint, trust_remote_code=True)")
+        self._emit("model_type = getattr(config, 'model_type', '')")
+        self._emit("config_class = type(config).__name__.lower()")
+        self._emit("if 'qwen3_vl' in model_type or 'qwen3vl' in config_class:")
+        self._indent += 1
+        self._emit("from transformers import Qwen3VLForConditionalGeneration")
+        self._emit("print(f'[td_lang] Loading as Qwen3-VL model: {checkpoint}')")
+        self._emit("return Qwen3VLForConditionalGeneration.from_pretrained(checkpoint, **kwargs)")
+        self._indent -= 1
+        self._indent -= 1
+        self._emit("except Exception as e:")
+        self._indent += 1
+        self._emit("print(f'[td_lang] Auto-detect failed ({e}), using AutoModelForCausalLM')")
+        self._indent -= 1
+        self._emit("from transformers import AutoModelForCausalLM")
+        self._emit("return AutoModelForCausalLM.from_pretrained(checkpoint, **kwargs)")
+        self._indent -= 1
+        self._emit("")
         if program.setup:
             self._emit_setup(program.setup)
         self._indent += 1
         self._emit(f'checkpoint = models["{cmd.target}"]["model_ref"]')
         self._indent -= 1
+        self._emit("from transformers import AutoTokenizer")
         self._emit("import torch, re, ast")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
+        self._emit("model = _load_model_smart(checkpoint, torch_dtype=torch.bfloat16, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         self._emit("# Mini-benchmark: math, code, reasoning, perplexity")
         self._emit('print("[td_lang] WARNING: No checkpoint - using model_ref instead.")')
         self._emit(f'checkpoint = models["{cmd.target}"]["model_ref"]')
         self._indent -= 1
+        self._emit("from transformers import AutoTokenizer")
         self._emit("import torch")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
+        self._emit("model = _load_model_smart(checkpoint, torch_dtype=torch.bfloat16, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         self._emit("# Self-diagnosis prompts (from TD interview findings test_12)")
         self._emit("diagnose_results = []")
         self._emit("for prompt in diag_prompts:")
         self._indent += 1
+        self._emit("# Use chat template for proper generation (Qwen3 needs this)")
+        self._emit('messages = [{"role": "user", "content": prompt}]')
+        self._emit("try:")
+        self._indent += 1
+        self._emit("text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)")
+        self._emit('inputs = tok(text, return_tensors="pt").to(model.device)')
+        self._indent -= 1
+        self._emit("except Exception:")
+        self._indent += 1
         self._emit('inputs = tok(prompt, return_tensors="pt").to(model.device)')
+        self._indent -= 1
         self._emit("with torch.no_grad():")
         self._indent += 1
         self._emit("output = model.generate(**inputs, max_new_tokens=512, do_sample=True, temperature=0.7)")
         self._indent -= 1
+        self._emit("new_tokens = output[0][inputs['input_ids'].shape[1]:]")
+        self._emit("response = tok.decode(new_tokens, skip_special_tokens=True)")
         self._emit('diagnose_results.append({"prompt": prompt, "response": response})')
         self._emit('print(f"  Prompt: {prompt[:50]}...")')
         self._emit('print(f"  Response: {response[:200]}...")')
         self._indent += 1
         self._emit(f'checkpoint = models["{cmd.target}"]["model_ref"]')
         self._indent -= 1
+        self._emit("from transformers import AutoTokenizer")
         self._emit("import torch, random, re")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
+        self._emit("model = _load_model_smart(checkpoint, torch_dtype=torch.bfloat16, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         self._emit("# Use structured diagnosis if available (upgraded diagnose outputs top_weaknesses)")
             self._indent -= 1
             self._emit(")")
             self._emit("")
+            self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
             self._emit("model = prepare_model_for_kbit_training(model)")
             self._emit("")
             self._emit("# LoRA adapters on mid-to-late layers (test_12: layers 16-28 for 32-layer)")
             self._emit("bnb_4bit_use_double_quant=True,")
             self._indent -= 1
             self._emit(")")
+            self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
             self._emit("model = prepare_model_for_kbit_training(model)")
             self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
             self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],')
         self._emit("from transformers import AutoModelForCausalLM, AutoTokenizer")
         self._emit("import torch, random, json")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
+        self._emit("model = _load_model_smart(checkpoint, torch_dtype=torch.bfloat16, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         self._emit("# Persona-based debate (test_14: single-model diversity protocol)")
         self._emit('"bnb_4bit_quant_type": "nf4",')
         self._indent -= 1
         self._emit("}")
+        self._emit("model = _load_model_smart(checkpoint, device_map='auto', **bnb_config)")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
         self._emit("")
         # Parse layer spec into layers_to_transform
         self._emit("manifest = json.load(f)")
         self._indent -= 1
         self._emit('base_ref = manifest.get("base_ref", ckpt_path)')
+        self._emit("model = _load_model_smart(base_ref, torch_dtype=torch.float16, device_map='cuda')")
         self._emit('if manifest.get("fork_type") == "adapter":')
         self._indent += 1
         self._emit("from peft import PeftModel")
         self._emit("elif os.path.isdir(ckpt_path):")
         self._indent += 1
         self._emit("# Loading from a HF-style directory")
+        self._emit("model = _load_model_smart(ckpt_path, torch_dtype=torch.float16, device_map='cuda')")
         self._indent -= 1
         self._emit("else:")
         self._indent += 1
         self._emit("state = load_file(ckpt_path, device='cpu')")
         self._emit("# Need base model architecture - reload from original")
         self._emit(f'base_ref = models.get("__base_ref_{alias}", ckpt_path)')
+        self._emit("model = _load_model_smart(base_ref, torch_dtype=torch.float16, device_map='cuda')")
         self._emit("try:")
         self._indent += 1
         self._emit("model.load_state_dict(state, strict=True, assign=True)")
         # Test source model
         self._emit(f'print("[td_lang] Loading source model: {source}...")')
         self._emit(f'_src_tok = AutoTokenizer.from_pretrained("{source}")')
+        self._emit(f'_src_model = _load_model_smart("{source}", torch_dtype=torch.bfloat16, device_map="auto")')
         self._emit("_src_model.eval()")
         self._emit("")
         self._emit("_src_answers = {}")
         self._emit(f'_mrg_checkpoint = models["{alias}"]["model_ref"]')
         self._indent -= 1
         self._emit("_mrg_tok = AutoTokenizer.from_pretrained(_mrg_checkpoint)")
+        self._emit('_mrg_model = _load_model_smart(_mrg_checkpoint, torch_dtype=torch.bfloat16, device_map="auto")')
         self._emit("_mrg_model.eval()")
         self._emit("")
         self._emit("_mrg_answers = {}")
         self._emit(f'_vfy_checkpoint = models["{alias}"]["model_ref"]')
         self._indent -= 1
         self._emit("_vfy_tok = AutoTokenizer.from_pretrained(_vfy_checkpoint)")
+        self._emit('_vfy_model = _load_model_smart(_vfy_checkpoint, torch_dtype=torch.bfloat16, device_map="auto")')
         self._emit("_vfy_model.eval()")
         self._emit("")
         self._emit("bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True,")
         self._indent -= 1
         self._emit(")")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
         self._indent -= 1
         self._emit("bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',")
         self._emit("    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         self._emit("correct_chains = []")
         self._emit("")
         self._emit("# Step 2: Train on correct reasoning chains")
         self._emit("ds = Dataset.from_dict({'text': correct_chains})")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
         self._indent -= 1
         self._emit("bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',")
         self._emit("    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         self._emit("def _score_response(resp):")
         self._emit("# Train on the best completions")
         self._emit(f'print(f"[td_lang] Training on {{len(best_completions)}} best-of-{cmd.n} completions...")')
         self._emit("ds = Dataset.from_dict({'text': best_completions})")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
         self._indent -= 1
         self._emit("bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',")
         self._emit("    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         self._emit("# EXPLOIT: Generate MANY diverse solutions with HIGH temperature")
         self._emit("# Train on ALL correct solutions (the controlled hack)")
         self._emit(f'print("[td_lang] Training on {{len(exploit_data)}} diverse correct solutions...")')
         self._emit("ds = Dataset.from_dict({'text': exploit_data})")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
         self._indent -= 1
         self._emit("bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',")
         self._emit("    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         # Episode loop
         self._emit('print(f"[td_lang] Training on {len(training_texts)} reward-weighted experiences...")')
         self._emit("")
         self._emit("ds = Dataset.from_dict({'text': training_texts})")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
         self._indent -= 1
         self._emit("bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type='nf4',")
         self._emit("    bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model.eval()")
         self._emit("")
         # Build questions for this round
         self._emit('print(f"[td_lang] Training on {len(training_texts)} reward-weighted experiences...")')
         self._emit("")
         self._emit("ds = Dataset.from_dict({'text': training_texts})")
+        self._emit("model = _load_model_smart(checkpoint, quantization_config=bnb_config, device_map='auto')")
         self._emit("model = prepare_model_for_kbit_training(model)")
         self._emit("lora_config = LoraConfig(r=32, lora_alpha=64, lora_dropout=0.05,")
         self._emit('    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], task_type="CAUSAL_LM")')
         self._emit("from transformers import AutoModelForCausalLM, AutoTokenizer")
         self._emit("import torch")
         self._emit("tok = AutoTokenizer.from_pretrained(checkpoint)")
+        self._emit("model = _load_model_smart(checkpoint, torch_dtype=torch.bfloat16, device_map='auto')")
         self._emit("model.eval()")
         self._emit(f'question = {repr(cmd.question)}')
         self._emit(f"n_samples = {n}")
         self._emit("import torch")
         self._emit('print("[td_lang] Loading teacher model...")')
         self._emit("teacher_tok = AutoTokenizer.from_pretrained(teacher_checkpoint)")
+        self._emit("teacher_model = _load_model_smart(teacher_checkpoint, torch_dtype=torch.bfloat16, device_map='auto')")
         self._emit("teacher_model.eval()")
         self._emit("")
         self._emit("distill_prompts = [")
         self._indent -= 1
         self._emit(")")
         self._emit("student_tok = AutoTokenizer.from_pretrained(student_path)")
+        self._emit("student_model = _load_model_smart(student_path, quantization_config=bnb_config, device_map='auto')")
         self._emit("student_model = prepare_model_for_kbit_training(student_model)")
         self._emit("")
         self._emit("lora_config = LoraConfig(")