Instructions to use prometheus04/qwen3-4b-thinking-microagent with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use prometheus04/qwen3-4b-thinking-microagent with PEFT:
Task type is invalid.
- Notebooks
- Google Colab
- Kaggle
| """Cheap eval: does the trained model emit MicroAgent-compliant output? | |
| This is the FIRST eval to run after training. It catches the most common | |
| failure mode (model emits malformed responses) without needing TB2 Docker setup. | |
| Procedure: | |
| 1. Hold out a small slice of converted trajectories | |
| 2. For each, take the prefix up through some turn, generate the next assistant turn | |
| 3. Check: does the output contain a valid <think>...</think> block? A valid | |
| <bash>...</bash> or <finish>...</finish>? Anything outside the tags? | |
| Pass rate >95% means the model has learned the format. <80% means broken training. | |
| Usage: | |
| python scripts/eval_format_compliance.py \\ | |
| --model runs/hunyuan-4b-microagent-v1/final \\ | |
| --base-model tencent-hunyuan/Hunyuan-4B-Instruct \\ | |
| --data data/microagent_train.jsonl \\ | |
| --n 50 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import random | |
| import re | |
| import sys | |
| from pathlib import Path | |
| _THINK_RE = re.compile(r"<think>(.*?)</think>", re.DOTALL) | |
| _BASH_RE = re.compile(r"<bash>(.*?)</bash>", re.DOTALL) | |
| _FINISH_RE = re.compile(r"<finish>(.*?)</finish>", re.DOTALL) | |
| def parse_args(): | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--model", required=True, | |
| help="Path to LoRA adapter dir, or full merged model") | |
| p.add_argument("--base-model", default=None, | |
| help="Base model id if --model points to an adapter") | |
| p.add_argument("--data", default="data/microagent_train.jsonl") | |
| p.add_argument("--n", type=int, default=50, | |
| help="Number of held-out prompts to test") | |
| p.add_argument("--max-new-tokens", type=int, default=512) | |
| p.add_argument("--temperature", type=float, default=0.1) | |
| p.add_argument("--seed", type=int, default=123) | |
| return p.parse_args() | |
| def classify(text: str) -> tuple[str, list[str]]: | |
| """Return (kind, problems) where kind is bash/finish/invalid.""" | |
| problems = [] | |
| think_m = _THINK_RE.search(text) | |
| if not think_m: | |
| problems.append("missing <think>") | |
| bash_m = _BASH_RE.search(text) | |
| finish_m = _FINISH_RE.search(text) | |
| if bash_m and finish_m: | |
| problems.append("both <bash> and <finish>") | |
| if bash_m: | |
| if not bash_m.group(1).strip(): | |
| problems.append("empty <bash>") | |
| return "bash", problems | |
| if finish_m: | |
| if not finish_m.group(1).strip(): | |
| problems.append("empty <finish>") | |
| return "finish", problems | |
| problems.append("no <bash> or <finish>") | |
| return "invalid", problems | |
| def main(): | |
| args = parse_args() | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| print(f"[eval] loading {args.model}") | |
| if args.base_model: | |
| from peft import PeftModel | |
| base = AutoModelForCausalLM.from_pretrained( | |
| args.base_model, torch_dtype=torch.bfloat16, | |
| device_map="auto", trust_remote_code=True, | |
| ) | |
| model = PeftModel.from_pretrained(base, args.model) | |
| tokenizer = AutoTokenizer.from_pretrained(args.base_model, trust_remote_code=True) | |
| else: | |
| model = AutoModelForCausalLM.from_pretrained( | |
| args.model, torch_dtype=torch.bfloat16, | |
| device_map="auto", trust_remote_code=True, | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model.eval() | |
| # Load held-out samples | |
| rng = random.Random(args.seed) | |
| rows = [] | |
| with open(args.data, "r", encoding="utf-8") as f: | |
| for line in f: | |
| rows.append(json.loads(line)) | |
| rng.shuffle(rows) | |
| rows = rows[: args.n] | |
| print(f"[eval] using {len(rows)} held-out prompts") | |
| bash_count = 0 | |
| finish_count = 0 | |
| invalid_count = 0 | |
| problem_tally = {} | |
| for i, row in enumerate(rows): | |
| conv = row["conversations"] | |
| # Pick a random assistant turn position (must have at least one before it) | |
| a_positions = [j for j, t in enumerate(conv) if t["role"] == "assistant"] | |
| if not a_positions: | |
| continue | |
| target_pos = rng.choice(a_positions) | |
| prefix = conv[:target_pos] | |
| gold = conv[target_pos]["content"] | |
| # Render prefix via the chat template | |
| prompt = tokenizer.apply_chat_template( | |
| prefix, tokenize=False, add_generation_prompt=True | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, | |
| max_length=8192).to(model.device) | |
| with torch.no_grad(): | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens=args.max_new_tokens, | |
| temperature=args.temperature, | |
| do_sample=args.temperature > 0, | |
| pad_token_id=tokenizer.pad_token_id, | |
| ) | |
| gen = tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
| kind, problems = classify(gen) | |
| if kind == "bash": | |
| bash_count += 1 | |
| elif kind == "finish": | |
| finish_count += 1 | |
| else: | |
| invalid_count += 1 | |
| for p in problems: | |
| problem_tally[p] = problem_tally.get(p, 0) + 1 | |
| if i < 5: | |
| print(f"\n=== Sample {i} (kind={kind}) ===") | |
| print(f"GOLD (first 200): {gold[:200]}") | |
| print(f"GEN (first 200): {gen[:200]}") | |
| if problems: | |
| print(f"PROBLEMS: {problems}") | |
| total = bash_count + finish_count + invalid_count | |
| print(f"\n========== Summary ({total} samples) ==========") | |
| print(f" bash : {bash_count} ({100*bash_count/max(total,1):.1f}%)") | |
| print(f" finish : {finish_count} ({100*finish_count/max(total,1):.1f}%)") | |
| print(f" invalid : {invalid_count} ({100*invalid_count/max(total,1):.1f}%)") | |
| print(f"\nProblems:") | |
| for k, v in sorted(problem_tally.items(), key=lambda x: -x[1]): | |
| print(f" {k}: {v}") | |
| pass_rate = (bash_count + finish_count) / max(total, 1) | |
| print(f"\nFormat compliance: {100*pass_rate:.1f}%") | |
| if pass_rate < 0.80: | |
| print("WARN: low compliance, check training") | |
| elif pass_rate < 0.95: | |
| print("OK but not great; consider another epoch") | |
| else: | |
| print("Good. Ready for TB2 eval.") | |
| if __name__ == "__main__": | |
| main() | |