Instructions to use nraptisss/tmf921-intent-training with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use nraptisss/tmf921-intent-training with PEFT:
Task type is invalid.
- Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python3 | |
| """Correct evaluation of GRPO adapter on top of SFT-merged base. | |
| The GRPO adapter was trained on: SFT-merged Qwen3-8B (not raw Qwen3-8B). | |
| Previous evaluations incorrectly loaded: raw Qwen3-8B + GRPO adapter. | |
| This script loads the correct stack: raw Qwen3-8B + SFT adapter (merged) + GRPO adapter. | |
| Usage: | |
| export PYTHONPATH="$PWD/src" | |
| python scripts/evaluate_grpo_correct.py | |
| """ | |
| import os | |
| import gc | |
| import torch | |
| from peft import PeftModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| # ---- Configuration ---- | |
| BASE_MODEL = "Qwen/Qwen3-8B" | |
| SFT_ADAPTER = "nraptisss/Qwen3-8B-TMF921-Intent-QLoRA-qwen3-8b-qlora-20260501-083834" | |
| GRPO_ADAPTER = "outputs/qwen3-8b-tmf921-grpo-v3" # local path from training | |
| DATASET = "nraptisss/TMF921-intent-to-config-research-sota" | |
| OUTPUT_DIR = "outputs/qwen3-8b-tmf921-grpo-v3/eval_correct" | |
| MAX_SAMPLES = 100 | |
| BATCH_SIZE = 4 | |
| def main(): | |
| print("=" * 60) | |
| print("GRPO Correct Evaluation") | |
| print("Model stack: Qwen3-8B + SFT adapter (merged) + GRPO adapter") | |
| print("=" * 60) | |
| # Step 1: Load base model in 4-bit | |
| print("\nStep 1: Loading base model in 4-bit...") | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| trust_remote_code=True, | |
| quantization_config=bnb_config, | |
| device_map={"": 0}, | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| print(f" GPU memory after base: {torch.cuda.memory_allocated()/1e9:.1f} GB") | |
| # Step 2: Load and merge SFT adapter (recreate the GRPO training base) | |
| print("\nStep 2: Loading SFT adapter and merging...") | |
| model = PeftModel.from_pretrained(model, SFT_ADAPTER) | |
| model = model.merge_and_unload() | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| print(f" GPU memory after SFT merge: {torch.cuda.memory_allocated()/1e9:.1f} GB") | |
| # Step 3: Load GRPO adapter on top of merged SFT | |
| print("\nStep 3: Loading GRPO adapter on top of SFT-merged base...") | |
| model = PeftModel.from_pretrained(model, GRPO_ADAPTER) | |
| print(f" GPU memory after GRPO adapter: {torch.cuda.memory_allocated()/1e9:.1f} GB") | |
| model.eval() | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| print("\nModel stack loaded correctly!") | |
| print("Now running evaluation via evaluate_model.py logic...\n") | |
| # Step 4: Run evaluation using the same logic as evaluate_model.py | |
| from datasets import load_dataset | |
| from tmf921_train.utils import aggregate_metrics, field_f1, get_message, json_exact_match, metadata_constraint_pass, parse_json, write_json | |
| from collections import defaultdict | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| import json | |
| ds = load_dataset(DATASET) | |
| splits = ["test_in_distribution", "test_template_ood", "test_use_case_ood", "test_sector_ood", "test_adversarial"] | |
| out_dir = Path(OUTPUT_DIR) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| all_summary = {} | |
| for split in splits: | |
| split_ds = ds[split] | |
| if MAX_SAMPLES and MAX_SAMPLES < len(split_ds): | |
| split_ds = split_ds.select(range(MAX_SAMPLES)) | |
| split_dir = out_dir / split | |
| split_dir.mkdir(parents=True, exist_ok=True) | |
| rows = [] | |
| print(f"Evaluating {split}: {len(split_ds)} examples, batch_size={BATCH_SIZE}") | |
| for start in tqdm(range(0, len(split_ds), BATCH_SIZE), desc=split): | |
| batch = [split_ds[i] for i in range(start, min(start + BATCH_SIZE, len(split_ds)))] | |
| # Build prompts (system + user, no assistant) | |
| texts = [] | |
| for ex in batch: | |
| messages = ex["messages"] | |
| prompt_msgs = [m for m in messages if m["role"] != "assistant"] | |
| text = tokenizer.apply_chat_template(prompt_msgs, tokenize=False, add_generation_prompt=True) | |
| texts.append(text) | |
| # Tokenize with left padding | |
| old_side = tokenizer.padding_side | |
| tokenizer.padding_side = "left" | |
| inputs = tokenizer(texts, return_tensors="pt", padding=True).to(model.device) | |
| tokenizer.padding_side = old_side | |
| # Generate | |
| with torch.inference_mode(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=1536, | |
| do_sample=False, | |
| temperature=None, | |
| pad_token_id=tokenizer.pad_token_id or tokenizer.eos_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| # Decode only new tokens | |
| new_tokens = outputs[:, inputs["input_ids"].shape[1]:] | |
| predictions = tokenizer.batch_decode(new_tokens, skip_special_tokens=True) | |
| # Score each example | |
| for ex, pred in zip(batch, predictions): | |
| pred = pred.strip() | |
| gold = get_message(ex["messages"], "assistant") | |
| pred_obj, pred_err = parse_json(pred) | |
| gold_obj, gold_err = parse_json(gold) | |
| row = { | |
| "id": ex.get("id"), | |
| "target_layer": ex.get("target_layer"), | |
| "slice_type": ex.get("slice_type"), | |
| "lifecycle_operation": ex.get("lifecycle_operation"), | |
| "parse_json": pred_obj is not None, | |
| "gold_parse_json": gold_obj is not None, | |
| "exact_match": False, | |
| "prediction": pred, | |
| "gold": gold, | |
| } | |
| if pred_obj is not None and gold_obj is not None: | |
| row["exact_match"] = json_exact_match(pred_obj, gold_obj) | |
| row.update(field_f1(pred_obj, gold_obj)) | |
| row.update(metadata_constraint_pass(ex, pred, pred_obj)) | |
| else: | |
| row.update({"field_precision": 0.0, "field_recall": 0.0, "field_f1": 0.0, | |
| "field_tp": 0, "field_fp": 0, "field_fn": 0}) | |
| row.update({"slice_sst_pass": False, "kpi_text_presence_pass": False, "adversarial_status_pass": False}) | |
| rows.append(row) | |
| # Save split results | |
| write_json(split_dir / "predictions.json", rows) | |
| summary = aggregate_metrics(rows) | |
| # Per-layer breakdown | |
| groups = defaultdict(list) | |
| for r in rows: | |
| groups[str(r.get("target_layer"))].append(r) | |
| summary["by_target_layer"] = {g: aggregate_metrics(v) for g, v in sorted(groups.items())} | |
| write_json(split_dir / "metrics.json", summary) | |
| all_summary[split] = summary | |
| # Print key metrics | |
| print(f" {split}: parse={summary.get('parse_json', 0):.3f}, field_f1={summary.get('field_f1', 0):.4f}") | |
| write_json(out_dir / "all_metrics.json", all_summary) | |
| # Print final summary | |
| print("\n" + "=" * 60) | |
| print("GRPO v3 CORRECT Evaluation Results") | |
| print("=" * 60) | |
| print(f"{'Split':<25} {'Parse':>8} {'Field F1':>10} {'Exact':>8}") | |
| print("-" * 55) | |
| for split, s in all_summary.items(): | |
| print(f"{split:<25} {s.get('parse_json',0):>8.3f} {s.get('field_f1',0):>10.4f} {s.get('exact_match',0):>8.4f}") | |
| print("=" * 60) | |
| print(f"\nResults saved to: {OUTPUT_DIR}") | |
| print("Compare with SFT Stage 1: parse=1.0, field_f1=0.687") | |
| if __name__ == "__main__": | |
| main() | |