import json import os import re import torch from datasets import Dataset from unsloth import FastLanguageModel os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "0" DATA_PATH = "/home/mshahidul/readctrl/data/finetuning_data/finetune_dataset_subclaim_support_v2_sft_prompt.json" MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-8B_subclaims-verifier_lora_nonreasoning" OUTPUT_PATH = "/home/mshahidul/readctrl/results/qwen3-8B_subclaims_verifier_test_predictions.jsonl" SUMMARY_PATH = "/home/mshahidul/readctrl/results/qwen3-8B_subclaims_verifier_test_summary.json" def normalize_label(text: str) -> str: if text is None: return "unknown" cleaned = text.strip().lower() cleaned = cleaned.replace("\n", " ").strip() if "not_supported" in cleaned: return "not_supported" if "not supported" in cleaned: return "not_supported" first = re.split(r"\s+", cleaned)[0].strip(".,:;") if first in {"supported", "not_supported"}: return first if "supported" in cleaned: return "supported" return "unknown" def get_turn(conversations, role: str) -> str: for turn in conversations: if turn.get("from") == role: return turn.get("content", "") return "" def main() -> None: if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available. Please run on a GPU.") with open(DATA_PATH, "r") as f: data = json.load(f) dataset = Dataset.from_list(data) split_dataset = dataset.train_test_split(test_size=0.2, seed=3407, shuffle=True) test_data = split_dataset["test"] model, tokenizer = FastLanguageModel.from_pretrained( model_name=MODEL_PATH, max_seq_length=8192, load_in_4bit=False, ) FastLanguageModel.for_inference(model) total = len(test_data) correct = 0 with open(OUTPUT_PATH, "w") as out_f: for idx, item in enumerate(test_data): user_text = get_turn(item["conversations"], "user") gold_text = get_turn(item["conversations"], "assistant") gold_label = normalize_label(gold_text) messages = [{"role": "user", "content": user_text}] input_text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) inputs = tokenizer([input_text], return_tensors="pt").to("cuda") with torch.no_grad(): generated = model.generate( **inputs, max_new_tokens=20, do_sample=False, use_cache=True, pad_token_id=tokenizer.eos_token_id, ) gen_text = tokenizer.decode( generated[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True, ) pred_label = normalize_label(gen_text) is_correct = pred_label == gold_label correct += int(is_correct) record = { "index": idx, "label": gold_label, "prediction": pred_label, "correct": is_correct, "raw_output": gen_text.strip(), } out_f.write(json.dumps(record, ensure_ascii=False) + "\n") if (idx + 1) % 100 == 0: print(f"Processed {idx + 1}/{total}") accuracy = correct / total if total else 0.0 summary = { "total": total, "correct": correct, "accuracy": accuracy, } with open(SUMMARY_PATH, "w") as f: json.dump(summary, f, ensure_ascii=False, indent=2) print(f"Accuracy: {accuracy:.4f}") print(f"Saved predictions: {OUTPUT_PATH}") print(f"Saved summary: {SUMMARY_PATH}") if __name__ == "__main__": main()