| import argparse |
| import os |
| import time |
| import logging |
|
|
| os.environ["HSA_OVERRIDE_GFX_VERSION"] = "9.4.2" |
| os.environ["HF_HUB_DISABLE_XET"] = "1" |
|
|
| from dotenv import load_dotenv |
| load_dotenv() |
|
|
| import torch |
| from unsloth import FastLanguageModel |
| from trl import SFTTrainer, SFTConfig |
|
|
| from train_specialist import TIER_CONFIGS, load_jsonl_dataset, EVAL_FILE, SEED |
|
|
| logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"), format="%(asctime)s [%(levelname)s] %(message)s") |
| logger = logging.getLogger(__name__) |
|
|
| def evaluate(tier: int): |
| """ |
| Evaluate the fine-tuned OncoAgent model (Tier 1 or 2) on the evaluation dataset. |
| Reports cross-entropy loss and perplexity. |
| """ |
| config = TIER_CONFIGS.get(tier) |
| if not config: |
| raise ValueError(f"Invalid tier: {tier}") |
|
|
| adapter_path = os.path.join("models", "oncoagent_adapters", f"tier{tier}", "final") |
| |
| if not os.path.exists(adapter_path): |
| logger.error(f"Adapter path not found: {adapter_path}. Please run training first.") |
| return |
| |
| logger.info("=" * 60) |
| logger.info(f"🔍 Starting Post-Training Evaluation for Tier {tier}") |
| logger.info(f" Adapter path: {adapter_path}") |
| logger.info("=" * 60) |
|
|
| |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name=adapter_path, |
| max_seq_length=config.max_seq_length, |
| load_in_4bit=True, |
| ) |
| |
| try: |
| eval_dataset = load_jsonl_dataset(EVAL_FILE, "evaluation") |
| except FileNotFoundError: |
| logger.error(f"Eval file not found at {EVAL_FILE}. Cannot perform evaluation.") |
| return |
|
|
| logger.info("Running quantitative evaluation (Loss & Perplexity)...") |
| |
| actual_tokenizer = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer |
| if actual_tokenizer.pad_token is None: |
| actual_tokenizer.pad_token = actual_tokenizer.eos_token |
|
|
| sft_config = SFTConfig( |
| output_dir=os.path.join("models", "oncoagent_adapters", f"tier{tier}", "eval_results"), |
| per_device_eval_batch_size=config.batch_size, |
| max_length=config.max_seq_length, |
| packing=True, |
| dataset_text_field="text", |
| fp16=not torch.cuda.is_bf16_supported(), |
| bf16=torch.cuda.is_bf16_supported(), |
| report_to="none", |
| eos_token=None, |
| ) |
| |
| trainer = SFTTrainer( |
| model=model, |
| processing_class=actual_tokenizer, |
| eval_dataset=eval_dataset, |
| args=sft_config, |
| ) |
| |
| t0 = time.time() |
| metrics = trainer.evaluate() |
| duration = time.time() - t0 |
| |
| logger.info("=" * 60) |
| logger.info(f"✅ EVALUATION COMPLETE FOR TIER {tier}") |
| logger.info(f" Eval duration: {time.strftime('%Hh %Mm %Ss', time.gmtime(duration))}") |
| for k, v in metrics.items(): |
| if isinstance(v, float): |
| logger.info(f" {k}: {v:.4f}") |
| else: |
| logger.info(f" {k}: {v}") |
| |
| try: |
| perplexity = torch.exp(torch.tensor(metrics["eval_loss"])).item() |
| logger.info(f" Perplexity: {perplexity:.4f}") |
| except Exception: |
| pass |
| logger.info("=" * 60) |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Evaluate Fine-Tuned OncoAgent Models") |
| parser.add_argument("--tier", type=int, choices=[1, 2], required=True, |
| help="Select the architectural tier to evaluate (1 = 9B, 2 = 27B)") |
| args = parser.parse_args() |
| evaluate(args.tier) |
|
|