OncoAgent / scripts /evaluate_specialist.py
MaximoLopezChenlo's picture
Upload folder using huggingface_hub
e1624f5 verified
import argparse
import os
import time
import logging
os.environ["HSA_OVERRIDE_GFX_VERSION"] = "9.4.2"
os.environ["HF_HUB_DISABLE_XET"] = "1"
from dotenv import load_dotenv
load_dotenv()
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from train_specialist import TIER_CONFIGS, load_jsonl_dataset, EVAL_FILE, SEED
logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"), format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger(__name__)
def evaluate(tier: int):
"""
Evaluate the fine-tuned OncoAgent model (Tier 1 or 2) on the evaluation dataset.
Reports cross-entropy loss and perplexity.
"""
config = TIER_CONFIGS.get(tier)
if not config:
raise ValueError(f"Invalid tier: {tier}")
adapter_path = os.path.join("models", "oncoagent_adapters", f"tier{tier}", "final")
if not os.path.exists(adapter_path):
logger.error(f"Adapter path not found: {adapter_path}. Please run training first.")
return
logger.info("=" * 60)
logger.info(f"🔍 Starting Post-Training Evaluation for Tier {tier}")
logger.info(f" Adapter path: {adapter_path}")
logger.info("=" * 60)
# Load the model with Unsloth's optimizations
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=adapter_path,
max_seq_length=config.max_seq_length,
load_in_4bit=True,
)
try:
eval_dataset = load_jsonl_dataset(EVAL_FILE, "evaluation")
except FileNotFoundError:
logger.error(f"Eval file not found at {EVAL_FILE}. Cannot perform evaluation.")
return
logger.info("Running quantitative evaluation (Loss & Perplexity)...")
actual_tokenizer = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
if actual_tokenizer.pad_token is None:
actual_tokenizer.pad_token = actual_tokenizer.eos_token
sft_config = SFTConfig(
output_dir=os.path.join("models", "oncoagent_adapters", f"tier{tier}", "eval_results"),
per_device_eval_batch_size=config.batch_size,
max_length=config.max_seq_length,
packing=True,
dataset_text_field="text",
fp16=not torch.cuda.is_bf16_supported(),
bf16=torch.cuda.is_bf16_supported(),
report_to="none",
eos_token=None,
)
trainer = SFTTrainer(
model=model,
processing_class=actual_tokenizer,
eval_dataset=eval_dataset,
args=sft_config,
)
t0 = time.time()
metrics = trainer.evaluate()
duration = time.time() - t0
logger.info("=" * 60)
logger.info(f"✅ EVALUATION COMPLETE FOR TIER {tier}")
logger.info(f" Eval duration: {time.strftime('%Hh %Mm %Ss', time.gmtime(duration))}")
for k, v in metrics.items():
if isinstance(v, float):
logger.info(f" {k}: {v:.4f}")
else:
logger.info(f" {k}: {v}")
try:
perplexity = torch.exp(torch.tensor(metrics["eval_loss"])).item()
logger.info(f" Perplexity: {perplexity:.4f}")
except Exception:
pass
logger.info("=" * 60)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate Fine-Tuned OncoAgent Models")
parser.add_argument("--tier", type=int, choices=[1, 2], required=True,
help="Select the architectural tier to evaluate (1 = 9B, 2 = 27B)")
args = parser.parse_args()
evaluate(args.tier)