Spaces:
No application file
No application file
File size: 5,496 Bytes
00350d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# /// script
# dependencies = ["transformers>=4.40.0", "peft>=0.7.0", "bitsandbytes>=0.41.0", "accelerate>=0.28.0"]
# ///
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch
print("=" * 80)
print("COMPREHENSIVE BIOMEDICAL MODEL EVALUATION")
print("=" * 80)
print("\nModel: panikos/llama-biomedical-production-qlora")
print("Training: 17,008 examples, 1 epoch, QLoRA")
print("=" * 80)
print("\n[1/3] Loading base model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-3.1-8B-Instruct",
quantization_config=bnb_config,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
print(" Base model loaded")
print("\n[2/3] Loading production LoRA adapters...")
model = PeftModel.from_pretrained(
base_model,
"panikos/llama-biomedical-production-qlora"
)
print(" LoRA adapters loaded")
print("\n[3/3] Running comprehensive evaluation...")
print("=" * 80)
# Test cases covering various biomedical scenarios
test_cases = [
{
"name": "Simple LOINC to SDTM Mapping",
"prompt": "Map the following LOINC code to CDISC SDTM domain:\n\nLOINC: 2339-0 (Glucose [Mass/volume] in Blood)",
"expected_keywords": ["LB", "Laboratory"]
},
{
"name": "Complex LOINC Panel Classification",
"prompt": """Analyze the following LOINC codes and classify each into the appropriate CDISC SDTM domain:
1. 2339-0: Glucose [Mass/volume] in Blood
2. 4548-4: Hemoglobin A1c/Hemoglobin.total in Blood
3. 2160-0: Creatinine [Mass/volume] in Serum or Plasma
Also identify if they form a panel and specify clinical significance.""",
"expected_keywords": ["LB", "Laboratory", "metabolic", "diabetes", "renal"]
},
{
"name": "CDISC Terminology Query",
"prompt": "What is the CDISC SDTM terminology for patient-reported adverse event severity?",
"expected_keywords": ["AESEV", "severity", "adverse event"]
},
{
"name": "Adverse Event Classification",
"prompt": "Classify the following observation into the appropriate SDTM domain:\n\nPatient reported experiencing headache with severity of moderate, lasting 2 hours after taking study medication.",
"expected_keywords": ["AE", "Adverse", "Event"]
},
{
"name": "SDTM vs ADaM Knowledge",
"prompt": "Explain the difference between SDTM and ADaM in CDISC standards.",
"expected_keywords": ["SDTM", "ADaM", "source", "analysis"]
},
{
"name": "Vital Signs Mapping",
"prompt": "Map the following LOINC code to CDISC SDTM domain:\n\nLOINC: 8867-4 (Heart rate)",
"expected_keywords": ["VS", "Vital", "Signs"]
}
]
results = []
for i, test in enumerate(test_cases, 1):
print(f"\n{'='*80}")
print(f"TEST {i}/{len(test_cases)}: {test['name']}")
print(f"{'='*80}")
print(f"\nPrompt: {test['prompt'][:100]}...")
# Format prompt
messages = [{"role": "user", "content": test['prompt']}]
# Tokenize
input_ids = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
# Generate
print("\n>>> Model Response:")
print("-" * 80)
with torch.no_grad():
outputs = model.generate(
input_ids,
max_new_tokens=250,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# Decode
response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
print(response)
print("-" * 80)
# Evaluate
found_keywords = [kw for kw in test['expected_keywords'] if kw.lower() in response.lower()]
score = (len(found_keywords) / len(test['expected_keywords'])) * 100
print(f"\n>>> Evaluation:")
print(f"Expected keywords: {', '.join(test['expected_keywords'])}")
print(f"Found: {', '.join(found_keywords) if found_keywords else 'None'}")
print(f"Score: {score:.0f}% ({len(found_keywords)}/{len(test['expected_keywords'])})")
results.append({
'name': test['name'],
'score': score,
'found': len(found_keywords),
'total': len(test['expected_keywords'])
})
# Overall evaluation
print("\n" + "=" * 80)
print("OVERALL EVALUATION SUMMARY")
print("=" * 80)
avg_score = sum(r['score'] for r in results) / len(results)
total_found = sum(r['found'] for r in results)
total_expected = sum(r['total'] for r in results)
print(f"\nAverage Score: {avg_score:.1f}%")
print(f"Total Keywords Found: {total_found}/{total_expected}")
print(f"\nIndividual Test Results:")
for r in results:
print(f" {r['name']}: {r['score']:.0f}% ({r['found']}/{r['total']})")
print("\n" + "=" * 80)
if avg_score >= 70:
print("RESULT: EXCELLENT - Model shows strong biomedical understanding")
print("RECOMMENDATION: Model is ready for production use!")
elif avg_score >= 50:
print("RESULT: GOOD - Model has solid biomedical knowledge")
print("RECOMMENDATION: Consider additional training for edge cases")
else:
print("RESULT: NEEDS IMPROVEMENT - Consider additional training epochs")
print("=" * 80)
|