File size: 5,496 Bytes
00350d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# /// script
# dependencies = ["transformers>=4.40.0", "peft>=0.7.0", "bitsandbytes>=0.41.0", "accelerate>=0.28.0"]
# ///

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

print("=" * 80)
print("COMPREHENSIVE BIOMEDICAL MODEL EVALUATION")
print("=" * 80)
print("\nModel: panikos/llama-biomedical-production-qlora")
print("Training: 17,008 examples, 1 epoch, QLoRA")
print("=" * 80)

print("\n[1/3] Loading base model with 4-bit quantization...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B-Instruct",
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
print("  Base model loaded")

print("\n[2/3] Loading production LoRA adapters...")
model = PeftModel.from_pretrained(
    base_model,
    "panikos/llama-biomedical-production-qlora"
)
print("  LoRA adapters loaded")

print("\n[3/3] Running comprehensive evaluation...")
print("=" * 80)

# Test cases covering various biomedical scenarios
test_cases = [
    {
        "name": "Simple LOINC to SDTM Mapping",
        "prompt": "Map the following LOINC code to CDISC SDTM domain:\n\nLOINC: 2339-0 (Glucose [Mass/volume] in Blood)",
        "expected_keywords": ["LB", "Laboratory"]
    },
    {
        "name": "Complex LOINC Panel Classification",
        "prompt": """Analyze the following LOINC codes and classify each into the appropriate CDISC SDTM domain:

1. 2339-0: Glucose [Mass/volume] in Blood
2. 4548-4: Hemoglobin A1c/Hemoglobin.total in Blood
3. 2160-0: Creatinine [Mass/volume] in Serum or Plasma

Also identify if they form a panel and specify clinical significance.""",
        "expected_keywords": ["LB", "Laboratory", "metabolic", "diabetes", "renal"]
    },
    {
        "name": "CDISC Terminology Query",
        "prompt": "What is the CDISC SDTM terminology for patient-reported adverse event severity?",
        "expected_keywords": ["AESEV", "severity", "adverse event"]
    },
    {
        "name": "Adverse Event Classification",
        "prompt": "Classify the following observation into the appropriate SDTM domain:\n\nPatient reported experiencing headache with severity of moderate, lasting 2 hours after taking study medication.",
        "expected_keywords": ["AE", "Adverse", "Event"]
    },
    {
        "name": "SDTM vs ADaM Knowledge",
        "prompt": "Explain the difference between SDTM and ADaM in CDISC standards.",
        "expected_keywords": ["SDTM", "ADaM", "source", "analysis"]
    },
    {
        "name": "Vital Signs Mapping",
        "prompt": "Map the following LOINC code to CDISC SDTM domain:\n\nLOINC: 8867-4 (Heart rate)",
        "expected_keywords": ["VS", "Vital", "Signs"]
    }
]

results = []
for i, test in enumerate(test_cases, 1):
    print(f"\n{'='*80}")
    print(f"TEST {i}/{len(test_cases)}: {test['name']}")
    print(f"{'='*80}")
    print(f"\nPrompt: {test['prompt'][:100]}...")

    # Format prompt
    messages = [{"role": "user", "content": test['prompt']}]

    # Tokenize
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    # Generate
    print("\n>>> Model Response:")
    print("-" * 80)
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_new_tokens=250,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode
    response = tokenizer.decode(outputs[0][input_ids.shape[1]:], skip_special_tokens=True)
    print(response)
    print("-" * 80)

    # Evaluate
    found_keywords = [kw for kw in test['expected_keywords'] if kw.lower() in response.lower()]
    score = (len(found_keywords) / len(test['expected_keywords'])) * 100

    print(f"\n>>> Evaluation:")
    print(f"Expected keywords: {', '.join(test['expected_keywords'])}")
    print(f"Found: {', '.join(found_keywords) if found_keywords else 'None'}")
    print(f"Score: {score:.0f}% ({len(found_keywords)}/{len(test['expected_keywords'])})")

    results.append({
        'name': test['name'],
        'score': score,
        'found': len(found_keywords),
        'total': len(test['expected_keywords'])
    })

# Overall evaluation
print("\n" + "=" * 80)
print("OVERALL EVALUATION SUMMARY")
print("=" * 80)

avg_score = sum(r['score'] for r in results) / len(results)
total_found = sum(r['found'] for r in results)
total_expected = sum(r['total'] for r in results)

print(f"\nAverage Score: {avg_score:.1f}%")
print(f"Total Keywords Found: {total_found}/{total_expected}")
print(f"\nIndividual Test Results:")
for r in results:
    print(f"  {r['name']}: {r['score']:.0f}% ({r['found']}/{r['total']})")

print("\n" + "=" * 80)
if avg_score >= 70:
    print("RESULT: EXCELLENT - Model shows strong biomedical understanding")
    print("RECOMMENDATION: Model is ready for production use!")
elif avg_score >= 50:
    print("RESULT: GOOD - Model has solid biomedical knowledge")
    print("RECOMMENDATION: Consider additional training for edge cases")
else:
    print("RESULT: NEEDS IMPROVEMENT - Consider additional training epochs")

print("=" * 80)