Spaces:

melmoheb
/

boardllm

No application file

File size: 5,124 Bytes

2247e66

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

class AnswerEvaluator:
    """Evaluates user answers against expected answers using an LLM."""

    def __init__(self, model_id="meta-llama/Llama-3.2-3B-Instruct"):
        print(f"Initializing AnswerEvaluator with model: {model_id}")
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_id)
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
                print("Set pad_token to eos_token")

            self.model = AutoModelForCausalLM.from_pretrained(
                model_id,
                torch_dtype=torch.float16,
                device_map="auto"
            )
            self.model.eval()
            self.device = self.model.device
            print(f"AnswerEvaluator model loaded successfully on device: {self.device}")

        except Exception as e:
            print(f"Error initializing AnswerEvaluator model {model_id}: {e}")
            raise


    def evaluate_answer(self, user_answer, expected_answer, clinical_context=None):
        """
        Compare user answer to expected answer and provide feedback
        
        Args:
            user_answer: Examinee's response
            expected_answer: Model answer from the dataset
            clinical_context: Optional clinical context to consider
        
        Returns:
            Feedback string
        """
        context_str = f"Clinical context: {clinical_context}\n\n" if clinical_context else ""
        
        prompt = f"""<s>[INST] You are acting as an expert examiner for the American Board of Surgery (ABS) oral board exam. You are evaluating a general surgery resident’s answer to a clinical question. \n
        Compare the answer provided by the residents to the correct expected answer, which I will provide you with. \n
        Use the grading rubric below to assess their response:
        
        [RUBRIC]
        - Correct: Resident includes all major points and clinical reasoning aligns closely with the expected answer.
        - Partially Correct: Resident includes some key points but omits others, or reasoning is partially flawed.
        - Incorrect: Resident misses most key points or demonstrates incorrect reasoning.

        {context_str}Here is the model answer that contains the key points expected from the resident:
        {expected_answer}
        
        Now, here is the resident’s actual response:
        {user_answer}
        
        Evaluate the resident’s response based **only** on the expected answer above. Do not rely on external knowledge or previous responses.
        
        Focus your evaluation on:
        1. Which key points were mentioned vs. missed
        2. The accuracy and clarity of the clinical reasoning
        3. Any major omissions or misunderstandings
        
        Start your output with:  
        ASSESSMENT: [Correct / Partially Correct / Incorrect]  
        Then write 1–2 clear, specific sentences explaining how the resident’s response compares to the expected answer.
        
        [EXAMPLE 1]
        Expected answer:
        "The differential diagnosis includes acute appendicitis, mesenteric adenitis, gastroenteritis, UTI, and testicular torsion."
        
        Resident’s response:
        "My top concern is appendicitis, but I’d also consider things like gastroenteritis or maybe even kidney stones."
        
        ASSESSMENT: Partially Correct
        The resident mentioned appendicitis and gastroenteritis but missed several other expected differentials like UTI, testicular torsion, and mesenteric adenitis.
        
        [EXAMPLE 2]
        Expected answer:
        "Initial labs should include CBC, CMP, lipase, and abdominal ultrasound to assess for gallstones."
        
        Resident’s response:
        "I’d start with a full workup including CBC, liver enzymes, lipase, and an abdominal ultrasound."
        
        ASSESSMENT: Correct
        The resident included all key labs and the correct imaging modality. Their reasoning aligns well with the expected answer.
        
        [/INST]</s>"""
        
        try:
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(self.device) # Added truncation

            with torch.no_grad():
                # Generate feedback using the model
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=150,
                    do_sample=True,
                    temperature = 0.2,
                    pad_token_id=self.tokenizer.eos_token_id # Ensure pad token ID is set
                )

            prompt_length_tokens = inputs.input_ids.shape[1]
            generated_ids = outputs[0][prompt_length_tokens:]

            feedback = self.tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

            return feedback

        except Exception as e:
            print(f"Error during LLM evaluation: {e}")
            return "Error: Could not generate feedback."