File size: 6,538 Bytes

c7a6fe6

import json
import concurrent.futures
from openai import OpenAI

class FactualityBenchmarker:
    def __init__(self, api_url="http://172.16.34.29:8004/v1", model="qwen3-32b-readctrl"):
        self.client = OpenAI(base_url=api_url, api_key="EMPTY")
        self.model = model

    def verify_claim(self, context, claim):
        """
        Asks the model to determine if the context supports the claim.
        """
        prompt = f"""
        CONTEXT:
        {context}

        CLAIM TO VERIFY:
        {claim}

        INSTRUCTION:
        Does the CONTEXT above provide enough evidence to support the CLAIM? 
        - Answer 'supported' if the claim is explicitly stated or logically followable.
        - Answer 'not_supported' if the claim is missing, contradicts the text, or requires outside info.

        Output only one word: 'supported' or 'not_supported'.
        """
        
        try:
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[{"role": "user", "content": prompt}],
                temperature=0.0, # Zero temp for consistency in benchmarks
                max_tokens=10
            )
            result = response.choices[0].message.content.strip().lower()
            return "supported" if "supported" in result and "not_supported" not in result else "not_supported"
        except Exception as e:
            print(f"Error: {e}")
            return "not_supported"

    def run_evaluation(self, test_cases):
        """
        Runs the benchmark over a list of test cases.
        Each test case: {"context": "...", "claims": [{"text": "...", "label": 1.0/0.0}]}
        """
        total_claims = 0
        correct_predictions = 0

        print(f"--- Starting Evaluation on {self.model} ---")
        
        for i, case in enumerate(test_cases):
            context = case["context"]
            print(f"\nTest Case {i+1}:")
            
            for claim_data in case["claims"]:
                claim_text = claim_data["text"]
                expected = claim_data["expected"]
                
                # Model Prediction
                prediction = self.verify_claim(context, claim_text)
                
                is_correct = (prediction == expected)
                if is_correct:
                    correct_predictions += 1
                total_claims += 1
                
                status = "PASS" if is_correct else "FAIL"
                print(f"  [{status}] Claim: {claim_text[:60]}... (Expected: {expected}, Got: {prediction})")

        accuracy = (correct_predictions / total_claims) * 100 if total_claims > 0 else 0
        print(f"\n" + "="*30)
        print(f"FINAL ACCURACY: {accuracy:.2f}% ({correct_predictions}/{total_claims})")
        print("="*30)

# --- Define your test data here ---
test_data = [
    {
        "context": """CASE PRESENTATION:
A 64-year-old male with a 15-year history of Type 2 Diabetes Mellitus and stage 3 chronic kidney disease (CKD) 
presented to the emergency department with acute shortness of breath and peripheral edema. On physical 
examination, the patient was hypertensive (175/95 mmHg) and tachycardic (110 bpm). Lung auscultation revealed 
bilateral crackles in the lower lobes, consistent with pulmonary congestion. Notable laboratory findings 
included a Serum Creatinine of 2.8 mg/dL (baseline 1.9 mg/dL) and a Brain Natriuretic Peptide (BNP) of 1,250 pg/mL. 

Crucially, the patient reported no history of tobacco use and denied any chest pain or radiating pain to the 
left arm. An EKG showed sinus tachycardia but no ST-segment elevation or T-wave inversion. The medical team 
initiated a regimen of intravenous furosemide (40mg bolus) and transitioned the patient from his home 
medication (Metformin) to insulin glargine to manage blood glucose during the acute episode, citing concerns 
over lactic acidosis risk given the acute kidney injury. After 48 hours, the patient's oxygen saturation 
improved from 89% on room air to 95%, and his weight decreased by 3.2 kg due to successful diuresis. 
The discharge summary noted that despite the respiratory distress, there were no signs of systemic infection 
or fever during the entire 4-day hospital stay.""",
        "claims":[
    # 1. Literal Extraction
    {"text": "The patient has had Type 2 Diabetes for 15 years.", "expected": "supported"},
    
    # 2. Medical Paraphrasing (Reading Control)
    {"text": "The patient showed signs of fluid buildup in the lungs.", "expected": "supported"}, # 'bilateral crackles/congestion'
    
    # 3. Negative Constraint (Exclusionary fact)
    {"text": "The patient has a history of smoking.", "expected": "not_supported"}, # Text says 'no history of tobacco'
    
    # 4. Mathematical Inference
    {"text": "The patient's Serum Creatinine increased by 0.9 mg/dL from his baseline.", "expected": "supported"}, # 2.8 - 1.9 = 0.9
    
    # 5. Logic: Cause and Effect
    {"text": "The doctors stopped Metformin because of the risk of lactic acidosis.", "expected": "supported"},
    
    # 6. Negative Finding (Testing 'Silence')
    {"text": "The patient complained of pain moving down his left arm.", "expected": "not_supported"}, # Specifically denied
    
    # 7. Vital Sign Interpretation
    {"text": "The patient was experiencing high blood pressure and a fast heart rate upon arrival.", "expected": "supported"}, # 175/95 and 110bpm
    
    # 8. Numerical Recovery
    {"text": "The patient lost over 3 kilograms during the first two days of treatment.", "expected": "supported"}, # 3.2 kg
    
    # 9. Complex Inference (EKG interpretation)
    {"text": "The EKG provided clear evidence of an active heart attack.", "expected": "not_supported"}, # Text says 'no ST-elevation'
    
    # 10. Systemic Health Status
    {"text": "The patient remained afebrile throughout the hospitalization.", "expected": "supported"} # 'no fever' = afebrile
]
    },
    {
        "context": "The company reported a 15% increase in revenue, reaching $2 billion this quarter. However, net profit dropped due to high R&D costs.",
        "claims": [
            {"text": "Revenue reached $2 billion.", "expected": "supported"},
            {"text": "Net profit increased this quarter.", "expected": "not_supported"},
            {"text": "Spending on Research and Development impacted profits.", "expected": "supported"}
        ]
    }
]

if __name__ == "__main__":
    benchmarker = FactualityBenchmarker()
    benchmarker.run_evaluation(test_data)