| import json |
| import concurrent.futures |
| from openai import OpenAI |
|
|
| class FactualityBenchmarker: |
| def __init__(self, api_url="http://172.16.34.29:8004/v1", model="qwen3-32b-readctrl"): |
| self.client = OpenAI(base_url=api_url, api_key="EMPTY") |
| self.model = model |
|
|
| def verify_claim(self, context, claim): |
| """ |
| Asks the model to determine if the context supports the claim. |
| """ |
| prompt = f""" |
| CONTEXT: |
| {context} |
| |
| CLAIM TO VERIFY: |
| {claim} |
| |
| INSTRUCTION: |
| Does the CONTEXT above provide enough evidence to support the CLAIM? |
| - Answer 'supported' if the claim is explicitly stated or logically followable. |
| - Answer 'not_supported' if the claim is missing, contradicts the text, or requires outside info. |
| |
| Output only one word: 'supported' or 'not_supported'. |
| """ |
| |
| try: |
| response = self.client.chat.completions.create( |
| model=self.model, |
| messages=[{"role": "user", "content": prompt}], |
| temperature=0.0, |
| max_tokens=10 |
| ) |
| result = response.choices[0].message.content.strip().lower() |
| return "supported" if "supported" in result and "not_supported" not in result else "not_supported" |
| except Exception as e: |
| print(f"Error: {e}") |
| return "not_supported" |
|
|
| def run_evaluation(self, test_cases): |
| """ |
| Runs the benchmark over a list of test cases. |
| Each test case: {"context": "...", "claims": [{"text": "...", "label": 1.0/0.0}]} |
| """ |
| total_claims = 0 |
| correct_predictions = 0 |
|
|
| print(f"--- Starting Evaluation on {self.model} ---") |
| |
| for i, case in enumerate(test_cases): |
| context = case["context"] |
| print(f"\nTest Case {i+1}:") |
| |
| for claim_data in case["claims"]: |
| claim_text = claim_data["text"] |
| expected = claim_data["expected"] |
| |
| |
| prediction = self.verify_claim(context, claim_text) |
| |
| is_correct = (prediction == expected) |
| if is_correct: |
| correct_predictions += 1 |
| total_claims += 1 |
| |
| status = "PASS" if is_correct else "FAIL" |
| print(f" [{status}] Claim: {claim_text[:60]}... (Expected: {expected}, Got: {prediction})") |
|
|
| accuracy = (correct_predictions / total_claims) * 100 if total_claims > 0 else 0 |
| print(f"\n" + "="*30) |
| print(f"FINAL ACCURACY: {accuracy:.2f}% ({correct_predictions}/{total_claims})") |
| print("="*30) |
|
|
| |
| test_data = [ |
| { |
| "context": """CASE PRESENTATION: |
| A 64-year-old male with a 15-year history of Type 2 Diabetes Mellitus and stage 3 chronic kidney disease (CKD) |
| presented to the emergency department with acute shortness of breath and peripheral edema. On physical |
| examination, the patient was hypertensive (175/95 mmHg) and tachycardic (110 bpm). Lung auscultation revealed |
| bilateral crackles in the lower lobes, consistent with pulmonary congestion. Notable laboratory findings |
| included a Serum Creatinine of 2.8 mg/dL (baseline 1.9 mg/dL) and a Brain Natriuretic Peptide (BNP) of 1,250 pg/mL. |
| |
| Crucially, the patient reported no history of tobacco use and denied any chest pain or radiating pain to the |
| left arm. An EKG showed sinus tachycardia but no ST-segment elevation or T-wave inversion. The medical team |
| initiated a regimen of intravenous furosemide (40mg bolus) and transitioned the patient from his home |
| medication (Metformin) to insulin glargine to manage blood glucose during the acute episode, citing concerns |
| over lactic acidosis risk given the acute kidney injury. After 48 hours, the patient's oxygen saturation |
| improved from 89% on room air to 95%, and his weight decreased by 3.2 kg due to successful diuresis. |
| The discharge summary noted that despite the respiratory distress, there were no signs of systemic infection |
| or fever during the entire 4-day hospital stay.""", |
| "claims":[ |
| |
| {"text": "The patient has had Type 2 Diabetes for 15 years.", "expected": "supported"}, |
| |
| |
| {"text": "The patient showed signs of fluid buildup in the lungs.", "expected": "supported"}, |
| |
| |
| {"text": "The patient has a history of smoking.", "expected": "not_supported"}, |
| |
| |
| {"text": "The patient's Serum Creatinine increased by 0.9 mg/dL from his baseline.", "expected": "supported"}, |
| |
| |
| {"text": "The doctors stopped Metformin because of the risk of lactic acidosis.", "expected": "supported"}, |
| |
| |
| {"text": "The patient complained of pain moving down his left arm.", "expected": "not_supported"}, |
| |
| |
| {"text": "The patient was experiencing high blood pressure and a fast heart rate upon arrival.", "expected": "supported"}, |
| |
| |
| {"text": "The patient lost over 3 kilograms during the first two days of treatment.", "expected": "supported"}, |
| |
| |
| {"text": "The EKG provided clear evidence of an active heart attack.", "expected": "not_supported"}, |
| |
| |
| {"text": "The patient remained afebrile throughout the hospitalization.", "expected": "supported"} |
| ] |
| }, |
| { |
| "context": "The company reported a 15% increase in revenue, reaching $2 billion this quarter. However, net profit dropped due to high R&D costs.", |
| "claims": [ |
| {"text": "Revenue reached $2 billion.", "expected": "supported"}, |
| {"text": "Net profit increased this quarter.", "expected": "not_supported"}, |
| {"text": "Spending on Research and Development impacted profits.", "expected": "supported"} |
| ] |
| } |
| ] |
|
|
| if __name__ == "__main__": |
| benchmarker = FactualityBenchmarker() |
| benchmarker.run_evaluation(test_data) |