import os import json import re import concurrent.futures from openai import OpenAI class MedicalClaimVerifier: def __init__(self): # Implementation remains similar, but with safer error handling api_file = "/home/mshahidul/api_new.json" with open(api_file, "r") as f: api_keys = json.load(f) self.api_key = api_keys["openai"] # Note: Ensure gpt-5-nano is actually available in your tier self.model_name = "gpt-5-nano" self.client = OpenAI(api_key=self.api_key) self.thresholds = { "low": {"comp": 1.0, "cov": 0.3226}, "intermediate": {"comp": 1.0, "cov": 0.4091}, "proficient": {"comp": 1.0, "cov": 0.9347}, } def get_prompt(self,context,claim): prompt = f""" CONTEXT: {context} CLAIM TO VERIFY: {claim} INSTRUCTION: Does the CONTEXT above provide enough evidence to support the CLAIM? - Answer 'supported' if the claim is explicitly stated or logically followable. - Answer 'not_supported' if the claim is missing, contradicts the text, or requires outside info. Output only one word: 'supported' or 'not_supported'. """ return prompt def check_support_api(self, prompt): try: response = self.client.chat.completions.create( model=self.model_name, messages=[{"role": "user", "content": prompt}], ) res = response.choices[0].message.content.strip().lower() return 1.0 if "supported" in res and "not_supported" not in res else 0.0 except Exception: return 0.0 def evaluate_level(self, gen_text, gold_subs, full_subs): if not gen_text or not gold_subs or not full_subs: return 0.0, 0.0 # Combining calls to reduce overhead all_claims = gold_subs + full_subs with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: results = list(executor.map(self.check_support_api, [self.get_prompt(gen_text, s) for s in all_claims])) comp_results = results[:len(gold_subs)] cov_results = results[len(gold_subs):] comp_score = sum(comp_results) / len(gold_subs) cov_score = sum(cov_results) / len(full_subs) return comp_score, cov_score verifier = MedicalClaimVerifier() def compute_score(data_source, solution_str, ground_truth, extra_info=None): gold_subs = ground_truth.get('summary_subclaims', []) full_subs = ground_truth.get('fulltext_subclaims', []) if not gold_subs or not full_subs: return 0.0 # 1. Parsing with fallback try: cleaned_str = solution_str.strip() if "```json" in cleaned_str: cleaned_str = cleaned_str.split("```json")[1].split("```")[0].strip() elif "```" in cleaned_str: cleaned_str = cleaned_str.split("```")[1].split("```")[0].strip() data = json.loads(cleaned_str) except Exception: return -5.0 levels = ["low", "intermediate", "proficient"] scores = {} # 2. Score Calculation for lvl in levels: gen_text = data.get(f"{lvl}_health_literacy", "") if not gen_text: scores[lvl] = {"comp": 0.0, "cov": 0.0, "missing": True} else: comp, cov = verifier.evaluate_level(gen_text, gold_subs, full_subs) scores[lvl] = {"comp": comp, "cov": cov, "missing": False} # 3. Reward Shaping Logic total_reward = 0.0 low_cov = scores["low"]["cov"] int_cov = scores["intermediate"]["cov"] pro_cov = scores["proficient"]["cov"] # Soft Hierarchy Check: Reward progression, penalize stagnation # Instead of -2.0 exit, we subtract if the order is wrong hierarchy_penalty = 0.0 if not (low_cov <= int_cov <= pro_cov): hierarchy_penalty = -2.0 for lvl in levels: if scores[lvl]["missing"]: total_reward -= 1.0 # Penalty per missing field continue comp_s = scores[lvl]["comp"] cov_s = scores[lvl]["cov"] thresh = verifier.thresholds[lvl] # Continuous Reward: (Actual - Threshold) # This tells the model "You're 10% away" vs "You failed" total_reward += (comp_s - thresh["comp"]) total_reward += (cov_s - thresh["cov"]) return total_reward + hierarchy_penalty