import json import sys from openai import OpenAI import ast,os # =========================== # CONFIGURATION # =========================== MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged" VLLM_API_URL = "http://localhost:8004/v1" VLLM_API_KEY = "EMPTY" # Initialize Client client = OpenAI( base_url=VLLM_API_URL, api_key=VLLM_API_KEY, ) # =========================== # INFERENCE FUNCTION # =========================== def infer_reasonableness( fulltext: str, generated_summary: str, readability_level: str, subclaim_text: str, result: int, ): """ Predict reasonableness using the local vLLM server. No error handling: validation or connection errors will raise exceptions. """ # ---- Build inference prompt ---- prompt = f""" ### **SYSTEM / ROLE INSTRUCTION** You are a **medical factuality and attribution evaluator**. You will assess whether the **unsupported subclaim** in a generated summary (when `"result": 0"`) is a *reasonable addition* given the readability level (*easy / intermediate / hard*). The goal is to decide whether this **extra piece of information** is an acceptable simplification or a *hallucination* that reduces factual faithfulness. --- ### **READABILITY & ATTRIBUTION GUIDELINES** | Level | Audience | Linguistic & Stylistic Profile | Content Goal | Allowable Additions | | :-- | :-- | :-- | :-- | :-- | | **Easy (FH 70–100, grade 5–7)** | General public; early secondary readers | Short, direct sentences using common vocabulary and concrete ideas. Avoid subordinate clauses and technical terms. Tone should be explanatory, lively, and highly accessible. | Simplify and clarify events and outcomes without introducing technical or diagnostic details. | General background context or plain-language explanations are acceptable; **no new facts, data, or inferred medical claims.** | | **Intermediate (FH 50–69, grade 8–12)** | Educated layperson / medical student | Moderate sentence length and complexity. Vocabulary suitable for high-school or introductory science readers. May include limited domain terms with brief clarification. | Present essential medical content with clear logic and limited detail, ensuring readability for non-experts. | Brief clarifications, definitions, or causal links consistent with the source are allowed; **avoid speculative or unconfirmed data.** | | **Hard (FH 0–49, university / professional)** | Medical professionals / technical audience | Long, multi-clause sentences; formal academic tone. Incorporate precise domain vocabulary, causal and analytical connectors (e.g., *por consiguiente*, *sin embargo*, *en virtud de*, *dado que*), at least one definition, one process description, and one statement of implications or challenges. | Preserve full factual accuracy, diagnostic precision, and interpretive nuance expected in professional discourse. | Additions are **not permitted**; every statement must be directly supported by the reference text. Parenthetical clarifications or relative clauses may be used for cohesion, not new content. | --- ### **Input** ``` Readability Level: {readability_level} Reference Full Text: {fulltext} Generated Summary: {generated_summary} Subclaim: "{subclaim_text}" Result: {result} # 1 = supported (included), 0 = unsupported ``` --- ### **TASK INSTRUCTIONS** If `"result": 0"`, judge whether including this subclaim is **reasonable** for the given readability level. Choose one of: `"reasonable addition"`, `"unnecessary but harmless"`, `"misleading / hallucinated"`. Provide a **1–2 sentence justification** describing your reasoning. --- ### **Output Format** Return structured JSON: ```json {{ "evaluation": {{ "reasonableness": "", "justification": "" }} }} ``` """.strip() messages = [{"role": "user", "content": prompt}] # ---- Call vLLM Server ---- response = client.chat.completions.create( model=MODEL_NAME, messages=messages, temperature=0.2, max_tokens=200, top_p=0.8, ) output_text = response.choices[0].message.content # ---- Clean Output (Handle Thinking & Markdown) ---- try: if "" in output_text: output_text = output_text.split("")[1] clean_text = output_text.strip().replace("```json", "").replace("```", "").strip() # import ipdb; ipdb.set_trace() t=ast.literal_eval(clean_text) # ---- Parse JSON (Will raise JSONDecodeError if invalid) ---- return t except Exception as e: return output_text # =========================== # MAIN EXECUTION # =========================== if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--data_path", type=str, required=True, help="Path to the JSON file containing evaluation data.") args = parser.parse_args() data_path = args.data_path # data_path = '/home/mshahidul/readctrl/data/concise_complete_attr_cal_v3/evaluated_metrics_0_100.json' file_name=os.path.basename(data_path) # Open file directly (Will raise FileNotFoundError if missing) with open(data_path, 'r') as f: dataset = json.load(f) # print(f"Loaded {len(dataset)} examples. Starting inference...") save_path = f'/home/mshahidul/readctrl/data/attribution_reasoning_result/{file_name}' os.makedirs('/home/mshahidul/readctrl/data/attribution_reasoning_result/', exist_ok=True) full_results = [] if os.path.exists(save_path): with open(save_path, 'r') as f: full_results = json.load(f) import tqdm for item in tqdm.tqdm(dataset): if any(d['id'] == item['id'] for d in full_results): continue fulltext = item['fulltext'] temp2={} for label in ['easy', 'intermediate', 'hard']: generated_summary = item[f'{label}_text'] subclaim_list = item['metrics'][f'{label}']['attribution']['details'] temp=[] for idx, subclaim in enumerate(subclaim_list): # Check status (assumes subclaim variable holds the status string) result = 1 if subclaim['label'] == 'supported' else 0 if result ==0: output = infer_reasonableness( fulltext=fulltext, generated_summary=generated_summary, readability_level=label, subclaim_text=subclaim['subclaim'], result=result, ) temp.append({ 'subclaim': subclaim['subclaim'], 'output': output }) else: temp.append({ 'subclaim': subclaim['subclaim'], 'output': { 'reasonableness': 'reasonable', 'justification': 'The subclaim is included in the generated summary, hence it is reasonable.' } }) temp2[label] = { 'results': temp } full_results.append({ 'id': item['id'], 'completeness': temp2 }) if len(full_results) % 10 == 0: with open(save_path, 'w') as f: json.dump(full_results, f, indent=2, ensure_ascii=False) with open(save_path, 'w') as f: json.dump(full_results, f, indent=2, ensure_ascii=False)