| | import json |
| | import sys |
| | from openai import OpenAI |
| | import ast,os |
| | |
| | |
| | |
| | MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged" |
| | VLLM_API_URL = "http://localhost:8004/v1" |
| | VLLM_API_KEY = "EMPTY" |
| |
|
| | |
| | client = OpenAI( |
| | base_url=VLLM_API_URL, |
| | api_key=VLLM_API_KEY, |
| | ) |
| |
|
| | |
| | |
| | |
| | def infer_reasonableness( |
| | fulltext: str, |
| | generated_summary: str, |
| | readability_level: str, |
| | subclaim_text: str, |
| | result: int, |
| | ): |
| | """ |
| | Predict reasonableness using the local vLLM server. |
| | No error handling: validation or connection errors will raise exceptions. |
| | """ |
| |
|
| | |
| | prompt = f""" |
| | ### **SYSTEM / ROLE INSTRUCTION** |
| | |
| | You are a **medical factuality and attribution evaluator**. |
| | You will assess whether the **unsupported subclaim** in a generated summary (when `"result": 0"`) is a *reasonable addition* given the readability level (*easy / intermediate / hard*). |
| | |
| | The goal is to decide whether this **extra piece of information** is an acceptable simplification or a *hallucination* that reduces factual faithfulness. |
| | |
| | --- |
| | |
| | ### **READABILITY & ATTRIBUTION GUIDELINES** |
| | |
| | | Level | Audience | Linguistic & Stylistic Profile | Content Goal | Allowable Additions | |
| | | :-- | :-- | :-- | :-- | :-- | |
| | | **Easy (FH 70–100, grade 5–7)** | General public; early secondary readers | Short, direct sentences using common vocabulary and concrete ideas. Avoid subordinate clauses and technical terms. Tone should be explanatory, lively, and highly accessible. | Simplify and clarify events and outcomes without introducing technical or diagnostic details. | General background context or plain-language explanations are acceptable; **no new facts, data, or inferred medical claims.** | |
| | | **Intermediate (FH 50–69, grade 8–12)** | Educated layperson / medical student | Moderate sentence length and complexity. Vocabulary suitable for high-school or introductory science readers. May include limited domain terms with brief clarification. | Present essential medical content with clear logic and limited detail, ensuring readability for non-experts. | Brief clarifications, definitions, or causal links consistent with the source are allowed; **avoid speculative or unconfirmed data.** | |
| | | **Hard (FH 0–49, university / professional)** | Medical professionals / technical audience | Long, multi-clause sentences; formal academic tone. Incorporate precise domain vocabulary, causal and analytical connectors (e.g., *por consiguiente*, *sin embargo*, *en virtud de*, *dado que*), at least one definition, one process description, and one statement of implications or challenges. | Preserve full factual accuracy, diagnostic precision, and interpretive nuance expected in professional discourse. | Additions are **not permitted**; every statement must be directly supported by the reference text. Parenthetical clarifications or relative clauses may be used for cohesion, not new content. | |
| | |
| | --- |
| | |
| | ### **Input** |
| | |
| | ``` |
| | Readability Level: {readability_level} |
| | |
| | Reference Full Text: |
| | {fulltext} |
| | |
| | Generated Summary: |
| | {generated_summary} |
| | |
| | Subclaim: "{subclaim_text}" |
| | Result: {result} # 1 = supported (included), 0 = unsupported |
| | ``` |
| | |
| | --- |
| | |
| | ### **TASK INSTRUCTIONS** |
| | |
| | If `"result": 0"`, judge whether including this subclaim is **reasonable** for the given readability level. |
| | Choose one of: `"reasonable addition"`, `"unnecessary but harmless"`, `"misleading / hallucinated"`. |
| | Provide a **1–2 sentence justification** describing your reasoning. |
| | |
| | --- |
| | |
| | ### **Output Format** |
| | |
| | Return structured JSON: |
| | |
| | ```json |
| | {{ |
| | "evaluation": {{ |
| | "reasonableness": "<reasonable | partially_reasonable | unreasonable>", |
| | "justification": "<short explanation>" |
| | }} |
| | }} |
| | ``` |
| | """.strip() |
| |
|
| | messages = [{"role": "user", "content": prompt}] |
| |
|
| | |
| | response = client.chat.completions.create( |
| | model=MODEL_NAME, |
| | messages=messages, |
| | temperature=0.2, |
| | max_tokens=200, |
| | top_p=0.8, |
| | ) |
| |
|
| | output_text = response.choices[0].message.content |
| |
|
| | |
| | try: |
| | if "</think>" in output_text: |
| | output_text = output_text.split("</think>")[1] |
| | |
| | clean_text = output_text.strip().replace("```json", "").replace("```", "").strip() |
| | |
| | t=ast.literal_eval(clean_text) |
| |
|
| | |
| | return t |
| | except Exception as e: |
| | return output_text |
| |
|
| |
|
| | |
| | |
| | |
| | if __name__ == "__main__": |
| | import argparse |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--data_path", type=str, required=True, |
| | help="Path to the JSON file containing evaluation data.") |
| | args = parser.parse_args() |
| | data_path = args.data_path |
| | |
| | file_name=os.path.basename(data_path) |
| | |
| | |
| | with open(data_path, 'r') as f: |
| | dataset = json.load(f) |
| | |
| | |
| | save_path = f'/home/mshahidul/readctrl/data/attribution_reasoning_result/{file_name}' |
| | os.makedirs('/home/mshahidul/readctrl/data/attribution_reasoning_result/', exist_ok=True) |
| | full_results = [] |
| | if os.path.exists(save_path): |
| | with open(save_path, 'r') as f: |
| | full_results = json.load(f) |
| |
|
| | import tqdm |
| | for item in tqdm.tqdm(dataset): |
| | if any(d['id'] == item['id'] for d in full_results): |
| | continue |
| | fulltext = item['fulltext'] |
| | temp2={} |
| | for label in ['easy', 'intermediate', 'hard']: |
| | generated_summary = item[f'{label}_text'] |
| | subclaim_list = item['metrics'][f'{label}']['attribution']['details'] |
| | temp=[] |
| | for idx, subclaim in enumerate(subclaim_list): |
| | |
| | |
| | result = 1 if subclaim['label'] == 'supported' else 0 |
| | |
| | if result ==0: |
| | output = infer_reasonableness( |
| | fulltext=fulltext, |
| | generated_summary=generated_summary, |
| | readability_level=label, |
| | subclaim_text=subclaim['subclaim'], |
| | result=result, |
| | ) |
| | |
| | temp.append({ |
| | 'subclaim': subclaim['subclaim'], |
| | 'output': output |
| | }) |
| | else: |
| | temp.append({ |
| | 'subclaim': subclaim['subclaim'], |
| | 'output': { |
| | 'reasonableness': 'reasonable', |
| | 'justification': 'The subclaim is included in the generated summary, hence it is reasonable.' |
| | } |
| | }) |
| |
|
| | temp2[label] = { |
| | 'results': temp |
| | } |
| | full_results.append({ |
| | 'id': item['id'], |
| | 'completeness': temp2 |
| | }) |
| | if len(full_results) % 10 == 0: |
| | with open(save_path, 'w') as f: |
| | json.dump(full_results, f, indent=2, ensure_ascii=False) |
| |
|
| | with open(save_path, 'w') as f: |
| | json.dump(full_results, f, indent=2, ensure_ascii=False) |
| |
|
| | |
| | |