| import json |
| import sys |
| from openai import OpenAI |
| import ast,os |
| |
| |
| |
| MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-completeness_resonability_check_8kCtx_v3_BF16_merged" |
| VLLM_API_URL = "http://localhost:8004/v1" |
| VLLM_API_KEY = "EMPTY" |
|
|
| |
| client = OpenAI( |
| base_url=VLLM_API_URL, |
| api_key=VLLM_API_KEY, |
| ) |
|
|
| |
| |
| |
| def infer_reasonableness( |
| reference_summary: str, |
| generated_summary: str, |
| readability_level: str, |
| subclaim_text: str, |
| result: int, |
| ): |
| """ |
| Predict reasonableness using the local vLLM server. |
| No error handling: validation or connection errors will raise exceptions. |
| """ |
|
|
| |
| prompt = f""" |
| You are an impartial medical summarization evaluator. |
| |
| Goal: |
| Decide whether the inclusion or omission of ONE specific subclaim from the reference summary is *reasonable*, given the readability level of the generated summary. |
| |
| Readability Criteria: |
| - Easy: for non-medical readers; emphasize main story and outcomes; omit numerical data, anatomy, and test details. |
| - Intermediate: for general educated readers; keep main findings but simplify phrasing. |
| - Hard: for clinical or technical readers; maintain diagnostic accuracy and essential quantitative or anatomic content. |
| |
| Judging rules: |
| * Base your decision strictly on what appears in the generated summary. |
| * If result = 0 (subclaim omitted) and the omitted detail is clearly technical or numerical for the given level, choose "reasonable". |
| * If result = 0 and the subclaim is essential to the main story, choose "unreasonable". |
| * Stay consistent between `result`, justification, and readability level. |
| |
| ### Inputs |
| Readability Level: {readability_level} |
| Reference Summary: {reference_summary} |
| Generated Summary: {generated_summary} |
| Subclaim: "{subclaim_text}" |
| Result: {result} # 1 = supported (included), 0 = omitted |
| |
| ### Task |
| Respond **only** with the following JSON object: |
| |
| {{ |
| "reasonableness": "<reasonable | partially_reasonable | unreasonable>", |
| "justification": "<short clear explanation>" |
| }} |
| """.strip() |
|
|
| messages = [{"role": "user", "content": prompt}] |
|
|
| |
| response = client.chat.completions.create( |
| model=MODEL_NAME, |
| messages=messages, |
| temperature=0.2, |
| max_tokens=200, |
| top_p=0.8, |
| ) |
|
|
| output_text = response.choices[0].message.content |
|
|
| |
| try: |
| if "</think>" in output_text: |
| output_text = output_text.split("</think>")[1] |
| |
| clean_text = output_text.strip().replace("```json", "").replace("```", "").strip() |
| |
| t=ast.literal_eval(clean_text) |
|
|
| |
| return t |
| except Exception as e: |
| return output_text |
|
|
|
|
| |
| |
| |
| if __name__ == "__main__": |
| import argparse |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--data_path", type=str, required=True, |
| help="Path to the JSON file containing evaluation data.") |
| args = parser.parse_args() |
| data_path = args.data_path |
| |
| file_name=os.path.basename(data_path) |
| |
| |
| with open(data_path, 'r') as f: |
| dataset = json.load(f) |
| |
| |
| save_path = f'/home/mshahidul/readctrl/data/completeness_resoning_result/{file_name}' |
| full_results = [] |
| if os.path.exists(save_path): |
| with open(save_path, 'r') as f: |
| full_results = json.load(f) |
|
|
| import tqdm |
| for item in tqdm.tqdm(dataset): |
| if any(d['id'] == item['id'] for d in full_results): |
| continue |
| reference_summary = item['summary'] |
| temp2={} |
| for label in ['easy', 'intermediate', 'hard']: |
| generated_summary = item[f'{label}_text'] |
| subclaim_list = item['metrics'][f'{label}']['completeness']['details'] |
| temp=[] |
| for idx, subclaim in enumerate(subclaim_list): |
| |
| |
| result = 1 if subclaim['label'] == 'supported' else 0 |
| |
| if result ==0: |
| output = infer_reasonableness( |
| reference_summary=reference_summary, |
| generated_summary=generated_summary, |
| readability_level=label, |
| subclaim_text=subclaim['subclaim'], |
| result=result, |
| ) |
| |
| temp.append({ |
| 'subclaim': subclaim['subclaim'], |
| 'output': output |
| }) |
| else: |
| temp.append({ |
| 'subclaim': subclaim['subclaim'], |
| 'output': { |
| 'reasonableness': 'reasonable', |
| 'justification': 'The subclaim is included in the generated summary, hence it is reasonable.' |
| } |
| }) |
|
|
| temp2[label] = { |
| 'results': temp |
| } |
| full_results.append({ |
| 'id': item['id'], |
| 'completeness': temp2 |
| }) |
| if len(full_results) % 10 == 0: |
| with open(save_path, 'w') as f: |
| json.dump(full_results, f, indent=2, ensure_ascii=False) |
|
|
| with open(save_path, 'w') as f: |
| json.dump(full_results, f, indent=2, ensure_ascii=False) |
|
|
| |
| |