File size: 6,196 Bytes

9c6961c

import json
import sys
from openai import OpenAI
import ast,os
# ===========================
# CONFIGURATION
# ===========================
MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-completeness_resonability_check_8kCtx_v3_BF16_merged"
VLLM_API_URL = "http://localhost:8004/v1"
VLLM_API_KEY = "EMPTY" 

# Initialize Client
client = OpenAI(
    base_url=VLLM_API_URL,
    api_key=VLLM_API_KEY,
)

# ===========================
# INFERENCE FUNCTION
# ===========================
def infer_reasonableness(
    reference_summary: str,
    generated_summary: str,
    readability_level: str,
    subclaim_text: str,
    result: int,
):
    """
    Predict reasonableness using the local vLLM server.
    No error handling: validation or connection errors will raise exceptions.
    """

    # ---- Build inference prompt ----
    prompt = f"""
You are an impartial medical summarization evaluator.

Goal:
Decide whether the inclusion or omission of ONE specific subclaim from the reference summary is *reasonable*, given the readability level of the generated summary.

Readability Criteria:
- Easy: for non-medical readers; emphasize main story and outcomes; omit numerical data, anatomy, and test details.
- Intermediate: for general educated readers; keep main findings but simplify phrasing.
- Hard: for clinical or technical readers; maintain diagnostic accuracy and essential quantitative or anatomic content.

Judging rules:
* Base your decision strictly on what appears in the generated summary.
* If result = 0 (subclaim omitted) and the omitted detail is clearly technical or numerical for the given level, choose "reasonable".
* If result = 0 and the subclaim is essential to the main story, choose "unreasonable".
* Stay consistent between `result`, justification, and readability level.

### Inputs
Readability Level: {readability_level}
Reference Summary: {reference_summary}
Generated Summary: {generated_summary}
Subclaim: "{subclaim_text}"
Result: {result}   # 1 = supported (included), 0 = omitted

### Task
Respond **only** with the following JSON object:

{{
  "reasonableness": "<reasonable | partially_reasonable | unreasonable>",
  "justification": "<short clear explanation>"
}}
""".strip()

    messages = [{"role": "user", "content": prompt}]

    # ---- Call vLLM Server ----
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        temperature=0.2,
        max_tokens=200,
        top_p=0.8,
    )

    output_text = response.choices[0].message.content

    # ---- Clean Output (Handle Thinking & Markdown) ----
    try:
        if "</think>" in output_text:
            output_text = output_text.split("</think>")[1]
        
        clean_text = output_text.strip().replace("```json", "").replace("```", "").strip()
        # import ipdb; ipdb.set_trace()
        t=ast.literal_eval(clean_text)

        # ---- Parse JSON (Will raise JSONDecodeError if invalid) ----
        return t
    except Exception as e:
        return output_text


# ===========================
# MAIN EXECUTION
# ===========================
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_path", type=str, required=True,
                        help="Path to the JSON file containing evaluation data.")
    args = parser.parse_args()
    data_path = args.data_path
    # data_path = '/home/mshahidul/readctrl/data/concise_complete_attr_cal_v3/evaluated_metrics_0_100.json'
    file_name=os.path.basename(data_path)
    
    # Open file directly (Will raise FileNotFoundError if missing)
    with open(data_path, 'r') as f:
        dataset = json.load(f)
        
    # print(f"Loaded {len(dataset)} examples. Starting inference...")
    save_path = f'/home/mshahidul/readctrl/data/completeness_resoning_result/{file_name}'
    full_results = []
    if os.path.exists(save_path):
        with open(save_path, 'r') as f:
            full_results = json.load(f)

    import tqdm
    for item in tqdm.tqdm(dataset):
        if any(d['id'] == item['id'] for d in full_results):
            continue  
        reference_summary = item['summary']
        temp2={}
        for label in ['easy', 'intermediate', 'hard']:
            generated_summary = item[f'{label}_text']
            subclaim_list = item['metrics'][f'{label}']['completeness']['details']
            temp=[]
            for idx, subclaim in enumerate(subclaim_list):
                
                # Check status (assumes subclaim variable holds the status string)
                result = 1 if subclaim['label'] == 'supported' else 0
                
                if result ==0:
                    output = infer_reasonableness(
                                reference_summary=reference_summary,
                                generated_summary=generated_summary,
                                readability_level=label,
                                subclaim_text=subclaim['subclaim'],
                                result=result,
                        )
                        
                    temp.append({
                                'subclaim': subclaim['subclaim'],
                                'output': output
                            })
                else:
                    temp.append({
                                'subclaim': subclaim['subclaim'],
                                'output': {
                                    'reasonableness': 'reasonable',
                                    'justification': 'The subclaim is included in the generated summary, hence it is reasonable.'
                                }
                            })

            temp2[label] = {
                'results': temp
            }
        full_results.append({
            'id': item['id'],
            'completeness': temp2
        })
        if len(full_results) % 10 == 0:
            with open(save_path, 'w') as f:
                json.dump(full_results, f, indent=2, ensure_ascii=False)

    with open(save_path, 'w') as f:
        json.dump(full_results, f, indent=2, ensure_ascii=False)