File size: 7,961 Bytes

9c6961c

import json
import sys
from openai import OpenAI
import ast,os
# ===========================
# CONFIGURATION
# ===========================
MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged"
VLLM_API_URL = "http://localhost:8004/v1"
VLLM_API_KEY = "EMPTY" 

# Initialize Client
client = OpenAI(
    base_url=VLLM_API_URL,
    api_key=VLLM_API_KEY,
)

# ===========================
# INFERENCE FUNCTION
# ===========================
def infer_reasonableness(
    fulltext: str,
    generated_summary: str,
    readability_level: str,
    subclaim_text: str,
    result: int,
):
    """
    Predict reasonableness using the local vLLM server.
    No error handling: validation or connection errors will raise exceptions.
    """

    # ---- Build inference prompt ----
    prompt = f"""
### **SYSTEM / ROLE INSTRUCTION**

You are a **medical factuality and attribution evaluator**.
You will assess whether the **unsupported subclaim** in a generated summary (when `"result": 0"`) is a *reasonable addition* given the readability level (*easy / intermediate / hard*).

The goal is to decide whether this **extra piece of information** is an acceptable simplification or a *hallucination* that reduces factual faithfulness.

---

### **READABILITY & ATTRIBUTION GUIDELINES**

| Level | Audience | Linguistic & Stylistic Profile | Content Goal | Allowable Additions |
| :-- | :-- | :-- | :-- | :-- |
| **Easy (FH 70–100, grade 5–7)** | General public; early secondary readers | Short, direct sentences using common vocabulary and concrete ideas. Avoid subordinate clauses and technical terms. Tone should be explanatory, lively, and highly accessible. | Simplify and clarify events and outcomes without introducing technical or diagnostic details. | General background context or plain-language explanations are acceptable; **no new facts, data, or inferred medical claims.** |
| **Intermediate (FH 50–69, grade 8–12)** | Educated layperson / medical student | Moderate sentence length and complexity. Vocabulary suitable for high-school or introductory science readers. May include limited domain terms with brief clarification. | Present essential medical content with clear logic and limited detail, ensuring readability for non-experts. | Brief clarifications, definitions, or causal links consistent with the source are allowed; **avoid speculative or unconfirmed data.** |
| **Hard (FH 0–49, university / professional)** | Medical professionals / technical audience | Long, multi-clause sentences; formal academic tone. Incorporate precise domain vocabulary, causal and analytical connectors (e.g., *por consiguiente*, *sin embargo*, *en virtud de*, *dado que*), at least one definition, one process description, and one statement of implications or challenges. | Preserve full factual accuracy, diagnostic precision, and interpretive nuance expected in professional discourse. | Additions are **not permitted**; every statement must be directly supported by the reference text. Parenthetical clarifications or relative clauses may be used for cohesion, not new content. |

---

### **Input**

```
Readability Level: {readability_level}

Reference Full Text:
{fulltext}

Generated Summary:
{generated_summary}

Subclaim: "{subclaim_text}"
Result: {result}   # 1 = supported (included), 0 = unsupported
```

---

### **TASK INSTRUCTIONS**

If `"result": 0"`, judge whether including this subclaim is **reasonable** for the given readability level.  
Choose one of: `"reasonable addition"`, `"unnecessary but harmless"`, `"misleading / hallucinated"`.  
Provide a **1–2 sentence justification** describing your reasoning.

---

### **Output Format**

Return structured JSON:

```json
{{
  "evaluation": {{
      "reasonableness": "<reasonable | partially_reasonable | unreasonable>",
      "justification": "<short explanation>"
  }}
}}
```
""".strip()

    messages = [{"role": "user", "content": prompt}]

    # ---- Call vLLM Server ----
    response = client.chat.completions.create(
        model=MODEL_NAME,
        messages=messages,
        temperature=0.2,
        max_tokens=200,
        top_p=0.8,
    )

    output_text = response.choices[0].message.content

    # ---- Clean Output (Handle Thinking & Markdown) ----
    try:
        if "</think>" in output_text:
            output_text = output_text.split("</think>")[1]
        
        clean_text = output_text.strip().replace("```json", "").replace("```", "").strip()
        # import ipdb; ipdb.set_trace()
        t=ast.literal_eval(clean_text)

        # ---- Parse JSON (Will raise JSONDecodeError if invalid) ----
        return t
    except Exception as e:
        return output_text


# ===========================
# MAIN EXECUTION
# ===========================
if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_path", type=str, required=True,
                        help="Path to the JSON file containing evaluation data.")
    args = parser.parse_args()
    data_path = args.data_path
    # data_path = '/home/mshahidul/readctrl/data/concise_complete_attr_cal_v3/evaluated_metrics_0_100.json'
    file_name=os.path.basename(data_path)
    
    # Open file directly (Will raise FileNotFoundError if missing)
    with open(data_path, 'r') as f:
        dataset = json.load(f)
        
    # print(f"Loaded {len(dataset)} examples. Starting inference...")
    save_path = f'/home/mshahidul/readctrl/data/attribution_reasoning_result/{file_name}'
    os.makedirs('/home/mshahidul/readctrl/data/attribution_reasoning_result/', exist_ok=True)
    full_results = []
    if os.path.exists(save_path):
        with open(save_path, 'r') as f:
            full_results = json.load(f)

    import tqdm
    for item in tqdm.tqdm(dataset):
        if any(d['id'] == item['id'] for d in full_results):
            continue  
        fulltext = item['fulltext']
        temp2={}
        for label in ['easy', 'intermediate', 'hard']:
            generated_summary = item[f'{label}_text']
            subclaim_list = item['metrics'][f'{label}']['attribution']['details']
            temp=[]
            for idx, subclaim in enumerate(subclaim_list):
                
                # Check status (assumes subclaim variable holds the status string)
                result = 1 if subclaim['label'] == 'supported' else 0
                
                if result ==0:
                    output = infer_reasonableness(
                                fulltext=fulltext,
                                generated_summary=generated_summary,
                                readability_level=label,
                                subclaim_text=subclaim['subclaim'],
                                result=result,
                        )
                        
                    temp.append({
                                'subclaim': subclaim['subclaim'],
                                'output': output
                            })
                else:
                    temp.append({
                                'subclaim': subclaim['subclaim'],
                                'output': {
                                    'reasonableness': 'reasonable',
                                    'justification': 'The subclaim is included in the generated summary, hence it is reasonable.'
                                }
                            })

            temp2[label] = {
                'results': temp
            }
        full_results.append({
            'id': item['id'],
            'completeness': temp2
        })
        if len(full_results) % 10 == 0:
            with open(save_path, 'w') as f:
                json.dump(full_results, f, indent=2, ensure_ascii=False)

    with open(save_path, 'w') as f:
        json.dump(full_results, f, indent=2, ensure_ascii=False)