File size: 6,983 Bytes

1db7196

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

import json
import torch
from unsloth import FastLanguageModel
import tqdm


_model_cache = {"model": None, "tokenizer": None}

def load_finetuned_model(model_path: str):
    """Load and cache the fine-tuned model + tokenizer."""
    if _model_cache["model"] is not None:
        return _model_cache["model"], _model_cache["tokenizer"]

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_path,
        max_seq_length=8192,
        load_in_4bit=False,
        load_in_8bit=False,
        full_finetuning=False,
    )
    _model_cache["model"], _model_cache["tokenizer"] = model, tokenizer
    return model, tokenizer


def build_inference_prompt(
    reference_full_text,
    generated_summary,
    subclaim_id,
    subclaim_text,
    subclaim_result,
    difficulty_level
):
    """
    Build a standardized inference prompt for single‑subclaim evaluation.
    Use after fine‑tuning to assess new examples consistently.
    """

    inference_prompt = f"""
### **SYSTEM / ROLE INSTRUCTION**

You are a **medical factuality and attribution evaluator**.
You will analyze one subclaim from a generated medical summary.

Each subclaim includes a `"result"` flag:
- `1` → Supported by the reference text (no reasonableness check required)
- `0` → Unsupported by the reference text (evaluate scope and validity)

Your task is to decide, for unsupported subclaims, whether the new information
is a *reasonable addition* given the specified readability level:
**easy**, **intermediate**, or **hard**.

---

### **READABILITY GUIDELINES**

| Level | Audience | Style | Allowable Additions |
| :-- | :-- | :-- | :-- |
| **Easy (FH 70–100)** | General public | Simple, concrete | Broad clarifications only; no factual innovations |
| **Intermediate (FH 50–69)** | Educated nonspecialist | Moderate precision | Limited clarifications consistent with the text |
| **Hard (FH 0–49)** | Professionals | Formal, technical | Must be strictly supported by evidence |

---

### **INPUT**

Readability Level: {difficulty_level}

Reference Full Text:
{reference_full_text}

Generated Summary:
{generated_summary}

Subclaim Info:
{{
  "subclaim_id": {subclaim_id},
  "subclaim": "{subclaim_text}",
  "result": {subclaim_result}
}}

---

### **TASK INSTRUCTIONS**

- If `"result": 1"`, respond with `"not_applicable"` and justify briefly
  (e.g., *"supported, no evaluation required"*).
- If `"result": 0"`, classify reasonableness:
  - `"reasonable"` → legitimate simplification consistent with the readability level
  - `"partially_reasonable"` → benign rephrasing
  - `"unreasonable"` → misleading, speculative, or contradicted by the source

Provide a **short 1–2 sentence justification**.

---

### **EXPECTED OUTPUT (JSON ONLY)**

```json
{{
  "evaluation": {{
    "subclaim_id": {subclaim_id},
    "subclaim": "{subclaim_text}",
    "result": {subclaim_result},
    "reasonableness": "<reasonable | partially_reasonable | unreasonable | not_applicable>",
    "justification": "<brief justification>"
  }}
}}
""".strip()

    return inference_prompt
def infer_attribution_reasonableness(prompt: str, model_path: str):
    """Run inference using the fine-tuned model with attribution prompt."""
    model, tokenizer = load_finetuned_model(model_path)

    messages = [{"role": "user", "content": prompt + "\n"}]

    chat_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    inputs = tokenizer(chat_text, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.2,
            top_p=0.8,
            top_k=5,
            do_sample=False,
        )

    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
    if "</think>" in output_text:
        output_text = output_text.split("</think>")[-1].strip().replace("```json", "").replace("```", "")

    try:
        parsed = json.loads(output_text)
    except Exception:
        parsed = output_text
    return parsed


file_synth = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"
file_qwen_results = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/attribution_resonability_results_100_qwen3-32B_v2.json"

with open(file_synth, 'r') as f:
    synthetic_data = json.load(f)
with open(file_qwen_results, 'r') as f:
    qwen3_32B_results = json.load(f) 
dict1={}
for item in qwen3_32B_results:
    version=item['version']
    dict1[(item['id'], version)] = item['attribution']['results']
 
res = []
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        res = json.load(f)
print(f"🔁 Resuming from {len(res)} entries")

existing = set((e["id"], e["difficulty_level"]) for e in res)

for ind in tqdm.tqdm(range(0, 100)):
    entry = synthetic_data[ind]
    
    for level in ["easy", "intermediate", "hard"]:
        subclaims_results = dict1[(entry["id"], level)]
        if (entry["id"], level) in existing:
            print(f"⏭️ Skipping {entry['id']} ({level})")
            continue

        ref_full_text = entry["full_text"]
        generated_summary = entry["readability_versions"][level]["text"]
        temp=[]
        for subclaim in subclaims_results:
            subclaim_id = subclaim['subclaim']['id']
            subclaim_text = subclaim['subclaim']['subclaim']
            subclaim_result = subclaim['result']
            prompt = build_inference_prompt(
                ref_full_text,
                generated_summary,
                subclaim_id,
                subclaim_text,
                subclaim_result,
                level
            )
            if subclaim_result=="1":
                temp.append({
                "subclaim_id": subclaim_id,
                "subclaim_text": subclaim_text,
                "response": "not_applicable"
            })
                continue
            response = infer_attribution_reasonableness(prompt,"/home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1")
            temp.append({
                "subclaim_id": subclaim_id,
                "subclaim_text": subclaim_text,
                "response": response
            })
        res.append({
            "id": entry["id"],
            "difficulty_level": level,
            "results": temp
        })
        if len(res) % 10 == 0:
            with open(save_path, 'w') as f:
                json.dump(res, f, indent=2, ensure_ascii=False)
            print(f"💾 Saved after {len(res)} entries")

with open(save_path, 'w') as f:
    json.dump(res, f, indent=2, ensure_ascii=False)