| import os |
| os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" |
| os.environ["CUDA_VISIBLE_DEVICES"] = "4" |
|
|
| import json |
| import torch |
| from unsloth import FastLanguageModel |
| import tqdm |
|
|
|
|
| _model_cache = {"model": None, "tokenizer": None} |
|
|
| def load_finetuned_model(model_path: str): |
| """Load and cache the fine-tuned model + tokenizer.""" |
| if _model_cache["model"] is not None: |
| return _model_cache["model"], _model_cache["tokenizer"] |
|
|
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name=model_path, |
| max_seq_length=8192, |
| load_in_4bit=False, |
| load_in_8bit=False, |
| full_finetuning=False, |
| ) |
| _model_cache["model"], _model_cache["tokenizer"] = model, tokenizer |
| return model, tokenizer |
|
|
|
|
| def build_inference_prompt( |
| reference_full_text, |
| generated_summary, |
| subclaim_id, |
| subclaim_text, |
| subclaim_result, |
| difficulty_level |
| ): |
| """ |
| Build a standardized inference prompt for single‑subclaim evaluation. |
| Use after fine‑tuning to assess new examples consistently. |
| """ |
|
|
| inference_prompt = f""" |
| ### **SYSTEM / ROLE INSTRUCTION** |
| |
| You are a **medical factuality and attribution evaluator**. |
| You will analyze one subclaim from a generated medical summary. |
| |
| Each subclaim includes a `"result"` flag: |
| - `1` → Supported by the reference text (no reasonableness check required) |
| - `0` → Unsupported by the reference text (evaluate scope and validity) |
| |
| Your task is to decide, for unsupported subclaims, whether the new information |
| is a *reasonable addition* given the specified readability level: |
| **easy**, **intermediate**, or **hard**. |
| |
| --- |
| |
| ### **READABILITY GUIDELINES** |
| |
| | Level | Audience | Style | Allowable Additions | |
| | :-- | :-- | :-- | :-- | |
| | **Easy (FH 70–100)** | General public | Simple, concrete | Broad clarifications only; no factual innovations | |
| | **Intermediate (FH 50–69)** | Educated nonspecialist | Moderate precision | Limited clarifications consistent with the text | |
| | **Hard (FH 0–49)** | Professionals | Formal, technical | Must be strictly supported by evidence | |
| |
| --- |
| |
| ### **INPUT** |
| |
| Readability Level: {difficulty_level} |
| |
| Reference Full Text: |
| {reference_full_text} |
| |
| Generated Summary: |
| {generated_summary} |
| |
| Subclaim Info: |
| {{ |
| "subclaim_id": {subclaim_id}, |
| "subclaim": "{subclaim_text}", |
| "result": {subclaim_result} |
| }} |
| |
| --- |
| |
| ### **TASK INSTRUCTIONS** |
| |
| - If `"result": 1"`, respond with `"not_applicable"` and justify briefly |
| (e.g., *"supported, no evaluation required"*). |
| - If `"result": 0"`, classify reasonableness: |
| - `"reasonable"` → legitimate simplification consistent with the readability level |
| - `"partially_reasonable"` → benign rephrasing |
| - `"unreasonable"` → misleading, speculative, or contradicted by the source |
| |
| Provide a **short 1–2 sentence justification**. |
| |
| --- |
| |
| ### **EXPECTED OUTPUT (JSON ONLY)** |
| |
| ```json |
| {{ |
| "evaluation": {{ |
| "subclaim_id": {subclaim_id}, |
| "subclaim": "{subclaim_text}", |
| "result": {subclaim_result}, |
| "reasonableness": "<reasonable | partially_reasonable | unreasonable | not_applicable>", |
| "justification": "<brief justification>" |
| }} |
| }} |
| """.strip() |
|
|
| return inference_prompt |
| def infer_attribution_reasonableness(prompt: str, model_path: str): |
| """Run inference using the fine-tuned model with attribution prompt.""" |
| model, tokenizer = load_finetuned_model(model_path) |
|
|
| messages = [{"role": "user", "content": prompt + "\n"}] |
|
|
| chat_text = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True, |
| enable_thinking=False, |
| ) |
|
|
| inputs = tokenizer(chat_text, return_tensors="pt").to("cuda") |
|
|
| with torch.no_grad(): |
| output_ids = model.generate( |
| **inputs, |
| max_new_tokens=150, |
| temperature=0.2, |
| top_p=0.8, |
| top_k=5, |
| do_sample=False, |
| ) |
|
|
| output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip() |
| if "</think>" in output_text: |
| output_text = output_text.split("</think>")[-1].strip().replace("```json", "").replace("```", "") |
|
|
| try: |
| parsed = json.loads(output_text) |
| except Exception: |
| parsed = output_text |
| return parsed |
|
|
|
|
| file_synth = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json" |
| file_qwen_results = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json" |
| save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/attribution_resonability_results_100_qwen3-32B_v2.json" |
|
|
| with open(file_synth, 'r') as f: |
| synthetic_data = json.load(f) |
| with open(file_qwen_results, 'r') as f: |
| qwen3_32B_results = json.load(f) |
| dict1={} |
| for item in qwen3_32B_results: |
| version=item['version'] |
| dict1[(item['id'], version)] = item['attribution']['results'] |
| |
| res = [] |
| if os.path.exists(save_path): |
| with open(save_path, 'r') as f: |
| res = json.load(f) |
| print(f"🔁 Resuming from {len(res)} entries") |
|
|
| existing = set((e["id"], e["difficulty_level"]) for e in res) |
|
|
| for ind in tqdm.tqdm(range(0, 100)): |
| entry = synthetic_data[ind] |
| |
| for level in ["easy", "intermediate", "hard"]: |
| subclaims_results = dict1[(entry["id"], level)] |
| if (entry["id"], level) in existing: |
| print(f"⏭️ Skipping {entry['id']} ({level})") |
| continue |
|
|
| ref_full_text = entry["full_text"] |
| generated_summary = entry["readability_versions"][level]["text"] |
| temp=[] |
| for subclaim in subclaims_results: |
| subclaim_id = subclaim['subclaim']['id'] |
| subclaim_text = subclaim['subclaim']['subclaim'] |
| subclaim_result = subclaim['result'] |
| prompt = build_inference_prompt( |
| ref_full_text, |
| generated_summary, |
| subclaim_id, |
| subclaim_text, |
| subclaim_result, |
| level |
| ) |
| if subclaim_result=="1": |
| temp.append({ |
| "subclaim_id": subclaim_id, |
| "subclaim_text": subclaim_text, |
| "response": "not_applicable" |
| }) |
| continue |
| response = infer_attribution_reasonableness(prompt,"/home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1") |
| temp.append({ |
| "subclaim_id": subclaim_id, |
| "subclaim_text": subclaim_text, |
| "response": response |
| }) |
| res.append({ |
| "id": entry["id"], |
| "difficulty_level": level, |
| "results": temp |
| }) |
| if len(res) % 10 == 0: |
| with open(save_path, 'w') as f: |
| json.dump(res, f, indent=2, ensure_ascii=False) |
| print(f"💾 Saved after {len(res)} entries") |
|
|
| with open(save_path, 'w') as f: |
| json.dump(res, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
|