{ "cells": [ { "cell_type": "markdown", "id": "17dc3d7c", "metadata": {}, "source": [ "# subclaim completeness calculation and reasoning combine" ] }, { "cell_type": "code", "execution_count": null, "id": "aa44fafa", "metadata": {}, "outputs": [], "source": [ "import json\n", "with open('/home/mshahidul/readctrl/results/dataset_quality_check/completeness_resonability_check_100_qwen3-32B_v3.json', 'r') as f2:\n", " data2 = json.load(f2)\n", " print(data2[0].keys())" ] }, { "cell_type": "code", "execution_count": 4, "id": "5a7286ac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['id', 'fulltext', 'summary'])\n" ] } ], "source": [ "# /home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_0_80_full.json\n", "import json\n", "with open('/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_en.json', 'r') as f1:\n", " data1 = json.load(f1)\n", " print(data1[0].keys())\n", "dat={}\n", "for idx,x in enumerate(data1):\n", " dat[idx]=x['summary']" ] }, { "cell_type": "code", "execution_count": 6, "id": "e462205b", "metadata": {}, "outputs": [], "source": [ "# /home/mshahidul/readctrl/data/annotators_validate_data_(20_80)/code/correction_evaluation_full_text.json\n", "with open('/home/mshahidul/readctrl/data/annotators_validate_data_(20_80)/code/correction_evaluation_full_text.json', 'r') as f3:\n", " data3 = json.load(f3)\n", "full_data=[]\n", "for item in data3:\n", " item['summary']=dat[item['doc_id']]\n", " full_data.append(item)" ] }, { "cell_type": "code", "execution_count": 8, "id": "051025fa", "metadata": {}, "outputs": [], "source": [ "with open('/home/mshahidul/readctrl/data/annotators_validate_data_(20_80)/code/correction_evaluation_full_text_with_gs.json', 'w') as f4:\n", " json.dump(full_data, f4, indent=4)" ] }, { "cell_type": "code", "execution_count": null, "id": "db70aadb", "metadata": {}, "outputs": [], "source": [ "reason_info = {}\n", "another_info = {}\n", "for item in data2:\n", " id = item['id']\n", " difficulty_level = item['version']\n", " data_temp = item['completeness']\n", " another_info[(id, difficulty_level)] = item['completeness']['results']\n", " for _data in data_temp['results']:\n", " reasonableness = _data['reasonableness']\n", " \n", " # Step 1: Try to parse as JSON\n", " if isinstance(reasonableness, str):\n", " parsed = None\n", " try:\n", " parsed = json.loads(reasonableness)\n", " except Exception:\n", " try:\n", " parsed = ast.literal_eval(reasonableness)\n", " except Exception:\n", " # Not JSON or dict — treat as plain text\n", " if \"'reasonable'\" in reasonableness:\n", " parsed = {\"reasonableness\": \"reasonable\", \"justification\": reasonableness}\n", " elif \"'unreasonable'\" in reasonableness:\n", " parsed = {\"reasonableness\": \"unreasonable\", \"justification\": reasonableness}\n", " else:\n", " parsed = {\"reasonableness\": \"unknown\", \"justification\": reasonableness}\n", " reasonableness = parsed\n", "\n", " # Step 2: Skip if \"reasonable\"\n", " key = (id, difficulty_level,_data['id'])\n", "\n", " if reasonableness.get('reasonableness') in [\"reasonable\"]:\n", " reason_info[key] = 1 \n" ] }, { "cell_type": "code", "execution_count": null, "id": "bed762d5", "metadata": {}, "outputs": [], "source": [ "import json\n", "full_results = []\n", "with open('/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json', 'r') as f:\n", " data = json.load(f)\n", " print(data[0].keys())\n", "success = 0\n", "accuracy_info={}\n", "for entry in data:\n", " id= entry['id']\n", " difficulty_level = entry['version']\n", " success = 0\n", " temp=[]\n", " for item in entry['completeness']['results']:\n", " flag=0 \n", " sub_claim_id = item['subclaim']['id']\n", " sub_claim=item['subclaim']['subclaim']\n", " if item['result']==\"1\":\n", " flag=1\n", " success+=1\n", " elif item['result']==\"0\":\n", " key = (id, difficulty_level, sub_claim_id)\n", " if key in reason_info and reason_info[key]==1:\n", " success+=reason_info[key]\n", " flag=1\n", " if flag==1:\n", " temp.append({\n", " \"subclaim_id\": sub_claim_id,\n", " \"subclaim\": sub_claim,\n", " \"supported\": True,\n", " })\n", " else:\n", " temp.append({\n", " \"subclaim_id\": sub_claim_id,\n", " \"subclaim\": sub_claim,\n", " \"supported\": False,\n", " })\n", " full_results.append({\n", " \"id\": id,\n", " \"version\": difficulty_level,\n", " \"completeness\": temp,\n", " \"accuracy\": success/len(entry['completeness']['results'])\n", " })\n", " accuracy_info[(id,difficulty_level)] = success/len(entry['completeness']['results'])" ] }, { "cell_type": "code", "execution_count": null, "id": "af8bd071", "metadata": {}, "outputs": [], "source": [ "# full_results\n", "with open('/home/mshahidul/readctrl/results/dataset_quality_check/completeness_final_subclaim_verifier_results_100_v1.json', 'w') as f:\n", " json.dump(full_results, f, indent=4)" ] }, { "cell_type": "code", "execution_count": null, "id": "95f0c872", "metadata": {}, "outputs": [], "source": [ "accuracy_calcs = {}\n", "item_num={}\n", "for version in ['easy','intermediate','hard']:\n", " for key, value in accuracy_info.items():\n", " if key[1]==version:\n", " accuracy_calcs[version] = accuracy_calcs.get(version, 0) + value\n", " item_num[version] = item_num.get(version, 0) + 1\n", " accuracy_calcs[version] = accuracy_calcs[version]/item_num[version]\n", "print(accuracy_calcs)" ] }, { "cell_type": "code", "execution_count": null, "id": "3ffeac9c", "metadata": {}, "outputs": [], "source": [ "res={\"easy\":[],\"intermediate\":[],\"hard\":[]}\n", "\n", "for entry in full_results:\n", " difficulty = entry['version']\n", " for item in entry['completeness']:\n", " res[difficulty].append(int(item['supported']))" ] }, { "cell_type": "code", "execution_count": null, "id": "36a1dda6", "metadata": {}, "outputs": [], "source": [ "print(f\"easy: {sum(res['easy'])/len(res['easy']):.4f}\")\n", "print(f\"intermediate: {sum(res['intermediate'])/len(res['intermediate']):.4f}\")\n", "print(f\"hard: {sum(res['hard'])/len(res['hard']):.4f}\")" ] }, { "cell_type": "markdown", "id": "2a7f857c", "metadata": {}, "source": [ "## reasonability model performance check using chatgpt" ] }, { "cell_type": "code", "execution_count": null, "id": "90c4aee1", "metadata": {}, "outputs": [], "source": [ "prompt='''\n", "You will act as a judge. I received an answer from my model using the prompt below. some subclaims were omitted in the generated summary compared to the reference summary based on readability label. I already calculated reasoning behind the omission of each subclaim. Now please evaluate whether the reasoning is good or not.\n", "\"\n", "def return_prompts(reference_summary, generated_summary, subclaims_json, difficulty_level):\n", " prompt=f\n", "You are a **medical summarization quality evaluator**.\n", "Your goal is to decide whether the inclusion or omission of each subclaim in the generated summary is *reasonable*, given the target readability level.\n", "\n", "---\n", "\n", "### **Input**\n", "\n", "```\n", "Readability Level: {difficulty_level}\n", "\n", "Reference Summary:\n", "{reference_summary}\n", "\n", "Generated Summary:\n", "{generated_summary}\n", "\n", "Subclaims with Support Results:\n", "{subclaims_json}\n", "```\n", "\n", "---\n", "\n", "### **Task**\n", "\n", "For each subclaim:\n", "\n", "1. Read `result`:\n", "\n", " * `1` = the subclaim is supported or clearly mentioned in the generated summary.\n", " * `0` = the subclaim is missing or not supported.\n", "\n", "2. Based on readability level and medical relevance, decide whether this inclusion/omission is **reasonable**, **partially reasonable**, or **unreasonable**.\n", "\n", "3. Provide a short justification (1–2 sentences) explaining your reasoning.\n", "\n", "---\n", "\n", "### **Output Format**\n", "\n", "Return structured JSON:\n", "\n", "```json\n", "{{\n", " \"readability_level\": \"\",\n", " \"evaluations\": [\n", " {{\n", " \"subclaim_id\": ,\n", " \"subclaim_text\": \"\",\n", " \"result\": <0 or 1>,\n", " \"reasonableness\": \"\",\n", " \"justification\": \"\"\n", " }},\n", " ...\n", " ]\n", "}}\n", "```\n", "\n", "---\n", "\n", "### **Evaluation Guidelines**\n", "\n", "| Readability Level | Reasonable Omission | Unreasonable Omission |\n", "| ----------------- | ------------------------------------------------------------ | ------------------------------------------------- |\n", "| **Easy** | Technical, anatomical, quantitative, or procedural details. | Key clinical findings, diagnoses, or outcomes. |\n", "| **Intermediate** | Minor imaging details or measurements. | Any main diagnostic finding or cause–effect link. |\n", "| **Hard** | Very few omissions acceptable; mostly stylistic compression. | Any missing clinical or diagnostic information. |\n", "\n", "\n", "\"\n", "\n", "Please evaluate how good my model’s performance is and whether it performed well or not.\n", "'''" ] }, { "cell_type": "code", "execution_count": null, "id": "569d50f1", "metadata": {}, "outputs": [], "source": [ "import json\n", "file_path = \"/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json\"\n", "\n", "with open(file_path, 'r') as f:\n", " synthetic_data = json.load(f)\n", "\n", "file_path_qwen3_32B = \"/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json\"\n", "\n", "with open(file_path_qwen3_32B, 'r') as f:\n", " qwen3_32B_results = json.load(f)\n", "\n", "\n", "ind=1\n", "version='hard'\n", "ref_summary = (f\"{synthetic_data[ind]['ref_summary']['text']}\")\n", "generated_summary = (f\"{synthetic_data[ind]['readability_versions'][version]['text']}\")\n", "subclaims_results = (f\"{qwen3_32B_results[ind]['completeness']['results']}\")\n", "print(f\"Version: {version}\")\n", "print(f\"Reference Summary: {ref_summary}\")\n", "print(f\"Generated Summary: {generated_summary}\")\n", "print(f\"Subclaims reasoning Results: {another_info[(synthetic_data[ind]['id'],version)]}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a470c099", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "cb78bbee", "metadata": {}, "source": [ "## Token length cal" ] }, { "cell_type": "code", "execution_count": null, "id": "fcb7163d", "metadata": {}, "outputs": [], "source": [ "\n", "def return_prompts_attribution(reference_full_text, generated_summary, subclaims_json, difficulty_level):\n", " return f'''\n", "### **SYSTEM / ROLE INSTRUCTION**\n", "\n", "You are a **medical factuality and attribution evaluator**.\n", "You will assess whether **unsupported subclaims** in a generated summary (those with `\"result\": 0\"`) are *reasonable additions* based on the readability level (*easy / intermediate / hard*).\n", "\n", "The goal is to determine whether these **extra pieces of information** are acceptable simplifications or *hallucinations* that reduce factual faithfulness.\n", "\n", "---\n", "\n", "### **READABILITY & ATTRIBUTION GUIDELINES**\n", "\n", "| Level | Audience | Linguistic & Stylistic Profile | Content Goal | Allowable Additions |\n", "| :-- | :-- | :-- | :-- | :-- |\n", "| **Easy (FH 70–100, grade 5–7)** | General public; early secondary readers | Short, direct sentences using common vocabulary and concrete ideas. Avoid subordinate clauses and technical terms. Tone should be explanatory, lively, and highly accessible. | Simplify and clarify events and outcomes without introducing technical or diagnostic details. | General background context or plain-language explanations are acceptable; **no new facts, data, or inferred medical claims.** |\n", "| **Intermediate (FH 50–69, grade 8–12)** | Educated layperson / medical student | Moderate sentence length and complexity. Vocabulary suitable for high-school or introductory science readers. May include limited domain terms with brief clarification. | Present essential medical content with clear logic and limited detail, ensuring readability for non-experts. | Brief clarifications, definitions, or causal links consistent with the source are allowed; **avoid speculative or unconfirmed data.** |\n", "| **Hard (FH 0–49, university / professional)** | Medical professionals / technical audience | Long, multi-clause sentences; formal academic tone. Incorporate precise domain vocabulary, causal and analytical connectors (e.g., *por consiguiente*, *sin embargo*, *en virtud de*, *dado que*), at least one definition, one process description, and one statement of implications or challenges. | Preserve full factual accuracy, diagnostic precision, and interpretive nuance expected in professional discourse. | Additions are **not permitted**; every statement must be directly supported by the reference text. Parenthetical clarifications or relative clauses may be used for cohesion, not new content. |\n", "\n", "---\n", "\n", "### **INPUTS**\n", "\n", "Readability Level: {difficulty_level} \n", "Reference Full Text: {reference_full_text} \n", "Generated Summary: {generated_summary} \n", "Subclaims: {subclaims_json}\n", "\n", "---\n", "\n", "### **TASK INSTRUCTIONS**\n", "\n", "1. Focus only on subclaims with `\"result\": 0\"` (not supported by the input text). \n", "2. For each unsupported subclaim:\n", " * Judge whether adding it is **reasonable** for the given readability level. \n", " * Choose one of: `\"reasonable addition\"`, `\"unnecessary but harmless\"`, `\"misleading / hallucinated\"`. \n", " * Provide a **1–2 sentence justification** explaining your reasoning.\n", "\n", "---\n", "\n", "### **OUTPUT FORMAT (strict JSON)**\n", "\n", "```json\n", "{{\n", " \"reasonableness\": \"\",\n", " \"justification\": \"\"\n", "}}\n", "\n", "'''\n", "import os, json, tqdm\n", "file_path = \"/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json\"\n", "file_path_qwen3_32B = \"/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json\"\n", "save_path = \"/home/mshahidul/readctrl/results/dataset_quality_check/attribution_resonability_check_100_qwen3-32B.json\"\n", "\n", "with open(file_path, 'r') as f:\n", " synthetic_data = json.load(f)\n", "with open(file_path_qwen3_32B, 'r') as f:\n", " qwen3_32B_results = json.load(f)\n", "\n", "\n", "import tiktoken\n", "\n", "def count_tokens_qwen(text: str):\n", " \n", " # fallback: use a generic encoding (not exact)\n", " encoding = tiktoken.get_encoding(\"cl100k_base\")\n", "\n", " token_ids = encoding.encode(text)\n", " return len(token_ids)\n", "\n", "length=0\n", "all_token_lengths = []\n", "for ind in (range(0, 100)):\n", " for version in [\"easy\",\"intermediate\" ,\"hard\"]:\n", "\n", " ref_full_text_summary = synthetic_data[ind]['full_text']\n", " generated_summary = synthetic_data[ind]['readability_versions'][version]['text']\n", " subclaims_results = qwen3_32B_results[ind]['attribution']['results']\n", "\n", " # Convert subclaims JSON nicely\n", " subclaims_json = json.dumps(subclaims_results, indent=2, ensure_ascii=False)\n", "\n", " prompt = return_prompts_attribution(\n", " ref_full_text_summary,\n", " generated_summary,\n", " subclaims_json,\n", " version\n", " )\n", " length=max(length,count_tokens_qwen(prompt))\n", " all_token_lengths.append(length)" ] }, { "cell_type": "code", "execution_count": null, "id": "d67bd288", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "plt.figure(figsize=(8, 5))\n", "plt.hist(all_token_lengths, bins=30, color='skyblue', edgecolor='black')\n", "plt.title('Distribution of all_token_lengths')\n", "plt.xlabel('Token Length')\n", "plt.ylabel('Frequency')\n", "plt.grid(True, linestyle='--', alpha=0.6)\n", "plt.show()\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f758d755", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "plt.figure(figsize=(6, 4))\n", "plt.boxplot(all_token_lengths, vert=True, patch_artist=True, boxprops=dict(facecolor='skyblue'))\n", "plt.title('Boxplot of all_token_lengths')\n", "plt.ylabel('Token Length')\n", "plt.grid(axis='y', linestyle='--', alpha=0.6)\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "e3d31e79", "metadata": {}, "source": [ "## attribution accuracy check" ] }, { "cell_type": "code", "execution_count": null, "id": "1eb679e5", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "with open('/home/mshahidul/readctrl/results/dataset_quality_check/attribution_resonability_results_100_qwen3-32B_v2.json', 'r') as f:\n", " attribution_resonability_results = json.load(f)\n", "\n", "print(attribution_resonability_results[0].keys())" ] }, { "cell_type": "code", "execution_count": null, "id": "4ec7bab1", "metadata": {}, "outputs": [], "source": [ "full_data=[]\n", "for item in attribution_resonability_results:\n", " success=0\n", " for eval in item['results']:\n", " if eval['response']==\"not_applicable\" or eval['response']['reasonableness'] in [\"reasonable\",\"partially_reasonable\"]:\n", " success+=1\n", " full_data.append({\n", " \"id\": item['id'],\n", " \"difficulty_level\": item['difficulty_level'],\n", " \"total_subclaims\": len(item['results']),\n", " \"reasonable_subclaims\": success,\n", " \"unreasonable_subclaims\": len(item['results']) - success,\n", " \"accuracy\": success/len(item['results']) if item['results'] else 0,\n", " \"subclaim_list\": item['results']\n", " })\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a5a206dd", "metadata": {}, "outputs": [], "source": [ "accuracy_calcs = {\"easy\":[],\"intermediate\":[],\"hard\":[]}\n", "for item in full_data:\n", " accuracy_calcs[item['difficulty_level']].append(item['accuracy'])\n", "accuracy_calcs2={}\n", "for level in accuracy_calcs:\n", " for item in accuracy_calcs[level]:\n", " acc_100+=1\n", " accuracy_calcs2[level] = sum(accuracy_calcs[level])/len(accuracy_calcs[level]) if accuracy_calcs[level] else 0\n", "print(accuracy_calcs2)" ] }, { "cell_type": "code", "execution_count": null, "id": "3c47e0ee", "metadata": {}, "outputs": [], "source": [ "# accuracy_calcs = {\"easy\":[],\"intermediate\":[],\"hard\":[]}\n", "# def temp1_func(num):\n", "# uc={\"easy\":0,\"intermediate\":0,\"hard\":0}\n", "# for item in full_data:\n", "# if item['unreasonable_subclaims']<=num:\n", "# uc[item['difficulty_level']] += 1\n", "# accuracy_calcs[item['difficulty_level']].append(item['accuracy'])\n", "# return uc\n", "# for num in range(1,10):\n", "# uc=temp1_func(num)\n", "# print(f\"Unreasonable subclaims threshold: {num}, Count: {uc}\")\n", "\n", "# print(uc)\n", "def temp2_func(num):\n", " accuracy_calcs2={}\n", " acc_100=0\n", " for level in accuracy_calcs:\n", " for item in accuracy_calcs[level]:\n", " if item>=num/10:\n", " acc_100+=1\n", " accuracy_calcs2[level] = sum(accuracy_calcs[level])/len(accuracy_calcs[level]) if accuracy_calcs[level] else 0\n", " temp=0\n", " for k,v in accuracy_calcs2.items():\n", " temp+=v\n", " print(f\"Threshold(>=): {num/10}, Overall Accuracy: {temp/3:.4f}\")\n", " # print(f\"Level: {k}, Accuracy: {v}\")\n", " # print(\"Threshold(>=):\", num/10, \"Accuracy:\", {k: v for k, v in accuracy_calcs2.items() if v >= num/10})\n", "print(\"Accuracy threshold results:\")\n", "for num in range(1,10):\n", " temp2_func(num)" ] }, { "cell_type": "code", "execution_count": null, "id": "d7b1364c", "metadata": {}, "outputs": [], "source": [ "def temp_result(list_res):\n", " cnt=0\n", " for res in list_res:\n", " if res['result']==\"1\":\n", " cnt+=1\n", " return len(list_res),cnt,cnt/len(list_res) if len(list_res) > 0 else 0\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "4f484774", "metadata": {}, "outputs": [], "source": [ "# full_data.append({\n", "# \"id\": item['id'],\n", "# \"difficulty_level\": item['difficulty_level'],\n", "# \"total_subclaims\": len(item['results']),\n", "# \"reasonable_subclaims\": success,\n", "# \"accuracy\": success/len(item['results']) if item['results'] else 0\n", "# })" ] }, { "cell_type": "code", "execution_count": null, "id": "90369a55", "metadata": {}, "outputs": [], "source": [ "import json\n", "full_data2={}\n", "with open('/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json', 'r') as f:\n", " subclaim_verifier_results = json.load(f)\n", "acc_list={\"easy\":[],\"intermediate\":[],\"hard\":[]}\n", "for item in subclaim_verifier_results:\n", " for level in [\"easy\",\"intermediate\",\"hard\"]:\n", " if item['version']==level:\n", " total, cnt, acc = temp_result(item['attribution']['results'])\n", " acc_list[level].append(acc)\n", " full_data2[(item['id'], level)] = {\n", " \"id\": item['id'],\n", " \"difficulty_level\": level,\n", " \"total_subclaims\": total,\n", " \"reasonable_subclaims\": cnt,\n", " \"accuracy\": acc,\n", " \"subclaim_list\": item['attribution']['results']\n", " }\n", "print({k: sum(v)/len(v) if v else 0 for k, v in acc_list.items()})" ] }, { "cell_type": "code", "execution_count": null, "id": "dbe194a8", "metadata": {}, "outputs": [], "source": [ "for (k1,v1), (k2,v2) in zip(full_data.items(), full_data2.items()):\n", " assert k1==k2\n", " if k1[0]==k2[0] and k1[1]==k2[1] and v1['accuracy']