File size: 59,746 Bytes

c7a6fe6

{
  "cells": [
    {
      "cell_type": "markdown",
      "id": "f869b176",
      "metadata": {},
      "source": [
        "## LLM guard Qwen3-32B model data formatting"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9f4ff22b",
      "metadata": {},
      "outputs": [],
      "source": [
        "def training_prompt(sub_questions, sub_answers, evaluation):\n",
        "    system_prompt = f\"\"\"\n",
        "You are an impartial evaluator. A set of sub‑questions and sub‑answers was created by separate models.  \n",
        "Determine whether, when combined, these sub‑answers form one meaningful, coherent, and reasonable overall answer to an implied main question.\n",
        "\n",
        "Sub‑questions: {sub_questions}\n",
        "Sub‑answers: {sub_answers}\n",
        "\n",
        "Respond only with:\n",
        "1 – if the combined sub‑answers form a coherent and meaningful overall answer  \n",
        "0 – if they do not (incoherent, contradictory, incomplete, or nonsensical)\n",
        "\"\"\"\n",
        "    \n",
        "    conversation = {}\n",
        "    conversation['conversations'] = (\n",
        "        {'from': \"user\", 'content': system_prompt},\n",
        "        {'from': \"assistant\", 'content': str(evaluation)},\n",
        "    )\n",
        "    return conversation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "afd89af1",
      "metadata": {},
      "outputs": [],
      "source": [
        "# /home/mshahidul/readctrl/data/data_annotator_data/manual_selections_en.json\n",
        "with open('/home/mshahidul/readctrl/data/data_annotator_data/manual_selections_en.json', 'r') as f:\n",
        "    import json\n",
        "    data = json.load(f)\n",
        "print(len(data))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "595815eb",
      "metadata": {},
      "outputs": [],
      "source": [
        "import os\n",
        "import json\n",
        "\n",
        "data_dir = \"/home/mshahidul/LLM_guard/data/training_data_combined_ans_check\"\n",
        "json_files = [f for f in os.listdir(data_dir) if f.endswith('.json')]\n",
        "\n",
        "all_data = []\n",
        "for file in json_files:\n",
        "    with open(os.path.join(data_dir, file), 'r') as f:\n",
        "        data = json.load(f)\n",
        "        for item in data:\n",
        "            training_prompt_data = training_prompt(\n",
        "                item['sub_questions'], \n",
        "                item['sub_answers'], \n",
        "                str(item['evaluation'])\n",
        "            )\n",
        "            all_data.append(training_prompt_data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7c1531e5",
      "metadata": {},
      "outputs": [],
      "source": [
        "with open('/home/mshahidul/LLM_guard/data/training_data_checking_sub_ques_ans.json', 'w') as outfile:\n",
        "    json.dump(all_data, outfile, indent=2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e6f87187",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "300\n",
            "{'conversations': ({'from': 'user', 'content': \"\\nYou are an expert medical adjudicator. Determine if the 'Medical Passage' contains the core factual information of each 'Subclaim', even if the passage uses simpler language or layperson terms.\\nRules:\\n- Label 'supported' if the essential meaning is present.\\n- Label 'not_supported' only if the information is missing or contradicted.\\nOutput: JSON array of strings ['supported', 'not_supported', ...]\\n\\nMedical text:\\nA 62-year-old man has had cough and fever for three days. He feels short of breath and has chest pain when he breathes. His temperature is 38.5 C and he breathes fast. His oxygen level is 92% on room air. He has high blood pressure and no drug allergies. A chest x-ray shows a new spot in the right lower lung, with no fluid. A nose swab test for COVID is negative. The doctor says he has community pneumonia and treats him at home. He gets mouth pills: amoxicillin-clavulanate and azithromycin. After two days, his fever goes down and oxygen is 95%.\\n\\nSubclaims:\\n1. Chest x-ray showed right lower lobe consolidation.\\n2. The patient was treated as an outpatient.\\n3. He received amoxicillin-clavulanate plus azithromycin.\\n4. The patient was breathing fast.\\n5. The patient was admitted to the intensive care unit.\\n6. A pleural effusion was present on imaging.\\n7. Blood cultures grew Streptococcus pneumoniae.\\n8. The patient has a penicillin allergy.\\n\"}, {'from': 'assistant', 'content': '[\"supported\", \"supported\", \"supported\", \"supported\", \"not_supported\", \"not_supported\", \"not_supported\", \"not_supported\"]'})}\n"
          ]
        }
      ],
      "source": [
        "import json\n",
        "from pathlib import Path\n",
        "\n",
        "# from qwen3-8b.py\n",
        "DATA_PATH = Path(\"/home/mshahidul/readctrl/data/finetuning_data/finetune_dataset_subclaim_support_v2.json\")\n",
        "TEXT_LEVEL = \"hard_text\"  # easy_text, intermediate_text, hard_text\n",
        "\n",
        "\n",
        "def training_prompt(medical_text, subclaims, labels):\n",
        "    numbered_subclaims = \"\\n\".join(\n",
        "        [f\"{idx + 1}. {claim}\" for idx, claim in enumerate(subclaims)]\n",
        "    )\n",
        "    \n",
        "    system_prompt = f\"\"\"\n",
        "You are an expert medical adjudicator. Determine if the 'Medical Passage' contains the core factual information of each 'Subclaim', even if the passage uses simpler language or layperson terms.\n",
        "Rules:\n",
        "- Label 'supported' if the essential meaning is present.\n",
        "- Label 'not_supported' only if the information is missing or contradicted.\n",
        "Output: JSON array of strings ['supported', 'not_supported', ...]\n",
        "\n",
        "Medical text:\n",
        "{medical_text}\n",
        "\n",
        "Subclaims:\n",
        "{numbered_subclaims}\n",
        "\"\"\"\n",
        "\n",
        "    conversation = {}\n",
        "    conversation[\"conversations\"] = (\n",
        "        {\"from\": \"user\", \"content\": system_prompt},\n",
        "        {\"from\": \"assistant\", \"content\": json.dumps(labels, ensure_ascii=False)},\n",
        "    )\n",
        "    return conversation\n",
        "\n",
        "\n",
        "def load_conversation_dataset(data_path=DATA_PATH, text_levels=(\"easy_text\", \"intermediate_text\", \"hard_text\")):\n",
        "    with Path(data_path).open(\"r\", encoding=\"utf-8\") as f:\n",
        "        raw_data = json.load(f)\n",
        "\n",
        "    formatted_data = []\n",
        "    for group in raw_data:\n",
        "        for item in group.get(\"items\", []):\n",
        "            subclaims = [x.get(\"subclaim\", \"\") for x in item.get(\"subclaims\", [])]\n",
        "            labels = [x.get(\"label\", \"not_supported\") for x in item.get(\"subclaims\", [])]\n",
        "\n",
        "            if not subclaims:\n",
        "                continue\n",
        "\n",
        "            for level in text_levels:\n",
        "                medical_text = item.get(level)\n",
        "                if not medical_text:\n",
        "                    continue\n",
        "                formatted_data.append(training_prompt(medical_text, subclaims, labels))\n",
        "\n",
        "    return formatted_data\n",
        "\n",
        "\n",
        "# Example usage:\n",
        "dataset_for_sft = load_conversation_dataset()\n",
        "import json\n",
        "\n",
        "with open(\"/home/mshahidul/readctrl/data/finetuning_data/dataset_for_sft_support_check_list.json\", \"w\", encoding=\"utf-8\") as f:\n",
        "    json.dump(dataset_for_sft, f, ensure_ascii=False, indent=2)\n",
        "\n",
        "print(len(dataset_for_sft))\n",
        "print(dataset_for_sft[0])"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "fe5218ed",
      "metadata": {},
      "source": [
        "## Training prompt creation (readability reasoning)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9c3e4329",
      "metadata": {},
      "outputs": [],
      "source": [
        "def readability_judgment_single_prompt_old(reference_summary, generated_summary, readability_level, subclaim_text, result, evaluation):\n",
        "    system_prompt = f\"\"\"\n",
        "You are an impartial medical summarization evaluator.\n",
        "\n",
        "Your goal is to decide whether the inclusion or omission of ONE specific subclaim \n",
        "from the reference summary is *reasonable*, given the readability level of the generated summary.\n",
        "\n",
        "### Inputs\n",
        "Readability Level: {readability_level}\n",
        "\n",
        "Reference Summary:\n",
        "{reference_summary}\n",
        "\n",
        "Generated Summary:\n",
        "{generated_summary}\n",
        "\n",
        "Subclaim:\n",
        "\"{subclaim_text}\"\n",
        "\n",
        "Result:\n",
        "{result}   # 1 = supported (included in generated summary), 0 = omitted (not included)\n",
        "\n",
        "### Task\n",
        "Judge whether this inclusion or omission is:\n",
        "- \"reasonable\" → appropriate for this readability level\n",
        "- \"partially_reasonable\" → oversimplified but acceptable\n",
        "- \"unreasonable\" → harms completeness or clinical meaning\n",
        "\n",
        "Respond only with a JSON object:\n",
        "{{\n",
        "  \"reasonableness\": \"<reasonable | partially_reasonable | unreasonable>\",\n",
        "  \"justification\": \"<short explanation>\"\n",
        "}}\n",
        "\"\"\"\n",
        "\n",
        "    conversation = {}\n",
        "    conversation['conversations'] = (\n",
        "        {'from': \"user\", 'content': system_prompt},\n",
        "        {'from': \"assistant\", 'content': str(evaluation)},\n",
        "    )\n",
        "    return conversation\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "276e1b47",
      "metadata": {},
      "outputs": [],
      "source": [
        "def readability_judgment_single_prompt(reference_summary, generated_summary, readability_level, subclaim_text, result, evaluation):\n",
        "    system_prompt = f\"\"\"\n",
        "You are an impartial medical summarization evaluator.\n",
        "\n",
        "Your goal is to decide whether the inclusion or omission of ONE specific subclaim \n",
        "from the reference summary is *reasonable*, given the readability level of the generated summary.\n",
        "\n",
        "Readability guidelines:\n",
        "- Easy: for general readers; omit detailed numbers, anatomy, or diagnostic test specifics.\n",
        "- Intermediate: maintain main medical ideas and reasoning; simplify complex phrasing only.\n",
        "- Hard: preserve nearly all technical and diagnostic detail, except redundant measurements.\n",
        "\n",
        "### Inputs\n",
        "Readability Level: {readability_level}\n",
        "\n",
        "Reference Summary:\n",
        "{reference_summary}\n",
        "\n",
        "Generated Summary:\n",
        "{generated_summary}\n",
        "\n",
        "Subclaim:\n",
        "\"{subclaim_text}\"\n",
        "\n",
        "Result:\n",
        "{result}   # 1 = supported (included in generated summary), 0 = omitted (not included)\n",
        "\n",
        "### Consistency rules:\n",
        "* If result = 0 (omitted) and the subclaim is purely technical or numerical for this readability level, likely \"reasonable\".\n",
        "* If result = 0 and the subclaim expresses a central event, diagnosis, or reason for treatment outcome, mark \"unreasonable\".\n",
        "\n",
        "### Task\n",
        "Judge whether this inclusion or omission is:\n",
        "- \"reasonable\" → appropriate for this readability level\n",
        "- \"partially_reasonable\" → oversimplified but acceptable\n",
        "- \"unreasonable\" → harms completeness or clinical meaning\n",
        "\n",
        "Output format rule: produce exactly the JSON object below, no extra commentary.\n",
        "\n",
        "{{\n",
        "  \"reasonableness\": \"<reasonable | partially_reasonable | unreasonable>\",\n",
        "  \"justification\": \"<short explanation>\"\n",
        "}}\n",
        "\"\"\"\n",
        "\n",
        "    conversation = {}\n",
        "    conversation['conversations'] = (\n",
        "        {'from': \"user\", 'content': system_prompt},\n",
        "        {'from': \"assistant\", 'content': str(evaluation)},\n",
        "    )\n",
        "    return conversation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "a3306898",
      "metadata": {},
      "outputs": [],
      "source": [
        "import json\n",
        "with open('/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_es.json', 'r') as f:\n",
        "    multiclinsum_gs_train_es_data = json.load(f)\n",
        "ref_summaries={}\n",
        "fulltexts={}\n",
        "for item in multiclinsum_gs_train_es_data:\n",
        "    ref_summaries[item['id']]=item['summary']\n",
        "    fulltexts[item['id']]=item['fulltext']"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "d5aeca22",
      "metadata": {},
      "outputs": [],
      "source": [
        "generated_summaries = {}\n",
        "with open('/home/mshahidul/readctrl/data/hand_create_gpt5_other_model/synthetic_data_es_raw_592.json', 'r') as f:\n",
        "    synthetic_data_es_raw_592 = json.load(f)\n",
        "for item in synthetic_data_es_raw_592:\n",
        "    for version in ['easy', 'intermediate', 'hard']:\n",
        "        generated_summaries[(item['id'], version)] = item['readability_versions'][version]['text']"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "28eb7213",
      "metadata": {},
      "source": [
        "<!-- def readability_judgment_single_prompt(reference_summary, generated_summary, readability_level, subclaim_text, result, evaluation): -->\n",
        "    "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "da42c192",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "training_data=[]\n",
        "with open('/home/mshahidul/readctrl/results/dataset_quality_check/syn_data_resonability_check_20_gpt5.json', 'r') as f:\n",
        "    syn_data_resonability_20 = json.load(f)\n",
        "for item in syn_data_resonability_20:\n",
        "    ref_summary = ref_summaries[item['id']]\n",
        "    fulltext = fulltexts[item['id']]\n",
        "    generated_summary = generated_summaries[(item['id'], item['difficulty_level'])]\n",
        "    results=item['reasonableness']['evaluations']\n",
        "    for eval_item in results:\n",
        "        training_prompt_data = readability_judgment_single_prompt(\n",
        "            ref_summary,\n",
        "            generated_summary,\n",
        "            item['difficulty_level'],\n",
        "            eval_item['subclaim_text'],\n",
        "            eval_item['result'],\n",
        "            str({\n",
        "                \"reasonableness\": eval_item['reasonableness'],\n",
        "                \"justification\": eval_item['justification']\n",
        "            })\n",
        "        )\n",
        "        training_data.append(training_prompt_data)\n",
        "with open('/home/mshahidul/readctrl/data/training_data/syn_data_resonability_check_20_gpt5_training_data.json', 'w') as f:\n",
        "    json.dump(training_data, f, indent=2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "09f6c6e4",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "python '/home/mshahidul/readctrl/code/finetune-inference/inference_resoning_check.py'\n",
        "python '/home/mshahidul/readctrl/code/readability_control.py'"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "id": "78187940",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "dict_keys(['id', 'fulltext', 'fulltext_subclaims', 'summary', 'summary_subclaims'])\n"
          ]
        }
      ],
      "source": [
        "# /home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_with_gs_summary_en.json\n",
        "import json\n",
        "with open('/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_multiclinsum_test_en_0_500.json', 'r') as f:\n",
        "    synthetic_data_with_gs_summary_en = json.load(f)\n",
        "print((synthetic_data_with_gs_summary_en)[0].keys())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "id": "ebd39c1c",
      "metadata": {},
      "outputs": [],
      "source": [
        "# /home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_with_gs_summary_en.json\n",
        "import json\n",
        "full_data=[]\n",
        "with open('/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_with_gs_summary_en.json', 'r') as f:\n",
        "    synthetic_data_with_gs_summary_en = json.load(f)\n",
        "for item in synthetic_data_with_gs_summary_en:\n",
        "    gold_summary = item['summary']\n",
        "    fulltext = item['fulltext']\n",
        "    evaluation = json.dumps(item['diff_label_texts'], ensure_ascii=False)\n",
        "    readability_generation_prompt_data = readability_generation(\n",
        "        gold_summary,\n",
        "        fulltext,\n",
        "        evaluation\n",
        "    )\n",
        "    full_data.append(readability_generation_prompt_data)\n",
        "with open('/home/mshahidul/readctrl/data/finetuning_data/training_data_readability_data_generation.json', 'w') as outfile:\n",
        "    json.dump(full_data, outfile, indent=2,ensure_ascii=False)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e71801a1",
      "metadata": {},
      "source": [
        "# Training prompt for attribution training "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "db70f9cd",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "import json\n",
        "def build_single_subclaim_conversation(\n",
        "    reference_full_text,\n",
        "    generated_summary,\n",
        "    subclaim_id,\n",
        "    subclaim_text,\n",
        "    subclaim_result,\n",
        "    difficulty_level,\n",
        "    evaluation\n",
        "):\n",
        "    \"\"\"\n",
        "    Create a fine‑tuning conversation entry for a single subclaim.\n",
        "\n",
        "    Args:\n",
        "        reference_full_text (str): Source article/reference text.\n",
        "        generated_summary (str): Summary generated for evaluation.\n",
        "        subclaim_id (int or str): Unique identifier of this subclaim.\n",
        "        subclaim_text (str): Subclaim content.\n",
        "        subclaim_result (int): 1 (supported) or 0 (unsupported).\n",
        "        difficulty_level (str): 'easy', 'intermediate', or 'hard'.\n",
        "        evaluation (dict): Target labeled response (reasonableness + justification).\n",
        "\n",
        "    Returns:\n",
        "        dict: One training example formatted for chat‑style fine‑tuning.\n",
        "    \"\"\"\n",
        "\n",
        "    system_prompt = f\"\"\"\n",
        "### **SYSTEM / ROLE INSTRUCTION**\n",
        "\n",
        "You are a **medical factuality and attribution evaluator**.\n",
        "You will assess the following subclaim from a generated summary.\n",
        "\n",
        "The `\"result\"` attribute indicates factual support:\n",
        "- `1` → Supported by the reference text (no evaluation required)\n",
        "- `0` → Unsupported; requires assessing reasonableness based on the readability level (*easy / intermediate / hard*).\n",
        "\n",
        "Your goal: decide whether the **unsupported subclaim (result=0)** is a reasonable simplification or an inaccurate addition.\n",
        "\n",
        "---\n",
        "\n",
        "### **READABILITY & ATTRIBUTION GUIDELINES**\n",
        "\n",
        "| Level | Audience | Linguistic & Stylistic Profile | Allowable Additions |\n",
        "| :-- | :-- | :-- | :-- |\n",
        "| **Easy (FH 70–100)** | General public | Very simple and concrete | Only broad clarifications; no new medical facts |\n",
        "| **Intermediate (FH 50–69)** | Educated layperson | Moderate complexity | Limited explanatory additions consistent with text |\n",
        "| **Hard (FH 0–49)** | Professionals | Formal, technical | Must stay fully evidence‑grounded |\n",
        "\n",
        "---\n",
        "\n",
        "### **Input**\n",
        "Readability Level: {difficulty_level}\n",
        "\n",
        "Reference Full Text:\n",
        "{reference_full_text}\n",
        "\n",
        "Generated Summary:\n",
        "{generated_summary}\n",
        "\n",
        "Subclaim Info:\n",
        "{{\n",
        "  \"subclaim_id\": {subclaim_id},\n",
        "  \"subclaim\": \"{subclaim_text}\",\n",
        "  \"result\": {subclaim_result}\n",
        "}}\n",
        "\n",
        "---\n",
        "\n",
        "### **TASK INSTRUCTIONS**\n",
        "\n",
        "- If `\"result\": 1\"`, respond with **\"not_applicable\"** and a short note like *\"supported, no evaluation required.\"*\n",
        "- If `\"result\": 0\"`, classify as:\n",
        "  - `\"reasonable\"` – legitimate simplification consistent with readability\n",
        "  - `\"partially_reasonable\"` – neutral or harmless addition\n",
        "  - `\"unreasonable\"` – misleading or speculative content\n",
        "\n",
        "Always include a brief justification (1–2 sentences).\n",
        "\n",
        "---\n",
        "\n",
        "### **Output JSON Format**\n",
        "\n",
        "```json\n",
        "{{\n",
        "  \"evaluation\": {{\n",
        "    \"subclaim_id\": {subclaim_id},\n",
        "    \"subclaim\": \"{subclaim_text}\",\n",
        "    \"result\": {subclaim_result},\n",
        "    \"reasonableness\": \"<reasonable | partially_reasonable | unreasonable | not_applicable>\",\n",
        "    \"justification\": \"<short explanation or 'supported, no evaluation required'>\"\n",
        "  }}\n",
        "}}\n",
        "\"\"\".strip()\n",
        "\n",
        "# ---- format the example as a conversation pair ----\n",
        "    conversation = {\n",
        "        \"conversations\": [\n",
        "            {\"from\": \"user\", \"content\": system_prompt},\n",
        "            {\"from\": \"assistant\", \"content\": json.dumps(evaluation, ensure_ascii=False, indent=2)}\n",
        "        ]\n",
        "    }\n",
        "\n",
        "    return conversation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "id": "f92974f0",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "# /home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_0_20.json\n",
        "full_data=[]\n",
        "import json\n",
        "with open('/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_0_20.json', 'r') as f:\n",
        "    data = json.load(f)\n",
        "    full_data.extend(data)\n",
        "with open('/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_20_67.json', 'r') as f:\n",
        "    data = json.load(f)\n",
        "    full_data.extend(data)\n",
        "with open('/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_67_80.json', 'r') as f:\n",
        "    data = json.load(f)\n",
        "    full_data.extend(data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "id": "37e21c6f",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "with open('/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_0_80_full.json', 'w') as f:\n",
        "    json.dump(full_data, f, indent=2,ensure_ascii=False)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "ddd2d6f2",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "# def build_single_subclaim_conversation(\n",
        "#     reference_full_text,\n",
        "#     generated_summary,\n",
        "#     subclaim_id,\n",
        "#     subclaim_text,\n",
        "#     subclaim_result,\n",
        "#     difficulty_level,\n",
        "#     evaluation\n",
        "# )\n",
        "# demo testing\n",
        "p=build_single_subclaim_conversation(\n",
        "    \"This is the full text of the reference article.\",\n",
        "    \"This is the generated summary.\",\n",
        "    1234,\n",
        "    \"This is the subclaim being evaluated.\",\n",
        "    1,\n",
        "    \"easy\",\n",
        "    {\n",
        "        \"reasonableness\": \"reasonable\",\n",
        "        \"justification\": \"The subclaim is a permissible simplification.\"\n",
        "    }\n",
        ")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "89951e90",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "print(p['conversations'][0]['content'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8918f214",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "file_synth = \"/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json\"\n",
        "file_qwen_results = \"/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json\"\n",
        "main_dataset=\"/home/mshahidul/readctrl/results/dataset_quality_check/syn_attribution_resonability_check_100_gpt5_train_v2.json\"\n",
        "save_path = \"/home/mshahidul/readctrl/results/dataset_quality_check/syn_attribution_resonability_check_30_gpt5_train_prompt.json\"\n",
        "\n",
        "with open(file_synth, 'r') as f:\n",
        "    synthetic_data = json.load(f)\n",
        "with open(file_qwen_results, 'r') as f:\n",
        "    qwen3_32B_results = json.load(f) \n",
        "with open(main_dataset, 'r') as f:\n",
        "    main_data = json.load(f)\n",
        "ref_summaries={}\n",
        "fulltexts={}\n",
        "generated_summaries={}\n",
        "for item in synthetic_data:\n",
        "    reference_summary = item['ref_summary']['text']\n",
        "    ref_summaries[item['id']] = reference_summary\n",
        "    full_text = item['full_text']\n",
        "    fulltexts[item['id']] = full_text\n",
        "    for version in ['easy', 'intermediate', 'hard']:\n",
        "        gen_summary = item['readability_versions'][version]['text']\n",
        "        generated_summaries[(item['id'], version)] = gen_summary\n",
        "full_training_data=[]\n",
        "for item in main_data:\n",
        "    ref_summary = ref_summaries[item['id']]\n",
        "    fulltext = fulltexts[item['id']]\n",
        "    generated_summary = generated_summaries[(item['id'], item['difficulty_level'])]\n",
        "    results=item['response']['evaluations']\n",
        "    for eval_item in results:\n",
        "        training_prompt_data = build_single_subclaim_conversation(\n",
        "            ref_summary,\n",
        "            generated_summary,\n",
        "            eval_item['subclaim_id'],\n",
        "            eval_item['subclaim'],\n",
        "            eval_item['result'],\n",
        "            item['difficulty_level'],\n",
        "            {\n",
        "                \"reasonableness\": eval_item['reasonableness'],\n",
        "                \"justification\": eval_item['justification']\n",
        "            }\n",
        "        )\n",
        "        full_training_data.append(training_prompt_data)\n",
        "with open(save_path, 'w') as f:\n",
        "    json.dump(full_training_data, f, indent=2)\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "06be4f7a",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "print(full_training_data[0]['conversations'][0]['content'])"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "e62306ed",
      "metadata": {},
      "source": [
        "# data cleaning"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "3d26ce59",
      "metadata": {},
      "outputs": [],
      "source": [
        "import os, json, re\n",
        "\n",
        "results_dir = \"/home/mshahidul/LLM_guard/results/sub_questions_answers/sub_questions_answers_llama31_8B\"\n",
        "results_dir_mod = \"/home/mshahidul/LLM_guard/results/sub_questions_answersV2/sub_questions_answers_llama31_8B\"\n",
        "os.makedirs(results_dir_mod, exist_ok=True)\n",
        "\n",
        "results_json_files = [f for f in os.listdir(results_dir) if f.endswith('.json')]\n",
        "\n",
        "results_data = []\n",
        "\n",
        "def safe_json_loads(text):\n",
        "    \"\"\"Try multiple ways to parse a possibly broken JSON string.\"\"\"\n",
        "    if not isinstance(text, str):\n",
        "        return text\n",
        "\n",
        "    # 1️⃣ Remove control characters\n",
        "    cleaned = re.sub(r'[\\x00-\\x1F\\x7F]', '', text)\n",
        "\n",
        "    # 2️⃣ Escape newlines and ensure proper quotes\n",
        "    cleaned = cleaned.replace('\\n', '\\\\n').replace('\\r', '\\\\r')\n",
        "\n",
        "    # 3️⃣ Try direct JSON parsing\n",
        "    try:\n",
        "        return json.loads(cleaned)\n",
        "    except json.JSONDecodeError:\n",
        "        pass\n",
        "\n",
        "    # 4️⃣ Try stripping outer braces/spaces and retry\n",
        "    try:\n",
        "        cleaned2 = cleaned.strip()\n",
        "        if cleaned2.startswith(\"{\") and cleaned2.endswith(\"}\"):\n",
        "            inner = cleaned2[1:-1].strip()\n",
        "            if inner.startswith('\"answer\":'):\n",
        "                inner = '{' + inner + '}'\n",
        "            return json.loads(inner)\n",
        "    except json.JSONDecodeError:\n",
        "        pass\n",
        "\n",
        "    # 5️⃣ Last fallback: wrap it as plain text JSON\n",
        "    return {\"answer\": cleaned.strip()}\n",
        "\n",
        "\n",
        "for file in results_json_files:\n",
        "    path = os.path.join(results_dir, file)\n",
        "    with open(path, 'r') as f:\n",
        "        data = json.load(f)\n",
        "\n",
        "    sub_questions_answers = data['sub_questions_answers']\n",
        "    new_data = []\n",
        "\n",
        "    for item in sub_questions_answers:\n",
        "        sub_q = item.get('sub_question', '')\n",
        "        sub_a_raw = item.get('sub_answer', '')\n",
        "\n",
        "        try:\n",
        "            parsed = safe_json_loads(sub_a_raw)\n",
        "        except Exception as e:\n",
        "            print(f\"⚠️ Still bad entry in {file}: {e}\")\n",
        "            print(f\"  Sub-question: {sub_q[:100]}\")\n",
        "            print(f\"  Raw answer preview: {sub_a_raw[:200]}\")\n",
        "            continue\n",
        "\n",
        "        new_data.append({\n",
        "            \"sub_question\": sub_q,\n",
        "            \"sub_answer\": parsed,\n",
        "        })\n",
        "\n",
        "    results_data.append({\n",
        "        \"id\": data['id'],\n",
        "        \"sub_questions_answers\": new_data,\n",
        "    })\n",
        "\n",
        "# Optionally save the cleaned output\n",
        "output_path = os.path.join(results_dir_mod, \"sub_questions_answers_llama31_8B.json\")\n",
        "with open(output_path, 'w') as f:\n",
        "    json.dump(results_data, f, indent=2)\n",
        "\n",
        "print(f\"✅ Cleaned data saved to: {output_path}\")\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "493028e3",
      "metadata": {},
      "outputs": [],
      "source": [
        "phi4_results_dir = \"/home/mshahidul/LLM_guard/results/sub_questions_answers/sub_questions_answers_phi4\"\n",
        "phi4_json_files = [f for f in os.listdir(phi4_results_dir) if f.endswith('.json')]\n",
        "\n",
        "phi4_results_data = []\n",
        "for file in phi4_json_files:\n",
        "    with open(os.path.join(phi4_results_dir, file), 'r') as f:\n",
        "        data = json.load(f)\n",
        "        new_data=[]\n",
        "        for item in data['sub_questions_answers']:\n",
        "            sub_answer=item.get('sub_answer', {}).split(\"assistant\")[2].strip()\n",
        "            new_data.append({\n",
        "                \"sub_question\": item.get('sub_question', ''),\n",
        "                \"sub_answer\": sub_answer,\n",
        "            })\n",
        "        phi4_results_data.append({\n",
        "            \"id\": data['id'],\n",
        "            \"sub_questions_answers\": new_data,\n",
        "        })\n",
        "output_path = os.path.join(results_dir_mod, \"sub_questions_answers_phi4.json\")\n",
        "with open(output_path, 'w') as outfile:\n",
        "    json.dump(phi4_results_data, outfile, indent=2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "814707ed",
      "metadata": {},
      "outputs": [],
      "source": [
        "qwen3_14B_results_dir = \"/home/mshahidul/LLM_guard/results/sub_questions_answers/sub_questions_answers_qwen3_14B\"\n",
        "results_dir_mod = \"/home/mshahidul/LLM_guard/results/sub_questions_answersV2\"\n",
        "qwen3_14B_json_files = [f for f in os.listdir(qwen3_14B_results_dir) if f.endswith('.json')]\n",
        "\n",
        "qwen3_14B_results_data = []\n",
        "for file in qwen3_14B_json_files:\n",
        "    with open(os.path.join(qwen3_14B_results_dir, file), 'r') as f:\n",
        "        data = json.load(f)\n",
        "        new_data=[]\n",
        "        for item in data['sub_questions_answers']:\n",
        "            sub_answer=item.get('sub_answer', {})\n",
        "            new_data.append({\n",
        "                \"sub_question\": (item.get('sub_question', '')),\n",
        "                \"sub_answer\": json.loads(item.get('sub_answer', ''))['answer'],\n",
        "            })\n",
        "        qwen3_14B_results_data.append({\n",
        "            \"id\": data['id'],\n",
        "            \"sub_questions_answers\": new_data,\n",
        "        })\n",
        "output_path = os.path.join(results_dir_mod, \"sub_questions_answers_qwen3_14B.json\")\n",
        "with open(output_path, 'w') as outfile:\n",
        "    json.dump(qwen3_14B_results_data, outfile, indent=2)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "2acf245e",
      "metadata": {},
      "outputs": [],
      "source": [
        "with open('/home/mshahidul/LLM_guard/results/sub_questions_answersV2/sub_questions_answers_qwen3_14B.json', 'r') as f:\n",
        "    qwen3_14B_data = json.load(f)\n",
        "qwen3_14B_data[4]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "1d787193",
      "metadata": {},
      "outputs": [],
      "source": [
        "llama=\"/home/mshahidul/LLM_guard/results/sub_questions_answersV2/sub_questions_answers_llama31_8B.json\"\n",
        "phi=\"/home/mshahidul/LLM_guard/results/sub_questions_answersV2/sub_questions_answers_phi4.json\"\n",
        "qwen=\"/home/mshahidul/LLM_guard/results/sub_questions_answersV2/sub_questions_answers_qwen3_14B.json\"\n",
        "with open(llama, 'r') as f:\n",
        "    llama_data = json.load(f)\n",
        "with open(phi, 'r') as f:\n",
        "    phi_data = json.load(f)\n",
        "with open(qwen, 'r') as f:\n",
        "    qwen_data = json.load(f)\n"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "3e1df02c",
      "metadata": {},
      "source": [
        "- Answer each subquestion individually.\n",
        "\n",
        "- Combine the individual answers into a single, consolidated response.\n",
        "\n",
        "- Ask the LLM to evaluate whether the consolidated response is reasonable.\n",
        "\n",
        "- Receive the LLM's judgment: “Use one LLM answer for those subquestions.”"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "af66abbe",
      "metadata": {},
      "outputs": [],
      "source": [
        "with open('/home/mshahidul/LLM_guard/results/attach_success_failure/attach_success_failure_phi4_14B.json', 'r') as f:\n",
        "    attach_phi4_14B_data = json.load(f)\n",
        "successful_cases = 0\n",
        "for item in attach_phi4_14B_data:\n",
        "    if item[\"evaluation\"] == \"1\":\n",
        "        successful_cases += 1\n",
        "print(f\"Total successful cases in attach_success_failure_phi4_14B: {successful_cases} out of {len(attach_phi4_14B_data)}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "14b88565",
      "metadata": {},
      "source": [
        "# reasonability check"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6617aa9e",
      "metadata": {},
      "outputs": [],
      "source": [
        "def return_promptst(reference_summary, generated_summary, subclaims_json, difficulty_level):\n",
        "    prompt=f'''\n",
        "            **SYSTEM / ROLE INSTRUCTION:**\n",
        "            You are a **medical readability evaluator**.\n",
        "            Your task is to judge whether omitted subclaims (those with `\"result\": 0\"`) from a generated summary are *reasonably omitted* based on the intended **readability level**: *easy*, *intermediate*, or *hard*.\n",
        "            You evaluate this from the standpoint of clarity, faithfulness, and readability goals.\n",
        "\n",
        "            ---\n",
        "\n",
        "            ### **READABILITY GUIDELINES**\n",
        "\n",
        "            | Level            | Target Audience                          | Content Expectation                                             | Technical Detail Allowed                                         |\n",
        "            | :--------------- | :--------------------------------------- | :-------------------------------------------------------------- | :--------------------------------------------------------------- |\n",
        "            | **Easy**         | General public                           | Focus on main events, outcomes, and diagnoses in plain Spanish. | Minimal — avoid measurements, anatomy, and test results.         |\n",
        "            | **Intermediate** | Educated lay readers or medical students | Include key findings and procedures in simplified form.         | Moderate — basic terms and causes allowed.                       |\n",
        "            | **Hard**         | Medical professionals                    | Retain most technical information and precision.                | High — measurements, anatomy, and test interpretations expected. |\n",
        "\n",
        "            ---\n",
        "\n",
        "            ### **INPUT FIELDS**\n",
        "\n",
        "            **Reference summary:**\n",
        "            {reference_summary}\n",
        "\n",
        "            **Generated summary ({difficulty_level}):**\n",
        "            {generated_summary}\n",
        "\n",
        "            **Subclaims and results:**\n",
        "            {subclaims_json}\n",
        "\n",
        "            ---\n",
        "\n",
        "            ### **TASK INSTRUCTIONS**\n",
        "\n",
        "            1. Focus on subclaims with `\"result\": 0\"` (not supported by the generated summary).\n",
        "            2. For each omitted subclaim:\n",
        "\n",
        "            * Decide whether omission is **reasonable** given the readability level.\n",
        "            * Label as: `\"yes\"`, `\"no\"`, or `\"borderline\"`.\n",
        "            * Write a brief justification (1–2 sentences).\n",
        "            3. After individual evaluations, assign a **reasonableness score (0–5)** using this scale:\n",
        "\n",
        "            * **5** = All omissions appropriate for target readability.\n",
        "            * **4** = Minor omissions could improve completeness.\n",
        "            * **3** = Some omissions reduce understanding or medical clarity.\n",
        "            * **2** = Many important omissions harm faithfulness.\n",
        "            * **1** = Major omissions misrepresent case.\n",
        "            * **0** = Summary fails to reflect key medical information.\n",
        "            4. End with an **overall explanation (3–5 sentences)** describing:\n",
        "\n",
        "            * The main reasoning behind the score.\n",
        "            * Whether the summary fits its intended readability level.\n",
        "            * Suggestions for improvement if needed.\n",
        "\n",
        "            ---\n",
        "\n",
        "            ### **OUTPUT FORMAT (strict JSON)**\n",
        "\n",
        "            ```json\n",
        "            {{\n",
        "            \"evaluation_table\": [\n",
        "                {{\n",
        "                \"id\": <subclaim_id>,\n",
        "                \"subclaim\": \"<text>\",\n",
        "                \"reasonable_omission\": \"<yes | no | borderline>\",\n",
        "                \"explanation\": \"<short reason>\"\n",
        "                }}\n",
        "            ],\n",
        "            \"reasonableness_score\": <0-5>,\n",
        "            \"overall_explanation\": \"<concise paragraph>\"\n",
        "            }}\n",
        "            ```\n",
        "            '''\n",
        "    return prompt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e0157715",
      "metadata": {},
      "outputs": [],
      "source": [
        "from openai import OpenAI\n",
        "\n",
        "file_path = \"/home/mshahidul/api_new.json\"\n",
        "with open(file_path, \"r\") as file:\n",
        "    api_keys = json.load(file)\n",
        "\n",
        "openai_api_key = api_keys.get(\"openai\")\n",
        "\n",
        "client = OpenAI(api_key=openai_api_key)\n",
        "def openai_return(prompt):\n",
        "    response = client.chat.completions.create(\n",
        "        model=\"gpt-5-mini\",\n",
        "        messages=[\n",
        "            {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
        "            {\"role\": \"user\", \"content\": prompt}\n",
        "        ]\n",
        "    )\n",
        "    cleaned_response = response.choices[0].message.content.strip().replace(\"```json\", \"\").replace(\"```\", \"\")\n",
        "    return json.loads(cleaned_response)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8469089e",
      "metadata": {},
      "outputs": [],
      "source": [
        "import json\n",
        "file_path = \"/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json\"\n",
        "\n",
        "with open(file_path, 'r') as f:\n",
        "    synthetic_data = json.load(f)\n",
        "\n",
        "synthetic_data[0].keys()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e878c58e",
      "metadata": {},
      "outputs": [],
      "source": [
        "file_path_qwen3_32B = \"/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json\"\n",
        "\n",
        "with open(file_path_qwen3_32B, 'r') as f:\n",
        "    qwen3_32B_results = json.load(f)\n",
        "\n",
        "# print(qwen3_32B_results[0]['completeness']['results'])\n",
        "print(qwen3_32B_results[0].keys())\n",
        "print(qwen3_32B_results[0]['completeness']['results'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c7306023",
      "metadata": {},
      "outputs": [],
      "source": [
        "# dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])\n",
        "# print(f\"Full text: {synthetic_data[0]['full_text']}\")\n",
        "res=[]\n",
        "save_path = \"/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5.json\"\n",
        "if os.path.exists(save_path):\n",
        "    with open(save_path, 'r') as f:\n",
        "        res = json.load(f)\n",
        "print(f\"Resuming from {len(res)} entries\")\n",
        "import tqdm\n",
        "for ind in tqdm.tqdm(range(0,100)):\n",
        "    for version in [\"easy\", \"intermediate\", \"hard\"]:\n",
        "        ref_summary = (f\"{synthetic_data[ind]['ref_summary']['text']}\")\n",
        "        generated_summary = (f\"{synthetic_data[ind]['readability_versions'][version]['text']}\")\n",
        "        subclaims_results = (f\"{qwen3_32B_results[ind]['completeness']['results']}\")\n",
        "        prompt = return_promptst(ref_summary, generated_summary, subclaims_results, version)\n",
        "        res.append({\n",
        "            \"id\": synthetic_data[ind]['id'],\n",
        "            \"difficulty_level\": version,\n",
        "            \"prompt\": openai_return(prompt)\n",
        "        })\n",
        "        if len(res)%2==0:\n",
        "            print(f\"Completed {len(res)} out of 300\")\n",
        "            with open(save_path, 'w') as outfile:\n",
        "                json.dump(res, outfile, indent=2)\n",
        "        # print(prompt)\n",
        "        # assert False\n",
        "with open(save_path, 'w') as outfile:\n",
        "    json.dump(res, outfile, indent=2)"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "62975fd6",
      "metadata": {},
      "source": [
        "# updated statistics"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "c837c69c",
      "metadata": {},
      "outputs": [],
      "source": [
        "resonability_data[0].keys()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "9a1e45ee",
      "metadata": {},
      "outputs": [],
      "source": [
        "resonability_data[0]['prompt'].keys()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "23ec58b5",
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "b152d3d6",
      "metadata": {},
      "outputs": [],
      "source": [
        "import json\n",
        "with open('/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5.json', 'r') as f:\n",
        "    resonability_data = json.load(f)\n",
        "dict1={}\n",
        "for item in resonability_data:\n",
        "    for eval in item['prompt']['evaluation_table']:\n",
        "        dict1[(item['id'], item['difficulty_level'], eval['id'])]= 0 if eval['reasonable_omission']==\"no\" else 1"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "360e5539",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "file_path_qwen3_32B = \"/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json\"\n",
        "\n",
        "with open(file_path_qwen3_32B, 'r') as f:\n",
        "    qwen3_32B_results = json.load(f)\n",
        "success=0\n",
        "acc=0\n",
        "success_full=[]\n",
        "for item in qwen3_32B_results:\n",
        "    success=0\n",
        "    total=0\n",
        "    for eval in item['completeness']['results']:\n",
        "        key = (item['id'], item['version'], eval['subclaim']['id'])\n",
        "        if eval.get('result')!=None:\n",
        "            total+=1\n",
        "            if eval['result']==\"1\":\n",
        "                success+=1\n",
        "            elif dict1.get(key)!=None:\n",
        "                success+=dict1.get(key)\n",
        "    success_full.append({\n",
        "        \"id\": item['id'],\n",
        "        \"version\": item['version'],\n",
        "        \"total_subclaims\": len(item['completeness']['results']),\n",
        "        \"successful_subclaims\": success/total\n",
        "    })"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7ee44884",
      "metadata": {
        "vscode": {
          "languageId": "ruby"
        }
      },
      "outputs": [],
      "source": [
        "success_full"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "93019187",
      "metadata": {},
      "outputs": [],
      "source": [
        "label_accuracy = {}\n",
        "for version in [\"easy\", \"intermediate\", \"hard\"]:\n",
        "    for item in success_full:\n",
        "        if item['version'] == version:\n",
        "            label_accuracy[version] = label_accuracy.get(version, 0) + item['successful_subclaims']\n",
        "for version in label_accuracy:\n",
        "    label_accuracy[version] = label_accuracy[version] / (100) \n",
        "    print(f\"{version}: {label_accuracy[version]*100:.2f}%\") \n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "6f4e15a0",
      "metadata": {},
      "outputs": [],
      "source": [
        "import json\n",
        "\n",
        "file_path = \"/home/mshahidul/LLM_guard/data/synthetic_best_ans_selection_qwen25-32B.json\"\n",
        "\n",
        "with open(file_path, 'r') as f:\n",
        "    synthetic_best_ans_data = json.load(f)\n",
        "\n",
        "print(synthetic_best_ans_data[3])  # Print the first entry for inspection"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "947b453d",
      "metadata": {},
      "outputs": [],
      "source": [
        "# /home/mshahidul/readctrl/data/raw_data/en_test/multiclinsum_test_en/fulltext read\n",
        "import os\n",
        "all_data = []\n",
        "lang=\"pt\"\n",
        "for path in os.listdir(f'/home/mshahidul/readctrl/data/raw_data/{lang}_test/multiclinsum_test_{lang}/fulltext'):\n",
        "        with open(os.path.join(f'/home/mshahidul/readctrl/data/raw_data/{lang}_test/multiclinsum_test_{lang}/fulltext', path), 'r') as f:\n",
        "            fulltext = f.read()\n",
        "        path2=path.replace(f\"_{lang}\", f\"_{lang}_sum\")\n",
        "        with open(os.path.join(f'/home/mshahidul/readctrl/data/raw_data/{lang}_test/multiclinsum_test_{lang}/summaries', path2), 'r') as f:\n",
        "            summary = f.read()\n",
        "        all_data.append({\n",
        "            \"id\": path,\n",
        "            \"fulltext\": fulltext,\n",
        "            \"summary\": summary\n",
        "        })       \n",
        "with open(f'/home/mshahidul/readctrl/data/processed_raw_data/multiclinsum_test_{lang}.json', 'w') as outfile:\n",
        "    json.dump(all_data, outfile, indent=2) \n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "bb375fa2",
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import json\n",
        "\n",
        "# Load your data\n",
        "with open('/home/mshahidul/readctrl/data/classified_readability/classified_multiclinsum_test_en.json', 'r') as f:\n",
        "    data = json.load(f)\n",
        "\n",
        "df = pd.DataFrame(data)\n",
        "\n",
        "# Define the bins and labels for Option 1\n",
        "# Bins: 0-2 (Easy), 2-3 (Medium), 3-5 (Hard)\n",
        "bins = [0, 2, 3, 5]\n",
        "labels = ['Easy', 'Medium', 'Hard']\n",
        "\n",
        "df['readability_level'] = pd.cut(df['readability_score'], bins=bins, labels=labels)\n",
        "\n",
        "print(df[['readability_score', 'readability_level']].head())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "782de099",
      "metadata": {},
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import json\n",
        "\n",
        "# 1. Load the dataset\n",
        "# Update the filename if it is in your current directory\n",
        "file_path = '/home/mshahidul/readctrl/data/classified_readability/classified_multiclinsum_test_en.json' \n",
        "\n",
        "with open(file_path, 'r') as f:\n",
        "    data = json.load(f)\n",
        "\n",
        "df = pd.DataFrame(data)\n",
        "\n",
        "# 2. Inspect the current distribution to decide on the best strategy\n",
        "print(\"Current Score Distribution:\")\n",
        "print(df['readability_score'].value_counts().sort_index())\n",
        "\n",
        "# 3. Apply the Balanced Split (Strategy 1)\n",
        "def categorize_readability(score):\n",
        "    if score <= 2:\n",
        "        return 'Easy'\n",
        "    elif score == 3:\n",
        "        return 'Medium'\n",
        "    else:\n",
        "        return 'Hard'\n",
        "\n",
        "df['readability_type'] = df['readability_score'].apply(categorize_readability)\n",
        "\n",
        "# 4. Save the results\n",
        "df.to_csv('classified_readability_results.csv', index=False)\n",
        "print(\"\\nTransformation complete. New categories:\")\n",
        "print(df['readability_type'].value_counts())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e4e582a2",
      "metadata": {},
      "outputs": [],
      "source": [
        "python /home/mshahidul/readctrl/code/finetune-inference/inference_extract_subclaims_v3.py --input_file /home/mshahidul/readctrl/data/classified_readability/classified_multiclinsum_test_en.json"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "7c2df145",
      "metadata": {},
      "outputs": [],
      "source": [
        "# /home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_classified_multiclinsum_test_en_en.json read\n",
        "with open('/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_0_100_qwen3-32B.json', 'r') as f:\n",
        "    extracted_subclaims_data = json.load(f)\n",
        "# print(len(extracted_subclaims_data))\n",
        "print(extracted_subclaims_data[0]['subclaim_evaluations'][0].keys())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "adb6ed8f",
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "id": "fdd516da",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "dict_keys(['index', 'id', 'fulltext', 'fulltext_subclaims', 'summary', 'summary_subclaims', 'diff_label_texts', 'diff_label_subclaims', 'readability_score'])\n",
            "dict_keys(['low_health_literacy', 'intermediate_health_literacy', 'proficient_health_literacy'])\n",
            "dict_keys(['low_health_literacy', 'intermediate_health_literacy', 'proficient_health_literacy'])\n"
          ]
        }
      ],
      "source": [
        "# /home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json\n",
        "import json\n",
        "with open('/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json', 'r') as f:\n",
        "    extracted_subclaims_syn_data = json.load(f)\n",
        "print(extracted_subclaims_syn_data[0].keys())\n",
        "print(extracted_subclaims_syn_data[0]['diff_label_texts'].keys())\n",
        "print(extracted_subclaims_syn_data[0]['diff_label_subclaims'].keys())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "id": "f2771312",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "dict_keys(['index', 'literacy_levels'])\n",
            "dict_keys(['low_health_literacy', 'intermediate_health_literacy', 'proficient_health_literacy'])\n",
            "dict_keys(['scores', 'details'])\n",
            "dict_keys(['factual_attribution', 'completeness', 'conciseness', 'source_coverage'])\n",
            "dict_keys(['attribution', 'completeness', 'conciseness', 'source_coverage'])\n",
            "dict_keys(['source_subclaim', 'status'])\n"
          ]
        }
      ],
      "source": [
        "with open('/home/mshahidul/readctrl/data/factual_testing/full_details_evaluation_0_20_qwen3-32B_v2.json', 'r') as f:\n",
        "    full_details_evaluation_data = json.load(f)\n",
        "print(full_details_evaluation_data[0].keys())\n",
        "print(full_details_evaluation_data[0]['literacy_levels'].keys())\n",
        "print(full_details_evaluation_data[0]['literacy_levels']['low_health_literacy'].keys())\n",
        "print(full_details_evaluation_data[0]['literacy_levels']['low_health_literacy']['scores'].keys())\n",
        "print(full_details_evaluation_data[0]['literacy_levels']['low_health_literacy']['details'].keys())\n",
        "print(full_details_evaluation_data[0]['literacy_levels']['low_health_literacy']['details']['source_coverage'][0].keys())"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "un",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.14"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}