{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "311c1d16",
   "metadata": {},
   "outputs": [],
   "source": [
    "# /home/mshahidul/readctrl/data/annotators_validate_data\n",
    "import os\n",
    "print(os.listdir('/home/mshahidul/readctrl/data/annotators_validate_data')[:3])\n",
    "all_folders = os.listdir('/home/mshahidul/readctrl/data/annotators_validate_data')\n",
    "print(os.listdir(f'/home/mshahidul/readctrl/data/annotators_validate_data/{all_folders[0]}'))\n",
    "file_path = f'/home/mshahidul/readctrl/data/annotators_validate_data/{all_folders[0]}/annotation_results.json'\n",
    "import json\n",
    "with open(file_path, 'r') as f:\n",
    "    data = json.load(f)\n",
    "print(data[0].keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea2f9f3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "(all_folders)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08ae6eaa",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "from collections import Counter\n",
    "\n",
    "# Configuration\n",
    "input_dir = '/home/mshahidul/readctrl/data/annotators_validate_data'\n",
    "output_dir = '/home/mshahidul/readctrl/data/final_result'\n",
    "output_file = os.path.join(output_dir, 'consolidated_ratings.json')\n",
    "\n",
    "# 1. Create the output directory if it doesn't exist\n",
    "if not os.path.exists(output_dir):\n",
    "    os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "all_data = []\n",
    "\n",
    "# 2. Collect data from all folders\n",
    "folders = [f for f in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, f))]\n",
    "avg = []\n",
    "for folder in folders:\n",
    "    json_path = os.path.join(input_dir, folder, 'annotation_results.json')\n",
    "    if os.path.exists(json_path):\n",
    "        with open(json_path, 'r') as f:\n",
    "            try:\n",
    "                entries = json.load(f)\n",
    "                if len(entries) <=3 :\n",
    "                    # print(f\"No entries found in {json_path}, skipping.\")\n",
    "                    avg.append(len(entries))\n",
    "                avg\n",
    "                for item in entries:\n",
    "                    all_data.append({\n",
    "                        'doc_id': item.get('doc_id'),\n",
    "                        'health_literacy_label': item.get('health_literacy_label'),\n",
    "                        'rating': item.get('doc_rating')\n",
    "                    })\n",
    "            except Exception as e:\n",
    "                print(f\"Skipping error in {json_path}: {e}\")\n",
    "\n",
    "# 3. Process data\n",
    "df = pd.DataFrame(all_data)\n",
    "\n",
    "# Ensure we drop rows where any of our keys or the rating are missing\n",
    "df = df.dropna(subset=['doc_id', 'health_literacy_label', 'rating'])\n",
    "\n",
    "# 4. Aggregation Logic using both doc_id and health_literacy_label\n",
    "def get_mode(series):\n",
    "    # Returns the most common rating for this specific doc + literacy level\n",
    "    return Counter(series).most_common(1)[0][0]\n",
    "\n",
    "# Grouping by the composite key\n",
    "summary = df.groupby(['doc_id', 'health_literacy_label'])['rating'].agg([\n",
    "    ('num_annotations', 'count'),\n",
    "    ('mean_rating', 'mean'),\n",
    "    ('consensus_rating', get_mode),\n",
    "    ('rating_distribution', lambda x: list(x))\n",
    "]).reset_index()\n",
    "\n",
    "# 5. Save to JSON\n",
    "# orient='records' creates a list of dictionaries\n",
    "summary.to_json(output_file, orient='records', indent=4)\n",
    "\n",
    "print(f\"Success! Processed {len(summary)} unique (doc_id, literacy_label) pairs.\")\n",
    "print(f\"File saved at: {output_file}\")\n",
    "\n",
    "# Preview the first few entries\n",
    "print(summary.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75197961",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import pandas as pd\n",
    "from collections import Counter\n",
    "\n",
    "# Configuration\n",
    "input_dir = '/home/mshahidul/readctrl/data/annotators_validate_data'\n",
    "output_dir = '/home/mshahidul/readctrl/data/final_result'\n",
    "output_file_match = os.path.join(output_dir, 'consolidated_ratings.json')\n",
    "output_file_mismatch = os.path.join(output_dir, 'mismatched_ratings.json')\n",
    "\n",
    "if not os.path.exists(output_dir):\n",
    "    os.makedirs(output_dir, exist_ok=True)\n",
    "\n",
    "all_data = []\n",
    "folders = [f for f in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, f))]\n",
    "\n",
    "# 1. Collect data\n",
    "for folder in folders:\n",
    "    json_path = os.path.join(input_dir, folder, 'annotation_results.json')\n",
    "    if os.path.exists(json_path):\n",
    "        with open(json_path, 'r') as f:\n",
    "            try:\n",
    "                entries = json.load(f)\n",
    "                for item in entries:\n",
    "                    all_data.append({\n",
    "                        'doc_id': item.get('doc_id'),\n",
    "                        'health_literacy_label': item.get('health_literacy_label'),\n",
    "                        'rating': item.get('doc_rating')\n",
    "                    })\n",
    "            except Exception as e:\n",
    "                print(f\"Skipping error in {json_path}: {e}\")\n",
    "\n",
    "df = pd.DataFrame(all_data).dropna(subset=['doc_id', 'health_literacy_label', 'rating'])\n",
    "\n",
    "# 2. Aggregation Logic\n",
    "def get_mode(series):\n",
    "    return Counter(series).most_common(1)[0][0]\n",
    "\n",
    "summary = df.groupby(['doc_id', 'health_literacy_label'])['rating'].agg([\n",
    "    ('num_annotations', 'count'),\n",
    "    ('mean_rating', 'mean'),\n",
    "    ('consensus_rating', get_mode),\n",
    "    ('rating_distribution', lambda x: list(x))\n",
    "]).reset_index()\n",
    "\n",
    "# 3. Validation Logic\n",
    "def check_match(row):\n",
    "    label = row['health_literacy_label']\n",
    "    rating = row['consensus_rating']\n",
    "    \n",
    "    if label == \"low_health_literacy\":\n",
    "        return rating in [1, 2]\n",
    "    elif label == \"intermediate_health_literacy\":\n",
    "        return rating == 3\n",
    "    elif label == \"proficient_health_literacy\":\n",
    "        return rating in [4, 5]\n",
    "    return False\n",
    "\n",
    "# Apply the check\n",
    "summary['is_match'] = summary.apply(check_match, axis=1)\n",
    "\n",
    "# 4. Split and Save\n",
    "matches = summary[summary['is_match'] == True].drop(columns=['is_match'])\n",
    "mismatches = summary[summary['is_match'] == False].drop(columns=['is_match'])\n",
    "\n",
    "matches.to_json(output_file_match, orient='records', indent=4)\n",
    "mismatches.to_json(output_file_mismatch, orient='records', indent=4)\n",
    "\n",
    "print(f\"Success!\")\n",
    "print(f\"Matching entries saved: {len(matches)} -> {output_file_match}\")\n",
    "print(f\"Mismatched entries saved: {len(mismatches)} -> {output_file_mismatch}\")\n",
    "\n",
    "if not mismatches.empty:\n",
    "    print(\"\\nPreview of Mismatches:\")\n",
    "    print(mismatches.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8773257",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f1f1045",
   "metadata": {},
   "outputs": [],
   "source": [
    "min(avg), max(avg), sum(avg)/len(avg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "877ebaac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "# 1. Load your consolidated JSON file\n",
    "file_path = '/home/mshahidul/readctrl/data/final_result/consolidated_ratings.json'\n",
    "with open(file_path, 'r') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "df = pd.DataFrame(data)\n",
    "\n",
    "# 2. Define the \"OK\" logic function\n",
    "def check_if_ok(row):\n",
    "    label = str(row['health_literacy_label']).lower()\n",
    "    rating = row['consensus_rating']\n",
    "    \n",
    "    if label == 'low_health_literacy':\n",
    "        return 1 if rating in [1, 2] else 0\n",
    "    elif label == 'intermediate_health_literacy':\n",
    "        return 1 if rating == 3 else 0\n",
    "    elif label == 'proficient_health_literacy':\n",
    "        return 1 if rating in [4, 5] else 0\n",
    "    return 0\n",
    "\n",
    "# 3. Apply logic and calculate stats\n",
    "df['is_ok'] = df.apply(check_if_ok, axis=1)\n",
    "\n",
    "# Group by literacy label to see performance\n",
    "stats = df.groupby('health_literacy_label')['is_ok'].agg(['count', 'sum']).reset_index()\n",
    "stats.columns = ['Literacy Level', 'Total Docs', 'Number OK']\n",
    "stats['Success Rate (%)'] = (stats['Number OK'] / stats['Total Docs'] * 100).round(2)\n",
    "\n",
    "print(\"--- Accuracy / Success Report ---\")\n",
    "print(stats)\n",
    "\n",
    "# 4. Total overall success\n",
    "total_docs = len(df)\n",
    "total_ok = df['is_ok'].sum()\n",
    "print(f\"\\nOverall Summary: {total_ok}/{total_docs} documents meet the literacy criteria ({round(total_ok/total_docs*100, 2)}%)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "065399a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"/home/mshahidul/readctrl/data/annotators_validate_data/Sharmin Sultana_2025-12-31_14-19-30/annotation_results.json\", 'r') as f:\n",
    "    data = json.load(f)\n",
    "print(data[0].keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5835ec3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "import math\n",
    "\n",
    "# Define paths\n",
    "input_path = \"/home/mshahidul/readctrl/data/annotators_validate_data/Sharmin Sultana_2025-12-31_14-19-30/annotation_results.json\"\n",
    "output_path = \"/home/mshahidul/readctrl/data/annotators_validate_data/Sharmin Sultana_2025-12-31_14-19-30/annotation_results_rescaled.json\"\n",
    "\n",
    "def rescale_rating(val):\n",
    "    if val is None:\n",
    "        return None\n",
    "    # Converts 1-10 to 1-5 (e.g., 10 becomes 5, 1 becomes 1)\n",
    "    return math.ceil(val / 2)\n",
    "\n",
    "# Load data\n",
    "with open(input_path, 'r') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "# Process ratings\n",
    "for entry in data:\n",
    "    if 'doc_rating' in entry:\n",
    "        entry['doc_rating'] = rescale_rating(entry['doc_rating'])\n",
    "    if 'wiki_rating' in entry:\n",
    "        entry['wiki_rating'] = rescale_rating(entry['wiki_rating'])\n",
    "\n",
    "# Save updated data\n",
    "with open(output_path, 'w') as f:\n",
    "    json.dump(data, f, indent=4)\n",
    "\n",
    "print(f\"Successfully saved rescaled data to: {output_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5865b65",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de9b3b2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# /home/mshahidul/readctrl/data/final_result/mismatched_ratings.json\n",
    "with open(\"/home/mshahidul/readctrl/data/final_result/mismatched_ratings.json\", 'r') as f:\n",
    "    data = json.load(f)\n",
    "id=0\n",
    "index=data[id]['doc_id']\n",
    "label=data[id]['health_literacy_label']\n",
    "print(data[id])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e307543c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# /home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json\n",
    "with open(\"/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json\", 'r') as f:\n",
    "    data2 = json.load(f)\n",
    "src_lang=\"English\"\n",
    "summary=data2[index]['summary']\n",
    "fulltext=data2[index]['fulltext']\n",
    "gen_summary=data2[index]['diff_label_texts'][label]\n",
    "f=open(\"/home/mshahidul/readctrl/prompts/syn_data_gen_diff_label.txt\",\"r\").read()\n",
    "txt=f.replace(\"<<<SOURCE_LANGUAGE>>>\",src_lang).replace(\"<<<GOLD_SUMMARY>>>\",summary).replace(\"<<<FULL_TEXT>>>\",fulltext)\n",
    "print(txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "59c72954",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(gen_summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b2a2595",
   "metadata": {},
   "outputs": [],
   "source": [
    "# /home/mshahidul/readctrl/data/final_result/consolidated_ratings_edit.json\n",
    "import json\n",
    "with open(\"/home/mshahidul/readctrl/data/final_result/consolidated_ratings_edit.json\", 'r') as f:\n",
    "    data = json.load(f)\n",
    "print(data[0].keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46f54097",
   "metadata": {},
   "outputs": [],
   "source": [
    "set([x[\"health_literacy_label\"] for x in data])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b1264ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "# /home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json\n",
    "with open(\"/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json\", 'r') as f:\n",
    "    data2 = json.load(f)\n",
    "print(data2[0].keys())\n",
    "print(data2[0]['diff_label_texts'].keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d847270d",
   "metadata": {},
   "source": [
    "## Step 0: Prepare Your Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44047dbb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# 1. Load the datasets\n",
    "with open(\"/home/mshahidul/readctrl/data/final_result/consolidated_ratings_edit.json\", 'r') as f:\n",
    "    ratings_data = json.load(f)\n",
    "ratings_data=ratings_data[7:]\n",
    "with open(\"/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json\", 'r') as f:\n",
    "    text_data = json.load(f)\n",
    "\n",
    "# 2. Updated mapping: Store the whole item or specific keys for fulltext and summary\n",
    "# We map the index to a dictionary containing the variations and the original full text/summary\n",
    "text_map = {\n",
    "    item['index']: {\n",
    "        'variations': item['diff_label_texts'],\n",
    "        'fulltext': item.get('fulltext', \"\"),\n",
    "        'summary': item.get('summary', \"\")\n",
    "    } \n",
    "    for item in text_data\n",
    "}\n",
    "\n",
    "cleaned_data = []\n",
    "\n",
    "# 3. Iterate through ratings and extract data\n",
    "for entry in ratings_data:\n",
    "    doc_id = entry['doc_id']\n",
    "    label = entry['health_literacy_label']\n",
    "    \n",
    "    if doc_id in text_map:\n",
    "        source_info = text_map[doc_id]\n",
    "        \n",
    "        # Retrieve the specific text version based on the label\n",
    "        # .get() handles cases where a specific label might be missing\n",
    "        labeled_text = source_info['variations'].get(label, \"\")\n",
    "        \n",
    "        # Construct the expanded object\n",
    "        cleaned_data.append({\n",
    "            \"doc_id\": doc_id,\n",
    "            \"label\": label,\n",
    "            \"gen_text\": labeled_text,\n",
    "            \"fulltext\": source_info['fulltext'],\n",
    "            \"gs_summary\": source_info['summary']\n",
    "        })\n",
    "\n",
    "# 4. Output the clean JSON\n",
    "output_path = \"/home/mshahidul/readctrl/data/new_exp/cleaned_health_literacy_data.json\"\n",
    "with open(output_path, 'w') as f:\n",
    "    json.dump(cleaned_data, f, indent=4, ensure_ascii=False)\n",
    "\n",
    "print(f\"Successfully processed {len(cleaned_data)} examples.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1e6b0ae",
   "metadata": {},
   "source": [
    "## Step 1: Pick Few-Shot Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71e83ac8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import requests\n",
    "from collections import defaultdict\n",
    "\n",
    "# Configuration\n",
    "API_URL = \"http://172.16.34.29:8004/v1/chat/completions\"\n",
    "MODEL_NAME = \"Qwen/Qwen3-30B-A3B-Instruct-2507\"\n",
    "INPUT_FILE = \"/home/mshahidul/readctrl/data/new_exp/cleaned_health_literacy_data.json\"\n",
    "OUTPUT_FILE = \"/home/mshahidul/readctrl/data/new_exp/few_shot_examples.json\"\n",
    "\n",
    "def get_text_metadata(text):\n",
    "    \"\"\"Ask the LLM to identify the topic and medical complexity of a text.\"\"\"\n",
    "    prompt = f\"\"\"Analyze the following medical text and provide a 1-word topic (e.g., Cardiology, Nutrition, Medication) and a 1-word complexity level (Simple, Moderate, Technical).\n",
    "    Text: {text}...\n",
    "    Format: Topic | Complexity\"\"\"\n",
    "    \n",
    "    try:\n",
    "        response = requests.post(API_URL, json={\n",
    "            \"model\": MODEL_NAME,\n",
    "            \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n",
    "            \"temperature\": 0.1\n",
    "        })\n",
    "        return response.json()['choices'][0]['message']['content'].strip()\n",
    "    except:\n",
    "        return \"General | Unknown\"\n",
    "\n",
    "# 1. Load the cleaned data\n",
    "with open(INPUT_FILE, 'r') as f:\n",
    "    data = json.load(f)\n",
    "\n",
    "# 2. Group data by label\n",
    "grouped_data = defaultdict(list)\n",
    "for item in data:\n",
    "    grouped_data[item['label']].append(item)\n",
    "\n",
    "# 3. Select diverse examples for each label\n",
    "few_shot_selection = {}\n",
    "\n",
    "for label, examples in grouped_data.items():\n",
    "    print(f\"Processing label: {label}...\")\n",
    "    \n",
    "    # Analyze a subset (or all) to find diversity\n",
    "    scored_examples = []\n",
    "    for ex in examples: \n",
    "        metadata = get_text_metadata(ex['gen_text'])\n",
    "        ex['metadata'] = metadata\n",
    "        scored_examples.append(ex)\n",
    "    \n",
    "    # Heuristic: Sort by metadata to group similar topics, then pick spread-out indices\n",
    "    scored_examples.sort(key=lambda x: x['metadata'])\n",
    "    \n",
    "    # Pick 5 examples spread across the sorted metadata for maximum diversity\n",
    "    step = max(1, len(scored_examples) // 5)\n",
    "    selected = scored_examples[::step][:5]\n",
    "    few_shot_selection[label] = selected\n",
    "\n",
    "# 4. Save the result\n",
    "with open(OUTPUT_FILE, 'w') as f:\n",
    "    json.dump(few_shot_selection, f, indent=4)\n",
    "\n",
    "print(f\"Few-shot examples saved to: {OUTPUT_FILE}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d48720a6",
   "metadata": {},
   "source": [
    "## Step 2: Decide on LLM(s)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4396ac94",
   "metadata": {},
   "source": [
    "### V1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f96d976b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import requests\n",
    "\n",
    "# Configuration\n",
    "API_URL = \"http://172.16.34.29:8004/v1/chat/completions\"\n",
    "MODEL_NAME = \"Qwen/Qwen3-30B-A3B-Instruct-2507\"\n",
    "FEW_SHOT_FILE = \"/home/mshahidul/readctrl/data/new_exp/few_shot_examples.json\"\n",
    "\n",
    "# 1. Load the 15 selected examples\n",
    "with open(FEW_SHOT_FILE, 'r') as f:\n",
    "    few_shot_data = json.load(f)\n",
    "\n",
    "def get_reasoning(fulltext, gen_text, label):\n",
    "    \"\"\"Ask the LLM to explain why the text fits the label compared to the source context.\"\"\"\n",
    "    prompt = f\"\"\"Compare the 'Target Text' to the 'Original Fulltext'. \n",
    "Explain why the Target Text fits the health literacy label: {label}.\n",
    "Focus on how vocabulary, jargon, and sentence structure were adapted.\n",
    "\n",
    "Original Fulltext: {fulltext}\n",
    "Target Text: {gen_text}\n",
    "Label: {label}\n",
    "\n",
    "Reasoning (1-2 sentences):\"\"\"\n",
    "    \n",
    "    try:\n",
    "        response = requests.post(API_URL, json={\n",
    "            \"model\": MODEL_NAME,\n",
    "            \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n",
    "            \"temperature\": 0\n",
    "        })\n",
    "        return response.json()['choices'][0]['message']['content'].strip()\n",
    "    except Exception as e:\n",
    "        return \"Reasoning could not be generated.\"\n",
    "\n",
    "# 2. Build the few-shot string\n",
    "few_shot_string = \"\"\n",
    "\n",
    "for label in [\"low_health_literacy\", \"intermediate_health_literacy\", \"proficient_health_literacy\"]:\n",
    "    examples = few_shot_data.get(label, [])\n",
    "    for ex in examples:\n",
    "        # Pass fulltext to the reasoning generator\n",
    "        reason = get_reasoning(ex.get('fulltext', \"\"), ex['gen_text'], label)\n",
    "        \n",
    "        few_shot_string += f\"Original Fulltext: \\\"{ex.get('fulltext', '')}\\\"\\n\"\n",
    "        few_shot_string += f\"Target Text: \\\"{ex['gen_text']}\\\"\\n\"\n",
    "        few_shot_string += f\"Reasoning: {reason}\\n\"\n",
    "        few_shot_string += f\"Label: {label}\\n\"\n",
    "        few_shot_string += \"-\" * 30 + \"\\n\"\n",
    "\n",
    "# 3. Define the Final Prompt Structure\n",
    "instruction = \"\"\"You are an expert in health communication. Your task is to judge the health literacy level of a target text based on its original medical source.\n",
    "\n",
    "Classify the text into one of three categories:\n",
    "1. low_health_literacy: Uses common words (everyday language), very short sentences, and eliminates all medical jargon.\n",
    "2. intermediate_health_literacy: Uses some medical terms with explanation, standard sentence length, requires basic health knowledge.\n",
    "3. proficient_health_literacy: Uses high-level medical jargon, technical language, and academic or professional structures.\n",
    "\n",
    "### Few-Shot Examples:\n",
    "\"\"\"\n",
    "\n",
    "# 4. Save the prompt template\n",
    "# The placeholder now expects both fulltext and input_text\n",
    "final_prompt_template = (\n",
    "    instruction + \n",
    "    few_shot_string + \n",
    "    \"\\n### Now judge this text:\\n\"\n",
    "    \"Original Fulltext: \\\"{fulltext}\\\"\\n\"\n",
    "    \"Target Text: \\\"{input_text}\\\"\\n\"\n",
    "    \"Reasoning:\"\n",
    ")\n",
    "\n",
    "output_path = \"/home/mshahidul/readctrl/data/new_exp/final_prompt_template.txt\"\n",
    "with open(output_path, 'w') as f:\n",
    "    f.write(final_prompt_template)\n",
    "\n",
    "print(f\"Prompt template with fulltext context saved to {output_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "3bc0564f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3396\n"
     ]
    }
   ],
   "source": [
    "# /home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_with_gs_summary_en.json\n",
    "import json\n",
    "with open(\"/home/mshahidul/readctrl/data/processed_test_raw_data/multiclinsum_test_en.json\", 'r') as f:\n",
    "    data = json.load(f)\n",
    "print(len(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "882507f2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['id', 'fulltext', 'fulltext_subclaims', 'summary', 'summary_subclaims'])\n",
      "3396\n"
     ]
    }
   ],
   "source": [
    "# /home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json\n",
    "import json\n",
    "with open(\"/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_multiclinsum_test_en_full.json\", 'r') as f:\n",
    "    data = json.load(f)\n",
    "print(data[0].keys())\n",
    "print(len(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0fcc380",
   "metadata": {},
   "outputs": [],
   "source": [
    "LOCAL_API_URL = \"http://172.16.34.29:8004/v1\"\n",
    "LOCAL_MODEL_NAME = \"/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-extraction-8b_ctx_fp16\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d8b235a6",
   "metadata": {},
   "outputs": [
    {
     "ename": "JSONDecodeError",
     "evalue": "Extra data: line 2 column 1 (char 22694)",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mJSONDecodeError\u001b[39m                           Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 4\u001b[39m\n\u001b[32m      2\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mjson\u001b[39;00m\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m/home/mshahidul/LLM_guard/CKA-Agent/results/single_run_20260203_213455/inter_result_sample_0.json\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m'\u001b[39m\u001b[33mr\u001b[39m\u001b[33m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m     data = \u001b[43mjson\u001b[49m\u001b[43m.\u001b[49m\u001b[43mload\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m      5\u001b[39m \u001b[38;5;28mprint\u001b[39m(data[\u001b[32m0\u001b[39m].keys())\n\u001b[32m      6\u001b[39m \u001b[38;5;28mprint\u001b[39m(data[\u001b[32m0\u001b[39m][\u001b[33m'\u001b[39m\u001b[33minter_result\u001b[39m\u001b[33m'\u001b[39m])\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/un/lib/python3.11/json/__init__.py:293\u001b[39m, in \u001b[36mload\u001b[39m\u001b[34m(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[39m\n\u001b[32m    274\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mload\u001b[39m(fp, *, \u001b[38;5;28mcls\u001b[39m=\u001b[38;5;28;01mNone\u001b[39;00m, object_hook=\u001b[38;5;28;01mNone\u001b[39;00m, parse_float=\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m    275\u001b[39m         parse_int=\u001b[38;5;28;01mNone\u001b[39;00m, parse_constant=\u001b[38;5;28;01mNone\u001b[39;00m, object_pairs_hook=\u001b[38;5;28;01mNone\u001b[39;00m, **kw):\n\u001b[32m    276\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"Deserialize ``fp`` (a ``.read()``-supporting file-like object containing\u001b[39;00m\n\u001b[32m    277\u001b[39m \u001b[33;03m    a JSON document) to a Python object.\u001b[39;00m\n\u001b[32m    278\u001b[39m \n\u001b[32m   (...)\u001b[39m\u001b[32m    291\u001b[39m \u001b[33;03m    kwarg; otherwise ``JSONDecoder`` is used.\u001b[39;00m\n\u001b[32m    292\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m293\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloads\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfp\u001b[49m\u001b[43m.\u001b[49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    294\u001b[39m \u001b[43m        \u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mcls\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobject_hook\u001b[49m\u001b[43m=\u001b[49m\u001b[43mobject_hook\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    295\u001b[39m \u001b[43m        \u001b[49m\u001b[43mparse_float\u001b[49m\u001b[43m=\u001b[49m\u001b[43mparse_float\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mparse_int\u001b[49m\u001b[43m=\u001b[49m\u001b[43mparse_int\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m    296\u001b[39m \u001b[43m        \u001b[49m\u001b[43mparse_constant\u001b[49m\u001b[43m=\u001b[49m\u001b[43mparse_constant\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mobject_pairs_hook\u001b[49m\u001b[43m=\u001b[49m\u001b[43mobject_pairs_hook\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkw\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/un/lib/python3.11/json/__init__.py:346\u001b[39m, in \u001b[36mloads\u001b[39m\u001b[34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[39m\n\u001b[32m    341\u001b[39m     s = s.decode(detect_encoding(s), \u001b[33m'\u001b[39m\u001b[33msurrogatepass\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m    343\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m (\u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[32m    344\u001b[39m         parse_int \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m parse_float \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m\n\u001b[32m    345\u001b[39m         parse_constant \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m object_pairs_hook \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kw):\n\u001b[32m--> \u001b[39m\u001b[32m346\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_default_decoder\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdecode\u001b[49m\u001b[43m(\u001b[49m\u001b[43ms\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    347\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m    348\u001b[39m     \u001b[38;5;28mcls\u001b[39m = JSONDecoder\n",
      "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/un/lib/python3.11/json/decoder.py:340\u001b[39m, in \u001b[36mJSONDecoder.decode\u001b[39m\u001b[34m(self, s, _w)\u001b[39m\n\u001b[32m    338\u001b[39m end = _w(s, end).end()\n\u001b[32m    339\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m end != \u001b[38;5;28mlen\u001b[39m(s):\n\u001b[32m--> \u001b[39m\u001b[32m340\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m JSONDecodeError(\u001b[33m\"\u001b[39m\u001b[33mExtra data\u001b[39m\u001b[33m\"\u001b[39m, s, end)\n\u001b[32m    341\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n",
      "\u001b[31mJSONDecodeError\u001b[39m: Extra data: line 2 column 1 (char 22694)"
     ]
    }
   ],
   "source": [
    "# /home/mshahidul/LLM_guard/CKA-Agent/results/single_run_20260203_213455/inter_result_sample_0.json\n",
    "import json\n",
    "with open(\"/home/mshahidul/LLM_guard/CKA-Agent/results/single_run_20260203_213455/inter_result_sample_0.json\", 'r') as f:\n",
    "    data = json.load(f)\n",
    "print(data[0].keys())\n",
    "print(data[0]['inter_result'])\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "74b07429",
   "metadata": {},
   "source": [
    "## V2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "912f3d85",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import requests\n",
    "from openai import OpenAI\n",
    "\n",
    "# --- Configuration ---\n",
    "LOCAL_API_URL = \"http://172.16.34.29:8004/v1/chat/completions\"\n",
    "LOCAL_MODEL_NAME = \"Qwen/Qwen3-30B-A3B-Instruct-2507\"\n",
    "\n",
    "api_file = \"/home/mshahidul/api_new.json\"\n",
    "with open(api_file, \"r\") as f:\n",
    "    api_keys = json.load(f)\n",
    "\n",
    "openai_client = OpenAI(api_key=api_keys[\"openai\"])\n",
    "OPENAI_MODEL_NAME = \"gpt-5\" # Note: Ensure your model version is correct\n",
    "\n",
    "FEW_SHOT_FILE = \"/home/mshahidul/readctrl/data/new_exp/few_shot_examples.json\"\n",
    "OUTPUT_PATH = \"/home/mshahidul/readctrl/data/new_exp/final_prompt_template.txt\"\n",
    "\n",
    "# --- Logic ---\n",
    "\n",
    "def get_reasoning(fulltext, gen_text, label, provider=\"local\"):\n",
    "    \"\"\"\n",
    "    Ask an LLM to explain why the text fits the label in JSON format.\n",
    "    \"\"\"\n",
    "    # Explicitly asking for JSON in the prompt\n",
    "    prompt = f\"\"\"Compare the 'Target Text' to the 'Original Fulltext'. \n",
    "Explain why the Target Text fits the health literacy label: {label}.\n",
    "Focus on how vocabulary, jargon, and sentence structure were adapted.\n",
    "\n",
    "Original Fulltext: {fulltext}\n",
    "Target Text: {gen_text}\n",
    "Label: {label}\n",
    "\n",
    "Return your response ONLY as a JSON object with the following key:\n",
    "\"reasoning\": \"your 1-2 sentence explanation\"\n",
    "\"\"\"\n",
    "\n",
    "    try:\n",
    "        if provider == \"openai\":\n",
    "            response = openai_client.chat.completions.create(\n",
    "                model=OPENAI_MODEL_NAME,\n",
    "                messages=[{\"role\": \"user\", \"content\": prompt}],\n",
    "                response_format={ \"type\": \"json_object\" } # Force JSON for OpenAI\n",
    "            )\n",
    "            content = response.choices[0].message.content.strip()\n",
    "        else:\n",
    "            response = requests.post(LOCAL_API_URL, json={\n",
    "                \"model\": LOCAL_MODEL_NAME,\n",
    "                \"messages\": [{\"role\": \"user\", \"content\": prompt}],\n",
    "                \"temperature\": 0\n",
    "            })\n",
    "            content = response.json()['choices'][0]['message']['content'].strip()\n",
    "        \n",
    "        # Parse JSON and extract reasoning\n",
    "        data = json.loads(content)\n",
    "        return data.get(\"reasoning\", \"Reasoning key not found.\")\n",
    "            \n",
    "    except Exception as e:\n",
    "        print(f\"Error with {provider}: {e}\")\n",
    "        return \"Reasoning could not be generated.\"\n",
    "\n",
    "# 1. Load the selected examples\n",
    "with open(FEW_SHOT_FILE, 'r') as f:\n",
    "    few_shot_data = json.load(f)\n",
    "\n",
    "# 2. Build the few-shot string\n",
    "few_shot_string = \"\"\n",
    "REASONING_PROVIDER = \"openai\" \n",
    "\n",
    "print(f\"Generating reasoning using: {REASONING_PROVIDER}...\")\n",
    "info=[]\n",
    "for label in [\"low_health_literacy\", \"intermediate_health_literacy\", \"proficient_health_literacy\"]:\n",
    "    examples = few_shot_data.get(label, [])\n",
    "    for ex in examples:\n",
    "        reason = get_reasoning(ex.get('fulltext', \"\"), ex['gen_text'], label, provider=REASONING_PROVIDER)\n",
    "        \n",
    "        # Adding structured few-shot examples to the string\n",
    "        few_shot_string += f\"Original Fulltext: \\\"{ex.get('fulltext', '')}\\\"\\n\"\n",
    "        few_shot_string += f\"Target Text: \\\"{ex['gen_text']}\\\"\\n\"\n",
    "        few_shot_string += f\"Reasoning: {reason}\\n\"\n",
    "        few_shot_string += f\"Label: {label}\\n\"\n",
    "        few_shot_string += \"-\" * 30 + \"\\n\"\n",
    "        info.append({\n",
    "            \"doc_id\": ex.get('doc_id', \"\"),\n",
    "            \"fulltext\": ex.get('fulltext', \"\"),\n",
    "            \"gen_text\": ex['gen_text'],\n",
    "            \"reasoning\": reason,\n",
    "            \"label\": label\n",
    "        })                    \n",
    "\n",
    "# 3. Define the Final Prompt Structure\n",
    "instruction = \"\"\"You are an expert in health communication. Your task is to judge the health literacy level of a target text based on its original medical source.\n",
    "\n",
    "Classify the text into one of three categories:\n",
    "1. low_health_literacy: Uses common words (everyday language), very short sentences, and eliminates all medical jargon.\n",
    "2. intermediate_health_literacy: Uses some medical terms with explanation, standard sentence length, requires basic health knowledge.\n",
    "3. proficient_health_literacy: Uses high-level medical jargon, technical language, and academic or professional structures.\n",
    "\n",
    "### Few-Shot Examples:\n",
    "\"\"\"\n",
    "\n",
    "# 4. Final Template Construction\n",
    "final_prompt_template = (\n",
    "    instruction + \n",
    "    few_shot_string + \n",
    "    \"\\n### Now judge this text:\\n\"\n",
    "    \"Original Fulltext: \\\"{fulltext}\\\"\\n\"\n",
    "    \"Target Text: \\\"{input_text}\\\"\\n\"\n",
    "    \"Reasoning:\"\n",
    ")\n",
    "\n",
    "with open(OUTPUT_PATH, 'w') as f:\n",
    "    f.write(final_prompt_template)\n",
    "with open(OUTPUT_PATH.replace('.txt', '_info.json'), 'w') as f:\n",
    "    json.dump(info, f, indent=4)\n",
    "print(f\"Structured prompt template saved to {OUTPUT_PATH}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "feafa46d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['doc_id', 'ai_label', 'rating_plaban', 'category_plaban', 'rating_mahi', 'category_mahi', 'rating_shama', 'category_shama', 'agreement_count'])\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "# /home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_0_80_full.json\n",
    "with open(\"/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_0_80_full.json\", 'r') as f:\n",
    "    data = json.load(f)\n",
    "print(data[0].keys())\n",
    "print(data[0]['diff_label_texts'].keys())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8c470dd5",
   "metadata": {},
   "source": [
    "## Fewshot data selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06158d8d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "# --- Configuration ---\n",
    "# Path to your existing data (containing 'reasoning', 'gen_text', and 'label')\n",
    "INPUT_INFO_FILE = \"/home/mshahidul/readctrl/data/new_exp/final_prompt_template_info.json\"\n",
    "OUTPUT_PATH = \"/home/mshahidul/readctrl/data/new_exp/new_prompt_template.txt\"\n",
    "\n",
    "# Decide how many few-shot examples you want to include for each label\n",
    "FEW_SHOT_PER_LABEL = 2  # Change this to 1, 3, etc.\n",
    "\n",
    "# --- Logic ---\n",
    "\n",
    "def generate_prompt_from_json(input_json_path, num_per_label):\n",
    "    if not os.path.exists(input_json_path):\n",
    "        return f\"Error: File {input_json_path} not found. Please check the path.\"\n",
    "        \n",
    "    with open(input_json_path, 'r') as f:\n",
    "        data = json.load(f)\n",
    "    \n",
    "    # Organize the data by label to ensure even distribution\n",
    "    labeled_data = {}\n",
    "    for entry in data:\n",
    "        label = entry['label']\n",
    "        if label not in labeled_data:\n",
    "            labeled_data[label] = []\n",
    "        labeled_data[label].append(entry)\n",
    "    \n",
    "    # Build the few-shot section\n",
    "    few_shot_string = \"\"\n",
    "    # Define labels in a logical order\n",
    "    target_labels = [\"low_health_literacy\", \"intermediate_health_literacy\", \"proficient_health_literacy\"]\n",
    "    \n",
    "    for label in target_labels:\n",
    "        examples = labeled_data.get(label, [])\n",
    "        # Slice the list based on your variable\n",
    "        selected_examples = examples[:num_per_label]\n",
    "        \n",
    "        for ex in selected_examples:\n",
    "            # Construct the example block WITHOUT the fulltext\n",
    "            few_shot_string += f\"Target Text: \\\"{ex['gen_text']}\\\"\\n\"\n",
    "            few_shot_string += f\"Reasoning: {ex['reasoning']}\\n\"\n",
    "            few_shot_string += f\"Label: {label}\\n\"\n",
    "            few_shot_string += \"-\" * 30 + \"\\n\"\n",
    "\n",
    "    # Define the final instruction structure (no mention of fulltext comparison)\n",
    "    instruction = \"\"\"You are an expert in health communication. Your task is to judge the health literacy level of the provided text.\n",
    "\n",
    "Classify the text into one of three categories:\n",
    "1. low_health_literacy: Uses common words (everyday language), very short sentences, and avoids medical jargon.\n",
    "2. intermediate_health_literacy: Uses some medical terms with explanation, standard sentence length, requires basic health knowledge.\n",
    "3. proficient_health_literacy: Uses high-level medical jargon, technical language, and academic or professional structures.\n",
    "\n",
    "### Examples:\n",
    "\"\"\"\n",
    "\n",
    "    # Final Template Construction\n",
    "    final_template = (\n",
    "        instruction + \n",
    "        few_shot_string + \n",
    "        \"\\n### Task:\\n\"\n",
    "        \"Target Text: \\\"{input_text}\\\"\\n\"\n",
    "        \"Reasoning:\"\n",
    "    )\n",
    "    \n",
    "    return final_template\n",
    "\n",
    "# 1. Generate the string\n",
    "new_prompt_template = generate_prompt_from_json(INPUT_INFO_FILE, FEW_SHOT_PER_LABEL)\n",
    "\n",
    "# 2. Save to file\n",
    "with open(OUTPUT_PATH, 'w') as f:\n",
    "    f.write(new_prompt_template)\n",
    "\n",
    "print(f\"Successfully created a prompt with {FEW_SHOT_PER_LABEL} examples per label.\")\n",
    "print(f\"Saved to: {OUTPUT_PATH}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f78d4619",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "with open(\"/home/mshahidul/readctrl/data/new_exp/cleaned_health_literacy_data.json\", 'r') as f:\n",
    "    cleaned_data = json.load(f)\n",
    "with open(\"/home/mshahidul/readctrl/data/new_exp/few_shot_examples.json\", 'r') as f:\n",
    "    few_shot_examples = json.load(f)\n",
    "\n",
    "list_data = []\n",
    "for item in few_shot_examples:\n",
    "    for ex in few_shot_examples[item]:\n",
    "        list_data.append((ex['doc_id'], ex['label']))\n",
    "\n",
    "test_set = []\n",
    "for item in cleaned_data:\n",
    "    if (item['doc_id'], item['label']) not in list_data:\n",
    "        test_set.append(item)\n",
    "with open(\"/home/mshahidul/readctrl/data/new_exp/test_health_literacy_data.json\", 'w') as f:\n",
    "    json.dump(test_set, f, indent=4)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d33bb77",
   "metadata": {},
   "source": [
    "## Testing V1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e2e888eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import requests\n",
    "\n",
    "# --- Configuration ---\n",
    "TEMPLATE_PATH = \"/home/mshahidul/readctrl/data/new_exp/final_prompt_template_v3.txt\"\n",
    "LOCAL_API_URL = \"http://172.16.34.29:8004/v1/chat/completions\"\n",
    "LOCAL_MODEL_NAME = \"Qwen/Qwen3-30B-A3B-Instruct-2507\"\n",
    "\n",
    "# --- 1. Load the Template ---\n",
    "with open(TEMPLATE_PATH, \"r\") as f:\n",
    "    prompt_template = f.read()\n",
    "\n",
    "# --- 2. Define Test Cases ---\n",
    "with open(\"/home/mshahidul/readctrl/data/new_exp/cleaned_health_literacy_data.json\", 'r') as f:\n",
    "    cleaned_data = json.load(f)\n",
    "with open(\"/home/mshahidul/readctrl/data/new_exp/few_shot_examples.json\", 'r') as f:\n",
    "    few_shot_examples = json.load(f)\n",
    "\n",
    "list_data = []\n",
    "for item in few_shot_examples:\n",
    "    for ex in few_shot_examples[item]:\n",
    "        list_data.append((ex['doc_id'], ex['label']))\n",
    "\n",
    "test_set = []\n",
    "for item in cleaned_data:\n",
    "    if (item['doc_id'], item['label']) not in list_data:\n",
    "        test_set.append(item)\n",
    "\n",
    "def run_test(fulltext, input_text):\n",
    "    final_prompt = prompt_template.format(fulltext=fulltext, input_text=input_text)\n",
    "    \n",
    "    payload = {\n",
    "        \"model\": LOCAL_MODEL_NAME,\n",
    "        \"messages\": [{\"role\": \"user\", \"content\": final_prompt}],\n",
    "        \"temperature\": 0 \n",
    "    }\n",
    "    \n",
    "    try:\n",
    "        response = requests.post(LOCAL_API_URL, json=payload, timeout=30)\n",
    "        return response.json()['choices'][0]['message']['content'].strip()\n",
    "    except Exception as e:\n",
    "        return f\"Error: {e}\"\n",
    "\n",
    "# --- 3. Execute and Compare ---\n",
    "print(f\"--- Starting Template Evaluation on {len(test_set)} cases ---\\n\")\n",
    "\n",
    "correct_count = 0\n",
    "results_log = []\n",
    "\n",
    "def text_return(text):\n",
    "    if \"low\" in text.lower():\n",
    "        return \"low_health_literacy\"\n",
    "    elif \"intermediate\" in text.lower():\n",
    "        return \"intermediate_health_literacy\"\n",
    "    elif \"proficient\" in text.lower():\n",
    "        return \"proficient_health_literacy\"\n",
    "    return \"unknown\"\n",
    "\n",
    "for i, case in enumerate(test_set):\n",
    "    expected = str(case['label']).strip().lower()\n",
    "    result = run_test(case['fulltext'], case['gen_text'])\n",
    "    \n",
    "    # Clean LLM output for comparison (case-insensitive and removing trailing periods)\n",
    "    prediction = result.strip().lower().rstrip('.')\n",
    "    \n",
    "    # Check if the expected label is the primary answer in the result\n",
    "    is_correct = (text_return(expected) == text_return(prediction) )\n",
    "    \n",
    "    if is_correct:\n",
    "        correct_count += 1\n",
    "    \n",
    "    print(f\"Test Case {i+1}:\")\n",
    "    print(f\"Expected: {case['label']}\")\n",
    "    print(f\"LLM Output: {result}\")\n",
    "    print(f\"Match: {'✅' if is_correct else '❌'}\")\n",
    "    print(\"-\" * 50)\n",
    "\n",
    "# --- 4. Final Accuracy Calculation ---\n",
    "total_cases = len(test_set)\n",
    "if total_cases > 0:\n",
    "    accuracy = (correct_count / total_cases) * 100\n",
    "    print(f\"\\n--- Evaluation Summary ---\")\n",
    "    print(f\"Total Tested: {total_cases}\")\n",
    "    print(f\"Correct: {correct_count}\")\n",
    "    print(f\"Accuracy: {accuracy:.2f}%\")\n",
    "else:\n",
    "    print(\"No test cases found.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0531d7c3",
   "metadata": {},
   "source": [
    "## Testing V2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab8b4c96",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import requests\n",
    "import os\n",
    "\n",
    "# --- Configuration ---\n",
    "DEV_SET_PATH = \"/home/mshahidul/readctrl/data/new_exp/test_health_literacy_data.json\"\n",
    "FEW_SHOT_SET_PATH = \"/home/mshahidul/readctrl/data/new_exp/final_prompt_template_info.json\" # Using the one with reasoning\n",
    "LOCAL_API_URL = \"http://172.16.34.29:8004/v1/chat/completions\"\n",
    "LOCAL_MODEL_NAME = \"Qwen/Qwen3-30B-A3B-Instruct-2507\"\n",
    "\n",
    "# Define the range of few-shots per label you want to test\n",
    "# e.g., [0, 1, 2, 3] will test 0-shot, 1-shot (3 total), 2-shot (6 total), etc.\n",
    "SHOTS_TO_EVALUATE = [0, 1, 2, 3]\n",
    "\n",
    "# --- Core Functions ---\n",
    "\n",
    "def build_dynamic_prompt(few_shot_data, k_per_label):\n",
    "    \"\"\"Constructs a prompt with k examples per literacy category.\"\"\"\n",
    "    instruction = (\n",
    "        \"You are an expert in health communication. Your task is to judge the health literacy level of the provided text.\\n\"\n",
    "        \"Classify the text into: low_health_literacy, intermediate_health_literacy, or proficient_health_literacy.\\n\\n\"\n",
    "    )\n",
    "    \n",
    "    if k_per_label == 0:\n",
    "        return instruction + \"### Task:\\nTarget Text: \\\"{input_text}\\\"\\nReasoning:\"\n",
    "\n",
    "    # Organize few-shot data by label\n",
    "    categorized = {}\n",
    "    for entry in few_shot_data:\n",
    "        label = entry['label']\n",
    "        categorized.setdefault(label, []).append(entry)\n",
    "\n",
    "    few_shot_blocks = \"### Examples:\\n\"\n",
    "    labels = [\"low_health_literacy\", \"intermediate_health_literacy\", \"proficient_health_literacy\"]\n",
    "    \n",
    "    for label in labels:\n",
    "        examples = categorized.get(label, [])[:k_per_label]\n",
    "        for ex in examples:\n",
    "            few_shot_blocks += f\"Target Text: \\\"{ex['gen_text']}\\\"\\n\"\n",
    "            few_shot_blocks += f\"Reasoning: {ex['reasoning']}\\n\"\n",
    "            few_shot_blocks += f\"Label: {label}\\n\"\n",
    "            few_shot_blocks += \"-\" * 30 + \"\\n\"\n",
    "            \n",
    "    return instruction + few_shot_blocks + \"\\n### Task:\\nTarget Text: \\\"{input_text}\\\"\\nReasoning:\"\n",
    "\n",
    "def get_prediction(prompt_template, input_text):\n",
    "    \"\"\"Sends the formatted prompt to the local LLM.\"\"\"\n",
    "    final_prompt = prompt_template.format(input_text=input_text)\n",
    "    payload = {\n",
    "        \"model\": LOCAL_MODEL_NAME,\n",
    "        \"messages\": [{\"role\": \"user\", \"content\": final_prompt}],\n",
    "        \"temperature\": 0 \n",
    "    }\n",
    "    try:\n",
    "        response = requests.post(LOCAL_API_URL, json=payload, timeout=30)\n",
    "        return response.json()['choices'][0]['message']['content'].strip()\n",
    "    except Exception:\n",
    "        return \"Error\"\n",
    "\n",
    "def parse_label(text):\n",
    "    \"\"\"Normalizes LLM output to match dataset labels.\"\"\"\n",
    "    text = text.lower()\n",
    "    if \"low\" in text: return \"low_health_literacy\"\n",
    "    if \"intermediate\" in text: return \"intermediate_health_literacy\"\n",
    "    if \"proficient\" in text: return \"proficient_health_literacy\"\n",
    "    return \"unknown\"\n",
    "\n",
    "# --- Main Execution ---\n",
    "\n",
    "# 1. Load Data\n",
    "with open(DEV_SET_PATH, 'r') as f:\n",
    "    dev_set = json.load(f)\n",
    "with open(FEW_SHOT_SET_PATH, 'r') as f:\n",
    "    few_shot_pool = json.load(f)\n",
    "\n",
    "# 2. Filter Dev Set\n",
    "# Ensure no overlap between few-shot examples and dev set\n",
    "shot_ids = {item['doc_id'] for item in few_shot_pool}\n",
    "clean_dev_set = [item for item in dev_set if item['doc_id'] not in shot_ids]\n",
    "\n",
    "results_summary = []\n",
    "\n",
    "print(f\"Starting Evaluation on {len(clean_dev_set)} samples...\\n\")\n",
    "\n",
    "# 3. Loop through shot counts\n",
    "for k in SHOTS_TO_EVALUATE:\n",
    "    print(f\"Evaluating {k}-shot per label (Total {k*3} examples)...\")\n",
    "    \n",
    "    current_template = build_dynamic_prompt(few_shot_pool, k)\n",
    "    correct = 0\n",
    "    \n",
    "    for case in clean_dev_set:\n",
    "        raw_output = get_prediction(current_template, case['gen_text'])\n",
    "        pred = parse_label(raw_output)\n",
    "        actual = parse_label(case['label'])\n",
    "        \n",
    "        if pred == actual:\n",
    "            correct += 1\n",
    "            \n",
    "    accuracy = (correct / len(clean_dev_set)) * 100\n",
    "    results_summary.append({\"shots_per_label\": k, \"accuracy\": accuracy})\n",
    "    print(f\"-> Accuracy: {accuracy:.2f}%\\n\")\n",
    "\n",
    "# --- Final Report ---\n",
    "print(\"-\" * 30)\n",
    "print(f\"{'Shots/Label':<15} | {'Accuracy':<10}\")\n",
    "print(\"-\" * 30)\n",
    "for res in results_summary:\n",
    "    print(f\"{res['shots_per_label']:<15} | {res['accuracy']:.2f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d5cd799a",
   "metadata": {},
   "source": [
    "## Step 3: Design Initial Prompt using dspy"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d916470f",
   "metadata": {},
   "source": [
    "## V1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "793a47c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import dspy\n",
    "import json\n",
    "from dspy.teleprompt import BootstrapFewShot\n",
    "\n",
    "# --- 1. Configure the LLM via your vLLM Endpoint ---\n",
    "# DSPy uses an OpenAI-compatible client for vLLM\n",
    "vllm_model = dspy.LM(\n",
    "    model='openai/Qwen/Qwen3-30B-A3B-Instruct-2507', # Use 'openai/' prefix for local endpoints\n",
    "    api_base=\"http://172.16.34.29:8004/v1\",\n",
    "    api_key=\"EMPTY\",\n",
    "    temperature=0.0\n",
    ")\n",
    "dspy.configure(lm=vllm_model)\n",
    "\n",
    "# --- 2. Define the Task Signature ---\n",
    "class HealthLiteracySignature(dspy.Signature):\n",
    "    \"\"\"\n",
    "    Judge the health literacy difficulty of a medical text.\n",
    "    Classify into: low_health_literacy, intermediate_health_literacy, or proficient_health_literacy.\n",
    "    \"\"\"\n",
    "    text = dspy.InputField(desc=\"The medical text or patient note to analyze.\")\n",
    "    reasoning = dspy.OutputField(desc=\"Step-by-step logic identifying jargon, sentence structure, and complexity.\")\n",
    "    label = dspy.OutputField(desc=\"The final classification: low_health_literacy, intermediate_health_literacy, or proficient_health_literacy.\")\n",
    "\n",
    "# --- 3. Load Training Data ---\n",
    "with open(\"/home/mshahidul/readctrl/data/new_exp/few_shot_examples.json\", 'r') as f:\n",
    "    raw_examples = json.load(f)\n",
    "\n",
    "# Convert your 15 examples into DSPy format\n",
    "trainset = []\n",
    "for label_key, examples in raw_examples.items():\n",
    "    for ex in examples:\n",
    "        trainset.append(dspy.Example(text=ex['text'], label=label_key).with_inputs('text'))\n",
    "\n",
    "# --- 4. Define the Program (Chain of Thought) ---\n",
    "class HealthLiteracyClassifier(dspy.Module):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        # ChainOfThought automatically adds \"Reasoning\" steps to the prompt\n",
    "        self.predictor = dspy.ChainOfThought(HealthLiteracySignature)\n",
    "\n",
    "    def forward(self, text):\n",
    "        return self.predictor(text=text)\n",
    "\n",
    "# --- 5. Define the Metric (Success = Label Match) ---\n",
    "def metric(gold, pred, trace=None):\n",
    "    return gold.label == pred.label\n",
    "\n",
    "# --- 6. Run the Optimizer (Teleprompter) ---\n",
    "# BootstrapFewShot will test variations of the prompt to see which one works best\n",
    "optimizer = BootstrapFewShot(metric=metric, max_bootstrapped_demos=3, max_labeled_demos=5)\n",
    "optimized_program = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)\n",
    "\n",
    "# --- 7. Save the Optimized Prompt ---\n",
    "optimized_program.save(\"/home/mshahidul/readctrl/data/new_exp/optimized_health_classifier.json\")\n",
    "\n",
    "# Inspect the final prompt logic\n",
    "vllm_model.inspect_history(n=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "06a0eb62",
   "metadata": {},
   "source": [
    "## V2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3529bb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import dspy\n",
    "import json\n",
    "from typing import Literal\n",
    "from dspy.teleprompt import BootstrapFewShotWithRandomSearch\n",
    "from dspy.evaluate import Evaluate\n",
    "\n",
    "# --- 1. LLM Configuration ---\n",
    "api_file = \"/home/mshahidul/api_new.json\"\n",
    "with open(api_file, \"r\") as f:\n",
    "    api_keys = json.load(f)\n",
    "openai_api_key = api_keys[\"openai\"]\n",
    "\n",
    "# Student: Local vLLM (Deployment Model)\n",
    "vllm_model = dspy.LM(\n",
    "    model='openai/Qwen/Qwen3-30B-A3B-Instruct-2507',\n",
    "    api_base=\"http://172.16.34.29:8004/v1\",\n",
    "    api_key=\"EMPTY\",\n",
    "    temperature=0.0\n",
    ")\n",
    "\n",
    "# Teacher: OpenAI (High-quality rationale generation)\n",
    "# Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')\n",
    "openai_model_teacher = dspy.LM(model='gpt-5', api_key=openai_api_key)\n",
    "openai_model_student = dspy.LM(model='gpt-5-mini', api_key=openai_api_key)\n",
    "\n",
    "dspy.configure(lm=openai_model_student)  # Default to OpenAI for optimization\n",
    "\n",
    "# --- 2. Data Processing & Deduplication ---\n",
    "\n",
    "# 2.1 Load Training Data (Few-Shot)\n",
    "with open(\"/home/mshahidul/readctrl/data/new_exp/few_shot_examples.json\", 'r') as f:\n",
    "    few_shot_data = json.load(f)\n",
    "\n",
    "trainset = []\n",
    "train_identifiers = set()\n",
    "\n",
    "for label_key, examples in few_shot_data.items():\n",
    "    for ex in examples:\n",
    "        # Create a unique ID to prevent data leakage\n",
    "        unique_id = f\"{ex['doc_id']}_{label_key}\"\n",
    "        train_identifiers.add(unique_id)\n",
    "        \n",
    "        # In few_shot, 'gen_text' is the summary we want to judge\n",
    "        trainset.append(dspy.Example(\n",
    "            summary_text=ex['gen_text'], \n",
    "            label=label_key\n",
    "        ).with_inputs('summary_text'))\n",
    "\n",
    "# 2.2 Load Test Data as Dev Set (Updated Path)\n",
    "test_data_path = \"/home/mshahidul/readctrl/data/new_exp/test_health_literacy_data.json\"\n",
    "with open(test_data_path, 'r') as f:\n",
    "    test_data = json.load(f)\n",
    "\n",
    "devset = []\n",
    "for item in test_data:\n",
    "    unique_id = f\"{item['doc_id']}_{item['label']}\"\n",
    "    \n",
    "    # Filter out examples if they accidentally appear in the training set\n",
    "    if unique_id not in train_identifiers:\n",
    "        devset.append(dspy.Example(\n",
    "            summary_text=item['gen_text'], \n",
    "            label=item['label']\n",
    "        ).with_inputs('summary_text'))\n",
    "\n",
    "print(f\"Dataset Stats: Train={len(trainset)}, Dev (Test Set)={len(devset)}\")\n",
    "\n",
    "# --- 3. Robust Signature & Module ---\n",
    "\n",
    "class HealthLiteracySignature(dspy.Signature):\n",
    "    \"\"\"\n",
    "    Judge the health literacy level of a generated medical summary.\n",
    "    Identify if the language is suitable for a layperson (low) or requires medical expertise (proficient).\n",
    "    \"\"\"\n",
    "    summary_text: str = dspy.InputField(desc=\"The generated medical summary to be analyzed.\")\n",
    "    reasoning: str = dspy.OutputField(desc=\"Analysis of jargon, acronyms, and sentence complexity.\")\n",
    "    label: Literal[\"low_health_literacy\", \"intermediate_health_literacy\", \"proficient_health_literacy\"] = dspy.OutputField()\n",
    "\n",
    "class HealthLiteracyClassifier(dspy.Module):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.predictor = dspy.ChainOfThought(HealthLiteracySignature)\n",
    "\n",
    "    def forward(self, summary_text):\n",
    "        return self.predictor(summary_text=summary_text)\n",
    "\n",
    "# --- 4. Metric and Optimization ---\n",
    "\n",
    "def health_literacy_metric(gold, pred, trace=None):\n",
    "    if not pred or not pred.label: return False\n",
    "    return gold.label.strip().lower() == pred.label.strip().lower()\n",
    "\n",
    "optimizer = BootstrapFewShotWithRandomSearch(\n",
    "    metric=health_literacy_metric,\n",
    "    max_bootstrapped_demos=3,\n",
    "    num_candidate_programs=8, \n",
    "    teacher_settings=dict(lm=openai_model_teacher)\n",
    ")\n",
    "\n",
    "# Compile the program\n",
    "optimized_program = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)\n",
    "\n",
    "# --- 5. Evaluation & Saving ---\n",
    "\n",
    "# Evaluate on the provided test dataset\n",
    "evaluator = Evaluate(devset=devset, metric=health_literacy_metric, num_threads=1, display_progress=True)\n",
    "accuracy_score = evaluator(optimized_program)\n",
    "\n",
    "print(f\"\\nOptimization Complete.\")\n",
    "print(f\"Final Accuracy on Test Set: {accuracy_score}%\")\n",
    "\n",
    "# Save the finalized prompt logic\n",
    "optimized_program.save(\"/home/mshahidul/readctrl/data/new_exp/optimized_health_classifier_gpt5-mini.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96f1f99e",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f\"Final Accuracy on Test Set: {accuracy_score}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "814b0186",
   "metadata": {},
   "outputs": [],
   "source": [
    "CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=2 python '/home/mshahidul/readctrl/code/RL_model/finetune.py'\n",
    "CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=2 python -m vllm.entrypoints.openai.api_server     --model Qwen/Qwen3-30B-A3B-Instruct-2507     --max-model-len 8192     --tensor-parallel-size 1     --port 8004     --dtype auto     --trust_remote_code True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0e0fbb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# To load and use:\n",
    "classifier = HealthLiteracyClassifier()\n",
    "classifier.load(\"/home/mshahidul/readctrl/data/new_exp/optimized_health_classifier.json\")\n",
    "path=\"/home/mshahidul/readctrl/data/new_exp/test_health_literacy_data.json\"\n",
    "with open(path,'r') as f:\n",
    "    test_data = json.load(f)\n",
    "for item in test_data:\n",
    "    expected_label = item['label']\n",
    "    text = item['gen_text']\n",
    "    result = classifier(summary_text=text)\n",
    "    if (result.label == expected_label):\n",
    "        print(f\"Correctly classified: {expected_label} ✅\")\n",
    "    else:\n",
    "        print(f\"Misclassified. Expected: {expected_label}, Got: {result.label} ❌\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8700ac2b",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(few_shot_data.keys())\n",
    "print(few_shot_data['low_health_literacy'][0].keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b5dbe7a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import json\n",
    "# import pandas as pd\n",
    "# from tqdm import tqdm\n",
    "# import dspy\n",
    "# from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, classification_report\n",
    "\n",
    "# # --- 1. Load Data and Optimized Program ---\n",
    "# CLEANED_DATA_PATH = \"/home/mshahidul/readctrl/data/new_exp/cleaned_health_literacy_data.json\"\n",
    "# FEW_SHOT_PATH = \"/home/mshahidul/readctrl/data/new_exp/few_shot_examples.json\"\n",
    "# MODEL_SAVE_PATH = \"/home/mshahidul/readctrl/data/new_exp/optimized_health_classifier.json\"\n",
    "\n",
    "# with open(CLEANED_DATA_PATH, 'r') as f:\n",
    "#     full_data = json.load(f)\n",
    "\n",
    "# with open(FEW_SHOT_PATH, 'r') as f:\n",
    "#     few_shot_data = json.load(f)\n",
    "\n",
    "# # Identify which doc_ids were used for training to ensure a clean test set\n",
    "# trained_ids = []\n",
    "# for label in few_shot_data:\n",
    "#     trained_ids.extend([ex['doc_id'] for ex in few_shot_data[label]])\n",
    "\n",
    "# test_set = [item for item in full_data if item['doc_id'] not in trained_ids]\n",
    "# print(f\"Total test examples: {len(test_set)}\")\n",
    "# # --- 2. Initialize DSPy Program ---\n",
    "# vllm_model = dspy.LM(\n",
    "#     model='openai/Qwen/Qwen3-30B-A3B-Instruct-2507',\n",
    "#     api_base=\"http://172.16.34.29:8004/v1\",\n",
    "#     api_key=\"EMPTY\"\n",
    "# )\n",
    "# dspy.configure(lm=vllm_model)\n",
    "\n",
    "# class HealthLiteracySignature(dspy.Signature):\n",
    "#     \"\"\"Judge health literacy difficulty: low, intermediate, or proficient.\"\"\"\n",
    "#     text = dspy.InputField()\n",
    "#     reasoning = dspy.OutputField()\n",
    "#     label = dspy.OutputField()\n",
    "\n",
    "# class HealthLiteracyClassifier(dspy.Module):\n",
    "#     def __init__(self):\n",
    "#         super().__init__()\n",
    "#         self.predictor = dspy.ChainOfThought(HealthLiteracySignature)\n",
    "#     def forward(self, text):\n",
    "#         return self.predictor(text=text)\n",
    "\n",
    "# # Load the optimized state\n",
    "# classifier = HealthLiteracyClassifier()\n",
    "# classifier.load(MODEL_SAVE_PATH)\n",
    "\n",
    "# # --- 3. Run Inference ---\n",
    "# results = []\n",
    "# y_true = []\n",
    "# y_pred = []\n",
    "\n",
    "# print(f\"Starting evaluation on {len(test_set)} examples...\")\n",
    "\n",
    "# for item in tqdm(test_set):\n",
    "#     try:\n",
    "#         prediction = classifier(text=item['text'])\n",
    "        \n",
    "#         # Clean the label (sometimes LLMs add extra text or punctuation)\n",
    "#         pred_label = prediction.label.strip().lower().replace(\" \", \"_\")\n",
    "        \n",
    "#         results.append({\n",
    "#             \"doc_id\": item['doc_id'],\n",
    "#             \"true_label\": item['label'],\n",
    "#             \"pred_label\": pred_label,\n",
    "#             \"reasoning\": prediction.reasoning\n",
    "#         })\n",
    "        \n",
    "#         y_true.append(item['label'])\n",
    "#         y_pred.append(pred_label)\n",
    "#     except Exception as e:\n",
    "#         print(f\"Error processing doc {item['doc_id']}: {e}\")\n",
    "\n",
    "# # --- 4. Calculate Metrics ---\n",
    "# labels = [\"low_health_literacy\", \"intermediate_health_literacy\", \"proficient_health_literacy\"]\n",
    "\n",
    "# accuracy = accuracy_score(y_true, y_pred)\n",
    "# f1 = f1_score(y_true, y_pred, average='weighted')\n",
    "# kappa = cohen_kappa_score(y_true, y_pred)\n",
    "\n",
    "# print(\"\\n--- Evaluation Results ---\")\n",
    "# print(f\"Accuracy: {accuracy:.4f}\")\n",
    "# print(f\"Cohen’s Kappa: {kappa:.4f}\")\n",
    "# print(f\"F1 Score (Weighted): {f1:.4f}\")\n",
    "# print(\"\\nClassification Report:\")\n",
    "# print(classification_report(y_true, y_pred, target_names=labels))\n",
    "\n",
    "# # Save results for failure analysis\n",
    "# output_file = \"/home/mshahidul/readctrl/data/new_exp/evaluation_results.json\"\n",
    "# with open(output_file, 'w') as f:\n",
    "#     json.dump(results, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e935e64c",
   "metadata": {},
   "outputs": [],
   "source": [
    "CUDA_DEVICE_ORDER=PCI_BUS_ID \\\n",
    "CUDA_VISIBLE_DEVICES=\"2\" \\\n",
    "PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\\n",
    "VLLM_USE_MODELSCOPE=True \\\n",
    "vllm \\\n",
    "  serve swift/Qwen3-30B-A3B-AWQ \\\n",
    "  --gpu-memory-utilization 0.9 \\\n",
    "  --max-model-len 32768 \\\n",
    "  --max-num-seqs 64 \\\n",
    "  --served-model-name swift/Qwen3-30B-A3B-AWQ \\\n",
    "  --host 127.0.0.1 \\\n",
    "  --port 8004"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8e90b755",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Items processed: 60\n",
      "Max raters per item: 7\n",
      "---\n",
      "Krippendorff's Alpha (Ordinal): 0.7083\n",
      "\n",
      "Note: Fleiss' Kappa skipped because of unequal rater counts per item.\n",
      "Use Krippendorff's Alpha for your final report as it accounts for this.\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import numpy as np\n",
    "import krippendorff\n",
    "\n",
    "def calculate_iaa_robust(file_path):\n",
    "    with open(file_path, 'r') as f:\n",
    "        data = json.load(f)\n",
    "\n",
    "    # 1. Prepare data for Krippendorff's Alpha\n",
    "    # Matrix shape must be (coders, items)\n",
    "    max_annotations = max(len(entry['rating_distribution']) for entry in data)\n",
    "    \n",
    "    # We create a list for each \"slot\" (rater position)\n",
    "    # If Doc 1 has 3 ratings and Doc 2 has 5, Doc 1 gets two np.nan values\n",
    "    reliability_data = []\n",
    "    for i in range(max_annotations):\n",
    "        row = []\n",
    "        for entry in data:\n",
    "            ratings = entry['rating_distribution']\n",
    "            if i < len(ratings):\n",
    "                row.append(ratings[i])\n",
    "            else:\n",
    "                row.append(np.nan)\n",
    "        reliability_data.append(row)\n",
    "    \n",
    "    reliability_matrix = np.array(reliability_data)\n",
    "\n",
    "    # 2. Calculate Krippendorff's Alpha (The primary metric for your paper)\n",
    "    # Level of measurement 'ordinal' is best for 1-5 scales\n",
    "    alpha = krippendorff.alpha(reliability_data=reliability_matrix, \n",
    "                               level_of_measurement='ordinal')\n",
    "    \n",
    "    print(f\"Items processed: {len(data)}\")\n",
    "    print(f\"Max raters per item: {max_annotations}\")\n",
    "    print(f\"---\")\n",
    "    print(f\"Krippendorff's Alpha (Ordinal): {alpha:.4f}\")\n",
    "\n",
    "    # 3. Handling Fleiss' Kappa (Optional/Conditional)\n",
    "    counts_list = []\n",
    "    rater_counts = []\n",
    "    for entry in data:\n",
    "        counts = [entry['rating_distribution'].count(i) for i in range(1, 6)]\n",
    "        counts_list.append(counts)\n",
    "        rater_counts.append(sum(counts))\n",
    "    \n",
    "    # Only run Fleiss if the raters are equal across all items\n",
    "    if len(set(rater_counts)) == 1:\n",
    "        from statsmodels.stats.inter_rater import fleiss_kappa\n",
    "        f_kappa = fleiss_kappa(np.array(counts_list))\n",
    "        print(f\"Fleiss' Kappa: {f_kappa:.4f}\")\n",
    "    else:\n",
    "        print(\"\\nNote: Fleiss' Kappa skipped because of unequal rater counts per item.\")\n",
    "        print(\"Use Krippendorff's Alpha for your final report as it accounts for this.\")\n",
    "\n",
    "# Usage\n",
    "path = '/home/mshahidul/readctrl/data/final_result/consolidated_ratings_threshold_manual_edit.json'\n",
    "calculate_iaa_robust(path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a0776765",
   "metadata": {},
   "outputs": [],
   "source": [
    "# /home/mshahidul/readctrl/data/final_result/consolidated_ratings_threshold.json\n",
    "import json\n",
    "def get_expected_label(rating):\n",
    "    if rating in [1, 2]:\n",
    "        return \"low_health_literacy\"\n",
    "    elif rating == 3:\n",
    "        return \"intermediate_health_literacy\"\n",
    "    elif rating in [4, 5]:\n",
    "        return \"proficient_health_literacy\"\n",
    "    return None\n",
    "with open(\"/home/mshahidul/readctrl/data/final_result/consolidated_ratings_threshold_manual_edit.json\", 'r') as f:\n",
    "    few_shot_data = json.load(f)\n",
    "cnt=0\n",
    "for item in few_shot_data:\n",
    "    expected_label = item['health_literacy_label']\n",
    "    consensus_rating = get_expected_label(item['consensus_rating'])\n",
    "    if expected_label == consensus_rating:\n",
    "        cnt+=1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed0a0618",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76ed37ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['id', 'fulltext', 'summary'])\n"
     ]
    }
   ],
   "source": [
    "# /home/mshahidul/readctrl/data/thresold_finding/junaed/seq0_record3.json\n",
    "import json\n",
    "with open(\"/home/mshahidul/readctrl/data/processed_test_raw_data/multiclinsum_test_en.json\", 'r') as f:\n",
    "    data = json.load(f)\n",
    "print(data[0].keys())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "eaefbfc6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Source Type          | Level           | Mean Threshold (%)\n",
      "-------------------------------------------------------\n",
      "Gold Summary         | low             |    61.07% (n=15)\n",
      "Gold Summary         | intermediate    |    81.99% (n=15)\n",
      "Gold Summary         | proficient      |    95.69% (n=2)\n",
      "Full Original Text   | low             |    37.23% (n=14)\n",
      "Full Original Text   | intermediate    |    66.11% (n=14)\n",
      "Full Original Text   | proficient      |    90.69% (n=4)\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import json\n",
    "from collections import defaultdict\n",
    "import numpy as np\n",
    "\n",
    "# Configuration\n",
    "base_path = \"/home/mshahidul/readctrl/data/thresold_finding\"\n",
    "levels = ['low', 'intermediate', 'proficient']\n",
    "source_types = [\"Gold Summary\", \"Full Original Text\"]\n",
    "\n",
    "# Dictionary to store percentages: results[source_type][level] = [list of values]\n",
    "results = {src: {lvl: [] for lvl in levels} for src in source_types}\n",
    "\n",
    "# Iterate through each annotator folder (e.g., 'junaed')\n",
    "annotator_names=['junaed','plabandas','shama']\n",
    "for annotator in annotator_names:\n",
    "    annotator_path = os.path.join(base_path, annotator)\n",
    "    \n",
    "    if os.path.isdir(annotator_path):\n",
    "        # Iterate through each json file in the folder\n",
    "        for filename in os.listdir(annotator_path):\n",
    "            if filename.endswith(\".json\"):\n",
    "                file_path = os.path.join(annotator_path, filename)\n",
    "                \n",
    "                try:\n",
    "                    with open(file_path, 'r') as f:\n",
    "                        data = json.load(f)\n",
    "                    \n",
    "                    src_type = data.get('source_type')\n",
    "                    # Ensure source_type is one we are tracking\n",
    "                    if src_type in source_types:\n",
    "                        for lvl in levels:\n",
    "                            # Extract threshold percentage from the annotations\n",
    "                            # Adjust 'threshold' key name if it differs in your JSON\n",
    "                            val = data['annotations'][lvl].get('percentage').replace('%', '').strip()\n",
    "                            if val is not None:\n",
    "                                if float(val) <= 99:\n",
    "                                    results[src_type][lvl].append(float(val))\n",
    "\n",
    "                                \n",
    "                except Exception as e:\n",
    "                    print(f\"Error processing {file_path}: {e}\")\n",
    "\n",
    "# Calculate and display averages\n",
    "print(f\"{'Source Type':<20} | {'Level':<15} | {'Mean Threshold (%)'}\")\n",
    "print(\"-\" * 55)\n",
    "\n",
    "for src in source_types:\n",
    "    for lvl in levels:\n",
    "        vals = results[src][lvl]\n",
    "        if vals:\n",
    "            mean_val = np.mean(vals)\n",
    "            count = len(vals)\n",
    "            print(f\"{src:<20} | {lvl:<15} | {mean_val:>8.2f}% (n={count})\")\n",
    "        else:\n",
    "            print(f\"{src:<20} | {lvl:<15} | No data found\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1aa3cd60",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "un",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}