File size: 8,462 Bytes

1db7196

{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "8a9d70f0",
      "metadata": {},
      "outputs": [],
      "source": [
        "import dspy\n",
        "import json\n",
        "from typing import Literal\n",
        "from dspy.teleprompt import BootstrapFewShotWithRandomSearch\n",
        "from dspy.evaluate import Evaluate\n",
        "\n",
        "# --- 1. LLM Configuration ---\n",
        "api_file = \"/home/mshahidul/api_new.json\"\n",
        "with open(api_file, \"r\") as f:\n",
        "    api_keys = json.load(f)\n",
        "openai_api_key = api_keys[\"openai\"]\n",
        "\n",
        "# Student: Local vLLM (Deployment Model)\n",
        "vllm_model = dspy.LM(\n",
        "    model='Qwen/Qwen3-30B-A3B-Instruct-2507',\n",
        "    api_base=\"http://172.16.34.29:8030/v1\",\n",
        "    api_key=\"EMPTY\",\n",
        "    temperature=0.0\n",
        ")\n",
        "\n",
        "# Teacher: OpenAI (High-quality rationale generation)\n",
        "# Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')\n",
        "openai_model_teacher = dspy.LM(model='gpt-5', api_key=openai_api_key)\n",
        "openai_model_student = dspy.LM(model='gpt-5-mini', api_key=openai_api_key)\n",
        "\n",
        "# Default LM for DSPy runtime\n",
        "# Use the local vLLM for fast iteration; switch to openai_model_student if needed.\n",
        "# dspy.configure(lm=vllm_model)\n",
        "dspy.configure(lm=openai_model_student)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "0f350ef4",
      "metadata": {},
      "outputs": [],
      "source": [
        "class HealthLiteracySignature(dspy.Signature):\n",
        "    \"\"\"\n",
        "    Classify the health literacy level of a generated text \n",
        "    based on the original full source text.\n",
        "    \"\"\"\n",
        "    full_text = dspy.InputField(desc=\"The original clinical or source medical text.\")\n",
        "    generated_text = dspy.InputField(desc=\"The rewritten medical text to classify for health literacy based on the original source text.\")\n",
        "    \n",
        "    # Using Literal ensures the output is constrained to your three categories\n",
        "    literacy_label = dspy.OutputField(desc=\"One of: low_health_literacy, intermediate_health_literacy, proficient_health_literacy\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e369f8e8",
      "metadata": {},
      "outputs": [],
      "source": [
        "class HealthLiteracyClassifier(dspy.Module):\n",
        "    def __init__(self):\n",
        "        super().__init__()\n",
        "        # Use ChainOfThought for better reasoning on medical jargon\n",
        "        self.classifier = dspy.ChainOfThought(HealthLiteracySignature)\n",
        "\n",
        "    def forward(self, full_text, generated_text):\n",
        "        return self.classifier(full_text=full_text, generated_text=generated_text)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "055542d5",
      "metadata": {},
      "outputs": [],
      "source": [
        "def prepare_data(raw_data):\n",
        "    dataset = []\n",
        "    for item in raw_data:\n",
        "        example = dspy.Example(\n",
        "            full_text=item['fulltext'],\n",
        "            generated_text=item['diff_label_texts'],\n",
        "            literacy_label=item['label'] # Matches the Signature field\n",
        "        ).with_inputs('full_text', 'generated_text')\n",
        "        dataset.append(example)\n",
        "    return dataset[:100], dataset[100:]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "e570be47",
      "metadata": {},
      "outputs": [],
      "source": [
        "import json\n",
        "path = \"/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json\"\n",
        "raw_data = json.load(open(path))\n",
        "trainset, testset = prepare_data(raw_data)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "39e90da8",
      "metadata": {},
      "outputs": [],
      "source": [
        "def health_literacy_metric(gold, pred, trace=None):\n",
        "    # Use 'literacy_label' because that is what's in your Signature\n",
        "    if not pred or not hasattr(pred, 'literacy_label'):\n",
        "        return False\n",
        "    \n",
        "    # Standardize both for comparison\n",
        "    gold_label = str(gold.literacy_label).strip().lower()\n",
        "    pred_label = str(pred.literacy_label).strip().lower()\n",
        "    \n",
        "    return gold_label == pred_label\n",
        "\n",
        "optimizer = BootstrapFewShotWithRandomSearch(\n",
        "    metric=health_literacy_metric,\n",
        "    max_bootstrapped_demos=3,\n",
        "    num_candidate_programs=8, \n",
        "    teacher_settings=dict(lm=openai_model_teacher)\n",
        ")\n",
        "\n",
        "# 3. Compile! This creates the \"optimized prompt\"\n",
        "compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)\n",
        "\n",
        "evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)\n",
        "accuracy_score = evaluator(compiled_classifier)\n",
        "compiled_classifier.save(\"health_literacy_model.json\")"
      ]
    },
    {
      "cell_type": "markdown",
      "id": "425291ff",
      "metadata": {},
      "source": [
        "## "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "id": "f8ae33e8",
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "vllm-gpt-oss-20b_teacher-gpt5_v1\n",
            "{'accuracy_score': 78.57, 'num_results': 84}\n",
            "vllm-gemma-3-12b-it_teacher-gpt5_v1\n",
            "{'accuracy_score': 79.76, 'num_results': 84}\n",
            "vllm-Qwen2.5-7B-Instruct_teacher-gpt5_v1\n",
            "{'accuracy_score': 59.52, 'num_results': 84}\n",
            "student-gpt5-mini_teacher-gpt5_(fulltxt+gen_sum)\n",
            "{'score': 88.1, 'results': 84}\n",
            "vllm-Meta-Llama-3.1-8B-Instruct_teacher-gpt5_v1\n",
            "{'accuracy_score': 78.57, 'num_results': 84}\n",
            "vllm-phi-4_teacher-gpt5_v1\n",
            "{'accuracy_score': 73.81, 'num_results': 84}\n",
            "vllm-qwen3-8b_teacher-gpt5_v1\n",
            "{'accuracy_score': 73.81, 'num_results': 84}\n",
            "student-gpt5-mini_teacher-gpt5_v1\n",
            "{'accuracy_score': 78.57, 'num_results': 84}\n"
          ]
        }
      ],
      "source": [
        "# /home/mshahidul/readctrl/code/text_classifier/dspy_model\n",
        "import os,json\n",
        "folders = os.listdir(\"/home/mshahidul/readctrl/code/text_classifier/dspy_model\")\n",
        "for folder in folders:\n",
        "    if os.path.isdir(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}\"):\n",
        "        files = os.listdir(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}\")\n",
        "        for file in files:\n",
        "            if file.endswith(\"accuracy.json\"):\n",
        "                path=(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}/{file}\")\n",
        "                print(path.split(\"/\")[-2])\n",
        "                data = json.load(open(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}/{file}\"))\n",
        "                print(data)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "4c236110",
      "metadata": {},
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "unsloth",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.11.11"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}