{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "8a9d70f0", "metadata": {}, "outputs": [], "source": [ "import dspy\n", "import json\n", "from typing import Literal\n", "from dspy.teleprompt import BootstrapFewShotWithRandomSearch\n", "from dspy.evaluate import Evaluate\n", "\n", "# --- 1. LLM Configuration ---\n", "api_file = \"/home/mshahidul/api_new.json\"\n", "with open(api_file, \"r\") as f:\n", " api_keys = json.load(f)\n", "openai_api_key = api_keys[\"openai\"]\n", "\n", "# Student: Local vLLM (Deployment Model)\n", "vllm_model = dspy.LM(\n", " model='Qwen/Qwen3-30B-A3B-Instruct-2507',\n", " api_base=\"http://172.16.34.29:8030/v1\",\n", " api_key=\"EMPTY\",\n", " temperature=0.0\n", ")\n", "\n", "# Teacher: OpenAI (High-quality rationale generation)\n", "# Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')\n", "openai_model_teacher = dspy.LM(model='gpt-5', api_key=openai_api_key)\n", "openai_model_student = dspy.LM(model='gpt-5-mini', api_key=openai_api_key)\n", "\n", "# Default LM for DSPy runtime\n", "# Use the local vLLM for fast iteration; switch to openai_model_student if needed.\n", "# dspy.configure(lm=vllm_model)\n", "dspy.configure(lm=openai_model_student)" ] }, { "cell_type": "code", "execution_count": null, "id": "0f350ef4", "metadata": {}, "outputs": [], "source": [ "class HealthLiteracySignature(dspy.Signature):\n", " \"\"\"\n", " Classify the health literacy level of a generated text \n", " based on the original full source text.\n", " \"\"\"\n", " full_text = dspy.InputField(desc=\"The original clinical or source medical text.\")\n", " generated_text = dspy.InputField(desc=\"The rewritten medical text to classify for health literacy based on the original source text.\")\n", " \n", " # Using Literal ensures the output is constrained to your three categories\n", " literacy_label = dspy.OutputField(desc=\"One of: low_health_literacy, intermediate_health_literacy, proficient_health_literacy\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e369f8e8", "metadata": {}, "outputs": [], "source": [ "class HealthLiteracyClassifier(dspy.Module):\n", " def __init__(self):\n", " super().__init__()\n", " # Use ChainOfThought for better reasoning on medical jargon\n", " self.classifier = dspy.ChainOfThought(HealthLiteracySignature)\n", "\n", " def forward(self, full_text, generated_text):\n", " return self.classifier(full_text=full_text, generated_text=generated_text)" ] }, { "cell_type": "code", "execution_count": null, "id": "055542d5", "metadata": {}, "outputs": [], "source": [ "def prepare_data(raw_data):\n", " dataset = []\n", " for item in raw_data:\n", " example = dspy.Example(\n", " full_text=item['fulltext'],\n", " generated_text=item['diff_label_texts'],\n", " literacy_label=item['label'] # Matches the Signature field\n", " ).with_inputs('full_text', 'generated_text')\n", " dataset.append(example)\n", " return dataset[:100], dataset[100:]" ] }, { "cell_type": "code", "execution_count": null, "id": "e570be47", "metadata": {}, "outputs": [], "source": [ "import json\n", "path = \"/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json\"\n", "raw_data = json.load(open(path))\n", "trainset, testset = prepare_data(raw_data)" ] }, { "cell_type": "code", "execution_count": null, "id": "39e90da8", "metadata": {}, "outputs": [], "source": [ "def health_literacy_metric(gold, pred, trace=None):\n", " # Use 'literacy_label' because that is what's in your Signature\n", " if not pred or not hasattr(pred, 'literacy_label'):\n", " return False\n", " \n", " # Standardize both for comparison\n", " gold_label = str(gold.literacy_label).strip().lower()\n", " pred_label = str(pred.literacy_label).strip().lower()\n", " \n", " return gold_label == pred_label\n", "\n", "optimizer = BootstrapFewShotWithRandomSearch(\n", " metric=health_literacy_metric,\n", " max_bootstrapped_demos=3,\n", " num_candidate_programs=8, \n", " teacher_settings=dict(lm=openai_model_teacher)\n", ")\n", "\n", "# 3. Compile! This creates the \"optimized prompt\"\n", "compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)\n", "\n", "evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)\n", "accuracy_score = evaluator(compiled_classifier)\n", "compiled_classifier.save(\"health_literacy_model.json\")" ] }, { "cell_type": "markdown", "id": "425291ff", "metadata": {}, "source": [ "## " ] }, { "cell_type": "code", "execution_count": 9, "id": "f8ae33e8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "vllm-gpt-oss-20b_teacher-gpt5_v1\n", "{'accuracy_score': 78.57, 'num_results': 84}\n", "vllm-gemma-3-12b-it_teacher-gpt5_v1\n", "{'accuracy_score': 79.76, 'num_results': 84}\n", "vllm-Qwen2.5-7B-Instruct_teacher-gpt5_v1\n", "{'accuracy_score': 59.52, 'num_results': 84}\n", "student-gpt5-mini_teacher-gpt5_(fulltxt+gen_sum)\n", "{'score': 88.1, 'results': 84}\n", "vllm-Meta-Llama-3.1-8B-Instruct_teacher-gpt5_v1\n", "{'accuracy_score': 78.57, 'num_results': 84}\n", "vllm-phi-4_teacher-gpt5_v1\n", "{'accuracy_score': 73.81, 'num_results': 84}\n", "vllm-qwen3-8b_teacher-gpt5_v1\n", "{'accuracy_score': 73.81, 'num_results': 84}\n", "student-gpt5-mini_teacher-gpt5_v1\n", "{'accuracy_score': 78.57, 'num_results': 84}\n" ] } ], "source": [ "# /home/mshahidul/readctrl/code/text_classifier/dspy_model\n", "import os,json\n", "folders = os.listdir(\"/home/mshahidul/readctrl/code/text_classifier/dspy_model\")\n", "for folder in folders:\n", " if os.path.isdir(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}\"):\n", " files = os.listdir(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}\")\n", " for file in files:\n", " if file.endswith(\"accuracy.json\"):\n", " path=(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}/{file}\")\n", " print(path.split(\"/\")[-2])\n", " data = json.load(open(f\"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder}/{file}\"))\n", " print(data)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "4c236110", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "unsloth", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }