File size: 7,373 Bytes
1db7196 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | import dspy
import json
import os
import random
from typing import Literal
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from dspy.evaluate import Evaluate
# --- 1. LLM Configuration ---
api_file = "/home/mshahidul/api_new.json"
with open(api_file, "r") as f:
api_keys = json.load(f)
openai_api_key = api_keys["openai"]
# Student: Local vLLM (Deployment Model)
vllm_model = dspy.LM(
model="openai/dspy",
api_base="http://172.16.34.29:8030/v1",
api_key="EMPTY",
temperature=0.0
)
# Teacher: OpenAI (High-quality rationale generation)
# Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')
openai_model_teacher = dspy.LM(model="gpt-5", api_key=openai_api_key)
# Default LM for DSPy runtime
# Use the local vLLM for fast iteration.
dspy.configure(lm=vllm_model)
class HealthLiteracySignature(dspy.Signature):
"""
Analyze the linguistic complexity, use of medical jargon, and sentence
structure of 'generated_text' relative to 'full_text' to determine
the health literacy level.
"""
full_text = dspy.InputField(desc="Original clinical or medical source text containing jargon and technical details.")
generated_text = dspy.InputField(
desc="A version of the source text rewritten for a specific audience."
)
literacy_label = dspy.OutputField(
desc="Classification: low_health_literacy (simple words, no jargon), intermediate_health_literacy (moderate technicality), or proficient_health_literacy (highly technical/original level)."
)
class HealthLiteracyClassifier(dspy.Module):
def __init__(self):
super().__init__()
# Use ChainOfThought for better reasoning on medical jargon
self.classifier = dspy.ChainOfThought(HealthLiteracySignature)
def forward(self, full_text, generated_text):
return self.classifier(full_text=full_text, generated_text=generated_text)
def prepare_data(raw_data, seed=42, train_ratio=0.6):
labels = [
"low_health_literacy",
"intermediate_health_literacy",
"proficient_health_literacy",
]
rng = random.Random(seed)
buckets = {label: [] for label in labels}
for item in raw_data:
label = item.get("label")
if label not in buckets:
continue
example = dspy.Example(
full_text=item["fulltext"],
generated_text=item["diff_label_texts"],
literacy_label=label, # Matches the Signature field
).with_inputs("full_text", "generated_text")
buckets[label].append(example)
min_count = min(len(buckets[label]) for label in labels)
if min_count == 0:
raise ValueError("One or more labels has no examples; cannot balance.")
per_label_total = min_count
per_label_train = int(round(per_label_total * train_ratio))
per_label_train = max(1, min(per_label_train, per_label_total - 1))
trainset = []
testset = []
for label in labels:
rng.shuffle(buckets[label])
selected = buckets[label][:per_label_total]
trainset.extend(selected[:per_label_train])
testset.extend(selected[per_label_train:per_label_total])
rng.shuffle(trainset)
rng.shuffle(testset)
return trainset, testset
import json
path = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json"
raw_data = json.load(open(path))
trainset, testset = prepare_data(raw_data)
def _example_to_dict(example):
return {
"full_text": example.full_text,
"generated_text": example.generated_text,
"literacy_label": example.literacy_label,
}
def save_jsonl(path, examples):
with open(path, "w") as f:
for ex in examples:
f.write(json.dumps(_example_to_dict(ex), ensure_ascii=False) + "\n")
train_path = "/home/mshahidul/readctrl/code/text_classifier/train.jsonl"
test_path = "/home/mshahidul/readctrl/code/text_classifier/test.jsonl"
save_jsonl(train_path, trainset)
save_jsonl(test_path, testset)
def health_literacy_metric(gold, pred, trace=None):
if not pred or not hasattr(pred, 'literacy_label'):
return False
gold_label = str(gold.literacy_label).strip().lower()
pred_label = str(pred.literacy_label).strip().lower()
# Simple inclusion check helps if the LLM gets wordy
return gold_label in pred_label
optimizer = BootstrapFewShotWithRandomSearch(
metric=health_literacy_metric,
max_bootstrapped_demos=3,
num_candidate_programs=8,
teacher_settings=dict(lm=openai_model_teacher)
)
# 3. Compile! This creates the "optimized prompt"
compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)
evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)
evaluation_result = evaluator(compiled_classifier)
accuracy_score = (
float(evaluation_result.score)
if hasattr(evaluation_result, "score")
else float(evaluation_result)
)
def _extract_usage(record):
if isinstance(record, dict):
usage = record.get("usage")
if usage:
return usage
response = record.get("response")
if isinstance(response, dict) and response.get("usage"):
return response["usage"]
return None
def calc_cost_usd(lm, price_in_per_1m, price_out_per_1m, price_cached_in_per_1m=None):
prompt_tokens = 0
completion_tokens = 0
cached_tokens = 0
for record in getattr(lm, "history", []) or []:
usage = _extract_usage(record)
if not usage:
continue
prompt_tokens += int(usage.get("prompt_tokens", usage.get("input_tokens", 0)) or 0)
completion_tokens += int(usage.get("completion_tokens", usage.get("output_tokens", 0)) or 0)
cached_tokens += int(usage.get("cached_tokens", usage.get("prompt_tokens_cached", 0)) or 0)
cost = (prompt_tokens / 1_000_000) * price_in_per_1m
cost += (completion_tokens / 1_000_000) * price_out_per_1m
if price_cached_in_per_1m is not None:
cost += (cached_tokens / 1_000_000) * price_cached_in_per_1m
return {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"cached_tokens": cached_tokens,
"cost_usd": cost,
}
# Fill these with current OpenAI pricing (USD per 1M tokens).
GPT5_PRICE_INPUT_PER_1M = 1.25
GPT5_PRICE_OUTPUT_PER_1M = 10.0
teacher_cost = calc_cost_usd(
openai_model_teacher,
GPT5_PRICE_INPUT_PER_1M,
GPT5_PRICE_OUTPUT_PER_1M,
)
cost_report = {
"gpt-5": teacher_cost,
}
folder_name = "vllm-qwen3-8b_teacher-gpt5_v1"
os.makedirs(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}", exist_ok=True)
compiled_classifier.save(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/model.json")
print(evaluation_result)
with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/accuracy.json", "w") as f:
json.dump(
{
"accuracy_score": accuracy_score,
"num_results": len(getattr(evaluation_result, "results", []) or []),
},
f,
indent=2,
)
print(json.dumps(cost_report, indent=2))
with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/cost.json", "w") as f:
json.dump(cost_report, f, indent=2) |