readctrl / code /text_classifier /text_classifier_dspy_only_gen_text.py
shahidul034's picture
Add files using upload-large-folder tool
1db7196 verified
import dspy
import json
import os
import random
from typing import Literal
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from dspy.evaluate import Evaluate
# --- 1. LLM Configuration ---
api_file = "/home/mshahidul/api_new.json"
with open(api_file, "r") as f:
api_keys = json.load(f)
openai_api_key = api_keys["openai"]
# Student: Local vLLM (Deployment Model)
vllm_model = dspy.LM(
model='Qwen/Qwen3-30B-A3B-Instruct-2507',
api_base="http://172.16.34.29:8030/v1",
api_key="EMPTY",
temperature=0.0
)
# Teacher: OpenAI (High-quality rationale generation)
# Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')
openai_model_teacher = dspy.LM(model='gpt-5', api_key=openai_api_key)
openai_model_student = dspy.LM(model='gpt-5-mini', api_key=openai_api_key)
# Default LM for DSPy runtime
# Use the local vLLM for fast iteration; switch to openai_model_student if needed.
# dspy.configure(lm=vllm_model)
dspy.configure(lm=openai_model_student)
class HealthLiteracySignature(dspy.Signature):
"""
Analyze the linguistic complexity, use of medical jargon, and sentence
structure of 'generated_text' to determine the health literacy level.
"""
generated_text = dspy.InputField(
desc="A version of the source text rewritten for a specific audience."
)
literacy_label = dspy.OutputField(
desc="Classification: low_health_literacy (simple words, no jargon), intermediate_health_literacy (moderate technicality), or proficient_health_literacy (highly technical/original level)."
)
class HealthLiteracyClassifier(dspy.Module):
def __init__(self):
super().__init__()
# Use ChainOfThought for better reasoning on medical jargon
self.classifier = dspy.ChainOfThought(HealthLiteracySignature)
def forward(self, generated_text):
return self.classifier(generated_text=generated_text)
def prepare_data(raw_data, seed=42, train_ratio=0.6):
labels = [
"low_health_literacy",
"intermediate_health_literacy",
"proficient_health_literacy",
]
rng = random.Random(seed)
buckets = {label: [] for label in labels}
for item in raw_data:
label = item.get("label")
if label not in buckets:
continue
example = dspy.Example(
generated_text=item["diff_label_texts"],
literacy_label=label, # Matches the Signature field
).with_inputs("generated_text")
buckets[label].append(example)
min_count = min(len(buckets[label]) for label in labels)
if min_count == 0:
raise ValueError("One or more labels has no examples; cannot balance.")
per_label_total = min_count
per_label_train = int(round(per_label_total * train_ratio))
per_label_train = max(1, min(per_label_train, per_label_total - 1))
trainset = []
testset = []
for label in labels:
rng.shuffle(buckets[label])
selected = buckets[label][:per_label_total]
trainset.extend(selected[:per_label_train])
testset.extend(selected[per_label_train:per_label_total])
rng.shuffle(trainset)
rng.shuffle(testset)
return trainset, testset
import json
path = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json"
raw_data = json.load(open(path))
trainset, testset = prepare_data(raw_data)
def _example_to_dict(example):
return {
"generated_text": example.generated_text,
"literacy_label": example.literacy_label,
}
def save_jsonl(path, examples):
with open(path, "w") as f:
for ex in examples:
f.write(json.dumps(_example_to_dict(ex), ensure_ascii=False) + "\n")
train_path = "/home/mshahidul/readctrl/code/text_classifier/train.jsonl"
test_path = "/home/mshahidul/readctrl/code/text_classifier/test.jsonl"
save_jsonl(train_path, trainset)
save_jsonl(test_path, testset)
def health_literacy_metric(gold, pred, trace=None):
if not pred or not hasattr(pred, 'literacy_label'):
return False
gold_label = str(gold.literacy_label).strip().lower()
pred_label = str(pred.literacy_label).strip().lower()
# Simple inclusion check helps if the LLM gets wordy
return gold_label in pred_label
optimizer = BootstrapFewShotWithRandomSearch(
metric=health_literacy_metric,
max_bootstrapped_demos=3,
num_candidate_programs=8,
teacher_settings=dict(lm=openai_model_teacher)
)
# 3. Compile! This creates the "optimized prompt"
compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)
evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)
evaluation_result = evaluator(compiled_classifier)
accuracy_score = (
float(evaluation_result.score)
if hasattr(evaluation_result, "score")
else float(evaluation_result)
)
def _extract_usage(record):
if isinstance(record, dict):
usage = record.get("usage")
if usage:
return usage
response = record.get("response")
if isinstance(response, dict) and response.get("usage"):
return response["usage"]
return None
def calc_cost_usd(lm, price_in_per_1m, price_out_per_1m, price_cached_in_per_1m=None):
prompt_tokens = 0
completion_tokens = 0
cached_tokens = 0
for record in getattr(lm, "history", []) or []:
usage = _extract_usage(record)
if not usage:
continue
prompt_tokens += int(usage.get("prompt_tokens", usage.get("input_tokens", 0)) or 0)
completion_tokens += int(usage.get("completion_tokens", usage.get("output_tokens", 0)) or 0)
cached_tokens += int(usage.get("cached_tokens", usage.get("prompt_tokens_cached", 0)) or 0)
cost = (prompt_tokens / 1_000_000) * price_in_per_1m
cost += (completion_tokens / 1_000_000) * price_out_per_1m
if price_cached_in_per_1m is not None:
cost += (cached_tokens / 1_000_000) * price_cached_in_per_1m
return {
"prompt_tokens": prompt_tokens,
"completion_tokens": completion_tokens,
"cached_tokens": cached_tokens,
"cost_usd": cost,
}
# Fill these with current OpenAI pricing (USD per 1M tokens).
GPT5_PRICE_INPUT_PER_1M = 1.25
GPT5_PRICE_OUTPUT_PER_1M = 10.0
GPT5_MINI_PRICE_INPUT_PER_1M = 0.25
GPT5_MINI_PRICE_OUTPUT_PER_1M = 2.0
teacher_cost = calc_cost_usd(
openai_model_teacher,
GPT5_PRICE_INPUT_PER_1M,
GPT5_PRICE_OUTPUT_PER_1M,
)
student_cost = calc_cost_usd(
openai_model_student,
GPT5_MINI_PRICE_INPUT_PER_1M,
GPT5_MINI_PRICE_OUTPUT_PER_1M,
)
cost_report = {
"gpt-5": teacher_cost,
"gpt-5-mini": student_cost,
}
folder_name="student-gpt5-mini_teacher-gpt5_v1"
os.makedirs(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}", exist_ok=True)
compiled_classifier.save(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/model.json")
print(evaluation_result)
print(json.dumps(cost_report, indent=2))
with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/accuracy.json", "w") as f:
json.dump(
{
"accuracy_score": accuracy_score,
"num_results": len(getattr(evaluation_result, "results", []) or []),
},
f,
indent=2,
)
with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/cost.json", "w") as f:
json.dump(cost_report, f, indent=2)