readctrl / code /text_classifier /text_classifier_dspy_only_gen_text.py

Add files using upload-large-folder tool

1db7196 verified 28 days ago

7.51 kB

	import dspy
	import json
	import os
	import random
	from typing import Literal
	from dspy.teleprompt import BootstrapFewShotWithRandomSearch
	from dspy.evaluate import Evaluate

	# --- 1. LLM Configuration ---
	api_file = "/home/mshahidul/api_new.json"
	with open(api_file, "r") as f:
	api_keys = json.load(f)
	openai_api_key = api_keys["openai"]

	# Student: Local vLLM (Deployment Model)
	vllm_model = dspy.LM(
	model='Qwen/Qwen3-30B-A3B-Instruct-2507',
	api_base="http://172.16.34.29:8030/v1",
	api_key="EMPTY",
	temperature=0.0
	)

	# Teacher: OpenAI (High-quality rationale generation)
	# Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')
	openai_model_teacher = dspy.LM(model='gpt-5', api_key=openai_api_key)
	openai_model_student = dspy.LM(model='gpt-5-mini', api_key=openai_api_key)

	# Default LM for DSPy runtime
	# Use the local vLLM for fast iteration; switch to openai_model_student if needed.
	# dspy.configure(lm=vllm_model)
	dspy.configure(lm=openai_model_student)

	class HealthLiteracySignature(dspy.Signature):
	"""
	Analyze the linguistic complexity, use of medical jargon, and sentence
	structure of 'generated_text' to determine the health literacy level.
	"""
	generated_text = dspy.InputField(
	desc="A version of the source text rewritten for a specific audience."
	)

	literacy_label = dspy.OutputField(
	desc="Classification: low_health_literacy (simple words, no jargon), intermediate_health_literacy (moderate technicality), or proficient_health_literacy (highly technical/original level)."
	)

	class HealthLiteracyClassifier(dspy.Module):
	def __init__(self):
	super().__init__()
	# Use ChainOfThought for better reasoning on medical jargon
	self.classifier = dspy.ChainOfThought(HealthLiteracySignature)

	def forward(self, generated_text):
	return self.classifier(generated_text=generated_text)

	def prepare_data(raw_data, seed=42, train_ratio=0.6):
	labels = [
	"low_health_literacy",
	"intermediate_health_literacy",
	"proficient_health_literacy",
	]
	rng = random.Random(seed)
	buckets = {label: [] for label in labels}
	for item in raw_data:
	label = item.get("label")
	if label not in buckets:
	continue
	example = dspy.Example(
	generated_text=item["diff_label_texts"],
	literacy_label=label, # Matches the Signature field
	).with_inputs("generated_text")
	buckets[label].append(example)

	min_count = min(len(buckets[label]) for label in labels)
	if min_count == 0:
	raise ValueError("One or more labels has no examples; cannot balance.")

	per_label_total = min_count
	per_label_train = int(round(per_label_total * train_ratio))
	per_label_train = max(1, min(per_label_train, per_label_total - 1))

	trainset = []
	testset = []
	for label in labels:
	rng.shuffle(buckets[label])
	selected = buckets[label][:per_label_total]
	trainset.extend(selected[:per_label_train])
	testset.extend(selected[per_label_train:per_label_total])

	rng.shuffle(trainset)
	rng.shuffle(testset)
	return trainset, testset


	import json
	path = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json"
	raw_data = json.load(open(path))
	trainset, testset = prepare_data(raw_data)

	def _example_to_dict(example):
	return {
	"generated_text": example.generated_text,
	"literacy_label": example.literacy_label,
	}

	def save_jsonl(path, examples):
	with open(path, "w") as f:
	for ex in examples:
	f.write(json.dumps(_example_to_dict(ex), ensure_ascii=False) + "\n")

	train_path = "/home/mshahidul/readctrl/code/text_classifier/train.jsonl"
	test_path = "/home/mshahidul/readctrl/code/text_classifier/test.jsonl"
	save_jsonl(train_path, trainset)
	save_jsonl(test_path, testset)

	def health_literacy_metric(gold, pred, trace=None):
	if not pred or not hasattr(pred, 'literacy_label'):
	return False

	gold_label = str(gold.literacy_label).strip().lower()
	pred_label = str(pred.literacy_label).strip().lower()

	# Simple inclusion check helps if the LLM gets wordy
	return gold_label in pred_label

	optimizer = BootstrapFewShotWithRandomSearch(
	metric=health_literacy_metric,
	max_bootstrapped_demos=3,
	num_candidate_programs=8,
	teacher_settings=dict(lm=openai_model_teacher)
	)

	# 3. Compile! This creates the "optimized prompt"
	compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)

	evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)
	evaluation_result = evaluator(compiled_classifier)
	accuracy_score = (
	float(evaluation_result.score)
	if hasattr(evaluation_result, "score")
	else float(evaluation_result)
	)

	def _extract_usage(record):
	if isinstance(record, dict):
	usage = record.get("usage")
	if usage:
	return usage
	response = record.get("response")
	if isinstance(response, dict) and response.get("usage"):
	return response["usage"]
	return None

	def calc_cost_usd(lm, price_in_per_1m, price_out_per_1m, price_cached_in_per_1m=None):
	prompt_tokens = 0
	completion_tokens = 0
	cached_tokens = 0
	for record in getattr(lm, "history", []) or []:
	usage = _extract_usage(record)
	if not usage:
	continue
	prompt_tokens += int(usage.get("prompt_tokens", usage.get("input_tokens", 0)) or 0)
	completion_tokens += int(usage.get("completion_tokens", usage.get("output_tokens", 0)) or 0)
	cached_tokens += int(usage.get("cached_tokens", usage.get("prompt_tokens_cached", 0)) or 0)
	cost = (prompt_tokens / 1_000_000) * price_in_per_1m
	cost += (completion_tokens / 1_000_000) * price_out_per_1m
	if price_cached_in_per_1m is not None:
	cost += (cached_tokens / 1_000_000) * price_cached_in_per_1m
	return {
	"prompt_tokens": prompt_tokens,
	"completion_tokens": completion_tokens,
	"cached_tokens": cached_tokens,
	"cost_usd": cost,
	}

	# Fill these with current OpenAI pricing (USD per 1M tokens).
	GPT5_PRICE_INPUT_PER_1M = 1.25
	GPT5_PRICE_OUTPUT_PER_1M = 10.0
	GPT5_MINI_PRICE_INPUT_PER_1M = 0.25
	GPT5_MINI_PRICE_OUTPUT_PER_1M = 2.0

	teacher_cost = calc_cost_usd(
	openai_model_teacher,
	GPT5_PRICE_INPUT_PER_1M,
	GPT5_PRICE_OUTPUT_PER_1M,
	)
	student_cost = calc_cost_usd(
	openai_model_student,
	GPT5_MINI_PRICE_INPUT_PER_1M,
	GPT5_MINI_PRICE_OUTPUT_PER_1M,
	)

	cost_report = {
	"gpt-5": teacher_cost,
	"gpt-5-mini": student_cost,
	}
	folder_name="student-gpt5-mini_teacher-gpt5_v1"
	os.makedirs(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}", exist_ok=True)
	compiled_classifier.save(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/model.json")

	print(evaluation_result)
	print(json.dumps(cost_report, indent=2))
	with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/accuracy.json", "w") as f:
	json.dump(
	{
	"accuracy_score": accuracy_score,
	"num_results": len(getattr(evaluation_result, "results", []) or []),
	},
	f,
	indent=2,
	)
	with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/cost.json", "w") as f:
	json.dump(cost_report, f, indent=2)