File size: 7,373 Bytes

1db7196

import dspy
import json
import os
import random
from typing import Literal
from dspy.teleprompt import BootstrapFewShotWithRandomSearch
from dspy.evaluate import Evaluate

# --- 1. LLM Configuration ---
api_file = "/home/mshahidul/api_new.json"
with open(api_file, "r") as f:
    api_keys = json.load(f)
openai_api_key = api_keys["openai"]

# Student: Local vLLM (Deployment Model)
vllm_model = dspy.LM(
    model="openai/dspy",
    api_base="http://172.16.34.29:8030/v1",
    api_key="EMPTY",
    temperature=0.0
)

# Teacher: OpenAI (High-quality rationale generation)
# Note: Ensure 'gpt-5' is the correct model name in your environment (usually 'gpt-4-turbo' or 'gpt-4o')
openai_model_teacher = dspy.LM(model="gpt-5", api_key=openai_api_key)

# Default LM for DSPy runtime
# Use the local vLLM for fast iteration.
dspy.configure(lm=vllm_model)

class HealthLiteracySignature(dspy.Signature):
    """
    Analyze the linguistic complexity, use of medical jargon, and sentence 
    structure of 'generated_text' relative to 'full_text' to determine 
    the health literacy level.
    """
    full_text = dspy.InputField(desc="Original clinical or medical source text containing jargon and technical details.")
    generated_text = dspy.InputField(
        desc="A version of the source text rewritten for a specific audience."
    )
    
    literacy_label = dspy.OutputField(
        desc="Classification: low_health_literacy (simple words, no jargon), intermediate_health_literacy (moderate technicality), or proficient_health_literacy (highly technical/original level)."
    )

class HealthLiteracyClassifier(dspy.Module):
    def __init__(self):
        super().__init__()
        # Use ChainOfThought for better reasoning on medical jargon
        self.classifier = dspy.ChainOfThought(HealthLiteracySignature)

    def forward(self, full_text, generated_text):
        return self.classifier(full_text=full_text, generated_text=generated_text)

def prepare_data(raw_data, seed=42, train_ratio=0.6):
    labels = [
        "low_health_literacy",
        "intermediate_health_literacy",
        "proficient_health_literacy",
    ]
    rng = random.Random(seed)
    buckets = {label: [] for label in labels}
    for item in raw_data:
        label = item.get("label")
        if label not in buckets:
            continue
        example = dspy.Example(
            full_text=item["fulltext"],
            generated_text=item["diff_label_texts"],
            literacy_label=label,  # Matches the Signature field
        ).with_inputs("full_text", "generated_text")
        buckets[label].append(example)

    min_count = min(len(buckets[label]) for label in labels)
    if min_count == 0:
        raise ValueError("One or more labels has no examples; cannot balance.")

    per_label_total = min_count
    per_label_train = int(round(per_label_total * train_ratio))
    per_label_train = max(1, min(per_label_train, per_label_total - 1))

    trainset = []
    testset = []
    for label in labels:
        rng.shuffle(buckets[label])
        selected = buckets[label][:per_label_total]
        trainset.extend(selected[:per_label_train])
        testset.extend(selected[per_label_train:per_label_total])

    rng.shuffle(trainset)
    rng.shuffle(testset)
    return trainset, testset


import json
path = "/home/mshahidul/readctrl/code/text_classifier/verified_combined_0-80.json"
raw_data = json.load(open(path))
trainset, testset = prepare_data(raw_data)

def _example_to_dict(example):
    return {
        "full_text": example.full_text,
        "generated_text": example.generated_text,
        "literacy_label": example.literacy_label,
    }

def save_jsonl(path, examples):
    with open(path, "w") as f:
        for ex in examples:
            f.write(json.dumps(_example_to_dict(ex), ensure_ascii=False) + "\n")

train_path = "/home/mshahidul/readctrl/code/text_classifier/train.jsonl"
test_path = "/home/mshahidul/readctrl/code/text_classifier/test.jsonl"
save_jsonl(train_path, trainset)
save_jsonl(test_path, testset)

def health_literacy_metric(gold, pred, trace=None):
    if not pred or not hasattr(pred, 'literacy_label'):
        return False
    
    gold_label = str(gold.literacy_label).strip().lower()
    pred_label = str(pred.literacy_label).strip().lower()
    
    # Simple inclusion check helps if the LLM gets wordy
    return gold_label in pred_label

optimizer = BootstrapFewShotWithRandomSearch(
    metric=health_literacy_metric,
    max_bootstrapped_demos=3,
    num_candidate_programs=8, 
    teacher_settings=dict(lm=openai_model_teacher)
)

# 3. Compile! This creates the "optimized prompt"
compiled_classifier = optimizer.compile(HealthLiteracyClassifier(), trainset=trainset)

evaluator = Evaluate(devset=testset, metric=health_literacy_metric, num_threads=1, display_progress=True)
evaluation_result = evaluator(compiled_classifier)
accuracy_score = (
    float(evaluation_result.score)
    if hasattr(evaluation_result, "score")
    else float(evaluation_result)
)

def _extract_usage(record):
    if isinstance(record, dict):
        usage = record.get("usage")
        if usage:
            return usage
        response = record.get("response")
        if isinstance(response, dict) and response.get("usage"):
            return response["usage"]
    return None

def calc_cost_usd(lm, price_in_per_1m, price_out_per_1m, price_cached_in_per_1m=None):
    prompt_tokens = 0
    completion_tokens = 0
    cached_tokens = 0
    for record in getattr(lm, "history", []) or []:
        usage = _extract_usage(record)
        if not usage:
            continue
        prompt_tokens += int(usage.get("prompt_tokens", usage.get("input_tokens", 0)) or 0)
        completion_tokens += int(usage.get("completion_tokens", usage.get("output_tokens", 0)) or 0)
        cached_tokens += int(usage.get("cached_tokens", usage.get("prompt_tokens_cached", 0)) or 0)
    cost = (prompt_tokens / 1_000_000) * price_in_per_1m
    cost += (completion_tokens / 1_000_000) * price_out_per_1m
    if price_cached_in_per_1m is not None:
        cost += (cached_tokens / 1_000_000) * price_cached_in_per_1m
    return {
        "prompt_tokens": prompt_tokens,
        "completion_tokens": completion_tokens,
        "cached_tokens": cached_tokens,
        "cost_usd": cost,
    }

# Fill these with current OpenAI pricing (USD per 1M tokens).
GPT5_PRICE_INPUT_PER_1M = 1.25
GPT5_PRICE_OUTPUT_PER_1M = 10.0

teacher_cost = calc_cost_usd(
    openai_model_teacher,
    GPT5_PRICE_INPUT_PER_1M,
    GPT5_PRICE_OUTPUT_PER_1M,
)

cost_report = {
    "gpt-5": teacher_cost,
}
folder_name = "vllm-qwen3-8b_teacher-gpt5_v1"
os.makedirs(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}", exist_ok=True)
compiled_classifier.save(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/model.json")

print(evaluation_result)

with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/accuracy.json", "w") as f:
    json.dump(
        {
            "accuracy_score": accuracy_score,
            "num_results": len(getattr(evaluation_result, "results", []) or []),
        },
        f,
        indent=2,
    )
print(json.dumps(cost_report, indent=2))
with open(f"/home/mshahidul/readctrl/code/text_classifier/dspy_model/{folder_name}/cost.json", "w") as f:
    json.dump(cost_report, f, indent=2)