mshahidul

Initial commit of readCtrl code without large models

030876e 7 days ago

8.08 kB

	import os
	import json
	from datetime import datetime

	import numpy as np
	from datasets import Dataset
	from openai import OpenAI
	from transformers import AutoTokenizer
	from unsloth.chat_templates import get_chat_template

	# -----------------------------
	# Configuration
	# -----------------------------
	# vLLM server (OpenAI-compatible) URL, e.g. "http://localhost:8000/v1"
	VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "http://localhost:8040/v1")

	# Model name as seen by vLLM server (can be HF repo id or local path)
	VLLM_MODEL_NAME = os.getenv(
	"VLLM_MODEL_NAME",
	"classifier", # adjust if needed
	)

	# Dummy key is fine for vLLM if auth is disabled
	OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "EMPTY")

	# Data and output paths (mirrors finetune script)
	data_path = "/home/mshahidul/readctrl/code/text_classifier/bn/testing_bn_full.json"
	test_size = 0.2
	seed = 42
	prompt_language = "en" # "bn" or "en"

	model_info_dir = "/home/mshahidul/readctrl/code/text_classifier/bn/model_info"
	ablation_dir = "/home/mshahidul/readctrl/code/text_classifier/bn/ablation_studies"
	os.makedirs(model_info_dir, exist_ok=True)
	os.makedirs(ablation_dir, exist_ok=True)

	# -----------------------------
	# Chat template / tokenizer (match finetune script)
	# -----------------------------
	BASE_MODEL_FOR_TEMPLATE = "unsloth/gemma-3-4b-it"
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_FOR_TEMPLATE)
	tokenizer = get_chat_template(tokenizer, chat_template="gemma-3")

	# -----------------------------
	# Prompt construction (copied from finetune script)
	# -----------------------------
	def build_classification_user_prompt(fulltext, gen_text):
	# Input: fulltext (reference) + gen_text (main text to classify), Output: label
	if prompt_language == "en":
	return (
	"You will be given a medical case description as reference (full text) and a generated text to classify. "
	"Determine the patient's health literacy level based only on the generated text.\n\n"
	f"Reference (full text):\n{fulltext}\n\n"
	f"Generated text (to classify):\n{gen_text}\n\n"
	"Reply with exactly one label from this set:\n"
	"low_health_literacy, intermediate_health_literacy, proficient_health_literacy"
	)
	# Bangla (default)
	return (
	"আপনাকে রেফারেন্স হিসেবে মেডিকেল কেসের পূর্ণ বর্ণনা (reference full text) এবং মূলভাবে শ্রেণিবিন্যাস করার জন্য তৈরি করা টেক্সট (generated text) দেওয়া হবে। "
	"শুধুমাত্র তৈরি করা টেক্সট (generated text)-এর উপর ভিত্তি করে রোগীর স্বাস্থ্যজ্ঞান (health literacy) কোন স্তরের তা নির্ধারণ করুন।\n\n"
	f"Reference (full text):\n{fulltext}\n\n"
	f"Generated text (যেটি শ্রেণিবিন্যাস করতে হবে):\n{gen_text}\n\n"
	"শুধু নিচের সেট থেকে একটি লেবেল দিয়ে উত্তর দিন:\n"
	"low_health_literacy, intermediate_health_literacy, proficient_health_literacy"
	)


	def build_classification_examples(raw_records):
	examples = []
	for record in raw_records:
	fulltext = record.get("fulltext", "")
	gen_text = record.get("gen_text", "")
	label = (record.get("label") or "").strip()
	if not label:
	continue
	user_prompt = build_classification_user_prompt(fulltext, gen_text)
	examples.append(
	{
	"fulltext": fulltext,
	"gen_text": gen_text,
	"gold_label": label,
	"user_prompt": user_prompt,
	}
	)
	return examples


	# -----------------------------
	# vLLM client
	# -----------------------------
	client = OpenAI(
	base_url=VLLM_BASE_URL,
	api_key=OPENAI_API_KEY,
	)


	def vllm_generate_label(user_prompt: str, max_tokens: int = 32) -> str:
	"""Call vLLM endpoint using the same chat template as finetuning."""
	prompt = tokenizer.apply_chat_template(
	[{"role": "user", "content": user_prompt}],
	tokenize=False,
	add_generation_prompt=True,
	)

	# 1. Define stop sequences.
	# For Gemma 3, common ones are "<\|endoftext\|>", "<\|file_separator\|>", or "\n"
	# Since your labels are single words, stopping at a newline is safest.
	stop_sequences = [tokenizer.eos_token, "<\|endoftext\|>", "\n", "<\|im_end\|>","<eos>","<end_of_turn>"]
	# print(stop_sequences,"stop sequences")

	response = client.completions.create(
	model=VLLM_MODEL_NAME,
	prompt=prompt,
	temperature=0.0,
	max_tokens=max_tokens,
	stop=stop_sequences, # <--- CRITICAL FIX
	)

	content = response.choices[0].text or ""
	# import ipdb; ipdb.set_trace()

	# 2. Clean up: split by lines and take the first non-empty line
	# This handles cases where the model might still return "label\n\n"
	predicted_label = content.strip().split('\n')[0].strip()

	return predicted_label


	# -----------------------------
	# Data loading & test split
	# -----------------------------
	def load_test_split():
	with open(data_path, "r", encoding="utf-8") as f:
	raw_data = json.load(f)

	raw_dataset = Dataset.from_list(raw_data)
	split_dataset = raw_dataset.train_test_split(
	test_size=test_size, seed=seed, shuffle=True
	)
	test_raw = split_dataset["test"]
	return test_raw


	# -----------------------------
	# Evaluation
	# -----------------------------
	def evaluate_with_vllm(test_split):
	examples = build_classification_examples(test_split)
	results = []
	total = 0
	correct = 0

	for idx, ex in enumerate(examples):
	fulltext = ex["fulltext"]
	gen_text = ex["gen_text"]
	gold_label = ex["gold_label"]
	user_prompt = ex["user_prompt"]

	try:
	pred_label = vllm_generate_label(user_prompt)
	except Exception as e:
	pred_label = f"ERROR: {e}"

	total += 1
	is_correct = pred_label == gold_label
	if is_correct:
	correct += 1

	results.append(
	{
	"sample_index": idx,
	"fulltext": fulltext,
	"gen_text": gen_text,
	"gold_label": gold_label,
	"predicted_label": pred_label,
	"correct": is_correct,
	}
	)

	accuracy = correct / total if total else 0.0
	return results, accuracy


	def main():
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	model_tag = os.path.basename(str(VLLM_MODEL_NAME)).replace(".", "_")

	test_raw = load_test_split()
	results, accuracy = evaluate_with_vllm(test_raw)

	metrics = {
	"mode": "fulltext_gen_text_classification",
	"model_name": VLLM_MODEL_NAME,
	"dataset_path": data_path,
	"prompt_language": prompt_language,
	"seed": seed,
	"test_size": test_size,
	"examples_evaluated": len(results),
	"accuracy": accuracy,
	"timestamp": timestamp,
	"inference_backend": "vllm_openai_server",
	}

	predictions_path = os.path.join(
	model_info_dir, f"{model_tag}_vllm_test_inference_{timestamp}.json"
	)
	accuracy_path = os.path.join(
	ablation_dir, f"{model_tag}_vllm_classification_{timestamp}.json"
	)

	with open(predictions_path, "w", encoding="utf-8") as f:
	json.dump(results, f, ensure_ascii=False, indent=2)

	with open(accuracy_path, "w", encoding="utf-8") as f:
	json.dump(metrics, f, ensure_ascii=False, indent=2)

	print(f"Saved vLLM test inference to: {predictions_path}")
	print(f"Saved vLLM test accuracy to: {accuracy_path}")
	print(f"Accuracy: {accuracy:.4f}")


	if __name__ == "__main__":
	main()