mshahidul

Initial commit of readCtrl code without large models

030876e 6 days ago

6.31 kB

	import argparse
	import os
	import json
	import sys
	sys.path.append(os.path.abspath('/home/mshahidul/'))
	from gpu_selection import _gpu_selection_

	parser = argparse.ArgumentParser(description="Readability Controlled Generation")
	parser.add_argument("--cuda", type=str, default="3")
	parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2")
	parser.add_argument("--temperature", type=float, default=0.1)
	args = parser.parse_args()

	model_name = args.model_name
	temperature = args.temperature

	if args.cuda is not None:
	os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
	print(f"🎮🎮 Using CUDA device: {args.cuda}")
	else:
	_gpu_selection_()

	prompts={
	"easy":'''
	Reescribe el siguiente informe médico en español con un nivel de lectura fácil correspondiente a un puntaje FH entre 70 y 100 (texto muy comprensible).
	Usa oraciones cortas y directas, vocabulario cotidiano, estructuras simples y explicaciones claras de términos médicos. El tono debe ser empático y accesible, como si estuvieras explicando la situación a un paciente o familiar sin conocimientos médicos.
	Mantén los datos clínicos y resultados esenciales, pero reemplaza o aclara tecnicismos con frases simples. Evita abreviaturas o siglas sin explicación.
	''',
	"intermediate": '''
	Reformula el siguiente informe médico en español con un nivel de lectura intermedio, correspondiente a un puntaje FH entre 50 y 70 (texto de dificultad moderada).
	Usa lenguaje formal pero comprensible, adecuado para lectores con educación general o estudiantes del área de salud. Mantén la precisión médica, pero agrega explicaciones breves tras los términos técnicos. Alterna oraciones simples y compuestas, con buena fluidez y cohesión.
	El texto debe sonar profesional, informativo y claro, sin llegar a la densidad típica de lenguaje técnico especializado.
	''',
	"hard": '''
	Reescribe el siguiente informe médico en español con un nivel de lectura avanzado o técnico, correspondiente a un puntaje FH entre 0 y 50 (texto especializado).
	Usa terminología médica precisa, estructuras sintácticas complejas y tono formal típico de documentos clínicos o publicaciones científicas. No simplifiques ni expliques los tecnicismos; conserva la exactitud conceptual y la nomenclatura profesional.
	Refleja el razonamiento clínico, hallazgos y juicios médicos con lenguaje apropiado para médicos, especialistas o investigadores.
	'''
	}
	# -------- New Part: Load keyword–definition dataset ----------
	kw_file = "/home/mshahidul/readctrl/data/kyw_def_train/kyw_gen_gpt5.json"
	with open(kw_file, "r", encoding="utf-8") as f:
	definitions_data = json.load(f)

	# Build quick lookup: id -> glossary text
	def_map = {}
	for obj in definitions_data:
	cid = obj.get("id")
	kwlist = obj.get("medical_keywords", [])
	defs_str = ""
	if kwlist:
	defs_lines = [f"• {d['term']} — {d['definition']}" for d in kwlist]
	defs_str = "Relevant medical definitions:\n" + "\n".join(defs_lines)
	def_map[cid] = defs_str
	# --------------------------------------------------------------

	path = "/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json"
	out_dir = "/home/mshahidul/readctrl/results/custom_promptsV1"
	os.makedirs(out_dir, exist_ok=True)

	if os.path.exists(model_name):
	out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned_with_defs.json"
	else:
	out_path = out_dir + f"/temp{temperature}_qwen3-14B_base_with_defs.json"

	results, completed_keys = [], set()
	if os.path.exists(out_path):
	with open(out_path, "r", encoding="utf-8") as f:
	results = json.load(f)
	for r in results:
	completed_keys.add(r["fulltext"])

	# -------- Load main dataset -----------
	with open(path, "r", encoding="utf-8") as f:
	dataset = json.load(f)
	dataset = dataset[0:50]

	from unsloth import FastLanguageModel
	import torch

	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_name,
	max_seq_length=4092,
	load_in_4bit=False,
	load_in_8bit=False,
	full_finetuning=False,
	)

	import tqdm
	for item in tqdm.tqdm(dataset):
	key = item["fulltext"]
	if key in completed_keys:
	continue
	item_id = item["id"]
	glossary = def_map.get(item_id, "") # retrieve glossary if exists

	for band in ["easy", "intermediate", "hard"]:
	# Append definitions below the case text
	user_content = f"Input text:\n{item['fulltext'].strip()}"
	# if glossary:
	# user_content += "\n\n" + glossary

	messages = [
	{"role": "system", "content": prompts[band].strip()},
	{"role": "user", "content": user_content}
	]

	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=False,
	)

	inputs = tokenizer(text, return_tensors="pt").to("cuda")
	input_len = inputs.input_ids.shape[1]
	length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1}
	max_new_tokens = int(min(1200, max(150, input_len * length_factors[band])))

	output_ids = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=0.9,
	top_k=45,
	)
	output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

	results.append({
	"id": item_id,
	"fulltext": item["fulltext"],
	"band": band,
	"lang": "es",
	"synthetic_summary": output_text,
	"definitions_used": bool(glossary) # track whether glossary applied
	})

	completed_keys.add(key)
	if len(results) % 3 == 0:
	with open(out_path, "w", encoding="utf-8") as f:
	json.dump(results, f, ensure_ascii=False, indent=2)

	with open(out_path, "w", encoding="utf-8") as f:
	json.dump(results, f, ensure_ascii=False, indent=2)


	from notifier import send_notification
	send_notification(
	"process-complete1507034",
	f"Finished inference with model {model_name} at temperature {temperature}. Results saved to {out_path}",
	title="Inference Complete",
	priority="default",
	tags="tada"
	)