mshahidul

Initial commit of readCtrl code without large models

030876e 6 days ago

5.34 kB

	import argparse
	import os
	import json
	import sys


	parser = argparse.ArgumentParser(description="Readability Controlled Generation")
	parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2")
	parser.add_argument("--temperature", type=float, default=0.1)
	args = parser.parse_args()

	model_name = args.model_name
	temperature = args.temperature


	prompts = {
	"easy": '''
	You are an assistant that rewrites Spanish texts to make them very simple and easy to understand.
	Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7).
	Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions.
	Keep all important factual details, but remove jargon.
	Return only the rewritten text without commentary.
	''',
	"intermediate": '''
	You are an assistant specialized in rewriting Spanish texts with medium readability.
	Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 50–70; grade 8–12).
	Use clear and complete sentences, moderately complex vocabulary, and structured narration.
	Retain all relevant medical or factual information, but phrase it in accessible language.
	Return only the rewritten text with no explanations.
	''',
	"hard": '''
	You are an assistant that rewrites Spanish medical texts with professional, technical precision.
	Rewrite the following input text using specialized, academic terminology and information‑dense phrasing.
	The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level).
	Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings.
	Return only the rewritten text.
	'''
	}

	# -------- New Part: Load keyword–definition dataset ----------
	kw_file = "/home/mshahidul/readctrl/data/kyw_def_train/kyw_gen_gpt5.json"
	with open(kw_file, "r", encoding="utf-8") as f:
	definitions_data = json.load(f)

	# Build quick lookup: id -> glossary text
	def_map = {}
	for obj in definitions_data:
	cid = obj.get("id")
	kwlist = obj.get("medical_keywords", [])
	defs_str = ""
	if kwlist:
	defs_lines = [f"• {d['term']} — {d['definition']}" for d in kwlist]
	defs_str = "Relevant medical definitions:\n" + "\n".join(defs_lines)
	def_map[cid] = defs_str
	# --------------------------------------------------------------

	path = "/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json"
	out_dir = "/home/mshahidul/readctrl/results/v3"
	os.makedirs(out_dir, exist_ok=True)

	if os.path.exists(model_name):
	out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned_with_defs.json"
	else:
	out_path = out_dir + f"/temp{temperature}_qwen3-14B_base_with_defs.json"

	results, completed_keys = [], set()
	if os.path.exists(out_path):
	with open(out_path, "r", encoding="utf-8") as f:
	results = json.load(f)
	for r in results:
	completed_keys.add(r["fulltext"])

	# -------- Load main dataset -----------
	with open(path, "r", encoding="utf-8") as f:
	dataset = json.load(f)
	dataset = dataset[0:50]

	from unsloth import FastLanguageModel
	import torch

	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=model_name,
	max_seq_length=4092,
	load_in_4bit=False,
	load_in_8bit=False,
	full_finetuning=False,
	)

	import tqdm
	for item in tqdm.tqdm(dataset):
	key = item["fulltext"]
	if key in completed_keys:
	continue
	item_id = item["id"]
	glossary = def_map.get(item_id, "") # retrieve glossary if exists

	for band in ["easy", "intermediate", "hard"]:
	# Append definitions below the case text
	user_content = f"Input text:\n{item['fulltext'].strip()}"
	if glossary:
	user_content += "\n\n" + glossary

	messages = [
	{"role": "system", "content": prompts[band].strip()},
	{"role": "user", "content": user_content}
	]

	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=False,
	)

	inputs = tokenizer(text, return_tensors="pt").to("cuda")
	input_len = inputs.input_ids.shape[1]
	length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1}
	max_new_tokens = int(min(1200, max(150, input_len * length_factors[band])))

	output_ids = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	temperature=temperature,
	top_p=0.9,
	top_k=45,
	)
	output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

	results.append({
	"id": item_id,
	"fulltext": item["fulltext"],
	"band": band,
	"lang": "es",
	"synthetic_summary": output_text,
	"definitions_used": bool(glossary) # track whether glossary applied
	})

	completed_keys.add(key)
	if len(results) % 3 == 0:
	with open(out_path, "w", encoding="utf-8") as f:
	json.dump(results, f, ensure_ascii=False, indent=2)

	with open(out_path, "w", encoding="utf-8") as f:
	json.dump(results, f, ensure_ascii=False, indent=2)