File size: 6,306 Bytes

9c6961c

import argparse
import os
import json
import sys
sys.path.append(os.path.abspath('/home/mshahidul/'))
from gpu_selection import _gpu_selection_

parser = argparse.ArgumentParser(description="Readability Controlled Generation")
parser.add_argument("--cuda", type=str, default="3")
parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2")
parser.add_argument("--temperature", type=float, default=0.1)
args = parser.parse_args()

model_name = args.model_name
temperature = args.temperature

if args.cuda is not None:
    os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
    print(f"🎮🎮 Using CUDA device: {args.cuda}")
else:
    _gpu_selection_()

prompts={
"easy":'''
Reescribe el siguiente informe médico en español con un nivel de lectura fácil correspondiente a un puntaje FH entre 70 y 100 (texto muy comprensible).
Usa oraciones cortas y directas, vocabulario cotidiano, estructuras simples y explicaciones claras de términos médicos. El tono debe ser empático y accesible, como si estuvieras explicando la situación a un paciente o familiar sin conocimientos médicos.
Mantén los datos clínicos y resultados esenciales, pero reemplaza o aclara tecnicismos con frases simples. Evita abreviaturas o siglas sin explicación.
''',
"intermediate": '''
Reformula el siguiente informe médico en español con un nivel de lectura intermedio, correspondiente a un puntaje FH entre 50 y 70 (texto de dificultad moderada).
Usa lenguaje formal pero comprensible, adecuado para lectores con educación general o estudiantes del área de salud. Mantén la precisión médica, pero agrega explicaciones breves tras los términos técnicos. Alterna oraciones simples y compuestas, con buena fluidez y cohesión.
El texto debe sonar profesional, informativo y claro, sin llegar a la densidad típica de lenguaje técnico especializado.
''',
"hard": '''
Reescribe el siguiente informe médico en español con un nivel de lectura avanzado o técnico, correspondiente a un puntaje FH entre 0 y 50 (texto especializado).
Usa terminología médica precisa, estructuras sintácticas complejas y tono formal típico de documentos clínicos o publicaciones científicas. No simplifiques ni expliques los tecnicismos; conserva la exactitud conceptual y la nomenclatura profesional.
Refleja el razonamiento clínico, hallazgos y juicios médicos con lenguaje apropiado para médicos, especialistas o investigadores.
'''
}
# -------- New Part: Load keyword–definition dataset ----------
kw_file = "/home/mshahidul/readctrl/data/kyw_def_train/kyw_gen_gpt5.json"
with open(kw_file, "r", encoding="utf-8") as f:
    definitions_data = json.load(f)

# Build quick lookup: id -> glossary text
def_map = {}
for obj in definitions_data:
    cid = obj.get("id")
    kwlist = obj.get("medical_keywords", [])
    defs_str = ""
    if kwlist:
        defs_lines = [f"• {d['term']} — {d['definition']}" for d in kwlist]
        defs_str = "Relevant medical definitions:\n" + "\n".join(defs_lines)
    def_map[cid] = defs_str
# --------------------------------------------------------------

path = "/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json"
out_dir = "/home/mshahidul/readctrl/results/custom_promptsV1"
os.makedirs(out_dir, exist_ok=True)

if os.path.exists(model_name):
    out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned_with_defs.json"
else:
    out_path = out_dir + f"/temp{temperature}_qwen3-14B_base_with_defs.json"

results, completed_keys = [], set()
if os.path.exists(out_path):
    with open(out_path, "r", encoding="utf-8") as f:
        results = json.load(f)
    for r in results:
        completed_keys.add(r["fulltext"])

# -------- Load main dataset -----------
with open(path, "r", encoding="utf-8") as f:
    dataset = json.load(f)
dataset = dataset[0:50]

from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=4092,
    load_in_4bit=False,
    load_in_8bit=False,
    full_finetuning=False,
)

import tqdm
for item in tqdm.tqdm(dataset):
    key = item["fulltext"]
    if key in completed_keys:
        continue
    item_id = item["id"]
    glossary = def_map.get(item_id, "")  # retrieve glossary if exists

    for band in ["easy", "intermediate", "hard"]:
        # Append definitions below the case text
        user_content = f"Input text:\n{item['fulltext'].strip()}"
        # if glossary:
        #     user_content += "\n\n" + glossary

        messages = [
            {"role": "system", "content": prompts[band].strip()},
            {"role": "user", "content": user_content}
        ]

        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=False,
        )

        inputs = tokenizer(text, return_tensors="pt").to("cuda")
        input_len = inputs.input_ids.shape[1]
        length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1}
        max_new_tokens = int(min(1200, max(150, input_len * length_factors[band])))

        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=0.9,
            top_k=45,
        )
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        results.append({
            "id": item_id,
            "fulltext": item["fulltext"],
            "band": band,
            "lang": "es",
            "synthetic_summary": output_text,
            "definitions_used": bool(glossary)  # track whether glossary applied
        })

        completed_keys.add(key)
        if len(results) % 3 == 0:
            with open(out_path, "w", encoding="utf-8") as f:
                json.dump(results, f, ensure_ascii=False, indent=2)

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)


from notifier import send_notification
send_notification(
    "process-complete1507034",
    f"Finished inference with model {model_name} at temperature {temperature}. Results saved to {out_path}",
    title="Inference Complete",
    priority="default",
    tags="tada"
)