| | import argparse |
| | import os |
| | import json |
| | import sys |
| | sys.path.append(os.path.abspath('/home/mshahidul/')) |
| | from gpu_selection import _gpu_selection_ |
| |
|
| | parser = argparse.ArgumentParser(description="Readability Controlled Generation") |
| | parser.add_argument("--cuda", type=str, default="3") |
| | parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2") |
| | parser.add_argument("--temperature", type=float, default=0.1) |
| | args = parser.parse_args() |
| |
|
| | model_name = args.model_name |
| | temperature = args.temperature |
| |
|
| | if args.cuda is not None: |
| | os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda |
| | print(f"🎮🎮 Using CUDA device: {args.cuda}") |
| | else: |
| | _gpu_selection_() |
| |
|
| | prompts={ |
| | "easy":''' |
| | Reescribe el siguiente informe médico en español con un nivel de lectura fácil correspondiente a un puntaje FH entre 70 y 100 (texto muy comprensible). |
| | Usa oraciones cortas y directas, vocabulario cotidiano, estructuras simples y explicaciones claras de términos médicos. El tono debe ser empático y accesible, como si estuvieras explicando la situación a un paciente o familiar sin conocimientos médicos. |
| | Mantén los datos clínicos y resultados esenciales, pero reemplaza o aclara tecnicismos con frases simples. Evita abreviaturas o siglas sin explicación. |
| | ''', |
| | "intermediate": ''' |
| | Reformula el siguiente informe médico en español con un nivel de lectura intermedio, correspondiente a un puntaje FH entre 50 y 70 (texto de dificultad moderada). |
| | Usa lenguaje formal pero comprensible, adecuado para lectores con educación general o estudiantes del área de salud. Mantén la precisión médica, pero agrega explicaciones breves tras los términos técnicos. Alterna oraciones simples y compuestas, con buena fluidez y cohesión. |
| | El texto debe sonar profesional, informativo y claro, sin llegar a la densidad típica de lenguaje técnico especializado. |
| | ''', |
| | "hard": ''' |
| | Reescribe el siguiente informe médico en español con un nivel de lectura avanzado o técnico, correspondiente a un puntaje FH entre 0 y 50 (texto especializado). |
| | Usa terminología médica precisa, estructuras sintácticas complejas y tono formal típico de documentos clínicos o publicaciones científicas. No simplifiques ni expliques los tecnicismos; conserva la exactitud conceptual y la nomenclatura profesional. |
| | Refleja el razonamiento clínico, hallazgos y juicios médicos con lenguaje apropiado para médicos, especialistas o investigadores. |
| | ''' |
| | } |
| | |
| | kw_file = "/home/mshahidul/readctrl/data/kyw_def_train/kyw_gen_gpt5.json" |
| | with open(kw_file, "r", encoding="utf-8") as f: |
| | definitions_data = json.load(f) |
| |
|
| | |
| | def_map = {} |
| | for obj in definitions_data: |
| | cid = obj.get("id") |
| | kwlist = obj.get("medical_keywords", []) |
| | defs_str = "" |
| | if kwlist: |
| | defs_lines = [f"• {d['term']} — {d['definition']}" for d in kwlist] |
| | defs_str = "Relevant medical definitions:\n" + "\n".join(defs_lines) |
| | def_map[cid] = defs_str |
| | |
| |
|
| | path = "/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json" |
| | out_dir = "/home/mshahidul/readctrl/results/custom_promptsV1" |
| | os.makedirs(out_dir, exist_ok=True) |
| |
|
| | if os.path.exists(model_name): |
| | out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned_with_defs.json" |
| | else: |
| | out_path = out_dir + f"/temp{temperature}_qwen3-14B_base_with_defs.json" |
| |
|
| | results, completed_keys = [], set() |
| | if os.path.exists(out_path): |
| | with open(out_path, "r", encoding="utf-8") as f: |
| | results = json.load(f) |
| | for r in results: |
| | completed_keys.add(r["fulltext"]) |
| |
|
| | |
| | with open(path, "r", encoding="utf-8") as f: |
| | dataset = json.load(f) |
| | dataset = dataset[0:50] |
| |
|
| | from unsloth import FastLanguageModel |
| | import torch |
| |
|
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | model_name=model_name, |
| | max_seq_length=4092, |
| | load_in_4bit=False, |
| | load_in_8bit=False, |
| | full_finetuning=False, |
| | ) |
| |
|
| | import tqdm |
| | for item in tqdm.tqdm(dataset): |
| | key = item["fulltext"] |
| | if key in completed_keys: |
| | continue |
| | item_id = item["id"] |
| | glossary = def_map.get(item_id, "") |
| |
|
| | for band in ["easy", "intermediate", "hard"]: |
| | |
| | user_content = f"Input text:\n{item['fulltext'].strip()}" |
| | |
| | |
| |
|
| | messages = [ |
| | {"role": "system", "content": prompts[band].strip()}, |
| | {"role": "user", "content": user_content} |
| | ] |
| |
|
| | text = tokenizer.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=True, |
| | enable_thinking=False, |
| | ) |
| |
|
| | inputs = tokenizer(text, return_tensors="pt").to("cuda") |
| | input_len = inputs.input_ids.shape[1] |
| | length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1} |
| | max_new_tokens = int(min(1200, max(150, input_len * length_factors[band]))) |
| |
|
| | output_ids = model.generate( |
| | **inputs, |
| | max_new_tokens=max_new_tokens, |
| | temperature=temperature, |
| | top_p=0.9, |
| | top_k=45, |
| | ) |
| | output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
| |
|
| | results.append({ |
| | "id": item_id, |
| | "fulltext": item["fulltext"], |
| | "band": band, |
| | "lang": "es", |
| | "synthetic_summary": output_text, |
| | "definitions_used": bool(glossary) |
| | }) |
| |
|
| | completed_keys.add(key) |
| | if len(results) % 3 == 0: |
| | with open(out_path, "w", encoding="utf-8") as f: |
| | json.dump(results, f, ensure_ascii=False, indent=2) |
| |
|
| | with open(out_path, "w", encoding="utf-8") as f: |
| | json.dump(results, f, ensure_ascii=False, indent=2) |
| |
|
| |
|
| | from notifier import send_notification |
| | send_notification( |
| | "process-complete1507034", |
| | f"Finished inference with model {model_name} at temperature {temperature}. Results saved to {out_path}", |
| | title="Inference Complete", |
| | priority="default", |
| | tags="tada" |
| | ) |