| import argparse |
| import os |
| import json |
| import sys |
|
|
|
|
| parser = argparse.ArgumentParser(description="Readability Controlled Generation") |
| parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2") |
| parser.add_argument("--temperature", type=float, default=0.1) |
| args = parser.parse_args() |
|
|
| model_name = args.model_name |
| temperature = args.temperature |
|
|
|
|
| prompts = { |
| "easy": ''' |
| You are an assistant that rewrites Spanish texts to make them very simple and easy to understand. |
| Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7). |
| Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions. |
| Keep all important factual details, but remove jargon. |
| Return only the rewritten text without commentary. |
| ''', |
| "intermediate": ''' |
| You are an assistant specialized in rewriting Spanish texts with medium readability. |
| Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 50–70; grade 8–12). |
| Use clear and complete sentences, moderately complex vocabulary, and structured narration. |
| Retain all relevant medical or factual information, but phrase it in accessible language. |
| Return only the rewritten text with no explanations. |
| ''', |
| "hard": ''' |
| You are an assistant that rewrites Spanish medical texts with professional, technical precision. |
| Rewrite the following input text using specialized, academic terminology and information‑dense phrasing. |
| The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level). |
| Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings. |
| Return only the rewritten text. |
| ''' |
| } |
|
|
| |
| kw_file = "/home/mshahidul/readctrl/data/kyw_def_train/kyw_gen_gpt5.json" |
| with open(kw_file, "r", encoding="utf-8") as f: |
| definitions_data = json.load(f) |
|
|
| |
| def_map = {} |
| for obj in definitions_data: |
| cid = obj.get("id") |
| kwlist = obj.get("medical_keywords", []) |
| defs_str = "" |
| if kwlist: |
| defs_lines = [f"• {d['term']} — {d['definition']}" for d in kwlist] |
| defs_str = "Relevant medical definitions:\n" + "\n".join(defs_lines) |
| def_map[cid] = defs_str |
| |
|
|
| path = "/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json" |
| out_dir = "/home/mshahidul/readctrl/results/v3" |
| os.makedirs(out_dir, exist_ok=True) |
|
|
| if os.path.exists(model_name): |
| out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned_with_defs.json" |
| else: |
| out_path = out_dir + f"/temp{temperature}_qwen3-14B_base_with_defs.json" |
|
|
| results, completed_keys = [], set() |
| if os.path.exists(out_path): |
| with open(out_path, "r", encoding="utf-8") as f: |
| results = json.load(f) |
| for r in results: |
| completed_keys.add(r["fulltext"]) |
|
|
| |
| with open(path, "r", encoding="utf-8") as f: |
| dataset = json.load(f) |
| dataset = dataset[0:50] |
|
|
| from unsloth import FastLanguageModel |
| import torch |
|
|
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name=model_name, |
| max_seq_length=4092, |
| load_in_4bit=False, |
| load_in_8bit=False, |
| full_finetuning=False, |
| ) |
|
|
| import tqdm |
| for item in tqdm.tqdm(dataset): |
| key = item["fulltext"] |
| if key in completed_keys: |
| continue |
| item_id = item["id"] |
| glossary = def_map.get(item_id, "") |
|
|
| for band in ["easy", "intermediate", "hard"]: |
| |
| user_content = f"Input text:\n{item['fulltext'].strip()}" |
| if glossary: |
| user_content += "\n\n" + glossary |
|
|
| messages = [ |
| {"role": "system", "content": prompts[band].strip()}, |
| {"role": "user", "content": user_content} |
| ] |
|
|
| text = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True, |
| enable_thinking=False, |
| ) |
|
|
| inputs = tokenizer(text, return_tensors="pt").to("cuda") |
| input_len = inputs.input_ids.shape[1] |
| length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1} |
| max_new_tokens = int(min(1200, max(150, input_len * length_factors[band]))) |
|
|
| output_ids = model.generate( |
| **inputs, |
| max_new_tokens=max_new_tokens, |
| temperature=temperature, |
| top_p=0.9, |
| top_k=45, |
| ) |
| output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
|
| results.append({ |
| "id": item_id, |
| "fulltext": item["fulltext"], |
| "band": band, |
| "lang": "es", |
| "synthetic_summary": output_text, |
| "definitions_used": bool(glossary) |
| }) |
|
|
| completed_keys.add(key) |
| if len(results) % 3 == 0: |
| with open(out_path, "w", encoding="utf-8") as f: |
| json.dump(results, f, ensure_ascii=False, indent=2) |
|
|
| with open(out_path, "w", encoding="utf-8") as f: |
| json.dump(results, f, ensure_ascii=False, indent=2) |