| | import argparse |
| | import os |
| | import json |
| | import sys |
| |
|
| |
|
| | parser = argparse.ArgumentParser(description="Readability Controlled Generation") |
| | parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2") |
| | parser.add_argument("--temperature", type=float, default=0.1) |
| | args = parser.parse_args() |
| |
|
| | model_name = args.model_name |
| | temperature = args.temperature |
| |
|
| |
|
| | prompts = { |
| | "easy": ''' |
| | You are an assistant that rewrites Spanish texts to make them very simple and easy to understand. |
| | Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7). |
| | Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions. |
| | Keep all important factual details, but remove jargon. |
| | Return only the rewritten text without commentary. |
| | ''', |
| | "intermediate": ''' |
| | You are an assistant specialized in rewriting Spanish texts with medium readability. |
| | Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 50–70; grade 8–12). |
| | Use clear and complete sentences, moderately complex vocabulary, and structured narration. |
| | Retain all relevant medical or factual information, but phrase it in accessible language. |
| | Return only the rewritten text with no explanations. |
| | ''', |
| | "hard": ''' |
| | You are an assistant that rewrites Spanish medical texts with professional, technical precision. |
| | Rewrite the following input text using specialized, academic terminology and information‑dense phrasing. |
| | The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level). |
| | Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings. |
| | Return only the rewritten text. |
| | ''' |
| | } |
| |
|
| | |
| | kw_file = "/home/mshahidul/readctrl/data/kyw_def_train/kyw_gen_gpt5.json" |
| | with open(kw_file, "r", encoding="utf-8") as f: |
| | definitions_data = json.load(f) |
| |
|
| | |
| | def_map = {} |
| | for obj in definitions_data: |
| | cid = obj.get("id") |
| | kwlist = obj.get("medical_keywords", []) |
| | defs_str = "" |
| | if kwlist: |
| | defs_lines = [f"• {d['term']} — {d['definition']}" for d in kwlist] |
| | defs_str = "Relevant medical definitions:\n" + "\n".join(defs_lines) |
| | def_map[cid] = defs_str |
| | |
| |
|
| | path = "/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json" |
| | out_dir = "/home/mshahidul/readctrl/results/v3" |
| | os.makedirs(out_dir, exist_ok=True) |
| |
|
| | if os.path.exists(model_name): |
| | out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned_with_defs.json" |
| | else: |
| | out_path = out_dir + f"/temp{temperature}_qwen3-14B_base_with_defs.json" |
| |
|
| | results, completed_keys = [], set() |
| | if os.path.exists(out_path): |
| | with open(out_path, "r", encoding="utf-8") as f: |
| | results = json.load(f) |
| | for r in results: |
| | completed_keys.add(r["fulltext"]) |
| |
|
| | |
| | with open(path, "r", encoding="utf-8") as f: |
| | dataset = json.load(f) |
| | dataset = dataset[0:50] |
| |
|
| | from unsloth import FastLanguageModel |
| | import torch |
| |
|
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | model_name=model_name, |
| | max_seq_length=4092, |
| | load_in_4bit=False, |
| | load_in_8bit=False, |
| | full_finetuning=False, |
| | ) |
| |
|
| | import tqdm |
| | for item in tqdm.tqdm(dataset): |
| | key = item["fulltext"] |
| | if key in completed_keys: |
| | continue |
| | item_id = item["id"] |
| | glossary = def_map.get(item_id, "") |
| |
|
| | for band in ["easy", "intermediate", "hard"]: |
| | |
| | user_content = f"Input text:\n{item['fulltext'].strip()}" |
| | if glossary: |
| | user_content += "\n\n" + glossary |
| |
|
| | messages = [ |
| | {"role": "system", "content": prompts[band].strip()}, |
| | {"role": "user", "content": user_content} |
| | ] |
| |
|
| | text = tokenizer.apply_chat_template( |
| | messages, |
| | tokenize=False, |
| | add_generation_prompt=True, |
| | enable_thinking=False, |
| | ) |
| |
|
| | inputs = tokenizer(text, return_tensors="pt").to("cuda") |
| | input_len = inputs.input_ids.shape[1] |
| | length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1} |
| | max_new_tokens = int(min(1200, max(150, input_len * length_factors[band]))) |
| |
|
| | output_ids = model.generate( |
| | **inputs, |
| | max_new_tokens=max_new_tokens, |
| | temperature=temperature, |
| | top_p=0.9, |
| | top_k=45, |
| | ) |
| | output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
| |
|
| | results.append({ |
| | "id": item_id, |
| | "fulltext": item["fulltext"], |
| | "band": band, |
| | "lang": "es", |
| | "synthetic_summary": output_text, |
| | "definitions_used": bool(glossary) |
| | }) |
| |
|
| | completed_keys.add(key) |
| | if len(results) % 3 == 0: |
| | with open(out_path, "w", encoding="utf-8") as f: |
| | json.dump(results, f, ensure_ascii=False, indent=2) |
| |
|
| | with open(out_path, "w", encoding="utf-8") as f: |
| | json.dump(results, f, ensure_ascii=False, indent=2) |