| import argparse |
| import os |
| import json |
| import sys |
| sys.path.append(os.path.abspath('/home/mshahidul/')) |
| from gpu_selection import _gpu_selection_ |
| |
| parser = argparse.ArgumentParser(description="Translation Evaluation") |
| |
| parser.add_argument("--cuda", type=str, default="3", help="CUDA device id, e.g., '0' or '0,1' for multiple GPUs") |
| parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2", help="Path to the finetuned model") |
| parser.add_argument("--temperature", type=float, default=0.1, help="Generation temperature") |
| args = parser.parse_args() |
| |
| model_name = args.model_name |
| temperature = args.temperature |
| if args.cuda is not None: |
| os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda |
| print(f"🎮🎮 Using CUDA device: {args.cuda}") |
| else: |
| _gpu_selection_() |
|
|
| prompts={ |
| "easy":''' |
| You are an assistant that rewrites Spanish texts to make them very simple and easy to understand. |
| Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7). |
| Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions. |
| Keep all important factual details, but remove jargon. |
| Return only the rewritten text without commentary. |
| ''', |
|
|
| 'intermediate':''' |
| You are an assistant specialized in rewriting Spanish texts with medium readability. |
| Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 50–70; grade 8–12). |
| Use clear and complete sentences, moderately complex vocabulary, and structured narration. |
| Retain all relevant medical or factual information, but phrase it in accessible language. |
| Return only the rewritten text with no explanations. |
| ''', |
|
|
| 'hard':''' |
| You are an assistant that rewrites Spanish medical texts with professional, technical precision. |
| Rewrite the following input text using specialized, academic terminology and information‑dense phrasing. |
| The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level). |
| Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings. |
| Return only the rewritten text. |
| ''' |
| } |
|
|
| |
| path="/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json" |
| out_dir = "/home/mshahidul/readctrl/results/v2_without_context" |
| os.makedirs(out_dir, exist_ok=True) |
| |
| |
| |
| if os.path.exists(model_name): |
| out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned.json" |
| else: |
| out_path = out_dir + f"/temp{temperature}_qwen3-14B_base.json" |
| |
| results = [] |
| completed_keys = set() |
| if os.path.exists(out_path): |
| with open(out_path, "r", encoding="utf-8") as f: |
| results = json.load(f) |
| for r in results: |
| completed_keys.add(r["fulltext"]) |
|
|
| |
| with open(path, "r", encoding="utf-8") as f: |
| dataset = json.load(f) |
| dataset=dataset[0:50] |
| from unsloth import FastLanguageModel |
| import torch |
| |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name = model_name, |
| max_seq_length = 4092, |
| load_in_4bit = False, |
| load_in_8bit = False, |
| full_finetuning = False, |
| ) |
|
|
| import tqdm |
| for item in tqdm.tqdm(dataset): |
| key = item["fulltext"] |
| if key in completed_keys: |
| continue |
|
|
| for band in ["easy", "intermediate", "hard"]: |
| prompt = prompts[band]+'\n\n'+"Input text:\n"+item['fulltext'] |
|
|
| |
| messages = [ |
| {"role": "system", "content": prompts[band].strip()}, |
| {"role": "user", "content": "Input text:\n" + item["fulltext"].strip()} |
| ] |
| text = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True, |
| enable_thinking=False, |
| ) |
| |
| |
| inputs = tokenizer(text, return_tensors="pt").to("cuda") |
| input_len = inputs.input_ids.shape[1] |
| |
| length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1} |
|
|
| |
| max_new_tokens = int(min(1200, max(150, input_len * length_factors[band]))) |
| output_ids = model.generate( |
| **inputs, |
| max_new_tokens=max_new_tokens, |
| temperature=temperature, |
| top_p=0.9, |
| top_k=45, |
| ) |
| output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
| |
|
|
| results.append({ |
| "fulltext": item["fulltext"], |
| "band": band, |
| "lang": "es", |
| "synthetic_summary": output_text, |
| }) |
| completed_keys.add(key) |
| |
| if len(results) % 3 == 0: |
| with open(out_path, "w", encoding="utf-8") as f: |
| json.dump(results, f, ensure_ascii=False, indent=2) |
|
|
| |
| with open(out_path, "w", encoding="utf-8") as f: |
| json.dump(results, f, ensure_ascii=False, indent=2) |