File size: 5,672 Bytes
9c6961c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | import argparse
import os
import json
import sys
sys.path.append(os.path.abspath('/home/mshahidul/'))
from gpu_selection import _gpu_selection_
# 1. Argparse for path
parser = argparse.ArgumentParser(description="Translation Evaluation")
# parser.add_argument("--out_path", type=str, default="/home/mshahidul/readctrl/generating_data/tik_ache/es_syntheticV3.json", help="Path to the JSON file")
parser.add_argument("--cuda", type=str, default="3", help="CUDA device id, e.g., '0' or '0,1' for multiple GPUs")
parser.add_argument("--model_name", type=str, default="/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v2", help="Path to the finetuned model")
parser.add_argument("--temperature", type=float, default=0.1, help="Generation temperature")
args = parser.parse_args()
# out_path = args.out_path
model_name = args.model_name
temperature = args.temperature
if args.cuda is not None:
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
print(f"🎮🎮 Using CUDA device: {args.cuda}")
else:
_gpu_selection_()
prompts={
"easy":'''
You are an assistant that rewrites Spanish texts to make them very simple and easy to understand.
Your goal is to rewrite the provided input text for younger readers (Fernández Huerta 70–100; grade 5–7).
Use short sentences, simple words, and friendly tone. Avoid technical or complex expressions.
Keep all important factual details, but remove jargon.
Return only the rewritten text without commentary.
''',
'intermediate':'''
You are an assistant specialized in rewriting Spanish texts with medium readability.
Your task is to rewrite the provided input text for general or high‑school‑level readers (Fernández Huerta 50–70; grade 8–12).
Use clear and complete sentences, moderately complex vocabulary, and structured narration.
Retain all relevant medical or factual information, but phrase it in accessible language.
Return only the rewritten text with no explanations.
''',
'hard':'''
You are an assistant that rewrites Spanish medical texts with professional, technical precision.
Rewrite the following input text using specialized, academic terminology and information‑dense phrasing.
The output must target a Fernández Huerta readability index between 0 and 50 (university/professional level).
Use clinical vocabulary, formal register, and detailed description of pathophysiology, procedures, and findings.
Return only the rewritten text.
'''
}
# 2. Output directory and file
path="/home/mshahidul/readctrl/data/testing_data/multiclinsum_test_es.json"
out_dir = "/home/mshahidul/readctrl/results/v2_without_context"
os.makedirs(out_dir, exist_ok=True)
# file_name = os.path.basename(path)
# out_path = os.path.join(out_dir, file_name.replace(".json", "_V2.json"))
# os.makedirs(os.path.dirname(out_dir), exist_ok=True)
if os.path.exists(model_name):
out_path = out_dir + f"/temp{temperature}_qwen3-14B_finetuned.json"
else:
out_path = out_dir + f"/temp{temperature}_qwen3-14B_base.json"
# 3. Load already evaluated results if exist
results = []
completed_keys = set()
if os.path.exists(out_path):
with open(out_path, "r", encoding="utf-8") as f:
results = json.load(f)
for r in results:
completed_keys.add(r["fulltext"])
# 4. Load dataset
with open(path, "r", encoding="utf-8") as f:
dataset = json.load(f)
dataset=dataset[0:50]
from unsloth import FastLanguageModel
import torch
# 5. Load model
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = model_name,
max_seq_length = 4092,
load_in_4bit = False,
load_in_8bit = False,
full_finetuning = False,
)
import tqdm
for item in tqdm.tqdm(dataset):
key = item["fulltext"]
if key in completed_keys:
continue
for band in ["easy", "intermediate", "hard"]:
prompt = prompts[band]+'\n\n'+"Input text:\n"+item['fulltext']
# messages = [{"role": "user", "content": prompt+"\n"}]
messages = [
{"role": "system", "content": prompts[band].strip()},
{"role": "user", "content": "Input text:\n" + item["fulltext"].strip()}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False,
)
# input_ids = tokenizer(item["fulltext"], return_tensors="pt").input_ids
# input_len = input_ids.shape[1]
inputs = tokenizer(text, return_tensors="pt").to("cuda")
input_len = inputs.input_ids.shape[1]
# Define proportional multipliers for each readability level
length_factors = {"easy": 0.5, "intermediate": 0.8, "hard": 1.1}
# Compute adaptive max_new_tokens
max_new_tokens = int(min(1200, max(150, input_len * length_factors[band])))
output_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=0.9,
top_k=45,
)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#answer = output_text.split("</think>")[1].strip()
results.append({
"fulltext": item["fulltext"],
"band": band,
"lang": "es",
"synthetic_summary": output_text,
})
completed_keys.add(key)
# Save every 10 results
if len(results) % 3 == 0:
with open(out_path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2)
# 7. Final save
with open(out_path, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=2) |