| import argparse |
| import os |
| import json |
| import sys |
| sys.path.append(os.path.abspath('/home/mshahidul/')) |
| from gpu_selection import _gpu_selection_ |
| |
| parser = argparse.ArgumentParser(description="Translation Evaluation") |
| parser.add_argument("--path", type=str, default="/home/mshahidul/readctrl/generating_data/tik_ache/es_syntheticV3.json", help="Path to the JSON file") |
| parser.add_argument("--cuda", type=str, default="3", help="CUDA device id, e.g., '0' or '0,1' for multiple GPUs") |
| args = parser.parse_args() |
|
|
| if args.cuda is not None: |
| os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda |
| print(f"🎮🎮 Using CUDA device: {args.cuda}") |
| else: |
| _gpu_selection_() |
|
|
| |
| out_dir = "/home/mshahidul/readctrl/results/" |
| os.makedirs(os.path.dirname(out_dir), exist_ok=True) |
| file_name = os.path.basename(args.path) |
| out_path = os.path.join(out_dir, file_name) |
|
|
| |
| results = [] |
| completed_keys = set() |
| if os.path.exists(out_path): |
| with open(out_path, "r", encoding="utf-8") as f: |
| results = json.load(f) |
| for r in results: |
| completed_keys.add((r["article"], r["gold_summary"])) |
|
|
| |
| with open(args.path, "r", encoding="utf-8") as f: |
| dataset = json.load(f) |
| from unsloth import FastLanguageModel |
| import torch |
| |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name = "/home/mshahidul/readctrl/finetuned_models/es_synthetic_data_creation_Qwen3_14B_v1", |
| max_seq_length = 4092, |
| load_in_4bit = True, |
| load_in_8bit = False, |
| full_finetuning = False, |
| ) |
| from prompt_generate import generate_prompt |
| |
| import tqdm |
| for item in tqdm.tqdm(dataset): |
| key = (item["article"], item["gold_summary"]) |
| if key in completed_keys: |
| continue |
|
|
| for band in ["B1", "B2", "B3"]: |
| prompt = generate_prompt(item['article'],item['gold_summary'],band,"es") |
|
|
| messages = [{"role": "user", "content": prompt+"\n"}] |
| text = tokenizer.apply_chat_template( |
| messages, |
| tokenize=False, |
| add_generation_prompt=True, |
| enable_thinking=False, |
| ) |
| inputs = tokenizer(text, return_tensors="pt").to("cuda") |
| output_ids = model.generate( |
| **inputs, |
| max_new_tokens=1000, |
| temperature=0.1, |
| top_p=0.8, |
| top_k=5, |
| ) |
| output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
| |
|
|
| results.append({ |
| "article": item["article"], |
| "gold_summary": item["gold_summary"], |
| "band": band, |
| "lang": "es", |
| "synthetic_summary": output_text, |
| }) |
| completed_keys.add(key) |
| |
| if len(results) % 30 == 0: |
| with open(out_path, "w", encoding="utf-8") as f: |
| json.dump(results, f, ensure_ascii=False, indent=2) |
|
|
| |
| with open(out_path, "w", encoding="utf-8") as f: |
| json.dump(results, f, ensure_ascii=False, indent=2) |