In [1]:
from transformers import AutoProcessor, Gemma3ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch
from peft import LoraConfig, get_peft_model

import os
from tqdm import tqdm
import json

import random
from datasets import load_dataset
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data_path = (
    "/root/notebooks/MT_TQ/Caches/May2025/tquality.annotated.data/parsed/pldl/"
)

json_files = [
    os.path.join(root, file)
    for root, _, files in os.walk(data_path)
    for file in files
    if file.endswith(".json")
]

training_samples = []
testing_samples  = []

for json_file in tqdm(json_files):
    with open(json_file, "r") as file:
        data = json.load(file)
    sampled_items = data["data"]
    if "test" in json_file:
        testing_samples.extend(sampled_items)
    if "train" in json_file:
        training_samples.extend(sampled_items)

training_datapoints = []
testing_datapoints  = []

for idx, sample in enumerate(training_samples):
    datapoint = {"input": {}}
    datapoint["input"]["src_text"]    = sample["src_text"]
    datapoint["input"]["tgt_text"]    = sample["main_tgt_text"]
    datapoint["input"]["src_prev"]    = sample["tt_src_prev"]
    datapoint["input"]["src_next"]    = sample["tt_src_next"]
    datapoint["input"]["tgt_prev"]    = sample["tt_tgt_prev"]
    datapoint["input"]["tgt_next"]    = sample["tt_tgt_next"]
    datapoint["input"]["src_lang"]    = sample["src_lang"]
    datapoint["input"]["tgt_lang"]    = sample["tgt_lang"]
    datapoint["input"]["start_frame"] = sample["start_frame"]
    datapoint["input"]["end_frame"]   = sample["end_frame"]
    datapoint["input"]["title_id"]    = sample["title_id"]
    datapoint["input"]["alt_tgt_text"]= sample["alt_tgt_text"]
    datapoint["input"]["id"]          = idx
    datapoint["evaluation"]           = sample["labelers"][0]["annotation"]
    training_datapoints.append(datapoint)

for idx, sample in enumerate(testing_samples):
    datapoint = {"input": {}}
    datapoint["input"]["src_text"]    = sample["src_text"]
    datapoint["input"]["tgt_text"]    = sample["main_tgt_text"]
    datapoint["input"]["src_prev"]    = sample["tt_src_prev"]
    datapoint["input"]["src_next"]    = sample["tt_src_next"]
    datapoint["input"]["tgt_prev"]    = sample["tt_tgt_prev"]
    datapoint["input"]["tgt_next"]    = sample["tt_tgt_next"]
    datapoint["input"]["src_lang"]    = sample["src_lang"]
    datapoint["input"]["tgt_lang"]    = sample["tgt_lang"]
    datapoint["input"]["start_frame"] = sample["start_frame"]
    datapoint["input"]["end_frame"]   = sample["end_frame"]
    datapoint["input"]["title_id"]    = sample["title_id"]
    datapoint["input"]["alt_tgt_text"]= sample["alt_tgt_text"]
    datapoint["input"]["id"]          = idx
    datapoint["evaluation"]           = sample["labelers"][0]["annotation"]
    testing_datapoints.append(datapoint)

system_message = "You are a helpful assistant who is an expert in estimating quality of translations."

output_template = '''
{
        "Accuracy Issues": [
                {
                        "Error Span": "",
                        "Error Explanation": "",
                        "Error Quality Category": "",
                        "Error Quality Tags": [],
                        "Error Severity": ""
                }
        ],
        "Accuracy Score": "",
        "Readability Issues": [
                {
                        "Error Span": "",
                        "Error Explanation": "",
                        "Error Quality Category": "",
                        "Error Quality Tags": [],
                        "Error Severity": ""
                }
        ],
        "Readability Score": ""
}'''

def create_conversation(input_sample, output_sample):
  return {
    "messages": [
      # {"role": "system", "content": system_message},
      {"role": "user", "content": input_sample},
      {"role": "assistant", "content": output_sample}
    ]
  }

def create_dataset(datapoints, template_string):
    dataset = []
    meta    = []
    for datapoint in datapoints:
        src_text = datapoint['input']['src_text']
        tgt_text = datapoint['input']['tgt_text']
        src_prev = datapoint['input']['src_prev']
        src_next = datapoint['input']['src_next']        
        tgt_prev = datapoint['input']['tgt_prev']
        tgt_next = datapoint['input']['tgt_next']
        src_lang = datapoint['input']['src_lang']
        tgt_lang = datapoint['input']['tgt_lang']
        
        start_frame = datapoint['input']['start_frame']
        end_frame   = datapoint['input']['end_frame']
        title_id    = datapoint['input']['title_id']
        output      = datapoint['evaluation']
        idx         = datapoint['input']['id']
        if len(output['Accuracy Issues']) != 0 or len(output['Readability Issues']) != 0:
            item = template_string.format(src_text=src_text, tgt_text=tgt_text, 
                                          src_prev=src_prev, src_next=src_next, 
                                          tgt_prev=tgt_prev, tgt_next=tgt_next, 
                                          src_lang=src_lang, tgt_lang=tgt_lang,
                                          template=output_template)
            
            dataset.append(create_conversation(item, json.dumps(output)))
            meta.append({"id": idx, "start_frame": start_frame, "end_frame": end_frame, "title_id": title_id})
    
    return dataset, meta
    
def dataset_prep(datapoints):
    with open("prompts.txt") as file:
        template_string = file.read()
    dataset, meta = create_dataset(datapoints, template_string)
    return dataset, meta

train_dataset, train_meta = dataset_prep(training_datapoints)
test_dataset,  test_meta  = dataset_prep(testing_datapoints)

dataset = {"train": train_dataset, "test": test_dataset}

def convert_to_hf_dataset(dataset):
    train_dataset = Dataset.from_list(dataset['train'])
    test_dataset  = Dataset.from_list(dataset['test'])
    
    hf_dataset = DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })
    
    return hf_dataset

hf_dataset = convert_to_hf_dataset(dataset)
print(hf_dataset)

100%|██████████| 8/8 [00:00<00:00, 22.76it/s]


DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 309
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 343
    })
})


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, BitsAndBytesConfig
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
device = torch.device("cuda:0")

# Hugging Face model id
model_id = "google/gemma-3-27b-it" # or `google/gemma-3-4b-pt`, `google/gemma-3-12b-pt`, `google/gemma-3-27b-pt`

# Select model class based on id
if model_id == "google/gemma-3-27b-it":
    model_class = Gemma3ForConditionalGeneration
else:
    model_class = AutoModelForImageTextToText

torch_dtype = torch.bfloat16

model_kwargs = dict(
    attn_implementation="eager",
    torch_dtype=torch_dtype,
    device_map="auto",
)

model = model_class.from_pretrained(model_id, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-27b-it") # Load the Instruction Tokenizer to use the official Gemma template

Loading checkpoint shards: 100%|██████████| 12/12 [00:18<00:00,  1.58s/it]


In [5]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head", "embed_tokens"] # make sure to save the lm_head and embed_tokens as you train the special tokens
)

In [6]:
from trl import SFTConfig

args = SFTConfig(
    output_dir="may13-gemma-27b-tq_sft_finetuned-model",
    max_seq_length=2048,
    packing=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adamw_torch_fused",
    logging_steps=1,
    save_strategy="epoch",
    learning_rate=1e-4,
    fp16=True if torch_dtype == torch.float16 else False,
    bf16=True if torch_dtype == torch.bfloat16 else False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    push_to_hub=True,
    report_to="tensorboard",
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": True,
    },
    no_cuda=False,
)

In [7]:
from trl import SFTTrainer

# Create Trainer object
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=hf_dataset["train"],
    peft_config=peft_config,
    processing_class=tokenizer
)

Converting train dataset to ChatML: 100%|██████████| 309/309 [00:00<00:00, 9533.70 examples/s]
Applying chat template to train dataset: 100%|██████████| 309/309 [00:00<00:00, 4443.06 examples/s]
Tokenizing train dataset: 100%|██████████| 309/309 [00:01<00:00, 226.22 examples/s]
Packing train dataset: 100%|██████████| 309/309 [00:00<00:00, 102364.74 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
trainer.train()
trainer.save_model()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
1,10.8019
2,8.3814
3,6.9702
4,5.7843
5,4.9708
6,4.3897
7,4.325
8,3.557
9,3.3577
10,3.0925


In [10]:
lora_model = trainer.model
merged_model = lora_model.merge_and_unload()
# Save the model with fused weights
merged_model.save_pretrained('/root/notebooks/MT_TQ/TQ/TQTune/gemma-27b-tq_sft_finetuned-model-full')
trainer.tokenizer.save_pretrained('/root/notebooks/MT_TQ/TQ/TQTune/gemma-27b-tq_sft_finetuned-model-full')

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('/root/notebooks/MT_TQ/TQ/TQTune/gemma-27b-tq_sft_finetuned-model-full/tokenizer_config.json',
 '/root/notebooks/MT_TQ/TQ/TQTune/gemma-27b-tq_sft_finetuned-model-full/special_tokens_map.json',
 '/root/notebooks/MT_TQ/TQ/TQTune/gemma-27b-tq_sft_finetuned-model-full/tokenizer.model',
 '/root/notebooks/MT_TQ/TQ/TQTune/gemma-27b-tq_sft_finetuned-model-full/added_tokens.json',
 '/root/notebooks/MT_TQ/TQ/TQTune/gemma-27b-tq_sft_finetuned-model-full/tokenizer.json')

In [1]:
# Merge LoRA weights into the base model
for name, param in model.named_parameters():
    if name in trainer.peft_model.lora_weights:
        param.data += trainer.peft_model.lora_weights[name]

# Save the model with fused weights
model.save_pretrained('/root/notebooks/MT_TQ/TQ/TQTune/gemma-27b-tq_sft_finetuned-model-full')
tokenizer.save_pretrained('/root/notebooks/MT_TQ/TQ/TQTune/gemma-27b-tq_sft_finetuned-model-full')

NameError: name 'model' is not defined

In [9]:
import torch
from transformers import pipeline
from random import randint
import re

model_id = "google/gemma-3-27b-it"
model = model_class.from_pretrained(
  model_id,
  device_map="auto",
  torch_dtype=torch_dtype,
  attn_implementation="eager",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)


Loading checkpoint shards: 100%|██████████| 12/12 [00:19<00:00,  1.60s/it]


NameError: name 'trainer' is not defined

In [None]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
rand_idx = randint(0, len(dataset["test"]))
test_sample = hf_dataset["test"][rand_idx]
stop_token_ids = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<end_of_turn>")]
prompt = pipe.tokenizer.apply_chat_template(test_sample["messages"][:1], tokenize=False, add_generation_prompt=True)

outputs = pipe(prompt, max_new_tokens=1024, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=stop_token_ids, disable_compile=True)

In [None]:
start = outputs[0]['generated_text'].split(r"<start_of_turn>model")[1].strip().find("{")
end   = outputs[0]['generated_text'].split(r"<start_of_turn>model")[1].strip().rfind("}")
print(start, end)
print(outputs[0]['generated_text'].split(r"<start_of_turn>model")[1].strip()[start:end + 1])
json.loads(outputs[0]['generated_text'].split(r"<start_of_turn>model")[1].strip()[start:end + 1])
rand_idx

In [None]:
json.loads(hf_dataset["test"][81]["messages"][1]['content'])

In [None]:
import torch
from transformers import pipeline
from random import randint
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, BitsAndBytesConfig
from transformers import AutoProcessor, Gemma3ForConditionalGeneration
device = torch.device("cuda:0")

model_class = Gemma3ForConditionalGeneration
torch_dtype = torch.bfloat16

model_id = "gemma-27b-tq_sft_finetuned-model"
model = model_class.from_pretrained(
  model_id,
  device_map="auto",
  torch_dtype=torch_dtype,
  attn_implementation="eager",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
def extract_json_data(json_string):
    key_pattern = r'"(.*?)"\s*:\s*'
    value_pattern = r'(?:"(.*?)"|(\d+)|$$(.*?)$$|\{(.*?)\})'
    matches = re.finditer(key_pattern + value_pattern, json_string, re.DOTALL)    
    data = {}
    for match in matches:
        key = match.group(1)
        value = match.group(2) or match.group(3) or match.group(4) or match.group(5)        
        if value:
            try:
                value = json.loads(value)
            except (json.JSONDecodeError, TypeError):
                pass
        data[key] = value
    return data

In [None]:
rand_idx = randint(0, len(dataset["test"]))
test_predictions = []

index = 9

meta_data = test_meta[index]
stop_token_ids = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<end_of_turn>")]
prompt = pipe.tokenizer.apply_chat_template(hf_dataset["test"][index]["messages"][:1], tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=2048, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=stop_token_ids, disable_compile=True)
start = outputs[0]['generated_text'].split(r"<start_of_turn>model")[1].strip().find("{")
end   = outputs[0]['generated_text'].split(r"<start_of_turn>model")[1].strip().rfind("}")
try:
    pred_dict = json.loads(outputs[0]['generated_text'].split(r"<start_of_turn>model")[1].strip()[start:end + 1])
except:
    start     = outputs[0]['generated_text'].split(r"<start_of_turn>model")[1].strip().find("{")
    end       = outputs[0]['generated_text'].split(r"<start_of_turn>model")[1].strip().rfind("}")
    pred_dict = outputs[0]['generated_text'].split(r"<start_of_turn>model")[1].strip()[start:end + 1]

In [None]:
pred_dict

In [None]:
hf_dataset["test"][index]["messages"][1]

In [None]:
batch_size = 8
test_predictions = []

for i in tqdm(range(0, len(hf_dataset["test"]), batch_size)):
    batch_samples = hf_dataset["test"][i:i + batch_size]["messages"]
    batch_meta = test_meta[i:i + batch_size]
    prompts = [
        pipe.tokenizer.apply_chat_template(sample[:1], tokenize=False, add_generation_prompt=True)
        for sample in batch_samples
    ]
    outputs = pipe(prompts, max_new_tokens=2048, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<end_of_turn>")], disable_compile=True)

    for index, output in tqdm(enumerate(tqdm(outputs))):
        output_dict = {}
        start = output[0]['generated_text'].split(r"<start_of_turn>model")[1].strip().find("{")
        end = output[0]['generated_text'].split(r"<start_of_turn>model")[1].strip().rfind("}")
        try:
            pred_dict = json.loads(output[0]['generated_text'].split(r"<start_of_turn>model")[1].strip()[start:end + 1])
        except:
            pred_dict = output[0]['generated_text'].split(r"<start_of_turn>model")[1].strip()[start:end + 1]
    
        output_dict.update(batch_meta[index])
        output_dict["predictions"]      = pred_dict
        output_dict["human-annotation"] = batch_samples[index][1]['content']
        output_dict["prompt"]           = batch_samples[index][0]['content']
        test_predictions.append(output_dict)

In [None]:
with open("/root/notebooks/trashspace/gemma_finetuned_expertdata/test_pred.json", 'w') as json_file:
    json.dump(test_predictions, json_file)

In [None]:
data = json.loads(test_sample['messages'][1]['content'])
data

In [None]:
print(len(hf_dataset["test"]))