piyazon's picture
Upload folder using huggingface_hub
551f95a verified
metadata
language:
  - ug
  - en
tags:
  - translation
pipeline_tag: translation

Usage

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import logging

logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

model_name = f"piyazon/uyghur_translate_dev2"
src_lang = "eng_Latn"
tgt_lang = "uig_Arab"

# Priority: CUDA > MPS > CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(f"Using device: cuda")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

tokenizer.src_lang = src_lang

text = "Let's answer a question: What is the radius of the Earth? The Earth's average radius is approximately 6371 kilometers, which is the average value of the distance from the equator to the poles."


# 1. PRE-PROCESSING (Crucial Step)
inputs = tokenizer(
    text, 
    return_tensors="pt", 
    padding=True, 
    truncation=True, 
).to(device)

# 2. PREPARE TARGET TOKEN
forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)

# 3. GENERATION
with torch.no_grad():
    out = model.generate(
        **inputs,
        forced_bos_token_id=forced_bos_token_id,
        max_new_tokens=128,
        num_beams=4, 
        no_repeat_ngram_size=3 
    )

# 4. DECODE
# Clean up the output
translation = tokenizer.batch_decode(out, skip_special_tokens=True)[0]

print(translation)