piyazon's picture
Upload folder using huggingface_hub
551f95a verified
---
language:
- ug
- en
tags:
- translation
pipeline_tag: translation
---
# Usage
```python
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
model_name = f"piyazon/uyghur_translate_dev2"
src_lang = "eng_Latn"
tgt_lang = "uig_Arab"
# Priority: CUDA > MPS > CPU
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
print(f"Using device: cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
tokenizer.src_lang = src_lang
text = "Let's answer a question: What is the radius of the Earth? The Earth's average radius is approximately 6371 kilometers, which is the average value of the distance from the equator to the poles."
# 1. PRE-PROCESSING (Crucial Step)
inputs = tokenizer(
text,
return_tensors="pt",
padding=True,
truncation=True,
).to(device)
# 2. PREPARE TARGET TOKEN
forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)
# 3. GENERATION
with torch.no_grad():
out = model.generate(
**inputs,
forced_bos_token_id=forced_bos_token_id,
max_new_tokens=128,
num_beams=4,
no_repeat_ngram_size=3
)
# 4. DECODE
# Clean up the output
translation = tokenizer.batch_decode(out, skip_special_tokens=True)[0]
print(translation)
```