|
|
--- |
|
|
{} |
|
|
--- |
|
|
# Tradutor |
|
|
|
|
|
A translation system from English to European Portuguese. |
|
|
|
|
|
## Usage |
|
|
|
|
|
|
|
|
### Using the pipeline |
|
|
|
|
|
|
|
|
```python |
|
|
from transformers import pipeline |
|
|
|
|
|
translator = pipeline("text-generation", model="liaad/Tradutor") |
|
|
|
|
|
text = "Hello, how are you?" |
|
|
chat = [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": "You are a translator from English to European Portuguese", |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": f"Translate this text from English to European Portuguese: {text}", |
|
|
}, |
|
|
] |
|
|
|
|
|
translated_text = translator( |
|
|
chat, |
|
|
max_length=1024, |
|
|
pad_token_id=translator.model.config.eos_token_id # Not necessary. Just to avoid warning. |
|
|
) |
|
|
|
|
|
print(translated_text[-1]["generated_text"][-1]["content"]) |
|
|
``` |
|
|
|
|
|
### Using model and tokenizer |
|
|
|
|
|
```python |
|
|
import torch |
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
|
|
|
|
model_name = "liaad/Tradutor" |
|
|
max_length = 1024 |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.bfloat16, |
|
|
) |
|
|
|
|
|
|
|
|
text = "Hello, how are you?" |
|
|
chat = [ |
|
|
{ |
|
|
"role": "system", |
|
|
"content": "You are a translator from English to European Portuguese", |
|
|
}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": f"Translate this text from English to European Portuguese: {text}", |
|
|
}, |
|
|
] |
|
|
|
|
|
input_ids = tokenizer.apply_chat_template( |
|
|
chat, |
|
|
add_generation_prompt=True, |
|
|
tokenize=True, |
|
|
return_tensors="pt", |
|
|
max_length=max_length, |
|
|
) |
|
|
|
|
|
output_ids = model.generate( |
|
|
input_ids, |
|
|
max_length=max_length, |
|
|
num_return_sequences=1, |
|
|
pad_token_id=tokenizer.eos_token_id, |
|
|
) |
|
|
|
|
|
generated_ids = output_ids[0, input_ids.shape[1] :] |
|
|
translated_text = tokenizer.decode(generated_ids, skip_special_tokens=True) |
|
|
|
|
|
print(translated_text.strip()) |
|
|
``` |
|
|
|
|
|
## Citation |
|
|
|
|
|
If you use this model in your work, please cite the following paper: |
|
|
|
|
|
``` |
|
|
@article{Sousa2025, |
|
|
author = {Hugo Sousa and Satya Almasian and Ricardo Campos and Alipio Jorge}, |
|
|
title = {Tradutor: Building a Variety Specific Translation Model}, |
|
|
journal = {Proceedings of the AAAI Conference on Artificial Intelligence}, |
|
|
volume = {39}, |
|
|
number = {24}, |
|
|
pages = {25183--25191}, |
|
|
year = {2025}, |
|
|
doi = {10.1609/aaai.v39i24.34704}, |
|
|
issn = {2374-3468}, |
|
|
month = {April} |
|
|
} |
|
|
``` |
|
|
|