|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
|
from transformers import MBartForConditionalGeneration, NllbTokenizer |
|
|
import argparse |
|
|
|
|
|
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
|
|
|
print(f"Loading models on {DEVICE.upper()}...") |
|
|
models = { |
|
|
"nepali": MBartForConditionalGeneration.from_pretrained("models/nllb-finetuned-nepali-en").to(DEVICE) |
|
|
} |
|
|
tokenizers = { |
|
|
"nepali": NllbTokenizer.from_pretrained("models/nllb-finetuned-nepali-en") |
|
|
} |
|
|
print("All models loaded successfully!") |
|
|
|
|
|
def translate_text(text_to_translate: str, source_language: str) -> str: |
|
|
""" |
|
|
Translates a single string of text to English using our fine-tuned models. |
|
|
""" |
|
|
model = models[source_language] |
|
|
tokenizer = tokenizers[source_language] |
|
|
|
|
|
tokenizer.src_lang = "nep_Npan" |
|
|
|
|
|
inputs = tokenizer(text_to_translate, return_tensors="pt").to(DEVICE) |
|
|
|
|
|
generated_tokens = model.generate( |
|
|
**inputs, |
|
|
forced_bos_token_id=tokenizer.convert_tokens_to_ids("eng_Latn"), |
|
|
max_length=128 |
|
|
) |
|
|
|
|
|
translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] |
|
|
return translation |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser(description="Translate text using a fine-tuned model.") |
|
|
parser.add_argument("--text", type=str, required=True, help="Text to translate.") |
|
|
parser.add_argument("--lang", type=str, required=True, choices=["nepali"], help="Source language: 'nepali'.") |
|
|
args = parser.parse_args() |
|
|
|
|
|
translated_sentence = translate_text(args.text, args.lang) |
|
|
|
|
|
print(f"\nOriginal ({args.lang}): {args.text}") |
|
|
print(f"Translated (en): {translated_sentence}") |
|
|
|