piyazon
/

uyghur_translate_dev2

Model card Files Files and versions

uyghur_translate_dev2 / README.md

piyazon's picture

Upload folder using huggingface_hub

551f95a verified about 1 month ago

|

history blame contribute delete

1.54 kB

	---
	language:
	- ug
	- en
	tags:
	- translation
	pipeline_tag: translation
	---

	# Usage

	```python
	import torch
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	import logging

	logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

	model_name = f"piyazon/uyghur_translate_dev2"
	src_lang = "eng_Latn"
	tgt_lang = "uig_Arab"

	# Priority: CUDA > MPS > CPU
	if torch.cuda.is_available():
	device = torch.device("cuda")
	elif torch.backends.mps.is_available():
	device = torch.device("mps")
	else:
	device = torch.device("cpu")

	print(f"Using device: cuda")

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

	tokenizer.src_lang = src_lang

	text = "Let's answer a question: What is the radius of the Earth? The Earth's average radius is approximately 6371 kilometers, which is the average value of the distance from the equator to the poles."


	# 1. PRE-PROCESSING (Crucial Step)
	inputs = tokenizer(
	text,
	return_tensors="pt",
	padding=True,
	truncation=True,
	).to(device)

	# 2. PREPARE TARGET TOKEN
	forced_bos_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)

	# 3. GENERATION
	with torch.no_grad():
	out = model.generate(
	**inputs,
	forced_bos_token_id=forced_bos_token_id,
	max_new_tokens=128,
	num_beams=4,
	no_repeat_ngram_size=3
	)

	# 4. DECODE
	# Clean up the output
	translation = tokenizer.batch_decode(out, skip_special_tokens=True)[0]

	print(translation)
	```