Spaces:

Redmind
/

hindi_DS_Training

Runtime error

App Files Files Community

hindi_DS_Training / app.py

Redmind

Update app.py

0d125e0 verified about 1 year ago

raw

history blame contribute delete

2.72 kB

	from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
	from datasets import Dataset, DatasetDict
	import pandas as pd

	# Load the dataset
	file_path = "hindi_dataset.tsv" # Update with your actual file path
	data = pd.read_csv(file_path, delimiter="\t")

	# Convert the dataset to Hugging Face Dataset
	hf_dataset = Dataset.from_pandas(data)

	# Split the dataset into train and test subsets
	split_dataset = hf_dataset.train_test_split(test_size=0.2)

	# Create a DatasetDict with train and test splits
	dataset = DatasetDict({
	"train": split_dataset["train"],
	"test": split_dataset["test"]
	})

	# Load the tokenizer and model
	model_name = "Helsinki-NLP/opus-mt-en-hi" # Pre-trained English-to-Hindi model
	tokenizer = MarianTokenizer.from_pretrained(model_name)
	model = MarianMTModel.from_pretrained(model_name)

	# Tokenize source and target text
	def tokenize_function(examples):
	model_inputs = tokenizer(examples['english'], truncation=True, padding='max_length', max_length=128)
	with tokenizer.as_target_tokenizer():
	labels = tokenizer(examples['hindi'], truncation=True, padding='max_length', max_length=128)
	model_inputs['labels'] = labels['input_ids']
	return model_inputs

	# Apply tokenization to the dataset
	tokenized_datasets = dataset.map(tokenize_function, batched=True)

	# Define the training arguments
	training_args = Seq2SeqTrainingArguments(
	output_dir="./results",
	evaluation_strategy="epoch",
	learning_rate=2e-5,
	per_device_train_batch_size=16,
	per_device_eval_batch_size=16,
	num_train_epochs=3,
	weight_decay=0.01,
	save_total_limit=3,
	predict_with_generate=True,
	logging_dir="./logs",
	logging_steps=10,
	save_steps=500
	)

	# Use the DataCollatorForSeq2Seq for padding
	data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

	# Define the Trainer
	trainer = Seq2SeqTrainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_datasets['train'],
	eval_dataset=tokenized_datasets['test'],
	tokenizer=tokenizer,
	data_collator=data_collator
	)

	# Train the model
	trainer.train()

	# Evaluate the model
	eval_results = trainer.evaluate()
	print("Evaluation Results:", eval_results)

	# Test the model with sample inputs
	def translate_text(text):
	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
	translated = model.generate(**inputs)
	return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

	# Test translation
	sample_text = "How are you?"
	hindi_translation = translate_text(sample_text)
	print(f"English: {sample_text}")
	print(f"Hindi: {hindi_translation[0]}")