--- datasets: - asr-malayalam/indicvoices-v1a - Tensoic/GPTeacher-Malayalam language: - ml - en metrics: - accuracy base_model: - deepseek-ai/DeepSeek-R1 pipeline_tag: translation --- import os import argparse import pandas as pd from datasets import Dataset from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq ) from utils import compute_metrics def load_dataset(file_path): """Load and prepare the dataset.""" df = pd.read_csv(file_path) dataset = Dataset.from_pandas(df) # Split dataset into train and validation split_dataset = dataset.train_test_split(test_size=0.1) return split_dataset def preprocess_function(examples, tokenizer, max_length=128): """Tokenize the texts.""" inputs = [ex for ex in examples["english_text"]] targets = [ex for ex in examples["malayalam_text"]] model_inputs = tokenizer( inputs, max_length=max_length, truncation=True, padding="max_length", ) with tokenizer.as_target_tokenizer(): labels = tokenizer( targets, max_length=max_length, truncation=True, padding="max_length", ) model_inputs["labels"] = labels["input_ids"] return model_inputs def main(args): # Load tokenizer and model model_name = "google/mt5-small" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name) # Load and preprocess dataset dataset = load_dataset("dataset/malayalam_dataset.csv") # Tokenize datasets tokenized_datasets = dataset.map( lambda x: preprocess_function(x, tokenizer), batched=True, remove_columns=dataset["train"].column_names ) # Define training arguments training_args = Seq2SeqTrainingArguments( output_dir="./model", evaluation_strategy="epoch", learning_rate=args.learning_rate, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, num_train_epochs=args.epochs, weight_decay=0.01, save_total_limit=2, predict_with_generate=True, logging_dir="./logs", logging_steps=100, push_to_hub=True, ) # Create data collator data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) # Initialize trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics ) # Train the model trainer.train() # Save the model trainer.save_model("./model") tokenizer.save_pretrained("./model") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--epochs", type=int, default=3) parser.add_argument("--batch_size", type=int, default=8) parser.add_argument("--learning_rate", type=float, default=2e-5) args = parser.parse_args() main(args)