innocentpeter commited on
Commit
6089f58
·
verified ·
1 Parent(s): c3a047c

Upload 7 files

Browse files
training/data/benin_en.txt ADDED
File without changes
training/data/hausa_en.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Yaya kake \t How are you
2
+ Lafiya lau \t I am fine
3
+ Na gode \t Thank you
4
+ Don Allah \t Please
5
+ Ya isa \t Enough
training/data/igbo_en.txt ADDED
File without changes
training/data/yoruba_en.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ Bawo ni \t How are you
2
+ Mo wa daadaa \t I am fine
3
+ E se \t Thank you
4
+ Jowo \t Please
5
+ O to \t Enough
training/outputs/model/text.txt ADDED
File without changes
training/outputs/text.py ADDED
File without changes
training/train_trenslation.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from datasets import load_dataset
4
+ from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
5
+
6
+ MODEL_NAME = "Helsinki-NLP/opus-mt-ha-en" # Hausa-English base model
7
+ OUTPUT_DIR = "./training/outputs/model"
8
+
9
+ def train_from_jsonl(jsonl_path):
10
+ dataset = load_dataset("json", data_files={"train": jsonl_path}, split="train")
11
+
12
+ # Train/validation split
13
+ dataset = dataset.train_test_split(test_size=0.1)
14
+
15
+ tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
16
+ model = MarianMTModel.from_pretrained(MODEL_NAME)
17
+
18
+ def preprocess(batch):
19
+ inputs = tokenizer(batch["src"], truncation=True, padding="max_length", max_length=128)
20
+ targets = tokenizer(batch["tgt"], truncation=True, padding="max_length", max_length=128)
21
+ inputs["labels"] = targets["input_ids"]
22
+ return inputs
23
+
24
+ tokenized = dataset.map(preprocess, batched=True, remove_columns=["src", "tgt"])
25
+ data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
26
+
27
+ training_args = Seq2SeqTrainingArguments(
28
+ output_dir=OUTPUT_DIR,
29
+ evaluation_strategy="epoch",
30
+ learning_rate=5e-5,
31
+ per_device_train_batch_size=8,
32
+ per_device_eval_batch_size=8,
33
+ num_train_epochs=3,
34
+ weight_decay=0.01,
35
+ save_total_limit=2,
36
+ predict_with_generate=True,
37
+ logging_dir="./training/logs",
38
+ )
39
+
40
+ trainer = Seq2SeqTrainer(
41
+ model=model,
42
+ args=training_args,
43
+ train_dataset=tokenized["train"],
44
+ eval_dataset=tokenized["test"],
45
+ tokenizer=tokenizer,
46
+ data_collator=data_collator,
47
+ )
48
+
49
+ trainer.train()
50
+ trainer.save_model(OUTPUT_DIR)
51
+ tokenizer.save_pretrained(OUTPUT_DIR)
52
+ print("✅ Training complete. Model saved at", OUTPUT_DIR)