innocentpeter commited on
Commit
d0c0c08
·
verified ·
1 Parent(s): 00e5e26

Upload 8 files

Browse files
Files changed (1) hide show
  1. training/train_trenslation.py +55 -55
training/train_trenslation.py CHANGED
@@ -1,55 +1,55 @@
1
- # voice_translator/training/train_translation.py
2
-
3
- import os
4
- from datasets import load_dataset, Dataset
5
- from transformers import (
6
- MarianTokenizer,
7
- MarianMTModel,
8
- Seq2SeqTrainingArguments,
9
- Seq2SeqTrainer,
10
- DataCollatorForSeq2Seq,
11
- )
12
-
13
- MODEL_NAME = "Helsinki-NLP/opus-mt-mul-en"
14
- OUTPUT_DIR = "./training/outputs/model"
15
-
16
- def train_from_jsonl(file_path):
17
- # Load dataset
18
- dataset = load_dataset("json", data_files=file_path, split="train")
19
-
20
- tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
21
- model = MarianMTModel.from_pretrained(MODEL_NAME)
22
-
23
- def preprocess(batch):
24
- inputs = tokenizer(batch["src"], truncation=True, padding="max_length", max_length=128)
25
- targets = tokenizer(batch["tgt"], truncation=True, padding="max_length", max_length=128)
26
- inputs["labels"] = targets["input_ids"]
27
- return inputs
28
-
29
- tokenized = dataset.map(preprocess, batched=True)
30
-
31
- collator = DataCollatorForSeq2Seq(tokenizer, model=model)
32
-
33
- args = Seq2SeqTrainingArguments(
34
- output_dir=OUTPUT_DIR,
35
- evaluation_strategy="no",
36
- learning_rate=5e-5,
37
- per_device_train_batch_size=8,
38
- num_train_epochs=3,
39
- save_total_limit=1,
40
- predict_with_generate=True,
41
- )
42
-
43
- trainer = Seq2SeqTrainer(
44
- model=model,
45
- args=args,
46
- train_dataset=tokenized,
47
- tokenizer=tokenizer,
48
- data_collator=collator,
49
- )
50
-
51
- trainer.train()
52
- trainer.save_model(OUTPUT_DIR)
53
- tokenizer.save_pretrained(OUTPUT_DIR)
54
-
55
- return f"✅ Model trained and saved to {OUTPUT_DIR}"
 
1
+ # voice_translator/training/train_translation.py
2
+
3
+ import os
4
+ from datasets import load_dataset, Dataset
5
+ from transformers import (
6
+ MarianTokenizer,
7
+ MarianMTModel,
8
+ Seq2SeqTrainingArguments,
9
+ Seq2SeqTrainer,
10
+ DataCollatorForSeq2Seq,
11
+ )
12
+
13
+ MODEL_NAME = "Helsinki-NLP/opus-mt-mul-en"
14
+ OUTPUT_DIR = "./training/outputs/model"
15
+
16
+ def train_from_jsonl(file_path):
17
+ # Load dataset
18
+ dataset = load_dataset("json", data_files=file_path, split="train")
19
+
20
+ tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
21
+ model = MarianMTModel.from_pretrained(MODEL_NAME)
22
+
23
+ def preprocess(batch):
24
+ inputs = tokenizer(batch["src"], truncation=True, padding="max_length", max_length=128)
25
+ targets = tokenizer(batch["tgt"], truncation=True, padding="max_length", max_length=128)
26
+ inputs["labels"] = targets["input_ids"]
27
+ return inputs
28
+
29
+ tokenized = dataset.map(preprocess, batched=True)
30
+
31
+ collator = DataCollatorForSeq2Seq(tokenizer, model=model)
32
+
33
+ args = Seq2SeqTrainingArguments(
34
+ output_dir=OUTPUT_DIR,
35
+ evaluation_strategy="no",
36
+ learning_rate=5e-5,
37
+ per_device_train_batch_size=8,
38
+ num_train_epochs=3,
39
+ save_total_limit=1,
40
+ predict_with_generate=True,
41
+ )
42
+
43
+ trainer = Seq2SeqTrainer(
44
+ model=model,
45
+ args=args,
46
+ train_dataset=tokenized,
47
+ tokenizer=tokenizer,
48
+ data_collator=collator,
49
+ )
50
+
51
+ trainer.train()
52
+ trainer.save_model(OUTPUT_DIR)
53
+ tokenizer.save_pretrained(OUTPUT_DIR)
54
+
55
+ return f"✅ Model trained and saved to {OUTPUT_DIR}"