Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import Trainer, TrainingArguments | |
| from app.model.model import NigerianLanguageModel | |
| from app.model.config import ModelConfig | |
| def train_model(model: NigerianLanguageModel, train_dataset, eval_dataset=None): | |
| training_args = TrainingArguments( | |
| output_dir="outputs", | |
| num_train_epochs=model.config.num_train_epochs, | |
| per_device_train_batch_size=model.config.batch_size, | |
| learning_rate=model.config.learning_rate, | |
| save_steps=500, | |
| ) | |
| trainer = Trainer( | |
| model=model.model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset | |
| ) | |
| trainer.train() | |
| # scripts/preprocess.py | |
| from app.utils.data_preprocessing import load_language_data, preprocess_text | |
| import os | |
| def main(): | |
| languages = ["yoruba", "igbo", "hausa"] | |
| for lang in languages: | |
| data = load_language_data("data/raw", lang) | |
| processed_data = [preprocess_text(text) for text in data] | |
| output_dir = f"data/processed/{lang}" | |
| os.makedirs(output_dir, exist_ok=True) | |
| with open(f"{output_dir}/processed_texts.txt", 'w', encoding='utf-8') as f: | |
| f.writelines(processed_data) |