Spaces:
Build error
Build error
| import torch | |
| import torch.nn as nn | |
| from torch.utils.data import DataLoader, Dataset | |
| from transformers import MarianMTModel, MarianTokenizer | |
| # Define dataset class | |
| class TranslationDataset(Dataset): | |
| def __init__(self, source_sentences, target_sentences, tokenizer): | |
| self.source_sentences = source_sentences | |
| self.target_sentences = target_sentences | |
| self.tokenizer = tokenizer | |
| def __len__(self): | |
| return len(self.source_sentences) | |
| def __getitem__(self, idx): | |
| source_text = self.source_sentences[idx] | |
| target_text = self.target_sentences[idx] | |
| source_tokens = self.tokenizer(source_text, return_tensors='pt', padding=True, truncation=True) | |
| target_tokens = self.tokenizer(target_text, return_tensors='pt', padding=True, truncation=True) | |
| return {'input_ids': source_tokens['input_ids'], 'labels': target_tokens['input_ids']} | |
| # Define training function | |
| def train(model, dataloader, optimizer, criterion, num_epochs): | |
| model.train() | |
| for epoch in range(num_epochs): | |
| total_loss = 0.0 | |
| for batch in dataloader: | |
| input_ids = batch['input_ids'].to(device) | |
| labels = batch['labels'].to(device) | |
| optimizer.zero_grad() | |
| outputs = model(input_ids=input_ids, labels=labels) | |
| loss = outputs.loss | |
| loss.backward() | |
| optimizer.step() | |
| total_loss += loss.item() | |
| print(f'Epoch {epoch + 1}, Loss: {total_loss / len(dataloader)}') | |
| # Load tokenizer and model | |
| tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-fr') | |
| model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-fr').to(device) | |
| # Prepare dataset and dataloader | |
| dataset = TranslationDataset(source_sentences, target_sentences, tokenizer) | |
| dataloader = DataLoader(dataset, batch_size=32, shuffle=True) | |
| # Define optimizer and criterion | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5) | |
| criterion = nn.CrossEntropyLoss() | |
| # Train the model | |
| train(model, dataloader, optimizer, criterion, num_epochs=10) | |
| # Save the trained model | |
| torch.save(model.state_dict(), 'translation_model.pth') | |