# -*- coding: utf-8 -*- """train_model.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1BMInZz4vjJ1PfgTbbqIknpJYcbM5cwV0 """ import torch import numpy as np from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments print("Downloading dataset...") dataset = load_dataset("papluca/language-identification", split="train") target_langs = {'en', 'fr', 'es', 'de'} filtered_dataset = dataset.filter(lambda example: example['labels'] in target_langs) label2id = {"en": 0, "fr": 1, "es": 2, "de": 3} id2label = {0: "en", 1: "fr", 2: "es", 3: "de"} model_ckpt = "distilbert-base-multilingual-cased" tokenizer = AutoTokenizer.from_pretrained(model_ckpt) def preprocess(examples): tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64) tokenized["labels"] = [label2id[lang] for lang in examples["labels"]] return tokenized print("Preprocessing data...") train_subset = filtered_dataset.shuffle(seed=42).select(range(1500)) tokenized_data = train_subset.map(preprocess, batched=True) model = AutoModelForSequenceClassification.from_pretrained( model_ckpt, num_labels=4, id2label=id2label, label2id=label2id ) args = TrainingArguments( output_dir="my_real_model", learning_rate=2e-5, per_device_train_batch_size=16, num_train_epochs=2, weight_decay=0.01, save_strategy="no", use_cpu=True trainer = Trainer( model=model, args=args, train_dataset=tokenized_data, tokenizer=tokenizer, ) print("Starting training...") trainer.train() print("Saving model to './production_model'...") trainer.save_model("production_model") print("Done!")