rishitpant commited on
Commit
de0e425
·
verified ·
1 Parent(s): 28db6e9

Upload train_model.py

Browse files
Files changed (1) hide show
  1. train_model.py +65 -0
train_model.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """train_model.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1BMInZz4vjJ1PfgTbbqIknpJYcbM5cwV0
8
+ """
9
+
10
+ import torch
11
+ import numpy as np
12
+ from datasets import load_dataset
13
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
14
+
15
+ print("Downloading dataset...")
16
+ dataset = load_dataset("papluca/language-identification", split="train")
17
+
18
+ target_langs = {'en', 'fr', 'es', 'de'}
19
+ filtered_dataset = dataset.filter(lambda example: example['labels'] in target_langs)
20
+
21
+ label2id = {"en": 0, "fr": 1, "es": 2, "de": 3}
22
+ id2label = {0: "en", 1: "fr", 2: "es", 3: "de"}
23
+
24
+ model_ckpt = "distilbert-base-multilingual-cased"
25
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
26
+
27
+ def preprocess(examples):
28
+ tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)
29
+ tokenized["labels"] = [label2id[lang] for lang in examples["labels"]]
30
+ return tokenized
31
+
32
+ print("Preprocessing data...")
33
+
34
+ train_subset = filtered_dataset.shuffle(seed=42).select(range(1500))
35
+ tokenized_data = train_subset.map(preprocess, batched=True)
36
+
37
+ model = AutoModelForSequenceClassification.from_pretrained(
38
+ model_ckpt,
39
+ num_labels=4,
40
+ id2label=id2label,
41
+ label2id=label2id
42
+ )
43
+
44
+ args = TrainingArguments(
45
+ output_dir="my_real_model",
46
+ learning_rate=2e-5,
47
+ per_device_train_batch_size=16,
48
+ num_train_epochs=2,
49
+ weight_decay=0.01,
50
+ save_strategy="no",
51
+ use_cpu=True
52
+
53
+ trainer = Trainer(
54
+ model=model,
55
+ args=args,
56
+ train_dataset=tokenized_data,
57
+ tokenizer=tokenizer,
58
+ )
59
+
60
+ print("Starting training...")
61
+ trainer.train()
62
+
63
+ print("Saving model to './production_model'...")
64
+ trainer.save_model("production_model")
65
+ print("Done!")