Spaces:

rishitpant
/

hf_workshop_deployment_IITM

Sleeping

App Files Files Community

rishitpant commited on Jan 29

Commit

de0e425

verified ·

1 Parent(s): 28db6e9

Upload train_model.py

Browse files

Files changed (1) hide show

train_model.py +65 -0

train_model.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# -*- coding: utf-8 -*-
+"""train_model.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1BMInZz4vjJ1PfgTbbqIknpJYcbM5cwV0
+"""
+import torch
+import numpy as np
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
+print("Downloading dataset...")
+dataset = load_dataset("papluca/language-identification", split="train")
+target_langs = {'en', 'fr', 'es', 'de'}
+filtered_dataset = dataset.filter(lambda example: example['labels'] in target_langs)
+label2id = {"en": 0, "fr": 1, "es": 2, "de": 3}
+id2label = {0: "en", 1: "fr", 2: "es", 3: "de"}
+model_ckpt = "distilbert-base-multilingual-cased"
+tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
+def preprocess(examples):
+    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=64)
+    tokenized["labels"] = [label2id[lang] for lang in examples["labels"]]
+    return tokenized
+print("Preprocessing data...")
+train_subset = filtered_dataset.shuffle(seed=42).select(range(1500))
+tokenized_data = train_subset.map(preprocess, batched=True)
+model = AutoModelForSequenceClassification.from_pretrained(
+    model_ckpt,
+    num_labels=4,
+    id2label=id2label,
+    label2id=label2id
+)
+args = TrainingArguments(
+    output_dir="my_real_model",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    num_train_epochs=2,
+    weight_decay=0.01,
+    save_strategy="no",
+    use_cpu=True
+trainer = Trainer(
+    model=model,
+    args=args,
+    train_dataset=tokenized_data,
+    tokenizer=tokenizer,
+)
+print("Starting training...")
+trainer.train()
+print("Saving model to './production_model'...")
+trainer.save_model("production_model")
+print("Done!")