| | from transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments |
| | from datasets import Dataset |
| | import json |
| | import torch |
| |
|
| | |
| | tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-italian-uncased") |
| | model = BertForTokenClassification.from_pretrained("dbmdz/bert-base-italian-uncased", num_labels=5) |
| |
|
| | |
| | with open('entity_dataset.json', 'r') as f: |
| | dataset = json.load(f) |
| |
|
| | |
| | def prepare_dataset(dataset): |
| | input_texts = [entry["query"] for entry in dataset] |
| | labels = [entry["entities"] for entry in dataset] |
| |
|
| | |
| | encodings = tokenizer(input_texts, truncation=True, padding=True, max_length=512) |
| | |
| | |
| | |
| |
|
| | |
| | encodings['labels'] = torch.tensor(labels) |
| |
|
| | return Dataset.from_dict(encodings) |
| |
|
| | |
| | train_dataset = prepare_dataset(dataset) |
| |
|
| | |
| | training_args = TrainingArguments( |
| | output_dir="./results", |
| | evaluation_strategy="epoch", |
| | learning_rate=2e-5, |
| | per_device_train_batch_size=16, |
| | num_train_epochs=3, |
| | weight_decay=0.01 |
| | ) |
| |
|
| | |
| | trainer = Trainer( |
| | model=model, |
| | args=training_args, |
| | train_dataset=train_dataset, |
| | ) |
| |
|
| | |
| | trainer.train() |
| |
|
| | |
| | model.save_pretrained("./hotel_model") |
| | tokenizer.save_pretrained("./hotel_model") |
| |
|