In [1]:
import json
import wandb
from datasets import Dataset
from seqeval.metrics import classification_report
import wandb



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def prepare_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    service_mapping = {
        "hotel": ["hotel", "hotels", "kh√°ch s·∫°n", "khach san", "ks"],
        "flight": ["flight", "flights", "v√© m√°y bay", "m√°y bay","may bay"],
        "car rental": ["car rental", "car rentals", "thu√™ xe", "xe"],
        "ticket": ["ticket", "tickets", "v√©", "v√© tham quan","ve", "ve tham quan"],
        "tour": ["tour", "tours", "du l·ªãch","du lich"]
    }
    
    processed_data = []
    for query in data['queries']:
        words = query['text'].split()
        labels = ['O'] * len(words) 
        
        lower_words = [w.lower() for w in words]
        for start, end, entity_type, entity_text in query['entities']:
            if entity_type == "SERVICE":
                search_terms = service_mapping.get(entity_text.lower(), [entity_text.lower()])
            else:
                search_terms = [entity_text.lower()]
            
            found = False
            for term in search_terms:
                term_words = term.split()
                for i in range(len(lower_words) - len(term_words) + 1):
                    if lower_words[i:i+len(term_words)] == term_words:
                        for j in range(len(term_words)):
                            labels[i+j] = f'B-{entity_type}' if j == 0 else f'I-{entity_type}'
                        found = True
                        break
                if found:
                    break
            
            if not found:
                print(f"Warning: Entity '{entity_text}' not found in text '{query['text']}'")
        
        # Keep the original capitalization in 'words'
        processed_data.append({'words': words, 'labels': labels})
    
    return Dataset.from_list(processed_data)

# Load data
train_dataset = prepare_data('/home/ebk/Desktop/NER model/train_dataset.json')
eval_dataset = prepare_data('/home/ebk/Desktop/NER model/eval_dataset.json')


In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import DataCollatorForTokenClassification
import numpy as np

# Load tokenizer and model
model_name = "xlm-roberta-base"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=5)

# Prepare label list and id2label, label2id mappings
label_list = ["O", "B-SERVICE", "I-SERVICE", "B-LOCATION", "I-LOCATION"]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True, padding=True, max_length=256)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Data augmentation: add lowercase versions
def augment_data(examples):
    new_examples = {
        "words": examples["words"] + [[word.lower() for word in sentence] for sentence in examples["words"]],
        "labels": examples["labels"] * 2  # Duplicate labels for lowercase versions
    }
    return new_examples

# Apply data augmentation and tokenization
augmented_train = train_dataset.map(augment_data, batched=True, remove_columns=train_dataset.column_names)
augmented_eval = eval_dataset.map(augment_data, batched=True, remove_columns=eval_dataset.column_names)

tokenized_train = augmented_train.map(tokenize_and_align_labels, batched=True)
tokenized_eval = augmented_eval.map(tokenize_and_align_labels, batched=True)

wandb_token = "a9e921f396228ca94645883d100e8bc7624a2737"
wandb.login(key=wandb_token)
run = wandb.init(
    project='',
    job_type="training",
    anonymous="allow"
)
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Add this line to match evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    warmup_steps=500,
    lr_scheduler_type="linear",
    logging_dir='./logs',
    logging_steps=100,
    report_to="wandb",
)

# Define data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Train the model
trainer.train()


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 544/544 [00:00<00:00, 62863.71 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 48/48 [00:00<00:00, 23307.08 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1088/1088 [00:00<00:00, 35149.87 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 96/96 [00:00<00:00, 22730.79 examples/s]
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mprodranek007[0m ([33mprodranek007-eh[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ebk/.netrc




Epoch,Training Loss,Validation Loss
1,No log,1.455081
2,1.499700,0.568691
3,0.661400,0.099232
4,0.661400,0.03271
5,0.090000,0.021099
6,0.025300,0.016014
7,0.025300,0.027342
8,0.011600,0.000233
9,0.005600,0.000714
10,0.005600,0.000568


TrainOutput(global_step=680, training_loss=0.3376634743283777, metrics={'train_runtime': 977.2101, 'train_samples_per_second': 11.134, 'train_steps_per_second': 0.696, 'total_flos': 99948720268800.0, 'train_loss': 0.3376634743283777, 'epoch': 10.0})

In [4]:
# After training
trainer.save_model("./results/best_model")
tokenizer.save_pretrained("./results/best_model")

('./results/best_model/tokenizer_config.json',
 './results/best_model/special_tokens_map.json',
 './results/best_model/tokenizer.json')

In [5]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Function to align predictions with labels
def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    out_label_list = [[] for _ in range(batch_size)]
    preds_list = [[] for _ in range(batch_size)]
    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != -100:
                out_label_list[i].append(id2label[label_ids[i][j]])
                preds_list[i].append(id2label[preds[i][j]])
    return preds_list, out_label_list

# Get predictions
test_results = trainer.predict(tokenized_eval)
predictions, labels, _ = test_results
preds_list, out_label_list = align_predictions(predictions, labels)

# Print classification report
print(classification_report(out_label_list, preds_list))

Evaluation results: {'eval_loss': 0.00023286677605938166, 'eval_runtime': 0.5843, 'eval_samples_per_second': 164.298, 'eval_steps_per_second': 10.269, 'epoch': 10.0}
              precision    recall  f1-score   support

    LOCATION       1.00      1.00      1.00        96
     SERVICE       1.00      1.00      1.00        96

   micro avg       1.00      1.00      1.00       192
   macro avg       1.00      1.00      1.00       192
weighted avg       1.00      1.00      1.00       192



In [9]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the saved model and tokenizer
model_path = "./results/best_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Define id2label mapping
id2label = {0: "O", 1: "B-SERVICE", 2: "I-SERVICE", 3: "B-LOCATION", 4: "I-LOCATION"}

service_mapping = {
        "hotel": ["hotel", "hotels", "kh√°ch s·∫°n", "khach san", "ks"],
        "flight": ["flight", "flights", "v√© m√°y bay", "m√°y bay","may bay"],
        "car rental": ["car rental", "car rentals", "thu√™ xe", "xe"],
        "ticket": ["ticket", "tickets", "v√©", "v√© tham quan","ve", "ve tham quan"],
        "tour": ["tour", "tours", "du l·ªãch","du lich"]
    }
def map_service(service):
    service = service.lower()
    for key, values in service_mapping.items():
        if any(v in service for v in values):
            return key
    return None

def predict(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Make prediction
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted label for each token
    predictions = torch.argmax(outputs.logits, dim=2)
    
    # Convert prediction ids to labels
    predicted_labels = [id2label[p.item()] for p in predictions[0]]
    
    # Align predictions with words
    word_ids = inputs.word_ids()
    aligned_labels = []
    current_word = None
    for word_id, label in zip(word_ids, predicted_labels):
        if word_id != current_word:
            aligned_labels.append(label)
            current_word = word_id
    
    # Extract entities
    entities = {"SERVICE": [], "LOCATION": []}
    current_entity = None
    current_tokens = []
    
    words = text.split()
    for word, label in zip(words, aligned_labels):
        if label.startswith("B-"):
            if current_entity:
                if current_entity == "SERVICE":
                    mapped_service = map_service(" ".join(current_tokens))
                    if mapped_service:
                        entities[current_entity].append(mapped_service)
                else:
                    entities[current_entity].append(" ".join(current_tokens))
            current_entity = label[2:]
            current_tokens = [word]
        elif label.startswith("I-") and current_entity:
            current_tokens.append(word)
        else:
            if current_entity:
                if current_entity == "SERVICE":
                    mapped_service = map_service(" ".join(current_tokens))
                    if mapped_service:
                        entities[current_entity].append(mapped_service)
                else:
                    entities[current_entity].append(" ".join(current_tokens))
                current_entity = None
                current_tokens = []
    
    if current_entity:
        if current_entity == "SERVICE":
            mapped_service = map_service(" ".join(current_tokens))
            if mapped_service:
                entities[current_entity].append(mapped_service)
        else:
            entities[current_entity].append(" ".join(current_tokens))
    
    # Remove duplicates and keep only the first service if multiple are detected
    if entities["SERVICE"]:
        entities["SERVICE"] = [entities["SERVICE"][0]]
    
    return entities

# Test function
def test_ner(text):
    print(f"Input: {text}")
    result = predict(text)
    print("Output:", result)
    return result

In [17]:
test_texts = [
    "du lich china"
]

for text in test_texts:
    test_ner(text)
    print()

Input: du lich china
Output: {'SERVICE': ['tour'], 'LOCATION': ['china']}

