Vietnamese_NER / src /preprocessing.py
GitHub Actions
Auto-deploy from GitHub (binary files removed)
95062a5
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from src.configs import configs
from pyvi import ViTokenizer
def join_tokens(tokens):
text = ' '.join(tokens)
return text
def reform_raw_text(tokens):
text = ' '.join(tokens)
return text.replace("_", " ")
def label(x, ):
id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
return [id_tag[int(i)] for i in x]
def replace_7_8(lst):
return [0 if x in (7, 8) else x for x in lst]
# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece
def group_embeddings(tokens, embeddings):
word_embeddings = []
current_vecs = []
for token, emb in zip(tokens, embeddings):
if token in ["<s>", "</s>"]:
continue
if token.endswith("@@"):
current_vecs.append(emb)
else:
current_vecs.append(emb)
word_emb = torch.mean(torch.stack(current_vecs), dim=0)
word_embeddings.append(word_emb)
current_vecs = []
if current_vecs: # Trong trường hợp sót lại cuối câu
word_emb = torch.mean(torch.stack(current_vecs), dim=0)
word_embeddings.append(word_emb)
return word_embeddings
# Download the dataset form Hugging Face
def download_raw_data():
splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}
df_train = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["train"])
df_valid = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["valid"])
df = pd.concat([df_train, df_valid]).reset_index(drop=True)
return df
# Process dataframe for EDA
def preprocess_data_for_EDA(df):
# Define tag - id
tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
# Add columns and remove inappropriate tags
df['ner_tags'] = df['ner_tags'].apply(replace_7_8)
df['text_withseg'] = df['tokens'].apply(join_tokens)
df['text_raw'] = df['tokens'].apply(reform_raw_text)
df["ner_labels"] = df.ner_tags.apply(label)
df.columns = ['tokens', 'id_labels', 'seg_text', 'raw_text', 'labels']
return df
def load_phoBERT_model_and_tokenizer():
# Load PhoBERT tokenizer và model
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
model = AutoModel.from_pretrained("vinai/phobert-base")
model.eval()
return model, tokenizer
# Embedding text
def create_embeddings(df, model, tokenizer):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
all_embeddings = [] # list of [seq_len_i, 768] tensors
all_labels = [] # list of [seq_len_i,] tensors
remove_index = []
for i, row in tqdm(df.iterrows(), total=len(df)):
# Truy cập phần tử từng dòng
sentence = row['seg_text']
gold_labels = row["id_labels"]
# Cho sentence đi qua SentencePiece
input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)
tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
# Encode tạo embeddings
with torch.no_grad():
outputs = model(input_ids)
last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
# Gộp các embeddings đã bị tách khi đi qua SentencePiece
word_embeds = group_embeddings(tokens, last_hidden_state)
# Kiểm tra số lượng embeddings và số lượng labels, nếu conflict -> xóa dòng đó
if len(word_embeds) != len(gold_labels):
# print(f"Warning: Skip row {i} - length mismatch")
remove_index.append(i)
continue
# Thêm vào list tổng & Tới đây là data đã sẵn sàng cho training
all_embeddings.append(torch.stack(word_embeds))
all_labels.append(torch.tensor(gold_labels))
# Create Dict
processed_data = {
"embeddings": all_embeddings,
"labels": all_labels
}
return processed_data
def split_dataset(data):
# Train_Val / Test Split
X_train_val, X_test, Y_train_val, Y_test = train_test_split(data["embeddings"], data["labels"], test_size=configs["test_ratio"], random_state=42)
# Train / Val Split
val_rest_ratio = configs["val_ratio"] / (configs["val_ratio"] + configs["train_ratio"])
X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size = val_rest_ratio, random_state=42)
return X_train, Y_train, X_val, Y_val, X_test, Y_test
# TODO: Refactor hàm process_demo_sentence, và hàm predict demo, warning nếu độ dài tokens_word không bằng độ dài sau group_embeddings
def process_demo_sentence(text):
"""
Trả về tensor shape 1 x Seq_length x 768
"""
segmented_text = ViTokenizer.tokenize(text)
tokens_word = segmented_text.strip().split(" ")
model, tokenizer = load_phoBERT_model_and_tokenizer()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
input_ids = tokenizer.encode(segmented_text, return_tensors="pt").to(device)
tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())
with torch.no_grad():
outputs = model(input_ids)
last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
word_embeds = group_embeddings(tokens, last_hidden_state)
all_embeddings = torch.stack(word_embeds) # seq_length x 768
all_embeddings = all_embeddings.unsqueeze(0) # Thêm chiều batch size là 1 -> 1 x seq_length x 768
return all_embeddings, tokens_word