Spaces:

DucLai
/

Vietnamese_NER

Sleeping

Vietnamese_NER / src /preprocessing.py

GitHub Actions

Auto-deploy from GitHub (binary files removed)

95062a5 6 months ago

5.93 kB

	import pandas as pd
	import torch
	from transformers import AutoTokenizer, AutoModel
	from tqdm import tqdm
	from sklearn.model_selection import train_test_split
	from src.configs import configs
	from pyvi import ViTokenizer

	def join_tokens(tokens):
	text = ' '.join(tokens)
	return text

	def reform_raw_text(tokens):
	text = ' '.join(tokens)
	return text.replace("_", " ")

	def label(x, ):
	id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
	return [id_tag[int(i)] for i in x]

	def replace_7_8(lst):
	return [0 if x in (7, 8) else x for x in lst]

	# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece
	def group_embeddings(tokens, embeddings):
	word_embeddings = []
	current_vecs = []

	for token, emb in zip(tokens, embeddings):
	if token in ["<s>", "</s>"]:
	continue

	if token.endswith("@@"):
	current_vecs.append(emb)
	else:
	current_vecs.append(emb)
	word_emb = torch.mean(torch.stack(current_vecs), dim=0)
	word_embeddings.append(word_emb)
	current_vecs = []

	if current_vecs: # Trong trường hợp sót lại cuối câu
	word_emb = torch.mean(torch.stack(current_vecs), dim=0)
	word_embeddings.append(word_emb)

	return word_embeddings


	# Download the dataset form Hugging Face
	def download_raw_data():
	splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}
	df_train = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["train"])
	df_valid = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["valid"])
	df = pd.concat([df_train, df_valid]).reset_index(drop=True)

	return df

	# Process dataframe for EDA
	def preprocess_data_for_EDA(df):
	# Define tag - id
	tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
	id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}

	# Add columns and remove inappropriate tags
	df['ner_tags'] = df['ner_tags'].apply(replace_7_8)
	df['text_withseg'] = df['tokens'].apply(join_tokens)
	df['text_raw'] = df['tokens'].apply(reform_raw_text)
	df["ner_labels"] = df.ner_tags.apply(label)
	df.columns = ['tokens', 'id_labels', 'seg_text', 'raw_text', 'labels']

	return df




	def load_phoBERT_model_and_tokenizer():
	# Load PhoBERT tokenizer và model
	tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
	model = AutoModel.from_pretrained("vinai/phobert-base")
	model.eval()
	return model, tokenizer


	# Embedding text
	def create_embeddings(df, model, tokenizer):
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	all_embeddings = [] # list of [seq_len_i, 768] tensors
	all_labels = [] # list of [seq_len_i,] tensors
	remove_index = []

	for i, row in tqdm(df.iterrows(), total=len(df)):

	# Truy cập phần tử từng dòng
	sentence = row['seg_text']
	gold_labels = row["id_labels"]

	# Cho sentence đi qua SentencePiece
	input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)

	tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())

	# Encode tạo embeddings
	with torch.no_grad():
	outputs = model(input_ids)
	last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()

	# Gộp các embeddings đã bị tách khi đi qua SentencePiece
	word_embeds = group_embeddings(tokens, last_hidden_state)

	# Kiểm tra số lượng embeddings và số lượng labels, nếu conflict -> xóa dòng đó
	if len(word_embeds) != len(gold_labels):
	# print(f"Warning: Skip row {i} - length mismatch")
	remove_index.append(i)
	continue

	# Thêm vào list tổng & Tới đây là data đã sẵn sàng cho training
	all_embeddings.append(torch.stack(word_embeds))
	all_labels.append(torch.tensor(gold_labels))

	# Create Dict
	processed_data = {
	"embeddings": all_embeddings,
	"labels": all_labels
	}

	return processed_data


	def split_dataset(data):

	# Train_Val / Test Split
	X_train_val, X_test, Y_train_val, Y_test = train_test_split(data["embeddings"], data["labels"], test_size=configs["test_ratio"], random_state=42)

	# Train / Val Split
	val_rest_ratio = configs["val_ratio"] / (configs["val_ratio"] + configs["train_ratio"])
	X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size = val_rest_ratio, random_state=42)

	return X_train, Y_train, X_val, Y_val, X_test, Y_test


	# TODO: Refactor hàm process_demo_sentence, và hàm predict demo, warning nếu độ dài tokens_word không bằng độ dài sau group_embeddings

	def process_demo_sentence(text):
	"""
	Trả về tensor shape 1 x Seq_length x 768
	"""
	segmented_text = ViTokenizer.tokenize(text)
	tokens_word = segmented_text.strip().split(" ")

	model, tokenizer = load_phoBERT_model_and_tokenizer()

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)

	input_ids = tokenizer.encode(segmented_text, return_tensors="pt").to(device)

	tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())

	with torch.no_grad():
	outputs = model(input_ids)
	last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()

	word_embeds = group_embeddings(tokens, last_hidden_state)

	all_embeddings = torch.stack(word_embeds) # seq_length x 768

	all_embeddings = all_embeddings.unsqueeze(0) # Thêm chiều batch size là 1 -> 1 x seq_length x 768

	return all_embeddings, tokens_word