Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import torch | |
| from transformers import AutoTokenizer, AutoModel | |
| from tqdm import tqdm | |
| from sklearn.model_selection import train_test_split | |
| from src.configs import configs | |
| from pyvi import ViTokenizer | |
| def join_tokens(tokens): | |
| text = ' '.join(tokens) | |
| return text | |
| def reform_raw_text(tokens): | |
| text = ' '.join(tokens) | |
| return text.replace("_", " ") | |
| def label(x, ): | |
| id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'} | |
| return [id_tag[int(i)] for i in x] | |
| def replace_7_8(lst): | |
| return [0 if x in (7, 8) else x for x in lst] | |
| # Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece | |
| def group_embeddings(tokens, embeddings): | |
| word_embeddings = [] | |
| current_vecs = [] | |
| for token, emb in zip(tokens, embeddings): | |
| if token in ["<s>", "</s>"]: | |
| continue | |
| if token.endswith("@@"): | |
| current_vecs.append(emb) | |
| else: | |
| current_vecs.append(emb) | |
| word_emb = torch.mean(torch.stack(current_vecs), dim=0) | |
| word_embeddings.append(word_emb) | |
| current_vecs = [] | |
| if current_vecs: # Trong trường hợp sót lại cuối câu | |
| word_emb = torch.mean(torch.stack(current_vecs), dim=0) | |
| word_embeddings.append(word_emb) | |
| return word_embeddings | |
| # Download the dataset form Hugging Face | |
| def download_raw_data(): | |
| splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'} | |
| df_train = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["train"]) | |
| df_valid = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["valid"]) | |
| df = pd.concat([df_train, df_valid]).reset_index(drop=True) | |
| return df | |
| # Process dataframe for EDA | |
| def preprocess_data_for_EDA(df): | |
| # Define tag - id | |
| tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6} | |
| id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'} | |
| # Add columns and remove inappropriate tags | |
| df['ner_tags'] = df['ner_tags'].apply(replace_7_8) | |
| df['text_withseg'] = df['tokens'].apply(join_tokens) | |
| df['text_raw'] = df['tokens'].apply(reform_raw_text) | |
| df["ner_labels"] = df.ner_tags.apply(label) | |
| df.columns = ['tokens', 'id_labels', 'seg_text', 'raw_text', 'labels'] | |
| return df | |
| def load_phoBERT_model_and_tokenizer(): | |
| # Load PhoBERT tokenizer và model | |
| tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False) | |
| model = AutoModel.from_pretrained("vinai/phobert-base") | |
| model.eval() | |
| return model, tokenizer | |
| # Embedding text | |
| def create_embeddings(df, model, tokenizer): | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| all_embeddings = [] # list of [seq_len_i, 768] tensors | |
| all_labels = [] # list of [seq_len_i,] tensors | |
| remove_index = [] | |
| for i, row in tqdm(df.iterrows(), total=len(df)): | |
| # Truy cập phần tử từng dòng | |
| sentence = row['seg_text'] | |
| gold_labels = row["id_labels"] | |
| # Cho sentence đi qua SentencePiece | |
| input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device) | |
| tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu()) | |
| # Encode tạo embeddings | |
| with torch.no_grad(): | |
| outputs = model(input_ids) | |
| last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu() | |
| # Gộp các embeddings đã bị tách khi đi qua SentencePiece | |
| word_embeds = group_embeddings(tokens, last_hidden_state) | |
| # Kiểm tra số lượng embeddings và số lượng labels, nếu conflict -> xóa dòng đó | |
| if len(word_embeds) != len(gold_labels): | |
| # print(f"Warning: Skip row {i} - length mismatch") | |
| remove_index.append(i) | |
| continue | |
| # Thêm vào list tổng & Tới đây là data đã sẵn sàng cho training | |
| all_embeddings.append(torch.stack(word_embeds)) | |
| all_labels.append(torch.tensor(gold_labels)) | |
| # Create Dict | |
| processed_data = { | |
| "embeddings": all_embeddings, | |
| "labels": all_labels | |
| } | |
| return processed_data | |
| def split_dataset(data): | |
| # Train_Val / Test Split | |
| X_train_val, X_test, Y_train_val, Y_test = train_test_split(data["embeddings"], data["labels"], test_size=configs["test_ratio"], random_state=42) | |
| # Train / Val Split | |
| val_rest_ratio = configs["val_ratio"] / (configs["val_ratio"] + configs["train_ratio"]) | |
| X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size = val_rest_ratio, random_state=42) | |
| return X_train, Y_train, X_val, Y_val, X_test, Y_test | |
| # TODO: Refactor hàm process_demo_sentence, và hàm predict demo, warning nếu độ dài tokens_word không bằng độ dài sau group_embeddings | |
| def process_demo_sentence(text): | |
| """ | |
| Trả về tensor shape 1 x Seq_length x 768 | |
| """ | |
| segmented_text = ViTokenizer.tokenize(text) | |
| tokens_word = segmented_text.strip().split(" ") | |
| model, tokenizer = load_phoBERT_model_and_tokenizer() | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model.to(device) | |
| input_ids = tokenizer.encode(segmented_text, return_tensors="pt").to(device) | |
| tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu()) | |
| with torch.no_grad(): | |
| outputs = model(input_ids) | |
| last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu() | |
| word_embeds = group_embeddings(tokens, last_hidden_state) | |
| all_embeddings = torch.stack(word_embeds) # seq_length x 768 | |
| all_embeddings = all_embeddings.unsqueeze(0) # Thêm chiều batch size là 1 -> 1 x seq_length x 768 | |
| return all_embeddings, tokens_word | |