File size: 5,926 Bytes
fff452e
 
 
 
 
95062a5
fff452e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from src.configs import configs
from pyvi import ViTokenizer

def join_tokens(tokens):
    text = ' '.join(tokens)
    return text

def reform_raw_text(tokens):
    text = ' '.join(tokens)
    return text.replace("_", " ")

def label(x, ):
    id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}
    return [id_tag[int(i)] for i in x]

def replace_7_8(lst):
    return [0 if x in (7, 8) else x for x in lst]

# Hàm gộp các embedding vectors của token bị tách ra khi qua SentencePiece
def group_embeddings(tokens, embeddings):
    word_embeddings = []
    current_vecs = []

    for token, emb in zip(tokens, embeddings):
        if token in ["<s>", "</s>"]:
            continue

        if token.endswith("@@"):
            current_vecs.append(emb)
        else:
            current_vecs.append(emb)
            word_emb = torch.mean(torch.stack(current_vecs), dim=0)
            word_embeddings.append(word_emb)
            current_vecs = []

    if current_vecs:  # Trong trường hợp sót lại cuối câu
        word_emb = torch.mean(torch.stack(current_vecs), dim=0)
        word_embeddings.append(word_emb)

    return word_embeddings


# Download the dataset form Hugging Face
def download_raw_data():
    splits = {'train': 'data/train-00000-of-00001-b0417886a268b83a.parquet', 'valid': 'data/valid-00000-of-00001-846411c236133ba3.parquet'}
    df_train = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["train"])
    df_valid = pd.read_parquet("hf://datasets/datnth1709/VLSP2016-NER-data/" + splits["valid"])
    df = pd.concat([df_train, df_valid]).reset_index(drop=True)

    return df

# Process dataframe for EDA
def preprocess_data_for_EDA(df):
    # Define tag - id
    tag_id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
    id_tag = {0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC'}

    # Add columns and remove inappropriate tags
    df['ner_tags'] = df['ner_tags'].apply(replace_7_8)
    df['text_withseg'] = df['tokens'].apply(join_tokens)
    df['text_raw'] = df['tokens'].apply(reform_raw_text)
    df["ner_labels"] = df.ner_tags.apply(label)
    df.columns = ['tokens', 'id_labels', 'seg_text', 'raw_text', 'labels']

    return df




def load_phoBERT_model_and_tokenizer():
    # Load PhoBERT tokenizer và model
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
    model = AutoModel.from_pretrained("vinai/phobert-base")
    model.eval()
    return model, tokenizer


# Embedding text
def create_embeddings(df, model, tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    all_embeddings = []  # list of [seq_len_i, 768] tensors
    all_labels = [] # list of [seq_len_i,] tensors
    remove_index = []

    for i, row in tqdm(df.iterrows(), total=len(df)):

        # Truy cập phần tử từng dòng
        sentence = row['seg_text']
        gold_labels = row["id_labels"]

        # Cho sentence đi qua SentencePiece
        input_ids = tokenizer.encode(sentence, return_tensors="pt").to(device)

        tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())

        # Encode tạo embeddings
        with torch.no_grad():
            outputs = model(input_ids)
            last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()

        # Gộp các embeddings đã bị tách khi đi qua SentencePiece
        word_embeds = group_embeddings(tokens, last_hidden_state)

        # Kiểm tra số lượng embeddings và số lượng labels, nếu conflict -> xóa dòng đó
        if len(word_embeds) != len(gold_labels):
            # print(f"Warning: Skip row {i} - length mismatch")
            remove_index.append(i)
            continue

        # Thêm vào list tổng & Tới đây là data đã sẵn sàng cho training
        all_embeddings.append(torch.stack(word_embeds))
        all_labels.append(torch.tensor(gold_labels))
    
        # Create Dict
        processed_data = {
          "embeddings": all_embeddings,
          "labels": all_labels
        }

    return processed_data


def split_dataset(data):

    # Train_Val / Test Split
    X_train_val, X_test, Y_train_val, Y_test = train_test_split(data["embeddings"], data["labels"], test_size=configs["test_ratio"], random_state=42)

    # Train / Val Split
    val_rest_ratio = configs["val_ratio"] / (configs["val_ratio"] + configs["train_ratio"])
    X_train, X_val, Y_train, Y_val = train_test_split(X_train_val, Y_train_val, test_size = val_rest_ratio, random_state=42)

    return X_train, Y_train, X_val, Y_val, X_test, Y_test


# TODO: Refactor hàm process_demo_sentence, và hàm predict demo, warning nếu độ dài tokens_word không bằng độ dài sau group_embeddings

def process_demo_sentence(text):
    """
    Trả về tensor shape 1 x Seq_length x 768
    """
    segmented_text = ViTokenizer.tokenize(text)
    tokens_word = segmented_text.strip().split(" ")

    model, tokenizer = load_phoBERT_model_and_tokenizer()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    input_ids = tokenizer.encode(segmented_text, return_tensors="pt").to(device)

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0].cpu())

    with torch.no_grad():
        outputs = model(input_ids)
        last_hidden_state = outputs.last_hidden_state.squeeze(0).cpu()
    
    word_embeds = group_embeddings(tokens, last_hidden_state)

    all_embeddings = torch.stack(word_embeds) # seq_length x 768

    all_embeddings = all_embeddings.unsqueeze(0) # Thêm chiều batch size là 1 -> 1 x seq_length x 768

    return all_embeddings, tokens_word