File size: 9,399 Bytes
8302e64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import json
import random
random.seed(42)
from functools import lru_cache
from sentence_transformers import SentenceTransformer, util
from pyvi.ViTokenizer import tokenize
import re
from collections import Counter
import copy

# Define Augmentation Process
@lru_cache(maxsize=None)
def find_similar_entity_with_simcse(entity_type, entity_value, intent):
    possible_entities = intent_entities[intent].get(entity_type, [])
    possible_entities = [e for e in possible_entities if e != entity_value]

    if not possible_entities:
        return entity_value

    source_tokenizer = tokenize(entity_value)
    embed_source = model.encode(source_tokenizer)

    similarities = [
        util.pytorch_cos_sim(embed_source, entity_to_embedding[possible_entity]).item()
        for possible_entity in possible_entities
    ]

    return possible_entities[similarities.index(max(similarities))]

def regenerate_entities_from_annotation(annotation):
    """
    Extract entities and their types from the sentence_annotation.
    Return a list of entities with their type and filler.
    """
    entity_pattern = re.compile(r'\[ ([^\]]+) : ([^\]]+) \]')
    entities = []
    for match in entity_pattern.findall(annotation):
        entity_type, filler = match
        entities.append({
            'type': entity_type,
            'filler': filler
        })
    return entities

def augment_and_correct_all(sample, augmentation_ratio=0.5):
    """
    Augment based on the sentence_annotation, reconstruct the sentence,
    and regenerate the entities list.
    """
    if random.random() > augmentation_ratio:
        return None

    augmented_sample = copy.deepcopy(sample)
    original_annotation = sample['sentence_annotation']
    annotation = original_annotation

    num_entities_to_replace = random.randint(1, len(sample['entities']))
    for entity in reversed(sample['entities'][:num_entities_to_replace]):
        similar_entity = find_similar_entity_with_simcse(entity['type'], entity['filler'], sample['intent'])
        if similar_entity != entity['filler']:
            annotation = annotation.replace(f"[ {entity['type']} : {entity['filler']} ]", f"[ {entity['type']} : {similar_entity} ]")

    # Reconstruct sentence
    sentence = re.sub(r'\[ [^\]]+ : ([^\]]+) \]', r'\1', annotation)

    if sentence == sample['sentence']:
        return None

    augmented_sample['sentence'] = sentence
    augmented_sample['sentence_annotation'] = annotation
    augmented_sample['entities'] = regenerate_entities_from_annotation(annotation)

    return augmented_sample


def augment_entry_owner(entry, chosen_word):
    device_filler = next(entity['filler'] for entity in entry['entities'] if entity['type'] == 'device')
    augmented_sentence = entry['sentence'].replace(device_filler, device_filler + " của " + chosen_word, 1)
    augmented_annotation = entry['sentence_annotation'].replace(device_filler, device_filler + " của " + chosen_word, 1)

    # Update the device entity filler
    for entity in entry['entities']:
        if entity['type'] == 'device':
            entity['filler'] = entity['filler'] + " của " + chosen_word
            break

    return {
        'sentence': augmented_sentence,
        'intent': entry['intent'],
        'sentence_annotation': augmented_annotation,
        'entities': regenerate_entities_from_annotation(augmented_annotation),
    }

def augment_entry_num_loc(entry, chosen_word):
    device_filler = next(entity['filler'] for entity in entry['entities'] if entity['type'] == 'device')

    # Augment sentence and annotation using our approach
    augmented_sentence = entry['sentence'].replace(device_filler, device_filler + " " + chosen_word, 1)
    augmented_annotation = entry['sentence_annotation'].replace(device_filler, device_filler + " " + chosen_word, 1)

    # Update the device entity filler
    for entity in entry['entities']:
        if entity['type'] == 'device':
            entity['filler'] = entity['filler'] + " " + chosen_word
            break

    return {
        'sentence': augmented_sentence,
        'intent': entry['intent'],
        'sentence_annotation': augmented_annotation,
        'entities': entry['entities'],
    }

with open("raw_data/intent_entities.json", 'r') as f:
    intent_entities = json.load(f)

with open("raw_data/train_final_20230919.jsonl", 'r') as f:
    training_samples = [json.loads(line) for line in f.readlines()]

for sample in training_samples:
    del sample['id']
    del sample['file']

model = SentenceTransformer('VoVanPhuc/sup-SimCSE-VietNamese-phobert-base')

# Augment by swapping entities
all_entities = set()
for intent, entities in intent_entities.items():
    for entity_type, entity_values in entities.items():
        all_entities.update(entity_values)

all_entities_tokenized = [tokenize(entity) for entity in all_entities]
entity_to_embedding = {
    entity: model.encode(tokenized_entity)
    for entity, tokenized_entity in zip(all_entities, all_entities_tokenized)
}
# 1. Analyze Intents Distribution
intent_counts = Counter([sample['intent'] for sample in training_samples])

# 2. Identify Less Frequent Intents
num_intents_to_augment = 4
less_frequent_intents = sorted(intent_counts, key=intent_counts.get)[:num_intents_to_augment]

augmented_samples = []

# 3. Augment Samples for Each Less Frequent Intent
for intent in less_frequent_intents:
    samples_for_intent = [sample for sample in training_samples if sample['intent'] == intent]

    for sample in samples_for_intent:
        augmented_sample = augment_and_correct_all(sample)
        if augmented_sample:  # Ensure the augmented sample is not None
            augmented_samples.append(augmented_sample)

seen_sentences = set()
swapped_samples = []
for sample in augmented_samples:
    sentence = sample['sentence']
    if sentence not in seen_sentences:
        seen_sentences.add(sentence)
        swapped_samples.append(sample)

# Augment by adding owner

words_after_cua_in_locations = []

for entry in training_samples:
    location_entities = [entity for entity in entry['entities'] if entity['type'] == 'location']

    for loc in location_entities:
        if "của" in loc['filler']:
            # Split the location filler at "của" and then split by spaces
            parts = loc['filler'].split("của", 1)
            if len(parts) > 1:
                words = parts[1].split()

                # Capture all words until the end or until a "]"
                phrase_after_cua = []
                for word in words:
                    if "]" in word:
                        phrase_after_cua.append(word.replace("]", ""))
                        break
                    phrase_after_cua.append(word)

                if phrase_after_cua:
                    words_after_cua_in_locations.append(" ".join(phrase_after_cua))

# Get the unique phrases after "của" in location entities
unique_phrases_after_cua = list(set(words_after_cua_in_locations))

device_entries = [entry for entry in training_samples if any(entity['type'] == 'device' for entity in entry['entities'])]
augment_count = len(device_entries) // 15
to_augment = random.sample(device_entries, augment_count)

augmented_entries_owner = [augment_entry_owner(entry, random.choice(unique_phrases_after_cua)) for entry in to_augment]

# Augment by adding number and location
# Lists to store the extracted phrases
so_numbers = []
directions = ["bên trái", "bên phải"]

# Iterate over each entry and extract the desired phrases
for entry in training_samples:
    for entity in entry.get('entities', []):
        if entity['type'] == 'location':
            filler = entity['filler']

            # Extract "số" followed by a number
            match = re.search(r'số (\d+)', filler)
            if match:
                so_numbers.append(match.group(0))

            # Extract directions "bên trái" and "bên phải"
            for direction in directions:
                if direction in filler:
                    so_numbers.append(direction)

device_entries = [entry for entry in training_samples if any(entity['type'] == 'device' for entity in entry['entities'])]
augment_count = len(device_entries) // 15
to_augment = random.sample(device_entries, augment_count)

# Use the combined augmentation function
augmented_entries_num = [augment_entry_num_loc(entry, random.choice(so_numbers)) for entry in to_augment]

# Combine all augmented samples
combined_augmented_samples = training_samples + swapped_samples + augmented_entries_owner + augmented_entries_num

combined_augmented_file_path = 'augmented_data.jsonl'
with open(combined_augmented_file_path, 'w', encoding='utf-8') as file:
    for entry in combined_augmented_samples:
        file.write(json.dumps(entry) + '\n')

with open(combined_augmented_file_path, 'r') as f:
    samples = [json.loads(line) for line in f.readlines()]

# Filter duplicates based on the 'sentence' field
seen_sentences = set()
unique_samples = [sample for sample in samples if sample['sentence'] not in seen_sentences and not seen_sentences.add(sample['sentence'])]

# Save the unique samples to a new JSONL file
cleaned_file_path = combined_augmented_file_path.replace(".jsonl", "_unique.jsonl")
with open(cleaned_file_path, 'w') as f:
    for sample in unique_samples:
        f.write(json.dumps(sample) + "\n")

print(f"Cleaned data saved to: {cleaned_file_path}")