Spaces:

ShynBui
/

Create_Vietnamese_spelling_errors

Build error

App Files Files Community

ShynBui commited on Aug 5, 2024

Commit

f462bd2

1 Parent(s): 4ff8b83

updata

Browse files

Files changed (5) hide show

__pycache__/fomula.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
app.py +39 -45
fomula.py +78 -3
utils.py +134 -34

__pycache__/fomula.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/fomula.cpython-310.pyc and b/__pycache__/fomula.cpython-310.pyc differ

__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -1,55 +1,49 @@
 import gradio as gr
 import fomula
 from utils import *
 def final_result(input_text):
-    # list_text_response = []
-    #
-    # ##character_replacement
-    # input_text_process = character_replacement(input_text, error_rate=0.1)
-    # list_text_response.append(input_text_process)
-    # ##character_insertion
-    # input_text_process = character_insertion(input_text, error_rate=0.01)
-    # list_text_response.append(input_text_process)
-    # ##character_deletion
-    # input_text_process = character_deletion(input_text, error_rate=0.01)
-    # list_text_response.append(input_text_process)
-    # ##character_transposition
-    # input_text_process = character_transposition(input_text, error_rate=0.03)
-    # list_text_response.append(input_text_process)
-    # ##homophone_replacement
-    # input_text_process = homophone_replacement(input_text, error_rate=0.12)
-    # list_text_response.append(input_text_process)
-    # ## common_misspelling_replacement
-    # input_text_process = common_misspelling_replacement(input_text)
-    # list_text_response.append(input_text_process)
-    # ##similar_character_replacement
-    # input_text_process = similar_character_replacement(input_text)
-    # list_text_response.append(input_text_process)
-    # ##random_space_insertion
-    # input_text_process = random_space_insertion(input_text)
-    # list_text_response.append(input_text_process)
-    # string_text_response = '\n'.join(list_text_response)
-    ##character_replacement
-    input_text_process = character_replacement(input_text, error_rate=0.01)
-    ##character_insertion
-    input_text_process = character_insertion(input_text_process, error_rate=0.01)
-    ##character_deletion
-    input_text_process = character_deletion(input_text_process, error_rate=0.01)
-    ##character_transposition
-    input_text_process = character_transposition(input_text_process, error_rate=0.01)
-    ##homophone_replacement
-    input_text_process = homophone_replacement(input_text_process, error_rate=0.01)
-    ## common_misspelling_replacement
-    input_text_process = common_misspelling_replacement(input_text_process, error_rate=0.01)
-    ##similar_character_replacement
-    input_text_process = similar_character_replacement(input_text_process, error_rate=0.01)
-    ##random_space_insertion
-    input_text_process = random_space_insertion(input_text_process, error_rate=0.01)
-    ##random_space_removal
-    input_text_process = random_space_removal(input_text_process)
     string_text_response = input_text_process
     return str(string_text_response)

 import gradio as gr
+import random
 import fomula
 from utils import *
 def final_result(input_text):
+    total_change = 0
+    num_change = 0
+    list_do = random.sample(range(10), 10)
+    input_text_process = input_text
+    for i in list_do:
+        if i == 0:
+            ##character_replacement
+            input_text_process, num_change = character_replacement(input_text_process, total_change / len(input_text), error_rate=0.02)
+        elif i == 1:
+            ##character_insertion
+            input_text_process, num_change = character_insertion(input_text_process,total_change / len(input_text), error_rate=0.03)
+        elif i == 2:
+            ##character_deletion
+            input_text_process, num_change = character_deletion(input_text_process,total_change / len(input_text), error_rate=0.03)
+        elif i == 3:
+            ##character_transposition
+            input_text_process, num_change = character_transposition(input_text_process,total_change / len(input_text), error_rate=0.01)
+        elif i == 4:
+            ##homophone_replacement
+            input_text_process, num_change = homophone_replacement(input_text_process,total_change / len(input_text), error_rate=0.01)
+        elif i == 5:
+            ## common_misspelling_replacement
+            input_text_process, num_change = common_misspelling_replacement(input_text_process,total_change / len(input_text), error_rate=0.01)
+        elif i == 6:
+            ##similar_character_replacement
+            input_text_process, num_change = similar_character_replacement(input_text_process,total_change / len(input_text), error_rate=0.01)
+        elif i == 7:
+            ##random_space_insertion
+            input_text_process, num_change = random_space_insertion(input_text_process,total_change / len(input_text), error_rate=0.01)
+        elif i == 8:
+            ##random_space_removal
+            input_text_process, num_change = random_space_removal(input_text_process, total_change / len(input_text), error_rate=0.03)
+        elif i == 9:
+            ##remove_vietnamese_accents
+            input_text_process, num_change = remove_vietnamese_accents(input_text_process, total_change / len(input_text), error_rate=0.03)
+        total_change = total_change + num_change
+    print("Total change: ", total_change)
+    print("Tỷ lệ", total_change / len(input_text))
     string_text_response = input_text_process
     return str(string_text_response)

fomula.py CHANGED Viewed

@@ -5,14 +5,89 @@ import random
 import math
 '''
-AP = a.e^(bx)
-e: Tỉ lệ lỗi trong 1 văn bản: 0.5% - 10%
-AP: Tỉ lệ sửa (0.1 <= AP <= 0.5)
 a: weight (0.5442)
 b: bias (-16.94145)
 '''
 def AP_fomula(a = 0.5442, b = -16.94145, error_rate=0.1):
     augmentation_probability = a * math.pow(np.e, b * error_rate)
     return augmentation_probability

 import math
 '''
+AP = a.ER^(bx) * sigmoidP(x)
+ER: Tỉ lệ lỗi trong 1 phép tạo lỗi: 0.5% - 10%
+AP (augmentation_probability) : Tỉ lệ sửa (0.1 <= AP <= 0.5)
 a: weight (0.5442)
 b: bias (-16.94145)
+SER: Tỷ lệ lỗi đã thêm (mong muốn: < 15% - tối đa <= 30%)
 '''
+ER_min = 0.01
+ER_max = 0.1
+def find_a_b_for_AP(min_AP = 0.5, max_AP = 1):
+    '''
+    :param min_AP:
+    :param max_AP:
+    :return: a, b
+    {a * e^(b * ER_min) = max_AP
+    {a * e^(b * ER_max) = min_AP
+    (ER_max - ER_min) * b = ln(min_AP / max_AP)
+    '''
+    exponential = ER_max - ER_min
+    b = np.log(min_AP / max_AP) / exponential
+    '''
+    a * e^(b * ER_min) = max_AP
+    '''
+    a = max_AP / math.pow(np.e, b * ER_min)
+    return a, b
 def AP_fomula(a = 0.5442, b = -16.94145, error_rate=0.1):
+    a, b = find_a_b_for_AP()
+    # print(a, b)
     augmentation_probability = a * math.pow(np.e, b * error_rate)
     return augmentation_probability
+'''
+Hàm sigmoid:
+Sigmoid(x) = 1 / (1 + e^(-k * x))
+Giới hạn tỉ lệ lỗi tối đa của một câu là <= 35%
+x: là SER_max - SER_mong muốn
+=> Sigmoid - lim(x) -> 0.35 ~ 1
+Sigmoid(x) = 1 / (1 + e^(-k * x)) ~ 1
+=> 1 / (1 + e^(-0.35k)) = 999/1000
+Đk kich hoạt hàm sigmoid => Khi SER đạt đến một độ min nhật định vd: 10%
+'''
+def get_k_sigmoid(x=0.2, lim1 = 0.999):
+    '''
+    :param x: ##SER: Tỷ lệ lỗi đã thêm (mong muốn: < 15% - tối đa <= 30%)
+    :param lim1: Giá trị ~1
+    :return: k
+    '''
+    sum1 = 1 - lim1
+    sum1 = sum1 / lim1
+    k = -np.log(sum1) / x
+    return k
+print(1 - (1 / (1 + pow(math.e, 0.0 * get_k_sigmoid(x=0.1)))))

utils.py CHANGED Viewed

@@ -6,12 +6,17 @@ import math
 import fomula
-def character_replacement(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     :param C: hằng số để tính toán xác suất thay thế ký tự
     :return: Gồm 1 câu sai chính tả
     Kết hợp phương pháp error_rate và augmentation_probability để tạo lỗi chính tả.
@@ -130,8 +135,15 @@ def character_replacement(text, error_rate=0.03):
     # Chọn ngẫu nhiên các vị trí để thay thế ký tự
     indices = random.sample(range(len(text)), num_errors)
     for index in indices:
-        if text[index].lower() in keyboard and random.random() <= augmentation_probability:
             # Lấy ngẫu nhiên một ký tự từ các ký tự gần trên bàn phím
             replacement_char = random.choice(keyboard[text[index].lower()])
             # Giữ nguyên kiểu chữ hoa hoặc chữ thường
@@ -142,10 +154,12 @@ def character_replacement(text, error_rate=0.03):
     # Ghép các ký tự lại thành chuỗi văn bản hoàn chỉnh
     final_text = ''.join(text)
-    return final_text
-def character_insertion(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -184,10 +198,17 @@ def character_insertion(text, error_rate=0.03):
     # Tính toán xác suất chèn ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện thêm ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
         # Chọn ngẫu nhiên một vị trí trong văn bản để thêm ký tự
-        if random.random() <= augmentation_probability:
             index = random.randint(0, len(text))
             # Chọn ngẫu nhiên một ký tự từ bàn phím
             char_to_insert = random.choice(keyboard)
@@ -195,10 +216,11 @@ def character_insertion(text, error_rate=0.03):
             text.insert(index, char_to_insert)
     # Ghép các ký tự lại thành chuỗi văn bản
-    return ''.join(text)
-def character_deletion(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -226,19 +248,26 @@ def character_deletion(text, error_rate=0.03):
     # Tính toán xác suất xóa ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện xóa ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
         # Chọn ngẫu nhiên một vị trí trong văn bản để xóa ký tự
-        if random.random() <= augmentation_probability:
             if len(text) > 0:  # Đảm bảo rằng danh sách không trống
                 index = random.randint(0, len(text) - 1)
                 # Xóa ký tự tại vị trí đã chọn
                 text.pop(index)
     # Ghép các ký tự lại thành chuỗi văn bản
-    return ''.join(text)
-def character_transposition(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -264,20 +293,26 @@ def character_transposition(text, error_rate=0.03):
     # Tính toán xác suất hoán đổi ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện hoán đổi ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
         # Chọn ngẫu nhiên một vị trí trong văn bản để hoán đổi ký tự
-        if random.random() <= augmentation_probability:
             if len(text) > 1:  # Đảm bảo rằng danh sách có ít nhất 2 ký tự để hoán đổi
                 index = random.randint(0, len(text) - 2)
                 # Hoán đổi hai ký tự liên tiếp tại vị trí đã chọn
                 text[index], text[index + 1] = text[index + 1], text[index]
     # Ghép các ký tự lại thành chuỗi văn bản
-    return ''.join(text)
-def homophone_replacement(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -323,28 +358,33 @@ def homophone_replacement(text, error_rate=0.03):
     # Tính toán xác suất thay thế từ dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     if num_errors < 1:
         num_errors = 1 if random.random() > 0.5 else 0
     for _ in range(num_errors):
-        if len(candidate_indices) > 0 and random.random() <= augmentation_probability:
             index = random.choice(candidate_indices)
             word = words[index]
             for key in homophones.keys():
                 if key in word:
                     word = word.replace(key, homophones[key])
                     break  # Dừng lại sau khi thay thế lần đầu tiên để tránh thay thế nhiều lần
             words[index] = word
             candidate_indices.remove(index)  # Đảm bảo từ này không bị thay thế nhiều lần
     # Ghép các từ lại thành chuỗi văn bản
-    return ' '.join(words)
-def common_misspelling_replacement(text, error_rate=0.12):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -377,30 +417,35 @@ def common_misspelling_replacement(text, error_rate=0.12):
     #Kiểm tra xem có thể thay không
     if len(candidate_indices) == 0:
-        return ' '.join(words)
     # Tính toán xác suất thay thế từ dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     if num_errors < 1:
         num_errors = 1 if random.random() > 0.5 else 0
     for _ in range(num_errors):
-        if len(candidate_indices) > 0 and random.random() <= augmentation_probability:
             index = random.choice(candidate_indices)
             word = words[index]
             # Chọn ngẫu nhiên một phiên bản sai chính tả từ từ điển
             misspelled_word = random.choice(common_misspellings[word])
             words[index] = misspelled_word
             candidate_indices.remove(index)  # Đảm bảo từ này không bị thay thế nhiều lần
     # Ghép các từ lại thành chuỗi văn bản
-    return ' '.join(words)
-def similar_character_replacement(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -438,14 +483,18 @@ def similar_character_replacement(text, error_rate=0.03):
     # Tính toán xác suất thay thế ký tự dựa trên error_rate sử d���ng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện thay thế ký tự tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     if num_errors < 1:
         num_errors = 1 if random.random() > 0.5 else 0
     for _ in range(num_errors):
-        if len(candidate_indices) > 0 and random.random() <= augmentation_probability:
             index = random.choice(candidate_indices)
             char = characters[index]
             # Chọn ngẫu nhiên một ký tự tương tự từ từ điển
@@ -454,10 +503,11 @@ def similar_character_replacement(text, error_rate=0.03):
             candidate_indices.remove(index)  # Đảm bảo ký tự này không bị thay thế nhiều lần
     # Ghép các ký tự lại thành chuỗi văn bản
-    return ''.join(characters)
-def random_space_insertion(text, error_rate=0.025):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -475,22 +525,27 @@ def random_space_insertion(text, error_rate=0.025):
     # Tính toán xác suất chèn khoảng trắng dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện chèn khoảng trắng tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     if num_errors < 1:
         num_errors = 1 if random.random() > 0.5 else 0
     for _ in range(num_errors):
-        if len(candidate_indices) > 0 and random.random() <= augmentation_probability:
             index = random.choice(candidate_indices)
             characters.insert(index, ' ')
             candidate_indices.remove(index)  # Đảm bảo không chèn khoảng trắng nhiều lần tại cùng một vị trí
     # Ghép các ký tự lại thành chuỗi văn bản
-    return ''.join(characters)
-def random_space_removal(text, error_rate=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -502,7 +557,7 @@ def random_space_removal(text, error_rate=0.01):
     # Tính toán xác suất bỏ dấu cách dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Tìm các vị trí có thể bỏ dấu cách (giữa các từ)
     words = text.split()
     candidate_indices = [i for i in range(len(words) - 1)]
@@ -512,15 +567,60 @@ def random_space_removal(text, error_rate=0.01):
     if num_errors < 1:
         num_errors = 1 if random.random() > 0.5 else 0
     # Thực hiện bỏ dấu cách tại các vị trí ngẫu nhiên
     selected_indices = random.sample(candidate_indices, min(num_errors, len(candidate_indices)))
     for index in selected_indices:
-        if random.random() <= augmentation_probability:
             words[index] = words[index] + words[index + 1]
             words[index + 1] = ''  # Xóa từ đã ghép để tránh lặp lại
     # Ghép các từ lại thành chuỗi văn bản, bỏ qua các từ trống
-    return ' '.join([word for word in words if word])

 import fomula
+##SER: Tỷ lệ lỗi đã thêm (mong muốn: < 15% - tối đa <= 30%)
+SER_want = 0.15
+SER_max = 0.3
+k_sigmoid = fomula.get_k_sigmoid(x=SER_max - SER_want)
+print("k_sigmoid", k_sigmoid)
+def character_replacement(text, SER, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     :param C: hằng số để tính toán xác suất thay thế ký tự
+    :SER: Tỷ lệ lỗi đã sửa / tổng lỗi
     :return: Gồm 1 câu sai chính tả
     Kết hợp phương pháp error_rate và augmentation_probability để tạo lỗi chính tả.
     # Chọn ngẫu nhiên các vị trí để thay thế ký tự
     indices = random.sample(range(len(text)), num_errors)
+    num_of_change = 0
+    sigmoid = 1 if SER <= SER_want else (1 - (1 / (1 + pow(math.e, - (SER - SER_want) * k_sigmoid))))
+    print(sigmoid)
     for index in indices:
+        if text[index].lower() in keyboard and random.random() <= augmentation_probability * sigmoid:
+            num_of_change += 1
             # Lấy ngẫu nhiên một ký tự từ các ký tự gần trên bàn phím
             replacement_char = random.choice(keyboard[text[index].lower()])
             # Giữ nguyên kiểu chữ hoa hoặc chữ thường
     # Ghép các ký tự lại thành chuỗi văn bản hoàn chỉnh
     final_text = ''.join(text)
+    print("character_replacement: ", num_of_change)
+    return final_text, num_of_change
+def character_insertion(text, SER, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Tính toán xác suất chèn ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
+    num_of_change = 0
     # Thực hiện thêm ký tự tại các vị trí ngẫu nhiên
+    sigmoid = 1 if SER <= SER_want else (1 - (1 / (1 + pow(math.e, -(SER - SER_want) * k_sigmoid))))
+    print(sigmoid)
     for _ in range(num_errors):
         # Chọn ngẫu nhiên một vị trí trong văn bản để thêm ký tự
+        if random.random() <= augmentation_probability * sigmoid:
+            num_of_change += 1
             index = random.randint(0, len(text))
             # Chọn ngẫu nhiên một ký tự từ bàn phím
             char_to_insert = random.choice(keyboard)
             text.insert(index, char_to_insert)
     # Ghép các ký tự lại thành chuỗi văn bản
+    print("character_insertion: ", num_of_change)
+    return ''.join(text), num_of_change
+def character_deletion(text, SER, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Tính toán xác suất xóa ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
+    num_of_change = 0
+    sigmoid = 1 if SER <= SER_want else (1 - (1 / (1 + pow(math.e, -(SER - SER_want) * k_sigmoid))))
+    print(sigmoid)
     # Thực hiện xóa ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
         # Chọn ngẫu nhiên một vị trí trong văn bản để xóa ký tự
+        if random.random() <= augmentation_probability * sigmoid:
             if len(text) > 0:  # Đảm bảo rằng danh sách không trống
+                num_of_change += 1
                 index = random.randint(0, len(text) - 1)
                 # Xóa ký tự tại vị trí đã chọn
                 text.pop(index)
     # Ghép các ký tự lại thành chuỗi văn bản
+    print("character_deletion: ", num_of_change)
+    return ''.join(text), num_of_change
+def character_transposition(text, SER, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Tính toán xác suất hoán đổi ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
+    num_of_change = 0
+    sigmoid = 1 if SER <= SER_want else (1 - (1 / (1 + pow(math.e, -(SER - SER_want) * k_sigmoid))))
+    print(sigmoid)
     # Thực hiện hoán đổi ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
         # Chọn ngẫu nhiên một vị trí trong văn bản để hoán đổi ký tự
+        if random.random() <= augmentation_probability * sigmoid:
             if len(text) > 1:  # Đảm bảo rằng danh sách có ít nhất 2 ký tự để hoán đổi
+                num_of_change += 2
                 index = random.randint(0, len(text) - 2)
                 # Hoán đổi hai ký tự liên tiếp tại vị trí đã chọn
                 text[index], text[index + 1] = text[index + 1], text[index]
     # Ghép các ký tự lại thành chuỗi văn bản
+    print("character_transposition: ", num_of_change)
+    return ''.join(text), num_of_change
+def homophone_replacement(text, SER, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Tính toán xác suất thay thế từ dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
+    num_of_change = 0
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     if num_errors < 1:
         num_errors = 1 if random.random() > 0.5 else 0
+    sigmoid = 1 if SER <= SER_want else (1 - (1 / (1 + pow(math.e, -(SER - SER_want) * k_sigmoid))))
+    print(sigmoid)
     for _ in range(num_errors):
+        if len(candidate_indices) > 0 and random.random() <= augmentation_probability * sigmoid:
             index = random.choice(candidate_indices)
             word = words[index]
             for key in homophones.keys():
                 if key in word:
                     word = word.replace(key, homophones[key])
+                    num_of_change += np.abs(len(key) - len(homophones[key]))
                     break  # Dừng lại sau khi thay thế lần đầu tiên để tránh thay thế nhiều lần
             words[index] = word
             candidate_indices.remove(index)  # Đảm bảo từ này không bị thay thế nhiều lần
     # Ghép các từ lại thành chuỗi văn bản
+    print("homophone_replacement: ", num_of_change)
+    return ' '.join(words), num_of_change
+def common_misspelling_replacement(text, SER, error_rate=0.12):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     #Kiểm tra xem có thể thay không
     if len(candidate_indices) == 0:
+        return ' '.join(words), 0
     # Tính toán xác suất thay thế từ dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
+    num_of_change = 0
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     if num_errors < 1:
         num_errors = 1 if random.random() > 0.5 else 0
+    sigmoid = 1 if SER <= SER_want else (1 - (1 / (1 + pow(math.e, -(SER - SER_want) * k_sigmoid))))
+    print(sigmoid)
     for _ in range(num_errors):
+        if len(candidate_indices) > 0 and random.random() <= augmentation_probability * sigmoid:
             index = random.choice(candidate_indices)
             word = words[index]
             # Chọn ngẫu nhiên một phiên bản sai chính tả từ từ điển
             misspelled_word = random.choice(common_misspellings[word])
             words[index] = misspelled_word
+            num_of_change += np.abs(len(misspelled_word) - len(word))
             candidate_indices.remove(index)  # Đảm bảo từ này không bị thay thế nhiều lần
     # Ghép các từ lại thành chuỗi văn bản
+    print("common_misspelling_replacement: ", num_of_change)
+    return ' '.join(words), num_of_change
+def similar_character_replacement(text, SER, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Tính toán xác suất thay thế ký tự dựa trên error_rate sử d���ng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
+    num_of_change = 0
     # Thực hiện thay thế ký tự tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     if num_errors < 1:
         num_errors = 1 if random.random() > 0.5 else 0
+    sigmoid = 1 if SER <= SER_want else (1 - (1 / (1 + pow(math.e, -(SER - SER_want) * k_sigmoid))))
+    print(sigmoid)
     for _ in range(num_errors):
+        if len(candidate_indices) > 0 and random.random() <= augmentation_probability * sigmoid:
+            num_of_change += 1
             index = random.choice(candidate_indices)
             char = characters[index]
             # Chọn ngẫu nhiên một ký tự tương tự từ từ điển
             candidate_indices.remove(index)  # Đảm bảo ký tự này không bị thay thế nhiều lần
     # Ghép các ký tự lại thành chuỗi văn bản
+    print("similar_character_replacement: ", num_of_change)
+    return ''.join(characters), num_of_change
+def random_space_insertion(text, SER, error_rate=0.025):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Tính toán xác suất chèn khoảng trắng dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
+    num_of_change = 0
     # Thực hiện chèn khoảng trắng tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     if num_errors < 1:
         num_errors = 1 if random.random() > 0.5 else 0
+    sigmoid = 1 if SER <= SER_want else (1 - (1 / (1 + pow(math.e, -(SER - SER_want) * k_sigmoid))))
+    print(sigmoid)
     for _ in range(num_errors):
+        if len(candidate_indices) > 0 and random.random() <= augmentation_probability * sigmoid:
+            num_of_change += 1
             index = random.choice(candidate_indices)
             characters.insert(index, ' ')
             candidate_indices.remove(index)  # Đảm bảo không chèn khoảng trắng nhiều lần tại cùng một vị trí
     # Ghép các ký tự lại thành chuỗi văn bản
+    print("random_space_insertion: ", num_of_change)
+    return ''.join(characters), num_of_change
+def random_space_removal(text, SER, error_rate=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Tính toán xác suất bỏ dấu cách dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
+    num_of_change = 0
     # Tìm các vị trí có thể bỏ dấu cách (giữa các từ)
     words = text.split()
     candidate_indices = [i for i in range(len(words) - 1)]
     if num_errors < 1:
         num_errors = 1 if random.random() > 0.5 else 0
+    sigmoid = 1 if SER <= SER_want else (1 - (1 / (1 + pow(math.e, -(SER - SER_want) * k_sigmoid))))
+    print(sigmoid)
     # Thực hiện bỏ dấu cách tại các vị trí ngẫu nhiên
     selected_indices = random.sample(candidate_indices, min(num_errors, len(candidate_indices)))
     for index in selected_indices:
+        if random.random() <= augmentation_probability * sigmoid:
+            num_of_change += 1
             words[index] = words[index] + words[index + 1]
             words[index + 1] = ''  # Xóa từ đã ghép để tránh lặp lại
     # Ghép các từ lại thành chuỗi văn bản, bỏ qua các từ trống
+    print("random_space_removal: ", num_of_change)
+    return ' '.join([word for word in words if word]), num_of_change
+import unidecode
+def remove_vietnamese_accents(text, SER, error_rate=0.05):
+    '''
+    :param text: Gồm 1 câu có dấu tiếng Việt
+    :param error_rate: tỷ lệ lỗi sai muốn thêm
+    :param C: hằng số để tính toán xác suất chuyển đổi ký tự
+    :param num_errors: số lượng lỗi sai muốn thêm (tuỳ chọn)
+    :return: Gồm 1 câu không dấu tiếng Việt với một tỷ lệ lỗi nhất định
+    Thực hiện chuyển đổi các ký tự có dấu thành các ký tự không dấu tương ứng với xác suất nhất định.
+    '''
+    # Tính toán xác suất chuy��n đổi ký tự dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
+    # Chuyển đổi văn bản thành danh sách ký tự để có thể chuyển đổi ký tự
+    characters = list(text)
+    num_of_change = 0
+    num_errors = int(len(text) * error_rate)
+    if num_errors < 1:
+        num_errors = 1 if random.random() > 0.5 else 0
+    # Xác định các ký tự có thể chuyển đổi
+    candidate_indices = [i for i, char in enumerate(characters) if unidecode.unidecode(char) != char]
+    sigmoid = 1 if SER <= SER_want else (1 - (1 / (1 + pow(math.e, -(SER - SER_want) * k_sigmoid))))
+    print(sigmoid)
+    # Thực hiện chuyển đổi ký tự tại các vị trí ngẫu nhiên
+    selected_indices = random.sample(candidate_indices, min(num_errors, len(candidate_indices)))
+    for index in selected_indices:
+        if random.random() <= augmentation_probability * sigmoid:
+            num_of_change += 1
+            characters[index] = unidecode.unidecode(characters[index])
+    print("remove_vietnamese_accents: ", num_of_change)
+    # Ghép các ký tự lại thành chuỗi văn bản
+    return ''.join(characters), num_of_change