Spaces:

ShynBui
/

Create_Vietnamese_spelling_errors

Build error

App Files Files Community

ShynBui commited on Jul 14, 2024

Commit

ed5fd92

1 Parent(s): e8d0176

add calc num_errors algorimth and homophon_replacement def

Browse files

Files changed (3) hide show

__pycache__/utils.cpython-310.pyc +0 -0
app.py +3 -1
utils.py +70 -8

__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -17,7 +17,9 @@ def final_result(input_text):
     ##character_transposition
     input_text_process = character_transposition(input_text, error_rate=0.03)
     list_text_response.append(input_text_process)
     ##
     string_text_response = '\n'.join(list_text_response)
     return str(string_text_response)

     ##character_transposition
     input_text_process = character_transposition(input_text, error_rate=0.03)
     list_text_response.append(input_text_process)
+    ##homophone_replacement
+    input_text_process = homophone_replacement(input_text, error_rate=0.12)
+    list_text_response.append(input_text_process)
     ##
     string_text_response = '\n'.join(list_text_response)
     return str(string_text_response)

utils.py CHANGED Viewed

@@ -119,7 +119,7 @@ def character_replacement(text, error_rate=0.03, C=0.01):
     text = list(text)
     # Xác định số lượng ký tự cần thay thế dựa trên tỷ lệ lỗi
-    num_errors = int(len(text) * error_rate)
     # Chọn ngẫu nhiên các vị trí để thay thế ký tự
     indices = random.sample(range(len(text)), num_errors)
@@ -139,7 +139,7 @@ def character_replacement(text, error_rate=0.03, C=0.01):
     return final_text
-def character_insertion(text, error_rate=0.03, C=0.5):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -171,7 +171,7 @@ def character_insertion(text, error_rate=0.03, C=0.5):
     text = list(text)
     # Xác định số lượng ký tự cần thêm dựa trên tỷ lệ lỗi
-    num_errors = int(len(text) * error_rate)
     # Tính toán xác suất chèn ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
@@ -190,7 +190,7 @@ def character_insertion(text, error_rate=0.03, C=0.5):
     return ''.join(text)
-def character_deletion(text, error_rate=0.03, C=0.5):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -210,7 +210,7 @@ def character_deletion(text, error_rate=0.03, C=0.5):
     text = list(text)
     # Xác định số lượng ký tự cần xóa dựa trên tỷ lệ lỗi
-    num_errors = int(len(text) * error_rate)
     # Tính toán xác suất xóa ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
@@ -227,7 +227,7 @@ def character_deletion(text, error_rate=0.03, C=0.5):
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(text)
-def character_transposition(text, error_rate=0.03, C=0.5):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -247,7 +247,7 @@ def character_transposition(text, error_rate=0.03, C=0.5):
     text = list(text)
     # Xác định số lượng lần hoán đổi cần thực hiện dựa trên tỷ lệ lỗi
-    num_errors = int(len(text) * error_rate)
     # Tính toán xác suất hoán đổi ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
@@ -262,4 +262,66 @@ def character_transposition(text, error_rate=0.03, C=0.5):
                 text[index], text[index + 1] = text[index + 1], text[index]
     # Ghép các ký tự lại thành chuỗi văn bản
-    return ''.join(text)

     text = list(text)
     # Xác định số lượng ký tự cần thay thế dựa trên tỷ lệ lỗi
+    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
     # Chọn ngẫu nhiên các vị trí để thay thế ký tự
     indices = random.sample(range(len(text)), num_errors)
     return final_text
+def character_insertion(text, error_rate=0.03, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     text = list(text)
     # Xác định số lượng ký tự cần thêm dựa trên tỷ lệ lỗi
+    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
     # Tính toán xác suất chèn ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
     return ''.join(text)
+def character_deletion(text, error_rate=0.03, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     text = list(text)
     # Xác định số lượng ký tự cần xóa dựa trên tỷ lệ lỗi
+    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
     # Tính toán xác suất xóa ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(text)
+def character_transposition(text, error_rate=0.03, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     text = list(text)
     # Xác định số lượng lần hoán đổi cần thực hiện dựa trên tỷ lệ lỗi
+    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
     # Tính toán xác suất hoán đổi ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
                 text[index], text[index + 1] = text[index + 1], text[index]
     # Ghép các ký tự lại thành chuỗi văn bản
+    return ''.join(text)
+def homophone_replacement(text, error_rate=0.03, C=0.01):
+    '''
+    :param text: Gồm 1 câu đúng chính tả
+    :param error_rate: tỷ lệ lỗi sai muốn thêm
+    :param C: hằng số để tính toán xác suất thay thế từ đồng âm
+    :return: Gồm 1 câu sai chính tả
+    Thực hiện thay thế từ đồng âm:
+    - Tìm các từ chứa homophones trước
+    - Sử dụng augmentation_probability để xác định liệu từ đó có bị thay thế hay không
+    Sử dụng ' '.join(words) để ghép danh sách các từ lại thành chuỗi văn bản hoàn chỉnh.
+    '''
+    homophones = {
+        'gi': 'd',
+        'd': 'gi',
+        's': 'x',
+        'x': 's',
+        'tr': 'ch',
+        'ch': 'tr',
+        'r': 'd',
+        'd': 'r',
+        'l': 'n',
+        'n': 'l',
+        'c': 't',
+        't': 'c',
+        'v': 'b',
+        'b': 'v',
+        'ng': 'ngh',
+        'ngh': 'ng'
+    }
+    # Chuyển đổi văn bản thành danh sách từ để có thể thay thế từ
+    words = text.split()
+    # Xác định các từ có chứa homophones
+    candidate_indices = []
+    for i, word in enumerate(words):
+        for key in homophones.keys():
+            if key in word:
+                candidate_indices.append(i)
+                break
+    # Tính toán xác suất thay thế từ dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = C / math.log(error_rate + 1)
+    # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
+    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
+    for _ in range(num_errors):
+        if len(candidate_indices) > 0 and random.random() <= augmentation_probability:
+            index = random.choice(candidate_indices)
+            word = words[index]
+            for key in homophones.keys():
+                if key in word:
+                    word = word.replace(key, homophones[key])
+                    break  # Dừng lại sau khi thay thế lần đầu tiên để tránh thay thế nhiều lần
+            words[index] = word
+            candidate_indices.remove(index)  # Đảm bảo từ này không bị thay thế nhiều lần
+    # Ghép các từ lại thành chuỗi văn bản
+    return ' '.join(words)