Spaces:

ShynBui
/

Create_Vietnamese_spelling_errors

Build error

App Files Files Community

ShynBui commited on Jul 15, 2024

Commit

e3e69c0

1 Parent(s): fa274a5

update num_errors fomula

Browse files

Files changed (3) hide show

__pycache__/utils.cpython-310.pyc +0 -0
app.py +3 -0
utils.py +22 -7

__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -20,6 +20,9 @@ def final_result(input_text):
     ##homophone_replacement
     input_text_process = homophone_replacement(input_text, error_rate=0.12)
     list_text_response.append(input_text_process)
     ##
     string_text_response = '\n'.join(list_text_response)
     return str(string_text_response)

     ##homophone_replacement
     input_text_process = homophone_replacement(input_text, error_rate=0.12)
     list_text_response.append(input_text_process)
+    ## common_misspelling_replacement
+    input_text_process = common_misspelling_replacement(input_text)
+    list_text_response.append(input_text_process)
     ##
     string_text_response = '\n'.join(list_text_response)
     return str(string_text_response)

utils.py CHANGED Viewed

@@ -119,7 +119,9 @@ def character_replacement(text, error_rate=0.03, C=0.01):
     text = list(text)
     # Xác định số lượng ký tự cần thay thế dựa trên tỷ lệ lỗi
-    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
     # Chọn ngẫu nhiên các vị trí để thay thế ký tự
     indices = random.sample(range(len(text)), num_errors)
@@ -171,7 +173,9 @@ def character_insertion(text, error_rate=0.03, C=0.01):
     text = list(text)
     # Xác định số lượng ký tự cần thêm dựa trên tỷ lệ lỗi
-    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
     # Tính toán xác suất chèn ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
@@ -210,7 +214,10 @@ def character_deletion(text, error_rate=0.03, C=0.01):
     text = list(text)
     # Xác định số lượng ký tự cần xóa dựa trên tỷ lệ lỗi
-    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
     # Tính toán xác suất xóa ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
@@ -247,7 +254,9 @@ def character_transposition(text, error_rate=0.03, C=0.01):
     text = list(text)
     # Xác định số lượng lần hoán đổi cần thực hiện dựa trên tỷ lệ lỗi
-    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
     # Tính toán xác suất hoán đổi ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
@@ -312,7 +321,10 @@ def homophone_replacement(text, error_rate=0.03, C=0.01):
     augmentation_probability = C / math.log(error_rate + 1)
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
-    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
     for _ in range(num_errors):
         if len(candidate_indices) > 0 and random.random() <= augmentation_probability:
             index = random.choice(candidate_indices)
@@ -328,7 +340,7 @@ def homophone_replacement(text, error_rate=0.03, C=0.01):
     return ' '.join(words)
-def common_misspelling_replacement(text, error_rate=0.03, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -367,7 +379,10 @@ def common_misspelling_replacement(text, error_rate=0.03, C=0.01):
     augmentation_probability = C / math.log(error_rate + 1)
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
-    num_errors = int(len(text) * error_rate) if len(text) * error_rate > 1 else int(random.random())
     for _ in range(num_errors):
         if len(candidate_indices) > 0 and random.random() <= augmentation_probability:
             index = random.choice(candidate_indices)

     text = list(text)
     # Xác định số lượng ký tự cần thay thế dựa trên tỷ lệ lỗi
+    num_errors = int(len(text) * error_rate)
+    if num_errors < 1:
+        num_errors = 1 if random.random() > 0.5 else 0
     # Chọn ngẫu nhiên các vị trí để thay thế ký tự
     indices = random.sample(range(len(text)), num_errors)
     text = list(text)
     # Xác định số lượng ký tự cần thêm dựa trên tỷ lệ lỗi
+    num_errors = int(len(text) * error_rate)
+    if num_errors < 1:
+        num_errors = 1 if random.random() > 0.5 else 0
     # Tính toán xác suất chèn ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
     text = list(text)
     # Xác định số lượng ký tự cần xóa dựa trên tỷ lệ lỗi
+    num_errors = int(len(text) * error_rate)
+    if num_errors < 1:
+        num_errors = 1 if random.random() > 0.5 else 0
     # Tính toán xác suất xóa ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
     text = list(text)
     # Xác định số lượng lần hoán đổi cần thực hiện dựa trên tỷ lệ lỗi
+    num_errors = int(len(text) * error_rate)
+    if num_errors < 1:
+        num_errors = 1 if random.random() > 0.5 else 0
     # Tính toán xác suất hoán đổi ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1)
     augmentation_probability = C / math.log(error_rate + 1)
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
+    num_errors = int(len(text) * error_rate)
+    if num_errors < 1:
+        num_errors = 1 if random.random() > 0.5 else 0
     for _ in range(num_errors):
         if len(candidate_indices) > 0 and random.random() <= augmentation_probability:
             index = random.choice(candidate_indices)
     return ' '.join(words)
+def common_misspelling_replacement(text, error_rate=0.12, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     augmentation_probability = C / math.log(error_rate + 1)
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
+    num_errors = int(len(text) * error_rate)
+    if num_errors < 1:
+        num_errors = 1 if random.random() > 0.5 else 0
     for _ in range(num_errors):
         if len(candidate_indices) > 0 and random.random() <= augmentation_probability:
             index = random.choice(candidate_indices)