Spaces:

ShynBui
/

Create_Vietnamese_spelling_errors

Build error

App Files Files Community

ShynBui commited on Jul 15, 2024

Commit

b213fb9

1 Parent(s): 6191d4c

add space remove

Browse files

Files changed (4) hide show

__pycache__/utils.cpython-310.pyc +0 -0
app.py +40 -18
requirements.txt +0 -0
utils.py +83 -7

__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -2,32 +2,54 @@ import gradio as gr
 from utils import *
 def final_result(input_text):
-    list_text_response = []
     ##character_replacement
-    input_text_process = character_replacement(input_text, error_rate=0.1)
-    list_text_response.append(input_text_process)
     ##character_insertion
-    input_text_process = character_insertion(input_text, error_rate=0.01)
-    list_text_response.append(input_text_process)
     ##character_deletion
-    input_text_process = character_deletion(input_text, error_rate=0.01)
-    list_text_response.append(input_text_process)
     ##character_transposition
-    input_text_process = character_transposition(input_text, error_rate=0.03)
-    list_text_response.append(input_text_process)
     ##homophone_replacement
-    input_text_process = homophone_replacement(input_text, error_rate=0.12)
-    list_text_response.append(input_text_process)
     ## common_misspelling_replacement
-    input_text_process = common_misspelling_replacement(input_text)
-    list_text_response.append(input_text_process)
     ##similar_character_replacement
-    input_text_process = similar_character_replacement(input_text)
-    list_text_response.append(input_text_process)
-    ##
-    string_text_response = '\n'.join(list_text_response)
     return str(string_text_response)
 demo = gr.Interface(fn=final_result, inputs="textbox", outputs="textbox")

 from utils import *
 def final_result(input_text):
+    # list_text_response = []
+    #
+    # ##character_replacement
+    # input_text_process = character_replacement(input_text, error_rate=0.1)
+    # list_text_response.append(input_text_process)
+    # ##character_insertion
+    # input_text_process = character_insertion(input_text, error_rate=0.01)
+    # list_text_response.append(input_text_process)
+    # ##character_deletion
+    # input_text_process = character_deletion(input_text, error_rate=0.01)
+    # list_text_response.append(input_text_process)
+    # ##character_transposition
+    # input_text_process = character_transposition(input_text, error_rate=0.03)
+    # list_text_response.append(input_text_process)
+    # ##homophone_replacement
+    # input_text_process = homophone_replacement(input_text, error_rate=0.12)
+    # list_text_response.append(input_text_process)
+    # ## common_misspelling_replacement
+    # input_text_process = common_misspelling_replacement(input_text)
+    # list_text_response.append(input_text_process)
+    # ##similar_character_replacement
+    # input_text_process = similar_character_replacement(input_text)
+    # list_text_response.append(input_text_process)
+    # ##random_space_insertion
+    # input_text_process = random_space_insertion(input_text)
+    # list_text_response.append(input_text_process)
+    # string_text_response = '\n'.join(list_text_response)
     ##character_replacement
+    input_text_process = character_replacement(input_text, error_rate=0.01)
     ##character_insertion
+    input_text_process = character_insertion(input_text_process, error_rate=0.01)
     ##character_deletion
+    input_text_process = character_deletion(input_text_process, error_rate=0.01)
     ##character_transposition
+    input_text_process = character_transposition(input_text_process, error_rate=0.01)
     ##homophone_replacement
+    input_text_process = homophone_replacement(input_text_process, error_rate=0.01)
     ## common_misspelling_replacement
+    input_text_process = common_misspelling_replacement(input_text_process, error_rate=0.01)
     ##similar_character_replacement
+    input_text_process = similar_character_replacement(input_text_process, error_rate=0.01)
+    ##random_space_insertion
+    input_text_process = random_space_insertion(input_text_process, error_rate=0.01)
+    ##random_space_removal
+    input_text_process = random_space_removal(input_text_process)
+    string_text_response = input_text_process
     return str(string_text_response)
 demo = gr.Interface(fn=final_result, inputs="textbox", outputs="textbox")

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

utils.py CHANGED Viewed

@@ -4,7 +4,14 @@ import numpy as np
 import random
 import math
-def character_replacement(text, error_rate=0.03, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -31,6 +38,7 @@ def character_replacement(text, error_rate=0.03, C=0.01):
     Sử dụng ''.join(text) để ghép danh sách các ký tự lại thành chuỗi văn bản hoàn chỉnh.
     '''
     # Tính toán xác suất thay thế ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1, 10)
@@ -141,7 +149,7 @@ def character_replacement(text, error_rate=0.03, C=0.01):
     return final_text
-def character_insertion(text, error_rate=0.03, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -194,7 +202,7 @@ def character_insertion(text, error_rate=0.03, C=0.01):
     return ''.join(text)
-def character_deletion(text, error_rate=0.03, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -234,7 +242,7 @@ def character_deletion(text, error_rate=0.03, C=0.01):
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(text)
-def character_transposition(text, error_rate=0.03, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -273,7 +281,7 @@ def character_transposition(text, error_rate=0.03, C=0.01):
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(text)
-def homophone_replacement(text, error_rate=0.03, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -340,7 +348,7 @@ def homophone_replacement(text, error_rate=0.03, C=0.01):
     return ' '.join(words)
-def common_misspelling_replacement(text, error_rate=0.12, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -396,7 +404,7 @@ def common_misspelling_replacement(text, error_rate=0.12, C=0.01):
     return ' '.join(words)
-def similar_character_replacement(text, error_rate=0.03, C=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -452,3 +460,71 @@ def similar_character_replacement(text, error_rate=0.03, C=0.01):
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(characters)

 import random
 import math
+'''
+AP = C / log(e + 1)
+e: Tỉ lệ lỗi trong 1 văn bản: 0.5% - 10%
+AP: Tỉ lệ sửa (0.5 < AP <= 1)
+'''
+Ceta = 0.02069634258
+def character_replacement(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     Sử dụng ''.join(text) để ghép danh sách các ký tự lại thành chuỗi văn bản hoàn chỉnh.
     '''
     # Tính toán xác suất thay thế ký tự dựa trên error_rate sử dụng hàm logarit
     augmentation_probability = C / math.log(error_rate + 1, 10)
     return final_text
+def character_insertion(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     return ''.join(text)
+def character_deletion(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(text)
+def character_transposition(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(text)
+def homophone_replacement(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     return ' '.join(words)
+def common_misspelling_replacement(text, error_rate=0.12, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     return ' '.join(words)
+def similar_character_replacement(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(characters)
+def random_space_insertion(text, error_rate=0.025, C=Ceta):
+    '''
+    :param text: Gồm 1 câu đúng chính tả
+    :param error_rate: tỷ lệ lỗi sai muốn thêm
+    :param C: hằng số để tính toán xác suất chèn khoảng trắng ngẫu nhiên
+    :return: Gồm 1 câu sai chính tả
+    Thực hiện chèn khoảng trắng ngẫu nhiên vào giữa từ.
+    '''
+    # Chuyển đổi văn bản thành danh sách ký tự để có thể chèn khoảng trắng
+    characters = list(text)
+    # Tìm các vị trí có thể chèn khoảng trắng
+    candidate_indices = [i for i in range(1, len(characters))]
+    # Tính toán xác suất chèn khoảng trắng dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = C / math.log(error_rate + 1, 10)
+    # Thực hiện chèn khoảng trắng tại các vị trí ngẫu nhiên
+    num_errors = int(len(text) * error_rate)
+    if num_errors < 1:
+        num_errors = 1 if random.random() > 0.5 else 0
+    for _ in range(num_errors):
+        if len(candidate_indices) > 0 and random.random() <= augmentation_probability:
+            index = random.choice(candidate_indices)
+            characters.insert(index, ' ')
+            candidate_indices.remove(index)  # Đảm bảo không chèn khoảng trắng nhiều lần tại cùng một vị trí
+    # Ghép các ký tự lại thành chuỗi văn bản
+    return ''.join(characters)
+def random_space_removal(text, error_rate=0.01, C=Ceta):
+    '''
+    :param text: Gồm 1 câu đúng chính tả
+    :param error_rate: tỷ lệ lỗi sai muốn thêm
+    :param C: hằng số để tính toán xác suất bỏ dấu cách
+    :return: Gồm 1 câu bị thiếu dấu cách
+    Thực hiện bỏ ngẫu nhiên một vài dấu cách giữa các từ.
+    '''
+    # Tính toán xác suất bỏ dấu cách dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = C / math.log(error_rate + 1, 10)
+    # Tìm các vị trí có thể bỏ dấu cách (giữa các từ)
+    words = text.split()
+    candidate_indices = [i for i in range(len(words) - 1)]
+    # Nếu num_errors không được cung cấp, tính toán số lượng dấu cách cần bỏ dựa trên error_rate
+    num_errors = int(len(text) * error_rate)
+    if num_errors < 1:
+        num_errors = 1 if random.random() > 0.5 else 0
+    # Thực hiện bỏ dấu cách tại các vị trí ngẫu nhiên
+    selected_indices = random.sample(candidate_indices, min(num_errors, len(candidate_indices)))
+    for index in selected_indices:
+        if random.random() <= augmentation_probability:
+            words[index] = words[index] + words[index + 1]
+            words[index + 1] = ''  # Xóa từ đã ghép để tránh lặp lại
+    # Ghép các từ lại thành chuỗi văn bản, bỏ qua các từ trống
+    return ' '.join([word for word in words if word])