Spaces:

ShynBui
/

Create_Vietnamese_spelling_errors

Build error

App Files Files Community

ShynBui commited on Jul 16, 2024

Commit

4ff8b83

1 Parent(s): b213fb9

update AP fomula

Browse files

Files changed (5) hide show

__pycache__/fomula.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
app.py +3 -0
fomula.py +18 -0
utils.py +21 -25

__pycache__/fomula.cpython-310.pyc ADDED Viewed

Binary file (450 Bytes). View file

__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import gradio as gr
 from utils import *
 def final_result(input_text):
@@ -55,4 +57,5 @@ def final_result(input_text):
 demo = gr.Interface(fn=final_result, inputs="textbox", outputs="textbox")
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+import fomula
 from utils import *
 def final_result(input_text):
 demo = gr.Interface(fn=final_result, inputs="textbox", outputs="textbox")
 if __name__ == "__main__":
+    print(fomula.AP_fomula(error_rate=0.12))
     demo.launch()

fomula.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import pandas as pd
+import numpy as np
+import random
+import math
+'''
+AP = a.e^(bx)
+e: Tỉ lệ lỗi trong 1 văn bản: 0.5% - 10%
+AP: Tỉ lệ sửa (0.1 <= AP <= 0.5)
+a: weight (0.5442)
+b: bias (-16.94145)
+'''
+def AP_fomula(a = 0.5442, b = -16.94145, error_rate=0.1):
+    augmentation_probability = a * math.pow(np.e, b * error_rate)
+    return augmentation_probability

utils.py CHANGED Viewed

@@ -4,14 +4,10 @@ import numpy as np
 import random
 import math
-'''
-AP = C / log(e + 1)
-e: Tỉ lệ lỗi trong 1 văn bản: 0.5% - 10%
-AP: Tỉ lệ sửa (0.5 < AP <= 1)
-'''
-Ceta = 0.02069634258
-def character_replacement(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -40,7 +36,7 @@ def character_replacement(text, error_rate=0.03, C=Ceta):
     # Tính toán xác suất thay thế ký tự dựa trên error_rate sử dụng hàm logarit
-    augmentation_probability = C / math.log(error_rate + 1, 10)
     # Bàn phím QWERTY tiếng Việt
     keyboard = {
@@ -149,7 +145,7 @@ def character_replacement(text, error_rate=0.03, C=Ceta):
     return final_text
-def character_insertion(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -186,7 +182,7 @@ def character_insertion(text, error_rate=0.03, C=Ceta):
         num_errors = 1 if random.random() > 0.5 else 0
     # Tính toán xác suất chèn ký tự dựa trên error_rate sử dụng hàm logarit
-    augmentation_probability = C / math.log(error_rate + 1, 10)
     # Thực hiện thêm ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
@@ -202,7 +198,7 @@ def character_insertion(text, error_rate=0.03, C=Ceta):
     return ''.join(text)
-def character_deletion(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -228,7 +224,7 @@ def character_deletion(text, error_rate=0.03, C=Ceta):
     # Tính toán xác suất xóa ký tự dựa trên error_rate sử dụng hàm logarit
-    augmentation_probability = C / math.log(error_rate + 1, 10)
     # Thực hiện xóa ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
@@ -242,7 +238,7 @@ def character_deletion(text, error_rate=0.03, C=Ceta):
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(text)
-def character_transposition(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -267,7 +263,7 @@ def character_transposition(text, error_rate=0.03, C=Ceta):
         num_errors = 1 if random.random() > 0.5 else 0
     # Tính toán xác suất hoán đổi ký tự dựa trên error_rate sử dụng hàm logarit
-    augmentation_probability = C / math.log(error_rate + 1, 10)
     # Thực hiện hoán đổi ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
@@ -281,7 +277,7 @@ def character_transposition(text, error_rate=0.03, C=Ceta):
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(text)
-def homophone_replacement(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -326,7 +322,7 @@ def homophone_replacement(text, error_rate=0.03, C=Ceta):
                 break
     # Tính toán xác suất thay thế từ dựa trên error_rate sử dụng hàm logarit
-    augmentation_probability = C / math.log(error_rate + 1, 10)
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
@@ -348,7 +344,7 @@ def homophone_replacement(text, error_rate=0.03, C=Ceta):
     return ' '.join(words)
-def common_misspelling_replacement(text, error_rate=0.12, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -384,7 +380,7 @@ def common_misspelling_replacement(text, error_rate=0.12, C=Ceta):
         return ' '.join(words)
     # Tính toán xác suất thay thế từ dựa trên error_rate sử dụng hàm logarit
-    augmentation_probability = C / math.log(error_rate + 1)
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
@@ -404,7 +400,7 @@ def common_misspelling_replacement(text, error_rate=0.12, C=Ceta):
     return ' '.join(words)
-def similar_character_replacement(text, error_rate=0.03, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -441,7 +437,7 @@ def similar_character_replacement(text, error_rate=0.03, C=Ceta):
     candidate_indices = [i for i, char in enumerate(characters) if char in similar_characters]
     # Tính toán xác suất thay thế ký tự dựa trên error_rate sử dụng hàm logarit
-    augmentation_probability = C / math.log(error_rate + 1, 10)
     # Thực hiện thay thế ký tự tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
@@ -461,7 +457,7 @@ def similar_character_replacement(text, error_rate=0.03, C=Ceta):
     return ''.join(characters)
-def random_space_insertion(text, error_rate=0.025, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -478,7 +474,7 @@ def random_space_insertion(text, error_rate=0.025, C=Ceta):
     candidate_indices = [i for i in range(1, len(characters))]
     # Tính toán xác suất chèn khoảng trắng dựa trên error_rate sử dụng hàm logarit
-    augmentation_probability = C / math.log(error_rate + 1, 10)
     # Thực hiện chèn khoảng trắng tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
@@ -494,7 +490,7 @@ def random_space_insertion(text, error_rate=0.025, C=Ceta):
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(characters)
-def random_space_removal(text, error_rate=0.01, C=Ceta):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
@@ -505,7 +501,7 @@ def random_space_removal(text, error_rate=0.01, C=Ceta):
     '''
     # Tính toán xác suất bỏ dấu cách dựa trên error_rate sử dụng hàm logarit
-    augmentation_probability = C / math.log(error_rate + 1, 10)
     # Tìm các vị trí có thể bỏ dấu cách (giữa các từ)
     words = text.split()

 import random
 import math
+import fomula
+def character_replacement(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Tính toán xác suất thay thế ký tự dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Bàn phím QWERTY tiếng Việt
     keyboard = {
     return final_text
+def character_insertion(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
         num_errors = 1 if random.random() > 0.5 else 0
     # Tính toán xác suất chèn ký tự dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện thêm ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
     return ''.join(text)
+def character_deletion(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     # Tính toán xác suất xóa ký tự dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện xóa ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(text)
+def character_transposition(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
         num_errors = 1 if random.random() > 0.5 else 0
     # Tính toán xác suất hoán đổi ký tự dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện hoán đổi ký tự tại các vị trí ngẫu nhiên
     for _ in range(num_errors):
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(text)
+def homophone_replacement(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
                 break
     # Tính toán xác suất thay thế từ dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     return ' '.join(words)
+def common_misspelling_replacement(text, error_rate=0.12):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
         return ' '.join(words)
     # Tính toán xác suất thay thế từ dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện thay thế từ tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     return ' '.join(words)
+def similar_character_replacement(text, error_rate=0.03):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     candidate_indices = [i for i, char in enumerate(characters) if char in similar_characters]
     # Tính toán xác suất thay thế ký tự dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện thay thế ký tự tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     return ''.join(characters)
+def random_space_insertion(text, error_rate=0.025):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     candidate_indices = [i for i in range(1, len(characters))]
     # Tính toán xác suất chèn khoảng trắng dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Thực hiện chèn khoảng trắng tại các vị trí ngẫu nhiên
     num_errors = int(len(text) * error_rate)
     # Ghép các ký tự lại thành chuỗi văn bản
     return ''.join(characters)
+def random_space_removal(text, error_rate=0.01):
     '''
     :param text: Gồm 1 câu đúng chính tả
     :param error_rate: tỷ lệ lỗi sai muốn thêm
     '''
     # Tính toán xác suất bỏ dấu cách dựa trên error_rate sử dụng hàm logarit
+    augmentation_probability = fomula.AP_fomula(error_rate=error_rate)
     # Tìm các vị trí có thể bỏ dấu cách (giữa các từ)
     words = text.split()