Spaces:
Runtime error
Runtime error
Commit
·
79be08a
1
Parent(s):
80c704e
feat: pretty number in thai word to numeric
Browse files- app.py +58 -10
- tests/test_thai_word.py +71 -0
- utils/thai_word.py +88 -0
app.py
CHANGED
|
@@ -1,11 +1,17 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
-
from transformers import pipeline
|
| 4 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
MODEL_NAME = "biodatlab/whisper-th-medium-combined"
|
| 7 |
DEVICE = 0 if torch.cuda.is_available() else "cpu"
|
|
|
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
transcriber = pipeline(
|
| 10 |
"automatic-speech-recognition",
|
| 11 |
model=MODEL_NAME,
|
|
@@ -14,16 +20,30 @@ transcriber = pipeline(
|
|
| 14 |
)
|
| 15 |
|
| 16 |
def transcribe(audio):
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
return
|
| 22 |
-
{"sampling_rate": sr, "raw": y},
|
| 23 |
-
generate_kwargs={"language":"<|th|>", "task":"transcribe"},
|
| 24 |
-
return_timestamps=False,
|
| 25 |
-
batch_size=16
|
| 26 |
-
)["text"]
|
| 27 |
|
| 28 |
|
| 29 |
demo = gr.Interface(
|
|
@@ -32,4 +52,32 @@ demo = gr.Interface(
|
|
| 32 |
"text",
|
| 33 |
)
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
from utils.thai_word import ThaiWord
|
| 6 |
+
from pythainlp.tokenize import word_tokenize
|
| 7 |
|
| 8 |
MODEL_NAME = "biodatlab/whisper-th-medium-combined"
|
| 9 |
DEVICE = 0 if torch.cuda.is_available() else "cpu"
|
| 10 |
+
thw = ThaiWord()
|
| 11 |
|
| 12 |
+
# stride_length_s is a tuple of the left and right stride length.
|
| 13 |
+
# With only 1 number, both sides get the same stride, by default
|
| 14 |
+
# the stride_length on one side is 1/6th of the chunk_length_s
|
| 15 |
transcriber = pipeline(
|
| 16 |
"automatic-speech-recognition",
|
| 17 |
model=MODEL_NAME,
|
|
|
|
| 20 |
)
|
| 21 |
|
| 22 |
def transcribe(audio):
|
| 23 |
+
result = ''
|
| 24 |
+
try:
|
| 25 |
+
sr, y = audio
|
| 26 |
+
y = y.astype(np.float32)
|
| 27 |
+
y /= np.max(np.abs(y))
|
| 28 |
+
|
| 29 |
+
text = transcriber(
|
| 30 |
+
{"sampling_rate": sr, "raw": y},
|
| 31 |
+
generate_kwargs={"language":"<|th|>", "task":"transcribe"},
|
| 32 |
+
return_timestamps=False,
|
| 33 |
+
batch_size=16
|
| 34 |
+
)["text"]
|
| 35 |
+
|
| 36 |
+
if text is not None:
|
| 37 |
+
# pretty text
|
| 38 |
+
tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
|
| 39 |
+
print(tokens)
|
| 40 |
+
result = f'pretty: {thw.pretty(tokens)}\n\n original: {text}'
|
| 41 |
+
else:
|
| 42 |
+
result = 'โปรดลองพูดอีกครั้ง'
|
| 43 |
+
except Exception as e:
|
| 44 |
+
result = f'ไม่สามารถแปลงข้อความเสียงได้ โปรดลองอีกครั้ง\n\nพบข้อผิดพลาด: {str(e)}'
|
| 45 |
|
| 46 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
demo = gr.Interface(
|
|
|
|
| 52 |
"text",
|
| 53 |
)
|
| 54 |
|
| 55 |
+
# def transcribe(stream, new_chunk):
|
| 56 |
+
# sr, y = new_chunk
|
| 57 |
+
# y = y.astype(np.float32)
|
| 58 |
+
# y /= np.max(np.abs(y))
|
| 59 |
+
|
| 60 |
+
# if stream is not None:
|
| 61 |
+
# stream = np.concatenate([stream, y])
|
| 62 |
+
# else:
|
| 63 |
+
# stream = y
|
| 64 |
+
|
| 65 |
+
# text = transcriber({"sampling_rate": sr, "raw": stream})["text"]
|
| 66 |
+
# if text is not None:
|
| 67 |
+
# # pretty text
|
| 68 |
+
# tokens = word_tokenize(text, engine="attacut", join_broken_num=True)
|
| 69 |
+
# result = f'pretty: {thw.pretty(tokens)}\n\n original: {text}'
|
| 70 |
+
# else:
|
| 71 |
+
# result = 'โปรดลองพูดอีกครั้ง'
|
| 72 |
+
|
| 73 |
+
# return stream, result
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# demo = gr.Interface(
|
| 77 |
+
# transcribe,
|
| 78 |
+
# ["state", gr.Audio(sources=["microphone"], streaming=True)],
|
| 79 |
+
# ["state", "text"],
|
| 80 |
+
# live=True,
|
| 81 |
+
# )
|
| 82 |
+
|
| 83 |
demo.launch()
|
tests/test_thai_word.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
from utils.thai_word import ThaiWord
|
| 3 |
+
|
| 4 |
+
class TestThaiWord(unittest.TestCase):
|
| 5 |
+
|
| 6 |
+
def setUp(self) -> None:
|
| 7 |
+
self.thw = ThaiWord()
|
| 8 |
+
|
| 9 |
+
def test_pretty_text_to_numeric(self):
|
| 10 |
+
self.assertEqual(
|
| 11 |
+
self.thw.pretty(['ฮา','โหล','หนึ่ง','สอง','สาม','สี่']),
|
| 12 |
+
'ฮาโหล1234',
|
| 13 |
+
'should convert single word number in thai to numeric'
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
def test_pretty_long_words_to_numeric(self):
|
| 17 |
+
self.assertEqual(
|
| 18 |
+
self.thw.pretty([
|
| 19 |
+
'ปี','นี้','สอง','พัน','ห้า','ร้อย','หก','สิบ','เจ็ด','นะ',
|
| 20 |
+
' ',
|
| 21 |
+
'ปี','หน้า','ก็','สอง','พัน','ห้า','ร้อย','หก','สิบ','แปด'
|
| 22 |
+
]),
|
| 23 |
+
'ปีนี้2567นะ ปีหน้าก็2568',
|
| 24 |
+
'should convert full-words number in thai to numeric in long words (case1)'
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
self.assertEqual(
|
| 28 |
+
self.thw.pretty([
|
| 29 |
+
'อืม', ' ', 'อยาก', 'ได้', 'ราย', 'ได้', 'ยี่', 'สิบ',
|
| 30 |
+
'เอ็ดล้าน', 'แบบ', 'เข้า', 'บ้าง', ' ', 'ทำ', 'ยัง', 'ไง', 'ดี'
|
| 31 |
+
]),
|
| 32 |
+
'อืม อยากได้รายได้21000000แบบเข้าบ้าง ทำยังไงดี',
|
| 33 |
+
'should convert full-words number in thai to numeric in long words (case2)'
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
self.assertEqual(
|
| 37 |
+
self.thw.pretty([
|
| 38 |
+
'อืม',' ','อยาก','ได้','ราย','ได้','ยี่สิบ','เอ็ด','ล้าน',
|
| 39 |
+
'แบบ', 'ร้าน','พร้อม','ทำ','ยัง','ไง','ดี'
|
| 40 |
+
]),
|
| 41 |
+
'อืม อยากได้รายได้21000000แบบร้านพร้อมทำยังไงดี',
|
| 42 |
+
'should convert full-words number in thai to numeric in long words (case3)'
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
def test_pretty_word11_to_numeric(self):
|
| 46 |
+
self.assertEqual(
|
| 47 |
+
self.thw.pretty(['ซื้อ','มา','สิบ','เอ็ด','บาท']),
|
| 48 |
+
'ซื้อมา11บาท',
|
| 49 |
+
'should correct specific numeric "สิบ" and "เอ็ด"'
|
| 50 |
+
)
|
| 51 |
+
self.assertEqual(
|
| 52 |
+
self.thw.pretty(['ซื้อ','มา','สิบเอ็ด','บาท']),
|
| 53 |
+
'ซื้อมา11บาท',
|
| 54 |
+
'should correct specific numeric "สิบเอ็ด"'
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
def test_pretty_word2x_to_numeric(self):
|
| 58 |
+
self.assertEqual(
|
| 59 |
+
self.thw.pretty(['ซื้อ','มา','ยี่','สิบ','ห้า','บาท']),
|
| 60 |
+
'ซื้อมา25บาท',
|
| 61 |
+
'should correct specific numeric "ยี่" and "สิบ"'
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
self.assertEqual(
|
| 65 |
+
self.thw.pretty(['ซื้อ','มา','ยี่สิบ','ห้า','บาท']),
|
| 66 |
+
'ซื้อมา25บาท',
|
| 67 |
+
'should correct specific numeric "ยี่สิบ"'
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
def tearDown(self) -> None:
|
| 71 |
+
self.thw = None
|
utils/thai_word.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pythainlp.util import text_to_num, text_to_arabic_digit
|
| 2 |
+
|
| 3 |
+
class ThaiWord:
|
| 4 |
+
|
| 5 |
+
def __init__(self) -> None:
|
| 6 |
+
self.word_number = ['หนึ่ง','สอง','สาม','สี่','ห้า','หก','เจ็ด','แปด','เก้า']
|
| 7 |
+
self.word_digit = ['สิบ','ร้อย','พัน','หมื่น','แสน','ล้าน']
|
| 8 |
+
self.word_number_specific = ['เอ็ด', 'ยี่']
|
| 9 |
+
self.word_digit_specific = ['สิบ']
|
| 10 |
+
|
| 11 |
+
def iscontains11(self, word) -> bool:
|
| 12 |
+
return self.word_number_specific[0] == word[-4:] or \
|
| 13 |
+
self.word_number_specific[0] == word[0:4]
|
| 14 |
+
|
| 15 |
+
def iscontains2x(self, word) -> bool:
|
| 16 |
+
return self.word_number_specific[1] == word[0:3]
|
| 17 |
+
|
| 18 |
+
def words_to_number(self, words) -> str:
|
| 19 |
+
num = ''
|
| 20 |
+
|
| 21 |
+
if len(words) == 1 and words[0] in self.word_digit:
|
| 22 |
+
# return text if the word is unit
|
| 23 |
+
num = words
|
| 24 |
+
else:
|
| 25 |
+
try:
|
| 26 |
+
num = text_to_num("".join(words))
|
| 27 |
+
if len(num) > 0:
|
| 28 |
+
num = num[0]
|
| 29 |
+
except Exception:
|
| 30 |
+
for word in words:
|
| 31 |
+
num = f'{num}{text_to_arabic_digit(word)}'
|
| 32 |
+
|
| 33 |
+
return f' {int(num):,} '
|
| 34 |
+
|
| 35 |
+
def pretty(self, words) -> str:
|
| 36 |
+
has_start_number = False
|
| 37 |
+
number = []
|
| 38 |
+
text = []
|
| 39 |
+
|
| 40 |
+
for idx, word in enumerate(words):
|
| 41 |
+
if has_start_number:
|
| 42 |
+
if self.is_number(word) or self.is_digit(word):
|
| 43 |
+
number.append(word)
|
| 44 |
+
else:
|
| 45 |
+
text.append(self.words_to_number(number))
|
| 46 |
+
has_start_number = False
|
| 47 |
+
number.clear()
|
| 48 |
+
|
| 49 |
+
if not has_start_number:
|
| 50 |
+
if self.is_start_number(word):
|
| 51 |
+
has_start_number = True
|
| 52 |
+
number.append(word)
|
| 53 |
+
else:
|
| 54 |
+
text.append(word)
|
| 55 |
+
|
| 56 |
+
if idx == len(words)-1 and len(number) > 0:
|
| 57 |
+
text.append(self.words_to_number(number))
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
return ''.join(text)
|
| 61 |
+
|
| 62 |
+
def is_start_number(self, word) -> bool:
|
| 63 |
+
has_start_number = False
|
| 64 |
+
if word in self.word_number or \
|
| 65 |
+
word in self.word_digit or \
|
| 66 |
+
self.iscontains2x(word) or \
|
| 67 |
+
self.iscontains11(word):
|
| 68 |
+
|
| 69 |
+
has_start_number = True
|
| 70 |
+
|
| 71 |
+
return has_start_number
|
| 72 |
+
|
| 73 |
+
def is_digit(self, word) -> bool:
|
| 74 |
+
has_digit = False
|
| 75 |
+
if word in self.word_digit:
|
| 76 |
+
has_digit = True
|
| 77 |
+
|
| 78 |
+
return has_digit
|
| 79 |
+
|
| 80 |
+
def is_number(self, word) -> bool:
|
| 81 |
+
has_number = False
|
| 82 |
+
if word in self.word_number or \
|
| 83 |
+
word in self.word_number_specific or \
|
| 84 |
+
self.iscontains11(word):
|
| 85 |
+
|
| 86 |
+
has_number = True
|
| 87 |
+
|
| 88 |
+
return has_number
|