change translation metric
Browse files
src/calc_metrics.py
CHANGED
|
@@ -22,7 +22,8 @@ class TokenizerZh:
|
|
| 22 |
|
| 23 |
@dataclass
|
| 24 |
class Event:
|
| 25 |
-
|
|
|
|
| 26 |
transcription: str
|
| 27 |
translation: str
|
| 28 |
segment_id: int
|
|
@@ -38,7 +39,7 @@ class WordTiming:
|
|
| 38 |
def segment_id(self):
|
| 39 |
return self.event.segment_id
|
| 40 |
|
| 41 |
-
def read_data(tag):
|
| 42 |
prev_lines = []
|
| 43 |
def is_prefix_of(a, b):
|
| 44 |
return a == b[:len(a)]
|
|
@@ -50,17 +51,15 @@ def read_data(tag):
|
|
| 50 |
|
| 51 |
segments = []
|
| 52 |
records = []
|
| 53 |
-
suffix = "-interim"
|
| 54 |
-
# suffix = ""
|
| 55 |
with (
|
| 56 |
open(f"data/{tag}/transcribe{suffix}.txt", 'r') as transcription_f,
|
| 57 |
open(f"data/{tag}/translate{suffix}.txt", 'r') as translate_f,
|
| 58 |
):
|
| 59 |
for transcription_line, translation_line in zip(transcription_f, translate_f):
|
| 60 |
# print("got:", transcription_line)
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
assert ms == ms2
|
| 64 |
if not is_any_prefix_of(transcription, prev_lines): # new line
|
| 65 |
segments.append(records)
|
| 66 |
prev_lines = []
|
|
@@ -70,7 +69,8 @@ def read_data(tag):
|
|
| 70 |
|
| 71 |
records.append(
|
| 72 |
Event(
|
| 73 |
-
|
|
|
|
| 74 |
transcription=transcription,
|
| 75 |
translation=translation,
|
| 76 |
segment_id=len(segments)
|
|
@@ -114,7 +114,7 @@ def get_words_stable(records, key='transcription'):
|
|
| 114 |
|
| 115 |
words.extend([
|
| 116 |
WordTiming(
|
| 117 |
-
timestamp=
|
| 118 |
word=word,
|
| 119 |
event=rec,
|
| 120 |
)
|
|
@@ -151,7 +151,7 @@ def get_words_edit(records, key='transcription'):
|
|
| 151 |
|
| 152 |
result = [
|
| 153 |
WordTiming(
|
| 154 |
-
timestamp=event
|
| 155 |
word=word,
|
| 156 |
event=event
|
| 157 |
)
|
|
@@ -207,7 +207,7 @@ def calc_retranslation_latency(transcription_words, translation_words):
|
|
| 207 |
transcription_segments = [[] for _ in range(num_segments)]
|
| 208 |
for w in transcription_words:
|
| 209 |
transcription_segments[w.segment_id].append(w)
|
| 210 |
-
cur_end = WordTiming(timestamp=0,word='',event=Event(0,'','',0),policy_timestamp=0)
|
| 211 |
for segments in transcription_segments:
|
| 212 |
if segments:
|
| 213 |
cur_end = segments[-1]
|
|
@@ -243,10 +243,20 @@ def calc_retranslation_latency(transcription_words, translation_words):
|
|
| 243 |
cur_words += 1
|
| 244 |
return sum(latency) / len(latency)
|
| 245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
def main():
|
| 247 |
-
|
| 248 |
tag = 'nhk'
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
| 250 |
all_events = flatten_segments(segments)
|
| 251 |
# for e in all_events:
|
| 252 |
# input(e)
|
|
@@ -257,7 +267,7 @@ def main():
|
|
| 257 |
print("Transcribe DAL: %.2f" % calc_lagging(transcription_words, reduction='mean'))
|
| 258 |
|
| 259 |
translation_words = get_words_edit(all_events, key='translation')
|
| 260 |
-
latency =
|
| 261 |
print("Translate Latency: %.2f" % latency)
|
| 262 |
|
| 263 |
erasure = calc_normalized_erasure(all_events)
|
|
|
|
| 22 |
|
| 23 |
@dataclass
|
| 24 |
class Event:
|
| 25 |
+
transcription_timestamp: float
|
| 26 |
+
translation_timestamp: float
|
| 27 |
transcription: str
|
| 28 |
translation: str
|
| 29 |
segment_id: int
|
|
|
|
| 39 |
def segment_id(self):
|
| 40 |
return self.event.segment_id
|
| 41 |
|
| 42 |
+
def read_data(tag, suffix):
|
| 43 |
prev_lines = []
|
| 44 |
def is_prefix_of(a, b):
|
| 45 |
return a == b[:len(a)]
|
|
|
|
| 51 |
|
| 52 |
segments = []
|
| 53 |
records = []
|
|
|
|
|
|
|
| 54 |
with (
|
| 55 |
open(f"data/{tag}/transcribe{suffix}.txt", 'r') as transcription_f,
|
| 56 |
open(f"data/{tag}/translate{suffix}.txt", 'r') as translate_f,
|
| 57 |
):
|
| 58 |
for transcription_line, translation_line in zip(transcription_f, translate_f):
|
| 59 |
# print("got:", transcription_line)
|
| 60 |
+
transcription_ms, transcription = transcription_line.lower().strip().split('\t')
|
| 61 |
+
translation_ms, translation = translation_line.lower().strip().split('\t')
|
| 62 |
+
# assert ms == ms2
|
| 63 |
if not is_any_prefix_of(transcription, prev_lines): # new line
|
| 64 |
segments.append(records)
|
| 65 |
prev_lines = []
|
|
|
|
| 69 |
|
| 70 |
records.append(
|
| 71 |
Event(
|
| 72 |
+
transcription_timestamp=float(transcription_ms),
|
| 73 |
+
translation_timestamp=float(translation_ms),
|
| 74 |
transcription=transcription,
|
| 75 |
translation=translation,
|
| 76 |
segment_id=len(segments)
|
|
|
|
| 114 |
|
| 115 |
words.extend([
|
| 116 |
WordTiming(
|
| 117 |
+
timestamp=getattr(rec, f"{key}_timestamp"),
|
| 118 |
word=word,
|
| 119 |
event=rec,
|
| 120 |
)
|
|
|
|
| 151 |
|
| 152 |
result = [
|
| 153 |
WordTiming(
|
| 154 |
+
timestamp=getattr(event, f"{key}_timestamp"),
|
| 155 |
word=word,
|
| 156 |
event=event
|
| 157 |
)
|
|
|
|
| 207 |
transcription_segments = [[] for _ in range(num_segments)]
|
| 208 |
for w in transcription_words:
|
| 209 |
transcription_segments[w.segment_id].append(w)
|
| 210 |
+
cur_end = WordTiming(timestamp=0,word='',event=Event(0,0,'','',0),policy_timestamp=0)
|
| 211 |
for segments in transcription_segments:
|
| 212 |
if segments:
|
| 213 |
cur_end = segments[-1]
|
|
|
|
| 243 |
cur_words += 1
|
| 244 |
return sum(latency) / len(latency)
|
| 245 |
|
| 246 |
+
def calc_translation_latency(events):
|
| 247 |
+
latencies = [
|
| 248 |
+
e.translation_timestamp - e.transcription_timestamp
|
| 249 |
+
for e in events
|
| 250 |
+
]
|
| 251 |
+
return sum(latencies) / len(latencies)
|
| 252 |
+
|
| 253 |
def main():
|
| 254 |
+
tag = 'gaza'
|
| 255 |
tag = 'nhk'
|
| 256 |
+
tag = 'ml2021'
|
| 257 |
+
suffix = ""
|
| 258 |
+
# suffix = "-interim"
|
| 259 |
+
segments = read_data(tag, suffix=suffix)
|
| 260 |
all_events = flatten_segments(segments)
|
| 261 |
# for e in all_events:
|
| 262 |
# input(e)
|
|
|
|
| 267 |
print("Transcribe DAL: %.2f" % calc_lagging(transcription_words, reduction='mean'))
|
| 268 |
|
| 269 |
translation_words = get_words_edit(all_events, key='translation')
|
| 270 |
+
latency = calc_translation_latency(all_events)
|
| 271 |
print("Translate Latency: %.2f" % latency)
|
| 272 |
|
| 273 |
erasure = calc_normalized_erasure(all_events)
|
src/data/ml2021/transcribe.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
4090 我 跟你 聊 聊
|
| 2 |
+
40455 來上課吧 那接下來啊,我們要講decoder,那我們上上週呢已經講了encoder,那接下來呢,我們要講decoder 那decoder呢,其實有兩種,等一下呢,會花比較多時間介紹比比較常見的這個叫做autoregressive的decoder 那這個autoregressive的decoder是怎麼運作的呢?那等一下我們是用語音變
|
| 3 |
+
41097 你 們 沒 錢
|
| 4 |
+
76099 其實是一模一樣的,你只是把輸入輸出改成不同的東西而已 好,那語音辨識是怎麼做的呢?語音辨識,你知道語音辨識就是輸入一段聲音,輸出一串文字 好,那你會把一段聲音輸入給encoder,比如說你對機器說機器學習,機器收到一段聲音訊號,聲音訊號呢進入encoder以後,輸出會是什麼呢?輸出會變成一排 那我們上週花了很多時間
|
| 5 |
+
81651 你 們 們 們 們 們
|
| 6 |
+
113117 做的事情,就是輸入一個vector sequence,輸出另外一個vector sequence 那接下來呢,就輪到decoder運作了,decoder要做的事情就是產生輸出,接下來輪到decoder產生語音辨識的結果 那decoder怎麼產生這個語音辨識的結果呢?那decoder做的事情就是把encoder的輸出先讀進去,那至於怎麼讀進去,那這個我們
|
| 7 |
+
113856 你 們 聊天 聊 聊
|
| 8 |
+
149334 等一下再處理 好,那decoder怎麼產生一段文字呢?語言辨識,機器的輸出就是一段文字,decoder怎麼產生一段文字呢?那首先呢,你要先給它一個特殊的符號,這個特殊的符號呢,代表開始,那在助教投影片裡面呢,是寫begin of sentence,所寫是bos,那我這邊會怕你知道bos是什麼啦,所以我就把它的意思明確的寫出來,就是開始,就是begin的意思
|
| 9 |
+
154918 我 聽 聽 聽 聽 聽
|
| 10 |
+
185612 在你可以本來decoder可能產生的文字裡面呢,多加一個特殊的符號,多加一個特殊的字,那這個字呢,就代表了,代表了開始這個事情 好,所以decoder呢,就吃到這個特殊的符號,那在這個機器學習裡面呢,假設你要處理nlp的問題,每一個token,你都可以把它用一個onehot的vector來表示,onehotvector就是其中一為是一,其他都是零,所以也是用onehot
|
| 11 |
+
186362 你 怎麼 樣
|
| 12 |
+
187279 上架
|
| 13 |
+
221105 什麼呢?這個vector裡面有什麼呢?這個vector的長度啊,它很長,它的長度呢,跟你的vocabulary的size是一樣的。這邊的vocabulary指的是什麼意思呢?你就先想好說你的decoder輸出的單位是什麼 假設我們今天做的是中文的語音辨識,我們decoder輸出的是中文,那你這邊的vocabulary的size啊,可能就是中文
|
| 14 |
+
221771 風 騎
|
| 15 |
+
9475 你 們 沒 錢
|
| 16 |
+
45239 字母,輸出英文的字母,但你可能會覺得字母這個單位太小了,有人可能會選擇輸出英文的詞彙,英文的詞彙是用空白作為間隔的,但如果這種詞彙當作輸出又太多了,所以你會發現剛才在字幕的影片裡面,字幕說它是用word當作英文的單位,就有一些方法可以把英文的自手字根切出來,拿自手字根當作單位。那如果中文的話呢,我覺得就比較單純
|
| 17 |
+
45890 你 怎麼 樣
|
| 18 |
+
62569 當做當位
|
| 19 |
+
65917 那時候就這樣
|
| 20 |
+
68560 你 們 沒 什 麼 樣 樣 樣
|
src/data/ml2021/translate.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
4490 I'll talk to you
|
| 2 |
+
40627 Let's go to class. Next, we are going to talk about decoders. We have already talked about encoders last week. Next, we are going to talk about decoders. There are actually two types of decoders. We will spend more time introducing the more common one called autoregressive decoder. How does the autoregressive decoder work?
|
| 3 |
+
41162 You have no money
|
| 4 |
+
76276 Actually, they are exactly the same. You just change the input and output to different things. Okay, how do you do speech recognition? Speech recognition, you know, is to input a sound and output a string of text. Okay, then you will input a sound into the encoder. For example, you tell the machine to learn machine learning. The machine receives a sound signal. After the sound signal enters the encoder, what will be the output? The output will become a row. We spent a lot of time last week.
|
| 5 |
+
81741 You guys you guys
|
| 6 |
+
113215 What the decoder does is to input a vector sequence and output another vector sequence. Then, it is the decoder's turn to operate. The decoder's job is to generate output. Then it is the decoder's turn to generate the result of speech recognition. How does the decoder generate the result of speech recognition? The decoder's job is to read the output of the encoder first. As for how to read it in, we
|
| 7 |
+
113926 You guys chat.
|
| 8 |
+
149466 I will deal with it later. Okay, how does the decoder generate a piece of text? Language recognition, the output of the machine is a piece of text. How does the decoder generate a piece of text? First of all, you have to give it a special symbol. This special symbol represents the beginning. In the teaching assistant's slide, it says begin of sentence, and the word written is bos. I am afraid that you know what bos is, so I will write its meaning clearly, which is the beginning, the meaning of begin.
|
| 9 |
+
154972 I listen, listen, listen, listen
|
| 10 |
+
185800 In the text that the decoder may generate, add a special symbol, add a special word, then this word represents the beginning of this thing. Okay, so the decoder will get this special symbol. In machine learning, assuming you want to deal with NLP problems, each token can be represented by a one-hot vector. A one-hot vector is one in which one is one and the others are zero, so it is also represented by a one-hot vector.
|
| 11 |
+
186449 How are you?
|
| 12 |
+
187304 Available
|
| 13 |
+
221264 What is it? What is in this vector? The length of this vector is very long. Its length is the same as the size of your vocabulary. What does vocabulary mean here? You should first think about the unit of your decoder output. Suppose we are doing Chinese speech recognition today, and our decoder outputs Chinese, then the size of your vocabulary here may be Chinese.
|
| 14 |
+
221882 Wind Ride
|
| 15 |
+
9546 You have no money
|
| 16 |
+
45451 Letters, output English letters, but you may think that the unit of letters is too small. Some people may choose to output English vocabulary. English vocabulary is separated by spaces, but if this kind of vocabulary is output, there will be too many. So you will find that in the subtitled video just now, the subtitles say that it uses word as the unit of English. There are some ways to cut out the English radicals and use the radicals as units. If it's Chinese, I think it's simpler.
|
| 17 |
+
45956 How are you?
|
| 18 |
+
62637 As a proper position
|
| 19 |
+
65997 That was the case at that time
|
| 20 |
+
68595 You don't have anything.
|
src/requirements.txt
CHANGED
|
@@ -3,4 +3,5 @@ google-cloud-translate
|
|
| 3 |
google-cloud-speech
|
| 4 |
Prompt
|
| 5 |
pyaudio
|
| 6 |
-
termcolor
|
|
|
|
|
|
| 3 |
google-cloud-speech
|
| 4 |
Prompt
|
| 5 |
pyaudio
|
| 6 |
+
termcolor
|
| 7 |
+
rapidfuzz
|
src/transcribe_and_translate_streaming_infinite_chirp2_0113.py
CHANGED
|
@@ -281,8 +281,10 @@ def listen_print_loop(responses: object, stream: object, location:str, target_la
|
|
| 281 |
continue
|
| 282 |
|
| 283 |
transcript = result.alternatives[0].transcript
|
|
|
|
| 284 |
#print(transcript)
|
| 285 |
translated_text = translate_text(project_id, location, transcript, target_language)
|
|
|
|
| 286 |
# translated_text = transcript
|
| 287 |
#print(translated_text)
|
| 288 |
|
|
@@ -308,7 +310,6 @@ def listen_print_loop(responses: object, stream: object, location:str, target_la
|
|
| 308 |
)
|
| 309 |
# Display interim results, but with a carriage return at the end of the
|
| 310 |
# line, so subsequent lines will overwrite them.
|
| 311 |
-
real_time = get_current_time() - stream.start_time
|
| 312 |
|
| 313 |
if result.is_final:
|
| 314 |
sys.stdout.write(GREEN)
|
|
@@ -316,10 +317,9 @@ def listen_print_loop(responses: object, stream: object, location:str, target_la
|
|
| 316 |
sys.stdout.write(str(corrected_time) + ": " + transcript + "=> translation:" + translated_text + "\n")
|
| 317 |
|
| 318 |
with open(f"data/{name}/transcribe.txt", "a") as f:
|
| 319 |
-
f.write(f"{
|
| 320 |
with open(f"data/{name}/translate.txt", "a") as f:
|
| 321 |
-
f.write(f"{
|
| 322 |
-
result_counter += 1
|
| 323 |
|
| 324 |
stream.is_final_end_time = stream.result_end_time
|
| 325 |
stream.last_transcript_was_final = True
|
|
@@ -337,9 +337,9 @@ def listen_print_loop(responses: object, stream: object, location:str, target_la
|
|
| 337 |
sys.stdout.write(str(corrected_time) + ": " + transcript + "=> translation:" + translated_text + "\r")
|
| 338 |
|
| 339 |
with open(f"data/{name}/transcribe-interim.txt", "a") as f:
|
| 340 |
-
f.write(f"{
|
| 341 |
with open(f"data/{name}/translate-interim.txt", "a") as f:
|
| 342 |
-
f.write(f"{
|
| 343 |
|
| 344 |
stream.last_transcript_was_final = False
|
| 345 |
|
|
|
|
| 281 |
continue
|
| 282 |
|
| 283 |
transcript = result.alternatives[0].transcript
|
| 284 |
+
transcript_time = get_current_time() - stream.start_time
|
| 285 |
#print(transcript)
|
| 286 |
translated_text = translate_text(project_id, location, transcript, target_language)
|
| 287 |
+
translated_time = get_current_time() - stream.start_time
|
| 288 |
# translated_text = transcript
|
| 289 |
#print(translated_text)
|
| 290 |
|
|
|
|
| 310 |
)
|
| 311 |
# Display interim results, but with a carriage return at the end of the
|
| 312 |
# line, so subsequent lines will overwrite them.
|
|
|
|
| 313 |
|
| 314 |
if result.is_final:
|
| 315 |
sys.stdout.write(GREEN)
|
|
|
|
| 317 |
sys.stdout.write(str(corrected_time) + ": " + transcript + "=> translation:" + translated_text + "\n")
|
| 318 |
|
| 319 |
with open(f"data/{name}/transcribe.txt", "a") as f:
|
| 320 |
+
f.write(f"{transcript_time}\t{transcript}\n")
|
| 321 |
with open(f"data/{name}/translate.txt", "a") as f:
|
| 322 |
+
f.write(f"{translated_time}\t{translated_text}\n")
|
|
|
|
| 323 |
|
| 324 |
stream.is_final_end_time = stream.result_end_time
|
| 325 |
stream.last_transcript_was_final = True
|
|
|
|
| 337 |
sys.stdout.write(str(corrected_time) + ": " + transcript + "=> translation:" + translated_text + "\r")
|
| 338 |
|
| 339 |
with open(f"data/{name}/transcribe-interim.txt", "a") as f:
|
| 340 |
+
f.write(f"{transcript_time}\t{transcript}\n")
|
| 341 |
with open(f"data/{name}/translate-interim.txt", "a") as f:
|
| 342 |
+
f.write(f"{translated_time}\t{translated_text}\n")
|
| 343 |
|
| 344 |
stream.last_transcript_was_final = False
|
| 345 |
|