cqchangm commited on
Commit
9cb9feb
·
1 Parent(s): bf1ebc4

change translation metric

Browse files
src/calc_metrics.py CHANGED
@@ -22,7 +22,8 @@ class TokenizerZh:
22
 
23
  @dataclass
24
  class Event:
25
- timestamp: float
 
26
  transcription: str
27
  translation: str
28
  segment_id: int
@@ -38,7 +39,7 @@ class WordTiming:
38
  def segment_id(self):
39
  return self.event.segment_id
40
 
41
- def read_data(tag):
42
  prev_lines = []
43
  def is_prefix_of(a, b):
44
  return a == b[:len(a)]
@@ -50,17 +51,15 @@ def read_data(tag):
50
 
51
  segments = []
52
  records = []
53
- suffix = "-interim"
54
- # suffix = ""
55
  with (
56
  open(f"data/{tag}/transcribe{suffix}.txt", 'r') as transcription_f,
57
  open(f"data/{tag}/translate{suffix}.txt", 'r') as translate_f,
58
  ):
59
  for transcription_line, translation_line in zip(transcription_f, translate_f):
60
  # print("got:", transcription_line)
61
- ms, transcription = transcription_line.lower().strip().split('\t')
62
- ms2, translation = translation_line.lower().strip().split('\t')
63
- assert ms == ms2
64
  if not is_any_prefix_of(transcription, prev_lines): # new line
65
  segments.append(records)
66
  prev_lines = []
@@ -70,7 +69,8 @@ def read_data(tag):
70
 
71
  records.append(
72
  Event(
73
- timestamp=float(ms),
 
74
  transcription=transcription,
75
  translation=translation,
76
  segment_id=len(segments)
@@ -114,7 +114,7 @@ def get_words_stable(records, key='transcription'):
114
 
115
  words.extend([
116
  WordTiming(
117
- timestamp=float(rec.timestamp),
118
  word=word,
119
  event=rec,
120
  )
@@ -151,7 +151,7 @@ def get_words_edit(records, key='transcription'):
151
 
152
  result = [
153
  WordTiming(
154
- timestamp=event.timestamp,
155
  word=word,
156
  event=event
157
  )
@@ -207,7 +207,7 @@ def calc_retranslation_latency(transcription_words, translation_words):
207
  transcription_segments = [[] for _ in range(num_segments)]
208
  for w in transcription_words:
209
  transcription_segments[w.segment_id].append(w)
210
- cur_end = WordTiming(timestamp=0,word='',event=Event(0,'','',0),policy_timestamp=0)
211
  for segments in transcription_segments:
212
  if segments:
213
  cur_end = segments[-1]
@@ -243,10 +243,20 @@ def calc_retranslation_latency(transcription_words, translation_words):
243
  cur_words += 1
244
  return sum(latency) / len(latency)
245
 
 
 
 
 
 
 
 
246
  def main():
247
- # tag = 'gaza'
248
  tag = 'nhk'
249
- segments = read_data(tag)
 
 
 
250
  all_events = flatten_segments(segments)
251
  # for e in all_events:
252
  # input(e)
@@ -257,7 +267,7 @@ def main():
257
  print("Transcribe DAL: %.2f" % calc_lagging(transcription_words, reduction='mean'))
258
 
259
  translation_words = get_words_edit(all_events, key='translation')
260
- latency = calc_retranslation_latency(transcription_words, translation_words)
261
  print("Translate Latency: %.2f" % latency)
262
 
263
  erasure = calc_normalized_erasure(all_events)
 
22
 
23
  @dataclass
24
  class Event:
25
+ transcription_timestamp: float
26
+ translation_timestamp: float
27
  transcription: str
28
  translation: str
29
  segment_id: int
 
39
  def segment_id(self):
40
  return self.event.segment_id
41
 
42
+ def read_data(tag, suffix):
43
  prev_lines = []
44
  def is_prefix_of(a, b):
45
  return a == b[:len(a)]
 
51
 
52
  segments = []
53
  records = []
 
 
54
  with (
55
  open(f"data/{tag}/transcribe{suffix}.txt", 'r') as transcription_f,
56
  open(f"data/{tag}/translate{suffix}.txt", 'r') as translate_f,
57
  ):
58
  for transcription_line, translation_line in zip(transcription_f, translate_f):
59
  # print("got:", transcription_line)
60
+ transcription_ms, transcription = transcription_line.lower().strip().split('\t')
61
+ translation_ms, translation = translation_line.lower().strip().split('\t')
62
+ # assert ms == ms2
63
  if not is_any_prefix_of(transcription, prev_lines): # new line
64
  segments.append(records)
65
  prev_lines = []
 
69
 
70
  records.append(
71
  Event(
72
+ transcription_timestamp=float(transcription_ms),
73
+ translation_timestamp=float(translation_ms),
74
  transcription=transcription,
75
  translation=translation,
76
  segment_id=len(segments)
 
114
 
115
  words.extend([
116
  WordTiming(
117
+ timestamp=getattr(rec, f"{key}_timestamp"),
118
  word=word,
119
  event=rec,
120
  )
 
151
 
152
  result = [
153
  WordTiming(
154
+ timestamp=getattr(event, f"{key}_timestamp"),
155
  word=word,
156
  event=event
157
  )
 
207
  transcription_segments = [[] for _ in range(num_segments)]
208
  for w in transcription_words:
209
  transcription_segments[w.segment_id].append(w)
210
+ cur_end = WordTiming(timestamp=0,word='',event=Event(0,0,'','',0),policy_timestamp=0)
211
  for segments in transcription_segments:
212
  if segments:
213
  cur_end = segments[-1]
 
243
  cur_words += 1
244
  return sum(latency) / len(latency)
245
 
246
+ def calc_translation_latency(events):
247
+ latencies = [
248
+ e.translation_timestamp - e.transcription_timestamp
249
+ for e in events
250
+ ]
251
+ return sum(latencies) / len(latencies)
252
+
253
  def main():
254
+ tag = 'gaza'
255
  tag = 'nhk'
256
+ tag = 'ml2021'
257
+ suffix = ""
258
+ # suffix = "-interim"
259
+ segments = read_data(tag, suffix=suffix)
260
  all_events = flatten_segments(segments)
261
  # for e in all_events:
262
  # input(e)
 
267
  print("Transcribe DAL: %.2f" % calc_lagging(transcription_words, reduction='mean'))
268
 
269
  translation_words = get_words_edit(all_events, key='translation')
270
+ latency = calc_translation_latency(all_events)
271
  print("Translate Latency: %.2f" % latency)
272
 
273
  erasure = calc_normalized_erasure(all_events)
src/data/ml2021/transcribe.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 4090 我 跟你 聊 聊
2
+ 40455 來上課吧 那接下來啊,我們要講decoder,那我們上上週呢已經講了encoder,那接下來呢,我們要講decoder 那decoder呢,其實有兩種,等一下呢,會花比較多時間介紹比比較常見的這個叫做autoregressive的decoder 那這個autoregressive的decoder是怎麼運作的呢?那等一下我們是用語音變
3
+ 41097 你 們 沒 錢
4
+ 76099 其實是一模一樣的,你只是把輸入輸出改成不同的東西而已 好,那語音辨識是怎麼做的呢?語音辨識,你知道語音辨識就是輸入一段聲音,輸出一串文字 好,那你會把一段聲音輸入給encoder,比如說你對機器說機器學習,機器收到一段聲音訊號,聲音訊號呢進入encoder以後,輸出會是什麼呢?輸出會變成一排 那我們上週花了很多時間
5
+ 81651 你 們 們 們 們 們
6
+ 113117 做的事情,就是輸入一個vector sequence,輸出另外一個vector sequence 那接下來呢,就輪到decoder運作了,decoder要做的事情就是產生輸出,接下來輪到decoder產生語音辨識的結果 那decoder怎麼產生這個語音辨識的結果呢?那decoder做的事情就是把encoder的輸出先讀進去,那至於怎麼讀進去,那這個我們
7
+ 113856 你 們 聊天 聊 聊
8
+ 149334 等一下再處理 好,那decoder怎麼產生一段文字呢?語言辨識,機器的輸出就是一段文字,decoder怎麼產生一段文字呢?那首先呢,你要先給它一個特殊的符號,這個特殊的符號呢,代表開始,那在助教投影片裡面呢,是寫begin of sentence,所寫是bos,那我這邊會怕你知道bos是什麼啦,所以我就把它的意思明確的寫出來,就是開始,就是begin的意思
9
+ 154918 我 聽 聽 聽 聽 聽
10
+ 185612 在你可以本來decoder可能產生的文字裡面呢,多加一個特殊的符號,多加一個特殊的字,那這個字呢,就代表了,代表了開始這個事情 好,所以decoder呢,就吃到這個特殊的符號,那在這個機器學習裡面呢,假設你要處理nlp的問題,每一個token,你都可以把它用一個onehot的vector來表示,onehotvector就是其中一為是一,其他都是零,所以也是用onehot
11
+ 186362 你 怎麼 樣
12
+ 187279 上架
13
+ 221105 什麼呢?這個vector裡面有什麼呢?這個vector的長度啊,它很長,它的長度呢,跟你的vocabulary的size是一樣的。這邊的vocabulary指的是什麼意思呢?你就先想好說你的decoder輸出的單位是什麼 假設我們今天做的是中文的語音辨識,我們decoder輸出的是中文,那你這邊的vocabulary的size啊,可能就是中文
14
+ 221771 風 騎
15
+ 9475 你 們 沒 錢
16
+ 45239 字母,輸出英文的字母,但你可能會覺得字母這個單位太小了,有人可能會選擇輸出英文的詞彙,英文的詞彙是用空白作為間隔的,但如果這種詞彙當作輸出又太多了,所以你會發現剛才在字幕的影片裡面,字幕說它是用word當作英文的單位,就有一些方法可以把英文的自手字根切出來,拿自手字根當作單位。那如果中文的話呢,我覺得就比較單純
17
+ 45890 你 怎麼 樣
18
+ 62569 當做當位
19
+ 65917 那時候就這樣
20
+ 68560 你 們 沒 什 麼 樣 樣 樣
src/data/ml2021/translate.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 4490 I'll talk to you
2
+ 40627 Let's go to class. Next, we are going to talk about decoders. We have already talked about encoders last week. Next, we are going to talk about decoders. There are actually two types of decoders. We will spend more time introducing the more common one called autoregressive decoder. How does the autoregressive decoder work?
3
+ 41162 You have no money
4
+ 76276 Actually, they are exactly the same. You just change the input and output to different things. Okay, how do you do speech recognition? Speech recognition, you know, is to input a sound and output a string of text. Okay, then you will input a sound into the encoder. For example, you tell the machine to learn machine learning. The machine receives a sound signal. After the sound signal enters the encoder, what will be the output? The output will become a row. We spent a lot of time last week.
5
+ 81741 You guys you guys
6
+ 113215 What the decoder does is to input a vector sequence and output another vector sequence. Then, it is the decoder's turn to operate. The decoder's job is to generate output. Then it is the decoder's turn to generate the result of speech recognition. How does the decoder generate the result of speech recognition? The decoder's job is to read the output of the encoder first. As for how to read it in, we
7
+ 113926 You guys chat.
8
+ 149466 I will deal with it later. Okay, how does the decoder generate a piece of text? Language recognition, the output of the machine is a piece of text. How does the decoder generate a piece of text? First of all, you have to give it a special symbol. This special symbol represents the beginning. In the teaching assistant's slide, it says begin of sentence, and the word written is bos. I am afraid that you know what bos is, so I will write its meaning clearly, which is the beginning, the meaning of begin.
9
+ 154972 I listen, listen, listen, listen
10
+ 185800 In the text that the decoder may generate, add a special symbol, add a special word, then this word represents the beginning of this thing. Okay, so the decoder will get this special symbol. In machine learning, assuming you want to deal with NLP problems, each token can be represented by a one-hot vector. A one-hot vector is one in which one is one and the others are zero, so it is also represented by a one-hot vector.
11
+ 186449 How are you?
12
+ 187304 Available
13
+ 221264 What is it? What is in this vector? The length of this vector is very long. Its length is the same as the size of your vocabulary. What does vocabulary mean here? You should first think about the unit of your decoder output. Suppose we are doing Chinese speech recognition today, and our decoder outputs Chinese, then the size of your vocabulary here may be Chinese.
14
+ 221882 Wind Ride
15
+ 9546 You have no money
16
+ 45451 Letters, output English letters, but you may think that the unit of letters is too small. Some people may choose to output English vocabulary. English vocabulary is separated by spaces, but if this kind of vocabulary is output, there will be too many. So you will find that in the subtitled video just now, the subtitles say that it uses word as the unit of English. There are some ways to cut out the English radicals and use the radicals as units. If it's Chinese, I think it's simpler.
17
+ 45956 How are you?
18
+ 62637 As a proper position
19
+ 65997 That was the case at that time
20
+ 68595 You don't have anything.
src/requirements.txt CHANGED
@@ -3,4 +3,5 @@ google-cloud-translate
3
  google-cloud-speech
4
  Prompt
5
  pyaudio
6
- termcolor
 
 
3
  google-cloud-speech
4
  Prompt
5
  pyaudio
6
+ termcolor
7
+ rapidfuzz
src/transcribe_and_translate_streaming_infinite_chirp2_0113.py CHANGED
@@ -281,8 +281,10 @@ def listen_print_loop(responses: object, stream: object, location:str, target_la
281
  continue
282
 
283
  transcript = result.alternatives[0].transcript
 
284
  #print(transcript)
285
  translated_text = translate_text(project_id, location, transcript, target_language)
 
286
  # translated_text = transcript
287
  #print(translated_text)
288
 
@@ -308,7 +310,6 @@ def listen_print_loop(responses: object, stream: object, location:str, target_la
308
  )
309
  # Display interim results, but with a carriage return at the end of the
310
  # line, so subsequent lines will overwrite them.
311
- real_time = get_current_time() - stream.start_time
312
 
313
  if result.is_final:
314
  sys.stdout.write(GREEN)
@@ -316,10 +317,9 @@ def listen_print_loop(responses: object, stream: object, location:str, target_la
316
  sys.stdout.write(str(corrected_time) + ": " + transcript + "=> translation:" + translated_text + "\n")
317
 
318
  with open(f"data/{name}/transcribe.txt", "a") as f:
319
- f.write(f"{real_time}\t{transcript}\n")
320
  with open(f"data/{name}/translate.txt", "a") as f:
321
- f.write(f"{real_time}\t{translated_text}\n")
322
- result_counter += 1
323
 
324
  stream.is_final_end_time = stream.result_end_time
325
  stream.last_transcript_was_final = True
@@ -337,9 +337,9 @@ def listen_print_loop(responses: object, stream: object, location:str, target_la
337
  sys.stdout.write(str(corrected_time) + ": " + transcript + "=> translation:" + translated_text + "\r")
338
 
339
  with open(f"data/{name}/transcribe-interim.txt", "a") as f:
340
- f.write(f"{real_time}\t{transcript}\n")
341
  with open(f"data/{name}/translate-interim.txt", "a") as f:
342
- f.write(f"{real_time}\t{translated_text}\n")
343
 
344
  stream.last_transcript_was_final = False
345
 
 
281
  continue
282
 
283
  transcript = result.alternatives[0].transcript
284
+ transcript_time = get_current_time() - stream.start_time
285
  #print(transcript)
286
  translated_text = translate_text(project_id, location, transcript, target_language)
287
+ translated_time = get_current_time() - stream.start_time
288
  # translated_text = transcript
289
  #print(translated_text)
290
 
 
310
  )
311
  # Display interim results, but with a carriage return at the end of the
312
  # line, so subsequent lines will overwrite them.
 
313
 
314
  if result.is_final:
315
  sys.stdout.write(GREEN)
 
317
  sys.stdout.write(str(corrected_time) + ": " + transcript + "=> translation:" + translated_text + "\n")
318
 
319
  with open(f"data/{name}/transcribe.txt", "a") as f:
320
+ f.write(f"{transcript_time}\t{transcript}\n")
321
  with open(f"data/{name}/translate.txt", "a") as f:
322
+ f.write(f"{translated_time}\t{translated_text}\n")
 
323
 
324
  stream.is_final_end_time = stream.result_end_time
325
  stream.last_transcript_was_final = True
 
337
  sys.stdout.write(str(corrected_time) + ": " + transcript + "=> translation:" + translated_text + "\r")
338
 
339
  with open(f"data/{name}/transcribe-interim.txt", "a") as f:
340
+ f.write(f"{transcript_time}\t{transcript}\n")
341
  with open(f"data/{name}/translate-interim.txt", "a") as f:
342
+ f.write(f"{translated_time}\t{translated_text}\n")
343
 
344
  stream.last_transcript_was_final = False
345