Spaces:
Paused
Paused
| import numpy as np | |
| def mask_func(tokenized_sen): | |
| if len(tokenized_sen) == 0: | |
| return [] | |
| token_list = [] | |
| # for sen in tokenized_sen: | |
| # for token in sen: | |
| # token_list.append(token) | |
| for sen in tokenized_sen: | |
| token_list += sen.text.split(' ') | |
| P = 0.5 | |
| ret_list = [] | |
| i = 0 | |
| mask_num = 0 | |
| while i < len(token_list): | |
| t = token_list[i] | |
| if '.' in t or '(' in t or ')' in t or '[' in t or ']' in t: | |
| ret_list.append(t) | |
| i += 1 | |
| mask_num = 0 | |
| else: | |
| length = np.random.poisson(3) | |
| if np.random.rand() < P and length > 0: | |
| if mask_num < 8: | |
| ret_list.append('<mask>') | |
| mask_num += 1 | |
| i += length | |
| else: | |
| ret_list.append(t) | |
| i += 1 | |
| mask_num = 0 | |
| return [' '.join(ret_list)] | |
| def find_mini_span(vec, words, check_set): | |
| def cal(text, sset): | |
| add = 0 | |
| for tt in sset: | |
| if tt in text: | |
| add += 1 | |
| return add | |
| text = ' '.join(words) | |
| max_add = cal(text, check_set) | |
| minn = 10000000 | |
| span = '' | |
| rc = None | |
| for i in range(len(vec)): | |
| if vec[i] == True: | |
| p = -1 | |
| for j in range(i+1, len(vec)+1): | |
| if vec[j-1] == True: | |
| text = ' '.join(words[i:j]) | |
| if cal(text, check_set) == max_add: | |
| p = j | |
| break | |
| if p > 0: | |
| if (p-i) < minn: | |
| minn = p-i | |
| span = ' '.join(words[i:p]) | |
| rc = (i, p) | |
| if rc: | |
| for i in range(rc[0], rc[1]): | |
| vec[i] = True | |
| return vec, span | |
| def process(text): | |
| for i in range(ord('A'), ord('Z')+1): | |
| text = text.replace(f'.{chr(i)}', f'. {chr(i)}') | |
| Left = ['(', '[', '{'] | |
| Right = [')', ']', '}'] | |
| for s in Left: | |
| text = text.replace(s+' ', s) | |
| for s in Right: | |
| text = text.replace(' '+s, s) | |
| for i in range(10): | |
| text = text.replace(f'{i} %', f'{i}%') | |
| text = text.replace(' .', '.') | |
| text = text.replace(' ,', ',') | |
| text = text.replace(' ?', '?') | |
| text = text.replace(' !', '!') | |
| text = text.replace(' :', ':') | |
| text = text.replace(' ;', ';') | |
| text = text.replace(' ', ' ') | |
| return text |