| import gradio as gr |
| import io |
| import numpy as np |
| from tok import Tokenizer |
|
|
| |
| def load_vectors(fname): |
| fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') |
| data = {} |
| for line in fin: |
| tokens = line.rstrip().split(' ') |
| data[tokens[0]] = np.array(list(map(float, tokens[1:]))) |
| del fin |
| return data, sorted(data.keys(), key=len, reverse=True) |
| vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec') |
|
|
| |
| tokenizer = Tokenizer(protected_words=sorted_vector) |
| def tokenize(text): |
| return tokenizer.word_tokenize(text) |
|
|
| |
| def onInput(paragraph, progress = gr.Progress()): |
| progress(0, "Tokenizing...") |
| tokens = tokenize(paragraph) |
| |
| progress(0.1, "Initializing merged vector...") |
| if not tokens: |
| return np.zeros(300).tolist() |
| |
| merged_vector = np.zeros(300) |
| |
| |
| totalTokens = len(tokens) |
| for ind, token in enumerate(tokens): |
| completion = 0.7*((ind+1)/totalTokens) |
| progress(0.1 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}") |
|
|
| vector = vectors[token] |
| merged_vector += vector |
| |
| |
| progress(0.9, "Normalizing...") |
| merged_vector /= len(tokens) |
| |
| progress(1, "Converting to list...") |
| return merged_vector.tolist() |
|
|
| demo = gr.Interface(fn=onInput, inputs="text", outputs="text") |
| demo.launch() |