Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import io | |
| import numpy as np | |
| import ctypes | |
| # Vector Loader | |
| def load_vectors(fname): | |
| fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') | |
| data = {} | |
| for line in fin: | |
| tokens = line.rstrip().split(' ') | |
| data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array | |
| del fin | |
| return data | |
| vectors = load_vectors('wiki-news-300d-1M.vec') | |
| tokens = [token.encode('utf-8') for token in vectors.keys()] | |
| # Tokenizer | |
| lib = ctypes.CDLL('./tokenizer.so') | |
| lib.tokenize.argtypes = [ctypes.c_char_p, ctypes.POINTER(ctypes.c_char_p), ctypes.c_int, ctypes.POINTER(ctypes.c_int)] | |
| lib.tokenize.restype = ctypes.POINTER(ctypes.c_char_p) | |
| def tokenize(text): | |
| text = text.encode('utf-8') | |
| num_tokens = len(tokens) | |
| tokens_array = (ctypes.c_char_p * num_tokens)(*tokens) | |
| result_size = ctypes.c_int() | |
| result = lib.tokenize(text, tokens_array, num_tokens, ctypes.byref(result_size)) | |
| python_tokens = [result[i].decode('utf-8') for i in range(result_size.value)] | |
| lib.free_tokens(result, result_size.value) | |
| return python_tokens | |
| # Interface | |
| def onInput(paragraph): | |
| tokens = tokenize(paragraph) | |
| if not tokens: # Handle case with no tokens found | |
| return np.zeros(300).tolist() # Return a zero vector of appropriate dimension | |
| merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional | |
| # Merge vectors using NumPy | |
| totalTokens = len(tokens) | |
| for ind, token in enumerate(tokens): | |
| completion = 0.2*((ind+1)/totalTokens) | |
| if token not in vectors: | |
| continue | |
| vector = vectors[token] | |
| merged_vector += vector | |
| # Normalize | |
| merged_vector /= len(tokens) | |
| return merged_vector.tolist() # Convert back to list for output | |
| demo = gr.Interface(fn=onInput, inputs="text", outputs="text") | |
| demo.launch() |