Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import json | |
| import pickle as pkl | |
| from transformers import AutoTokenizer | |
| import re | |
| # Vector Loader | |
| vectors = pkl.load(open("vectors.pkl", "rb")) | |
| vocab = [word.lower() for word in vectors.keys()] | |
| # Tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") | |
| def make_alphanumeric(input_string): | |
| return re.sub(r'[^a-zA-Z0-9 ]', '', input_string) | |
| def tokenize(text): | |
| # Check data | |
| if len(text) == 0: | |
| gr.Error("No text provided.") | |
| elif len(text) > 4096: | |
| gr.Error("Text too long.") | |
| # Filter | |
| text = make_alphanumeric(text.lower()) | |
| pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text) | |
| pre_tokenized_text = [word for word, offset in pre_tokenize_result] | |
| tokens = [] | |
| for word in pre_tokenized_text: | |
| if word in vocab: | |
| tokens.append(word) | |
| return tokens | |
| # Interface | |
| def onInput(paragraph, progress = gr.Progress()): | |
| tokens = tokenize(paragraph) | |
| if not tokens: # Handle case with no tokens found | |
| return np.zeros(300).tolist() # Return a zero vector of appropriate dimension | |
| merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional | |
| # Merge vectors using NumPy | |
| totalTokens = len(tokens) | |
| for ind, token in enumerate(tokens): | |
| completion = 0.2*((ind+1)/totalTokens) | |
| progress(0.6 + completion, f"Merging {token}, Token #{tokens.index(token)+1}/{len(tokens)}") | |
| if token not in vectors: | |
| continue | |
| vector = vectors[token] | |
| merged_vector += vector | |
| # Normalize | |
| merged_vector /= len(tokens) | |
| return merged_vector.tolist(), json.dumps(tokens) | |
| demo = gr.Interface(fn=onInput, inputs="text", outputs=["text", "json"]) | |
| demo.launch() |