Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import json | |
| file_path = 'merge.json' | |
| with open(file_path, 'r') as json_file: | |
| merge = json.load(json_file) | |
| file_path = 'vocab.json' | |
| with open(file_path, 'r') as json_file: | |
| vocab = json.load(json_file) | |
| def get_counts(text): | |
| counts = {} | |
| for pairs in zip(text, text[1:]): | |
| counts[str(pairs[0]) + "_" + str(pairs[1])] = counts.get(str(pairs[0]) + "_" + str(pairs[1]), 0) + 1 | |
| return counts | |
| def merge_token(token_pattern, text, symbol): | |
| i = 0 | |
| new_text = [] | |
| token_pattern = token_pattern.split("_") | |
| token_pattern = [int(x) for x in token_pattern] | |
| while i < len(text): | |
| if i + 1 < len(text) and text[i] == token_pattern[0] and text[i+1] == token_pattern[1]: | |
| # print("found pattern") | |
| new_text.append(symbol) | |
| i += 2 | |
| else: | |
| new_text.append(text[i]) | |
| i += 1 | |
| return new_text | |
| def encode_sequence(sequence): | |
| tokens = list(sequence.encode('utf-8')) | |
| # print(tokens) | |
| while len(tokens) >= 2: | |
| counts = get_counts(tokens) | |
| pair = min(counts, key=lambda x: merge.get(x, float('inf'))) | |
| # print(pair) | |
| # print(merge) | |
| if pair not in merge: | |
| break | |
| # print("pair: ", pair, merge[pair]) | |
| symbol = int(merge[pair]) | |
| tokens = merge_token(pair, tokens, symbol) | |
| # print("tokens: ", tokens) | |
| return tokens | |
| def decode_sequence(sequence): | |
| # print(sequence) | |
| bitstring = b"".join([vocab[str(token)].encode('utf-8') for token in sequence]) | |
| return bitstring.decode('utf-8', errors='replace') | |
| def tokenize(input): | |
| if len(input) == 0: | |
| return "", 0 | |
| encoded = encode_sequence(input) | |
| return encoded, [decode_sequence([token]) for token in encoded], len(input)/len(encoded) | |
| examples = ["ayyyy whats up 👋", "Okay now picture little Bobby just a youngin' runnin' round", "Peace is when you leave it in the past, let it heal like a cast;When enough time pass, then you blast;Kinda like John Wick, bars like a convict;Fuck around and you don't wanna start shit, woo!"] | |
| intf = gr.Interface(fn=tokenize, inputs="text", outputs=["text", "text", gr.components.Number()], examples=examples, title = "Logic Tokenizer", description="Logic Tokenizer tokenizes your text based on BPE run on the top 10 songs by logic. The vocab size is 1024, and expanded from an original 256 from utf-8. The float output is the compression ratio of len(input)/len(encoded), and the array of integers are the tokens the model learned.") | |
| intf.launch(inline=True) |