Spaces:
Sleeping
Sleeping
| from collections import defaultdict, Counter | |
| import re | |
| import gradio as gr | |
| def tokenize(text): | |
| text = text.lower() | |
| return re.findall(r'\b\w+\b', text) | |
| def build_ngram(tokens, n): | |
| model = defaultdict(Counter) | |
| for i in range(len(tokens) - n + 1): | |
| prefix = tuple(tokens[i:i + n - 1]) | |
| target = tokens[i + n - 1] | |
| model[prefix][target] += 1 | |
| return model | |
| def calc_prob(model): | |
| prob = {} | |
| for prefix, counter in model.items(): | |
| total = sum(counter.values()) | |
| prob[prefix] = {w: c / total for w, c in counter.items()} | |
| return prob | |
| def fine_tune(base_model, new_tokens, n, alpha=1.0): | |
| new_model = build_ngram(new_tokens, n) | |
| for prefix in new_model: | |
| for word in new_model[prefix]: | |
| base_model[prefix][word] += alpha * new_model[prefix][word] | |
| return base_model | |
| def predict(prefix_words, prob_model): | |
| prefix = tuple(prefix_words) | |
| if prefix in prob_model: | |
| return max(prob_model[prefix], key=prob_model[prefix].get) | |
| return "No prediction" | |
| corpus = "this is a simple language model this is a test" | |
| tokens = tokenize(corpus) | |
| n = 2 | |
| model = build_ngram(tokens, n) | |
| prob = calc_prob(model) | |
| def run_model(prefix, finetune_text): | |
| global model, prob | |
| if finetune_text.strip(): | |
| new_tokens = tokenize(finetune_text) | |
| model = fine_tune(model, new_tokens, n) | |
| prob = calc_prob(model) | |
| prefix_words = tokenize(prefix) | |
| if len(prefix_words) != n - 1: | |
| return "Prefix length error" | |
| return predict(prefix_words, prob) | |
| demo = gr.Interface( | |
| fn=run_model, | |
| inputs=[ | |
| gr.Textbox(label="Prefix"), | |
| gr.Textbox(label="Fine-tuning Text") | |
| ], | |
| outputs="text", | |
| title="N-gram Language Model" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |