from collections import defaultdict, Counter import re import gradio as gr def tokenize(text): text = text.lower() return re.findall(r'\b\w+\b', text) def build_ngram(tokens, n): model = defaultdict(Counter) for i in range(len(tokens) - n + 1): prefix = tuple(tokens[i:i + n - 1]) target = tokens[i + n - 1] model[prefix][target] += 1 return model def calc_prob(model): prob = {} for prefix, counter in model.items(): total = sum(counter.values()) prob[prefix] = {w: c / total for w, c in counter.items()} return prob def fine_tune(base_model, new_tokens, n, alpha=1.0): new_model = build_ngram(new_tokens, n) for prefix in new_model: for word in new_model[prefix]: base_model[prefix][word] += alpha * new_model[prefix][word] return base_model def predict(prefix_words, prob_model): prefix = tuple(prefix_words) if prefix in prob_model: return max(prob_model[prefix], key=prob_model[prefix].get) return "No prediction" corpus = "this is a simple language model this is a test" tokens = tokenize(corpus) n = 2 model = build_ngram(tokens, n) prob = calc_prob(model) def run_model(prefix, finetune_text): global model, prob if finetune_text.strip(): new_tokens = tokenize(finetune_text) model = fine_tune(model, new_tokens, n) prob = calc_prob(model) prefix_words = tokenize(prefix) if len(prefix_words) != n - 1: return "Prefix length error" return predict(prefix_words, prob) demo = gr.Interface( fn=run_model, inputs=[ gr.Textbox(label="Prefix"), gr.Textbox(label="Fine-tuning Text") ], outputs="text", title="N-gram Language Model" ) if __name__ == "__main__": demo.launch()