File size: 1,800 Bytes
0ed38d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from collections import defaultdict, Counter
import re
import gradio as gr

def tokenize(text):
    text = text.lower()
    return re.findall(r'\b\w+\b', text)

def build_ngram(tokens, n):
    model = defaultdict(Counter)
    for i in range(len(tokens) - n + 1):
        prefix = tuple(tokens[i:i + n - 1])
        target = tokens[i + n - 1]
        model[prefix][target] += 1
    return model

def calc_prob(model):
    prob = {}
    for prefix, counter in model.items():
        total = sum(counter.values())
        prob[prefix] = {w: c / total for w, c in counter.items()}
    return prob

def fine_tune(base_model, new_tokens, n, alpha=1.0):
    new_model = build_ngram(new_tokens, n)
    for prefix in new_model:
        for word in new_model[prefix]:
            base_model[prefix][word] += alpha * new_model[prefix][word]
    return base_model

def predict(prefix_words, prob_model):
    prefix = tuple(prefix_words)
    if prefix in prob_model:
        return max(prob_model[prefix], key=prob_model[prefix].get)
    return "No prediction"

corpus = "this is a simple language model this is a test"
tokens = tokenize(corpus)
n = 2
model = build_ngram(tokens, n)
prob = calc_prob(model)

def run_model(prefix, finetune_text):
    global model, prob
    if finetune_text.strip():
        new_tokens = tokenize(finetune_text)
        model = fine_tune(model, new_tokens, n)
        prob = calc_prob(model)
    prefix_words = tokenize(prefix)
    if len(prefix_words) != n - 1:
        return "Prefix length error"
    return predict(prefix_words, prob)

demo = gr.Interface(
    fn=run_model,
    inputs=[
        gr.Textbox(label="Prefix"),
        gr.Textbox(label="Fine-tuning Text")
    ],
    outputs="text",
    title="N-gram Language Model"
)

if __name__ == "__main__":
    demo.launch()