123 / app.py
william1324's picture
Rename App.py to app.py
cf4a2a7 verified
from collections import defaultdict, Counter
import re
import gradio as gr
def tokenize(text):
text = text.lower()
return re.findall(r'\b\w+\b', text)
def build_ngram(tokens, n):
model = defaultdict(Counter)
for i in range(len(tokens) - n + 1):
prefix = tuple(tokens[i:i + n - 1])
target = tokens[i + n - 1]
model[prefix][target] += 1
return model
def calc_prob(model):
prob = {}
for prefix, counter in model.items():
total = sum(counter.values())
prob[prefix] = {w: c / total for w, c in counter.items()}
return prob
def fine_tune(base_model, new_tokens, n, alpha=1.0):
new_model = build_ngram(new_tokens, n)
for prefix in new_model:
for word in new_model[prefix]:
base_model[prefix][word] += alpha * new_model[prefix][word]
return base_model
def predict(prefix_words, prob_model):
prefix = tuple(prefix_words)
if prefix in prob_model:
return max(prob_model[prefix], key=prob_model[prefix].get)
return "No prediction"
corpus = "this is a simple language model this is a test"
tokens = tokenize(corpus)
n = 2
model = build_ngram(tokens, n)
prob = calc_prob(model)
def run_model(prefix, finetune_text):
global model, prob
if finetune_text.strip():
new_tokens = tokenize(finetune_text)
model = fine_tune(model, new_tokens, n)
prob = calc_prob(model)
prefix_words = tokenize(prefix)
if len(prefix_words) != n - 1:
return "Prefix length error"
return predict(prefix_words, prob)
demo = gr.Interface(
fn=run_model,
inputs=[
gr.Textbox(label="Prefix"),
gr.Textbox(label="Fine-tuning Text")
],
outputs="text",
title="N-gram Language Model"
)
if __name__ == "__main__":
demo.launch()