|
|
import streamlit as st |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from nltk.tokenize import word_tokenize |
|
|
import nltk |
|
|
|
|
|
nltk.download('punkt') |
|
|
|
|
|
st.title("๐ Bayesian Token Co-occurrence Simulator") |
|
|
|
|
|
|
|
|
user_input = st.text_area("โ๏ธ Enter your training sentences (one per line):", |
|
|
""" |
|
|
fido loves the red ball |
|
|
timmy and fido go to the park |
|
|
fido and timmy love to play |
|
|
the red ball is timmy's favorite toy |
|
|
""") |
|
|
|
|
|
sentences = user_input.strip().split('\n') |
|
|
tokenized = [word_tokenize(s.lower()) for s in sentences if s.strip()] |
|
|
vocab = sorted(set(word for sentence in tokenized for word in sentence)) |
|
|
token2idx = {word: i for i, word in enumerate(vocab)} |
|
|
idx2token = {i: word for word, i in token2idx.items()} |
|
|
|
|
|
|
|
|
window_size = 2 |
|
|
matrix = np.zeros((len(vocab), len(vocab))) |
|
|
|
|
|
for sentence in tokenized: |
|
|
for i, word in enumerate(sentence): |
|
|
for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)): |
|
|
if i != j: |
|
|
matrix[token2idx[word]][token2idx[sentence[j]]] += 1 |
|
|
|
|
|
alpha = st.slider("๐ง Set Bayesian Prior (ฮฑ smoothing)", 0.0, 2.0, 0.1) |
|
|
posterior = matrix + alpha |
|
|
|
|
|
df = pd.DataFrame(posterior, index=vocab, columns=vocab) |
|
|
st.subheader("๐ Co-occurrence Heatmap") |
|
|
fig, ax = plt.subplots(figsize=(10, 8)) |
|
|
sns.heatmap(df, annot=True, cmap="Blues", fmt=".1f", ax=ax) |
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
|
selected_word = st.selectbox("๐ฎ Predict next token after:", vocab) |
|
|
row = posterior[token2idx[selected_word]] |
|
|
probs = row / row.sum() |
|
|
prediction = np.random.choice(vocab, p=probs) |
|
|
|
|
|
st.markdown(f"**Predicted next token:** `{prediction}`") |
|
|
|
|
|
|
|
|
|