import streamlit as st import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from nltk.tokenize import word_tokenize import nltk nltk.download('punkt') st.title("📊 Bayesian Token Co-occurrence Simulator") # User input user_input = st.text_area("✍️ Enter your training sentences (one per line):", """ fido loves the red ball timmy and fido go to the park fido and timmy love to play the red ball is timmy's favorite toy """) sentences = user_input.strip().split('\n') tokenized = [word_tokenize(s.lower()) for s in sentences if s.strip()] vocab = sorted(set(word for sentence in tokenized for word in sentence)) token2idx = {word: i for i, word in enumerate(vocab)} idx2token = {i: word for word, i in token2idx.items()} # Co-occurrence matrix window_size = 2 matrix = np.zeros((len(vocab), len(vocab))) for sentence in tokenized: for i, word in enumerate(sentence): for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)): if i != j: matrix[token2idx[word]][token2idx[sentence[j]]] += 1 alpha = st.slider("🔧 Set Bayesian Prior (α smoothing)", 0.0, 2.0, 0.1) posterior = matrix + alpha df = pd.DataFrame(posterior, index=vocab, columns=vocab) st.subheader("📈 Co-occurrence Heatmap") fig, ax = plt.subplots(figsize=(10, 8)) sns.heatmap(df, annot=True, cmap="Blues", fmt=".1f", ax=ax) st.pyplot(fig) # Next-token prediction selected_word = st.selectbox("🔮 Predict next token after:", vocab) row = posterior[token2idx[selected_word]] probs = row / row.sum() prediction = np.random.choice(vocab, p=probs) st.markdown(f"**Predicted next token:** `{prediction}`")