Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from nltk.tokenize import word_tokenize | |
| import nltk | |
| nltk.download('punkt') | |
| st.title("📊 Bayesian Token Co-occurrence Simulator") | |
| # User input | |
| user_input = st.text_area("✍️ Enter your training sentences (one per line):", | |
| """ | |
| fido loves the red ball | |
| timmy and fido go to the park | |
| fido and timmy love to play | |
| the red ball is timmy's favorite toy | |
| """) | |
| sentences = user_input.strip().split('\n') | |
| tokenized = [word_tokenize(s.lower()) for s in sentences if s.strip()] | |
| vocab = sorted(set(word for sentence in tokenized for word in sentence)) | |
| token2idx = {word: i for i, word in enumerate(vocab)} | |
| idx2token = {i: word for word, i in token2idx.items()} | |
| # Co-occurrence matrix | |
| window_size = 2 | |
| matrix = np.zeros((len(vocab), len(vocab))) | |
| for sentence in tokenized: | |
| for i, word in enumerate(sentence): | |
| for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)): | |
| if i != j: | |
| matrix[token2idx[word]][token2idx[sentence[j]]] += 1 | |
| alpha = st.slider("🔧 Set Bayesian Prior (α smoothing)", 0.0, 2.0, 0.1) | |
| posterior = matrix + alpha | |
| df = pd.DataFrame(posterior, index=vocab, columns=vocab) | |
| st.subheader("📈 Co-occurrence Heatmap") | |
| fig, ax = plt.subplots(figsize=(10, 8)) | |
| sns.heatmap(df, annot=True, cmap="Blues", fmt=".1f", ax=ax) | |
| st.pyplot(fig) | |
| # Next-token prediction | |
| selected_word = st.selectbox("🔮 Predict next token after:", vocab) | |
| row = posterior[token2idx[selected_word]] | |
| probs = row / row.sum() | |
| prediction = np.random.choice(vocab, p=probs) | |
| st.markdown(f"**Predicted next token:** `{prediction}`") | |