Spaces:

shrestha-prabin
/

bayesian-classifier-app

Sleeping

App Files Files Community

shrestha-prabin commited on May 24, 2025

Commit

4e977d5

verified ·

1 Parent(s): 06ca9ef

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +55 -37

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,58 @@
-import altair as alt
 import numpy as np
 import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import matplotlib.pyplot as plt
+import nltk
 import numpy as np
 import pandas as pd
+import seaborn as sns
 import streamlit as st
+from nltk.tokenize import word_tokenize
+nltk.download("punkt")
+nltk.download("punkt_tab")
+st.title("📊 Bayesian Token Co-occurrence Simulator")
+# User input
+user_input = st.text_area(
+    "✍️ Enter your training sentences (one per line):",
+    """
+fido loves the red ball
+timmy and fido go to the park
+fido and timmy love to play
+the red ball is timmy's favorite toy
+""",
+)
+sentences = user_input.strip().split("\n")
+tokenized = [word_tokenize(s.lower()) for s in sentences if s.strip()]
+vocab = sorted(set(word for sentence in tokenized for word in sentence))
+token2idx = {word: i for i, word in enumerate(vocab)}
+idx2token = {i: word for word, i in token2idx.items()}
+# Co-occurrence matrix
+window_size = 2
+matrix = np.zeros((len(vocab), len(vocab)))
+for sentence in tokenized:
+    for i, word in enumerate(sentence):
+        for j in range(
+            max(0, i - window_size), min(len(sentence), i + window_size + 1)
+        ):
+            if i != j:
+                matrix[token2idx[word]][token2idx[sentence[j]]] += 1
+alpha = st.slider("🔧 Set Bayesian Prior (α smoothing)", 0.0, 2.0, 0.1)
+posterior = matrix + alpha
+df = pd.DataFrame(posterior, index=vocab, columns=vocab)
+st.subheader("📈 Co-occurrence Heatmap")
+fig, ax = plt.subplots(figsize=(10, 8))
+sns.heatmap(df, annot=True, cmap="Blues", fmt=".1f", ax=ax)
+st.pyplot(fig)
+# Next-token prediction
+selected_word = st.selectbox("🔮 Predict next token after:", vocab)
+row = posterior[token2idx[selected_word]]
+probs = row / row.sum()
+prediction = np.random.choice(vocab, p=probs)
+st.markdown(f"**Predicted next token:** `{prediction}`")