Spaces:

afscomercial
/

stf_model

Sleeping

stf_model / app.py

andres.salguero

Update project for Hugging Face Spaces

5f21106 9 months ago

1.69 kB

	import streamlit as st
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from nltk.tokenize import word_tokenize
	import nltk

	nltk.download('punkt')

	st.title("📊 Bayesian Token Co-occurrence Simulator")

	# User input
	user_input = st.text_area("✍️ Enter your training sentences (one per line):",
	"""
	fido loves the red ball
	timmy and fido go to the park
	fido and timmy love to play
	the red ball is timmy's favorite toy
	""")

	sentences = user_input.strip().split('\n')
	tokenized = [word_tokenize(s.lower()) for s in sentences if s.strip()]
	vocab = sorted(set(word for sentence in tokenized for word in sentence))
	token2idx = {word: i for i, word in enumerate(vocab)}
	idx2token = {i: word for word, i in token2idx.items()}

	# Co-occurrence matrix
	window_size = 2
	matrix = np.zeros((len(vocab), len(vocab)))

	for sentence in tokenized:
	for i, word in enumerate(sentence):
	for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
	if i != j:
	matrix[token2idx[word]][token2idx[sentence[j]]] += 1

	alpha = st.slider("🔧 Set Bayesian Prior (α smoothing)", 0.0, 2.0, 0.1)
	posterior = matrix + alpha

	df = pd.DataFrame(posterior, index=vocab, columns=vocab)
	st.subheader("📈 Co-occurrence Heatmap")
	fig, ax = plt.subplots(figsize=(10, 8))
	sns.heatmap(df, annot=True, cmap="Blues", fmt=".1f", ax=ax)
	st.pyplot(fig)

	# Next-token prediction
	selected_word = st.selectbox("🔮 Predict next token after:", vocab)
	row = posterior[token2idx[selected_word]]
	probs = row / row.sum()
	prediction = np.random.choice(vocab, p=probs)

	st.markdown(f"Predicted next token: `{prediction}`")