Spaces:

teapotai
/

tinyteapotchat

Running

App Files Files Community

tinyteapotchat / src /streamlit_app.py

zakerytclarke

Update src/streamlit_app.py

9e36a10 verified about 2 months ago

raw

history blame contribute delete

12.9 kB

	import os
	import time
	import threading
	import requests

	import streamlit as st
	import torch
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer

	# Optional LangSmith (trace + feedback)
	try:
	from langsmith import Client as LangSmithClient
	from langsmith import traceable
	from langsmith.run_helpers import get_current_run_tree
	except Exception:
	LangSmithClient = None
	traceable = None
	get_current_run_tree = None


	# =========================
	# CONFIG
	# =========================
	MODEL_NAME = "teapotai/tinyteapot"
	MAX_INPUT_TOKENS = 512
	MAX_NEW_TOKENS = 192
	TOP_K_SEARCH = 3
	LOGO_URL = "https://teapotai.com/assets/logo.gif"

	st.set_page_config(page_title="TeapotAI Chat", page_icon="🫖", layout="centered")


	# =========================
	# LOAD MODEL (CACHED)
	# =========================
	@st.cache_resource
	def load_model():
	tok = AutoTokenizer.from_pretrained(MODEL_NAME)
	mdl = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
	dev = "cuda" if torch.cuda.is_available() else "cpu"
	mdl.to(dev).eval()
	return tok, mdl, dev


	tokenizer, model, device = load_model()


	# =========================
	# LANGSMITH (OPTIONAL)
	# =========================
	@st.cache_resource
	def get_langsmith():
	if (os.getenv("LANGCHAIN_API_KEY") or os.getenv("LANGSMITH_API_KEY")) and LangSmithClient:
	return LangSmithClient()
	return None


	ls_client = get_langsmith()


	# =========================
	# SAMPLE SEED (with full debug fields)
	# =========================
	SAMPLE_QUESTION = "Who are you?"

	DEFAULT_SYSTEM_PROMPT = (
	"You are Teapot, an open-source AI assistant optimized for running on low-end cpu devices, "
	"providing short, accurate responses without hallucinating while excelling at "
	"information extraction and text summarization. "
	"If the context does not answer the question, reply exactly: "
	"'I am sorry but I don't have any information on that'."
	)

	SAMPLE_SYSTEM_PROMPT = DEFAULT_SYSTEM_PROMPT
	SAMPLE_CONTEXT = "Teapot is an open-source AI assistant optimized for running on low-end cpu devices."
	SAMPLE_ANSWER = "I am Teapot, an open-source AI assistant optimized for running on low-end cpu devices."
	SAMPLE_PROMPT = f"{SAMPLE_CONTEXT}\n{SAMPLE_SYSTEM_PROMPT}\n{SAMPLE_QUESTION}\n"

	SAMPLE_USER_MSG = {"role": "user", "content": SAMPLE_QUESTION}
	SAMPLE_ASSISTANT_MSG = {
	"role": "assistant",
	"content": SAMPLE_ANSWER,
	"context": SAMPLE_CONTEXT,
	"system_prompt": SAMPLE_SYSTEM_PROMPT,
	"question": SAMPLE_QUESTION,
	"prompt": SAMPLE_PROMPT,
	"search_time": 0.37,
	"gen_time": 0.67,
	"input_tokens": 245,
	"output_tokens": 24,
	"tps": 35.9,
	"trace_id": None,
	"feedback": None,
	}


	# =========================
	# SESSION STATE
	# =========================
	if "messages" not in st.session_state:
	st.session_state.messages = []
	if "seeded" not in st.session_state:
	st.session_state.seeded = False

	# Seed exactly once on first load
	if (not st.session_state.seeded) and (len(st.session_state.messages) == 0):
	st.session_state.messages = [SAMPLE_USER_MSG, SAMPLE_ASSISTANT_MSG]
	st.session_state.seeded = True


	# =========================
	# HEADER
	# =========================
	col1, col2 = st.columns([1, 7], vertical_alignment="center")
	with col1:
	st.image(LOGO_URL, width=56)
	with col2:
	st.markdown("## TeapotAI Chat")
	st.caption(
	"Teapot is a 77M-parameter LLM optimized for fast CPU inference that only generates answers "
	"from the provided context to minimize hallucinations."
	)


	# =========================
	# SIDEBAR
	# =========================
	with st.sidebar:
	st.markdown("### Settings")

	system_prompt = st.text_area(
	"System prompt",
	value=DEFAULT_SYSTEM_PROMPT,
	height=160,
	)

	local_context = st.text_area(
	"Local context (optional)",
	height=140,
	placeholder="Extra context appended after web snippets…",
	)

	if st.button("Clear chat"):
	st.session_state.messages = []
	st.session_state.seeded = True
	st.rerun()


	# =========================
	# WEB SEARCH (ALWAYS ON)
	# =========================
	def web_search_snippets(query: str):
	api_key = os.getenv("BRAVE_API_KEY") or st.secrets.get("BRAVE_API_KEY", None)
	if not api_key:
	return "", 0.0

	headers = {"X-Subscription-Token": api_key, "Accept": "application/json"}
	params = {"q": query, "count": TOP_K_SEARCH}

	t0 = time.perf_counter()
	try:
	r = requests.get(
	"https://api.search.brave.com/res/v1/web/search",
	headers=headers,
	params=params,
	timeout=6,
	)
	data = r.json()
	except Exception:
	return "", 0.0
	t1 = time.perf_counter()

	snippets = []
	for item in data.get("web", {}).get("results", [])[:TOP_K_SEARCH]:
	desc = (item.get("description") or "").replace("<strong>", "").replace("</strong>", "").strip()
	if desc:
	snippets.append(desc)

	return "\n\n".join(snippets), (t1 - t0)


	# =========================
	# CONTEXT TRUNCATION (TAIL)
	# =========================
	def truncate_context(web_ctx: str, local_ctx: str, system: str, question: str) -> str:
	ctx = f"{web_ctx}\n\n{local_ctx}".strip()
	base = f"\n{system}\n{question}\n"
	base_tokens = tokenizer.encode(base)
	budget = MAX_INPUT_TOKENS - len(base_tokens)
	if budget <= 0:
	return ""
	if not ctx:
	return ""
	ctx_tokens = tokenizer.encode(ctx)
	if len(ctx_tokens) <= budget:
	return ctx
	return tokenizer.decode(ctx_tokens[-budget:], skip_special_tokens=True)


	def count_tokens(text: str) -> int:
	return len(tokenizer.encode(text)) if text else 0


	def get_trace_id_if_available() -> str \| None:
	if not get_current_run_tree:
	return None
	try:
	run = get_current_run_tree()
	return str(run.id) if run and getattr(run, "id", None) else None
	except Exception:
	return None


	# =========================
	# FEEDBACK HANDLER (attached to trace_id)
	# =========================
	def handle_feedback(idx: int):
	val = st.session_state.get(f"fb_{idx}")
	st.session_state.messages[idx]["feedback"] = val

	msg = st.session_state.messages[idx]
	trace_id = msg.get("trace_id")

	if ls_client and trace_id:
	score = 1 if val == "👍" else 0
	try:
	ls_client.create_feedback(
	trace_id=trace_id,
	key="thumb_rating",
	score=score,
	comment="thumbs_up" if score else "thumbs_down",
	)
	except Exception:
	pass


	# =========================
	# STREAMING + LANGSMITH FIX
	# - We do NOT trace a generator.
	# - We stream to UI while returning a SINGLE final string.
	# =========================
	_UI_STREAM = {"placeholder": None} # set per-request


	def _generate_with_streamer(prompt: str) -> str:
	"""
	Runs model.generate with a TextIteratorStreamer and updates a Streamlit placeholder
	as chunks arrive. Returns the final full text.
	"""
	inputs = tokenizer(prompt, return_tensors="pt").to(device)

	streamer = TextIteratorStreamer(
	tokenizer,
	skip_prompt=True,
	skip_special_tokens=True,
	)

	gen_kwargs = dict(
	**inputs,
	max_new_tokens=MAX_NEW_TOKENS,
	do_sample=False,
	num_beams=1,
	streamer=streamer,
	)

	t = threading.Thread(target=model.generate, kwargs=gen_kwargs, daemon=True)
	t.start()

	buf = ""
	ph = _UI_STREAM.get("placeholder")
	if ph is not None:
	ph.markdown("") # ensure element exists before first chunk

	for chunk in streamer:
	buf += chunk
	if ph is not None:
	ph.markdown(buf)

	return buf


	if traceable:

	@traceable(name="teapot_answer")
	def traced_answer_streaming(context: str, system_prompt: str, question: str) -> str:
	prompt = f"{context}\n{system_prompt}\n{question}\n"
	return _generate_with_streamer(prompt)

	else:

	def traced_answer_streaming(context: str, system_prompt: str, question: str) -> str:
	prompt = f"{context}\n{system_prompt}\n{question}\n"
	return _generate_with_streamer(prompt)


	# =========================
	# INPUT FIRST (so latest user msg renders immediately)
	# =========================
	query = st.chat_input("Ask a question...")

	if query:
	st.session_state.messages.append({"role": "user", "content": query})


	# =========================
	# RENDER HISTORY
	# Row 1: message + feedback
	# Row 2: inspect + debug metrics
	# =========================
	for i, msg in enumerate(st.session_state.messages):
	with st.chat_message(msg["role"]):
	if msg["role"] == "user":
	st.markdown(msg["content"])
	continue

	# Row 1
	msg_col, fb_col = st.columns([12, 1], vertical_alignment="center")
	with msg_col:
	st.markdown(msg.get("content", ""))
	with fb_col:
	key = f"fb_{i}"
	st.session_state.setdefault(key, msg.get("feedback"))
	st.feedback(
	"thumbs",
	key=key,
	disabled=msg.get("feedback") is not None,
	on_change=handle_feedback,
	args=(i,),
	)

	# Row 2
	inspect_col, metrics_col = st.columns([12, 1], vertical_alignment="center")
	with inspect_col:
	st.caption(
	f"🔎 {msg.get('search_time', 0.0):.2f}s (search) "
	f"🧠 {msg.get('gen_time', 0.0):.2f}s (generation) "
	f"⚡ {msg.get('tps', 0.0):.1f} tok/s "
	f"🧾 {msg.get('input_tokens', 0)} input tokens • {msg.get('output_tokens', 0)} output tokens"
	)
	with metrics_col:
	with st.popover("ℹ️", help="Inspect"):
	st.markdown("Context")
	st.code(msg.get("context", ""), language="text")
	st.markdown("System Prompt")
	st.code(msg.get("system_prompt", ""), language="text")
	st.markdown("Question")
	st.code(msg.get("question", ""), language="text")


	# =========================
	# GENERATE ONLY IF THIS RUN RECEIVED A NEW QUERY
	# =========================
	if query:
	question = query

	# Web search
	web_ctx, search_time = web_search_snippets(question)

	# Context + truncation
	context = truncate_context(web_ctx, local_context, system_prompt, question)
	prompt = f"{context}\n{system_prompt}\n{question}\n"
	input_tokens = count_tokens(prompt)

	# Assistant response (stream to UI, return full string for LangSmith)
	with st.chat_message("assistant"):
	# Row 1: message + feedback (disabled live)
	msg_col, fb_col = st.columns([14, 1], vertical_alignment="center")
	with msg_col:
	placeholder = st.empty()
	with fb_col:
	st.feedback("thumbs", key="live_fb", disabled=True)

	_UI_STREAM["placeholder"] = placeholder

	start = time.perf_counter()
	answer = traced_answer_streaming(context, system_prompt, question)
	trace_id = get_trace_id_if_available()
	gen_time = time.perf_counter() - start

	_UI_STREAM["placeholder"] = None # cleanup

	output_tokens = count_tokens(answer)
	tps = output_tokens / gen_time if gen_time > 0 else 0.0

	# Row 2: inspect + metrics
	inspect_col, metrics_col = st.columns([12, 1], vertical_alignment="center")
	with inspect_col:
	st.caption(
	f"🔎 {search_time:.2f}s (search) "
	f"🧠 {gen_time:.2f}s (generation) "
	f"⚡ {tps:.1f} tok/s "
	f"🧾 {input_tokens} input tokens • {output_tokens} output tokens"
	)
	with metrics_col:
	with st.popover("ℹ️", help="Inspect"):
	st.markdown("Context")
	st.code(context, language="text")
	st.markdown("System")
	st.code(system_prompt, language="text")
	st.markdown("Question")
	st.code(question, language="text")
	st.markdown("Prompt")
	st.code(prompt, language="text")

	# Persist assistant message
	st.session_state.messages.append(
	{
	"role": "assistant",
	"content": answer,
	"context": context,
	"system_prompt": system_prompt,
	"question": question,
	"prompt": prompt,
	"search_time": search_time,
	"gen_time": gen_time,
	"input_tokens": input_tokens,
	"output_tokens": output_tokens,
	"tps": tps,
	"trace_id": trace_id,
	"feedback": None,
	}
	)