Spaces:

teapotai
/

tinyteapotchat

Running

App Files Files Community

zakerytclarke commited on Feb 22

Commit

4c83d38

verified ·

1 Parent(s): 8f8133e

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +334 -108

src/streamlit_app.py CHANGED Viewed

@@ -1,28 +1,26 @@
-# streamlit_app.py
 import os
 import re
 import time
-import warnings
-from typing import List, Dict, Optional
 import requests
 import streamlit as st
 import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-from teapotai import TeapotAI
 # -----------------------
-# Optional: quiet noisy warnings from deps
 # -----------------------
-warnings.filterwarnings("ignore", message="pkg_resources is deprecated as an API.*")
-warnings.filterwarnings("ignore", message='Field name "schema" in "TeapotTool" shadows.*')
-# -----------------------
-# Config
-# -----------------------
 TEAPOT_LOGO_GIF = "https://teapotai.com/assets/logo.gif"
 SUGGESTED_QUERIES = [
@@ -41,46 +39,54 @@ DEFAULT_SYSTEM_PROMPT = (
     "'I am sorry but I don't have any information on that'."
 )
-DEFAULT_DOCUMENTS = [
-    """Teapot (Tiny Teapot) is an open-source small language model (~77 million parameters) fine-tuned on synthetic data and optimized to run locally on resource-constrained devices such as smartphones and CPUs. Teapot is trained to only answer using context from documents, reducing hallucinations. Teapot can perform a variety of tasks, including hallucination-resistant Question Answering (QnA), Retrieval-Augmented Generation (RAG), and JSON extraction. TeapotLLM is a fine tune of flan-t5-large that was trained on synthetic data generated by Deepseek v3 TeapotLLM can be hosted on low-power devices with as little as 2GB of CPU RAM such as a Raspberry Pi. Teapot is a model built by and for the community."""
-]
-# Brave Search
-BRAVE_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
 TOP_K = 3
 TIMEOUT_SECS = 15
-# -----------------------
-# Streamlit setup (no custom theming)
-# -----------------------
-st.set_page_config(page_title="TeapotAI Chat", page_icon="🫖", layout="centered")
 # -----------------------
-# Helpers
 # -----------------------
 def st_image_full_width(img_url: str):
-    # Streamlit API varies across builds
     try:
         st.image(img_url, use_container_width=True)
     except TypeError:
         st.image(img_url, use_column_width=True)
-def get_brave_key() -> Optional[str]:
-    # HF Spaces secrets are commonly env vars; support st.secrets too
     return os.getenv("BRAVE_API_KEY") or (st.secrets.get("BRAVE_API_KEY") if hasattr(st, "secrets") else None)
-def brave_search_snippets(query: str, top_k: int = 3) -> List[Dict[str, str]]:
-    key = get_brave_key()
     if not key:
-        raise RuntimeError("Missing BRAVE_API_KEY (set as a Space secret / env var).")
     headers = {"Accept": "application/json", "X-Subscription-Token": key}
     params = {"q": query, "count": top_k}
-    r = requests.get(BRAVE_ENDPOINT, headers=headers, params=params, timeout=TIMEOUT_SECS)
     r.raise_for_status()
     data = r.json()
@@ -97,6 +103,9 @@ def brave_search_snippets(query: str, top_k: int = 3) -> List[Dict[str, str]]:
 def format_context_from_results(results: List[Dict[str, str]]) -> str:
     if not results:
         return ""
@@ -106,83 +115,194 @@ def format_context_from_results(results: List[Dict[str, str]]) -> str:
         url = re.sub(r"\s+", " ", r.get("url", "")).strip()
         snippet = re.sub(r"\s+", " ", r.get("snippet", "")).strip()
-        # per your requirement: strip <strong> tags
         title = title.replace("<strong>", "").replace("</strong>", "")
         snippet = snippet.replace("<strong>", "").replace("</strong>", "")
         blocks.append(f"[{i}] {title}\nURL: {url}\nSnippet: {snippet}")
     return "\n\n".join(blocks)
 def count_tokens(tokenizer: AutoTokenizer, text: str) -> int:
     if not text:
         return 0
-    try:
-        return len(tokenizer.encode(text))
-    except Exception:
-        return 0
-def render_sources_popover(sources: List[Dict[str, str]], context: str):
     """
-    Renders ℹ️ popover if available; otherwise uses expander.
     """
-    def _body():
-        st.markdown("**Sources**")
-        if sources:
-            for j, s in enumerate(sources, start=1):
-                title = (s.get("title") or "").strip() or f"Result {j}"
-                url = (s.get("url") or "").strip()
-                snippet = (s.get("snippet") or "").strip()
-                if url:
-                    st.markdown(f"- [{title}]({url})")
-                else:
-                    st.markdown(f"- {title}")
-                if snippet:
-                    st.caption(snippet)
-        else:
-            st.caption("(No sources returned.)")
-        st.markdown("**Full context**")
-        if context.strip():
-            st.code(context)
-        else:
-            st.caption("(Empty context.)")
     try:
-        with st.popover("ℹ️"):
-            _body()
     except Exception:
-        with st.expander("ℹ️ Sources / Context"):
-            _body()
 # -----------------------
-# Load model + TeapotAI (cached)
 # -----------------------
 @st.cache_resource
-def load_teapot_ai_and_tokenizer():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model.to(device)
     model.eval()
-    teapot_ai = TeapotAI(
-        tokenizer=tokenizer,
-        model=model,
-        documents=DEFAULT_DOCUMENTS,
-    )
-    return teapot_ai, tokenizer
 # -----------------------
 # Session state
 # -----------------------
 if "messages" not in st.session_state:
-    # Each assistant message includes: sources/context + timing/tokens
     st.session_state.messages = []
 if "pending_query" not in st.session_state:
     st.session_state.pending_query = None
@@ -200,7 +320,7 @@ with c2:
 # -----------------------
-# Sidebar (ONLY: system prompt + web search toggle)
 # -----------------------
 with st.sidebar:
     st.markdown("### Settings")
@@ -208,12 +328,15 @@ with st.sidebar:
     use_web_search = st.checkbox("Use web search", value=True)
-# Load tiny model on startup
-teapot_ai, hf_tokenizer = load_teapot_ai_and_tokenizer()
 # -----------------------
-# Suggested queries on empty chat
 # -----------------------
 if len(st.session_state.messages) == 0 and st.session_state.pending_query is None:
     st.markdown("#### Suggested")
@@ -228,7 +351,38 @@ if len(st.session_state.messages) == 0 and st.session_state.pending_query is Non
 # -----------------------
 # Render chat history
 # -----------------------
-for m in st.session_state.messages:
     if m["role"] == "user":
         with st.chat_message("user"):
             st.markdown(m["content"])
@@ -236,25 +390,33 @@ for m in st.session_state.messages:
         with st.chat_message("assistant"):
             st.markdown(m["content"])
-            # metadata row
-            meta_cols = st.columns([1, 3, 3, 5])
-            with meta_cols[0]:
                 render_sources_popover(m.get("sources", []), m.get("context", ""))
-            # tokens/sec, and token counts
-            tps = m.get("tps", None)
-            out_toks = m.get("output_tokens", None)
-            secs = m.get("seconds", None)
-            with meta_cols[1]:
-                if tps is not None:
-                    st.caption(f"⚡ {tps:.1f} tokens/s")
-            with meta_cols[2]:
-                if out_toks is not None:
-                    st.caption(f"🧮 {out_toks} output tokens")
-            with meta_cols[3]:
-                if secs is not None:
-                    st.caption(f"⏱️ {secs:.2f}s")
 # -----------------------
@@ -267,41 +429,105 @@ if st.session_state.pending_query and not user_input:
     st.session_state.pending_query = None
 if user_input:
     st.session_state.messages.append({"role": "user", "content": user_input})
     sources: List[Dict[str, str]] = []
-    context = ""
     if use_web_search:
         try:
-            sources = brave_search_snippets(user_input, top_k=TOP_K)
-            context = format_context_from_results(sources)
         except Exception:
             sources = []
-            context = ""
-    # Teapot inference + timing
-    t0 = time.perf_counter()
-    answer = teapot_ai.query(
-        query=user_input,
         context=context,
         system_prompt=system_prompt,
     )
-    t1 = time.perf_counter()
-    elapsed = max(t1 - t0, 1e-6)
-    output_tokens = count_tokens(hf_tokenizer, answer)
-    tps = output_tokens / elapsed if output_tokens > 0 else 0.0
     st.session_state.messages.append(
         {
             "role": "assistant",
-            "content": answer,
             "sources": sources,
             "context": context,
             "seconds": elapsed,
-            "output_tokens": output_tokens,
             "tps": tps,
         }
     )
     st.rerun()

 import os
 import re
 import time
+import threading
+from typing import List, Dict, Optional, Iterable, Tuple
 import requests
 import streamlit as st
 import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TextIteratorStreamer
+# LangSmith
+try:
+    from langsmith import Client as LangSmithClient
+except Exception:
+    LangSmithClient = None
 # -----------------------
+# App config
 # -----------------------
+st.set_page_config(page_title="TeapotAI Chat", page_icon="🫖", layout="centered")
 TEAPOT_LOGO_GIF = "https://teapotai.com/assets/logo.gif"
 SUGGESTED_QUERIES = [
     "'I am sorry but I don't have any information on that'."
 )
+# Search provider (kept internal; UI says “web search”)
+SEARCH_ENDPOINT = "https://api.search.brave.com/res/v1/web/search"
 TOP_K = 3
 TIMEOUT_SECS = 15
+# Model input budget
+MAX_INPUT_TOKENS = 512
+MAX_NEW_TOKENS = 192  # output cap
 # -----------------------
+# Utilities
 # -----------------------
 def st_image_full_width(img_url: str):
     try:
         st.image(img_url, use_container_width=True)
     except TypeError:
         st.image(img_url, use_column_width=True)
+def autoscroll_to_bottom():
+    st.markdown(
+        """
+        <script>
+          (function() {
+            const doc = window.parent.document;
+            const el = doc.documentElement || doc.body;
+            el.scrollTo({ top: el.scrollHeight, behavior: "smooth" });
+          })();
+        </script>
+        """,
+        unsafe_allow_html=True,
+    )
+def get_search_key() -> Optional[str]:
+    # Keep the secret name you already use
     return os.getenv("BRAVE_API_KEY") or (st.secrets.get("BRAVE_API_KEY") if hasattr(st, "secrets") else None)
+def search_top_snippets(query: str, top_k: int = 3) -> List[Dict[str, str]]:
+    key = get_search_key()
     if not key:
+        raise RuntimeError("Missing BRAVE_API_KEY (Space secret / env var).")
     headers = {"Accept": "application/json", "X-Subscription-Token": key}
     params = {"q": query, "count": top_k}
+    r = requests.get(SEARCH_ENDPOINT, headers=headers, params=params, timeout=TIMEOUT_SECS)
     r.raise_for_status()
     data = r.json()
 def format_context_from_results(results: List[Dict[str, str]]) -> str:
+    """
+    Stable formatting + strip <strong> tags.
+    """
     if not results:
         return ""
         url = re.sub(r"\s+", " ", r.get("url", "")).strip()
         snippet = re.sub(r"\s+", " ", r.get("snippet", "")).strip()
         title = title.replace("<strong>", "").replace("</strong>", "")
         snippet = snippet.replace("<strong>", "").replace("</strong>", "")
         blocks.append(f"[{i}] {title}\nURL: {url}\nSnippet: {snippet}")
     return "\n\n".join(blocks)
 def count_tokens(tokenizer: AutoTokenizer, text: str) -> int:
     if not text:
         return 0
+    return len(tokenizer.encode(text))
+def build_prompt(context: str, system_prompt: str, question: str) -> str:
+    # EXACT format you’ve been using
+    return f"{context}\n{system_prompt}\n{question}\n"
+def truncate_context_to_fit(
+    tokenizer: AutoTokenizer,
+    context: str,
+    system_prompt: str,
+    question: str,
+    max_input_tokens: int = 512,
+) -> str:
     """
+    Keep the *most recent* context while ensuring total prompt <= max_input_tokens.
+    We right-truncate by tokens (keep tail).
     """
+    # Tokenize fixed parts (system + question + newlines)
+    fixed_prompt = build_prompt("", system_prompt, question)
+    fixed_tokens = tokenizer.encode(fixed_prompt)
+    # Reserve at least 0 for context
+    budget = max_input_tokens - len(fixed_tokens)
+    if budget <= 0:
+        return ""  # no room for context at all
+    ctx_tokens = tokenizer.encode(context)
+    if len(ctx_tokens) <= budget:
+        return context
+    # Keep the most recent tokens (tail)
+    kept = ctx_tokens[-budget:]
+    truncated = tokenizer.decode(kept, skip_special_tokens=True)
+    return truncated
+# -----------------------
+# LangSmith integration
+# -----------------------
+@st.cache_resource
+def get_langsmith_client() -> Optional["LangSmithClient"]:
+    if LangSmithClient is None:
+        return None
+    # LangSmith typically uses these env vars; if no key, no-op.
+    api_key = os.getenv("LANGCHAIN_API_KEY") or os.getenv("LANGSMITH_API_KEY")
+    if not api_key:
+        return None
     try:
+        return LangSmithClient()
     except Exception:
+        return None
+def ls_create_run(
+    client: Optional["LangSmithClient"],
+    *,
+    context: str,
+    system_prompt: str,
+    question: str,
+    model_name: str,
+) -> Optional[str]:
+    if client is None:
+        return None
+    project = os.getenv("LANGCHAIN_PROJECT") or "teapot-chat"
+    try:
+        run = client.create_run(
+            name="teapot_chat_turn",
+            run_type="llm",
+            project_name=project,
+            inputs={
+                "context": context,
+                "system_prompt": system_prompt,
+                "question": question,
+                "model": model_name,
+            },
+            tags=["teapot", "streamlit"],
+        )
+        # create_run returns a Run-like object; the id property name can vary
+        return getattr(run, "id", None) or getattr(run, "run_id", None)
+    except Exception:
+        return None
+def ls_end_run(
+    client: Optional["LangSmithClient"],
+    run_id: Optional[str],
+    *,
+    answer: str,
+    meta: Dict[str, object],
+):
+    if client is None or not run_id:
+        return
+    try:
+        client.update_run(
+            run_id,
+            outputs={"answer": answer, **meta},
+        )
+    except Exception:
+        pass
+def ls_feedback(
+    client: Optional["LangSmithClient"],
+    run_id: Optional[str],
+    *,
+    score: int,
+    comment: str = "",
+):
+    if client is None or not run_id:
+        return
+    try:
+        client.create_feedback(
+            run_id=run_id,
+            key="user_feedback",
+            score=float(score),
+            comment=comment or None,
+        )
+    except Exception:
+        pass
 # -----------------------
+# Model loading
 # -----------------------
 @st.cache_resource
+def load_model_and_tokenizer():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model.to(device)
     model.eval()
+    return tokenizer, model, device
+def generate_stream(
+    tokenizer: AutoTokenizer,
+    model: AutoModelForSeq2SeqLM,
+    device: str,
+    prompt: str,
+    max_new_tokens: int = 192,
+) -> Iterable[str]:
+    """
+    True streaming via TextIteratorStreamer.
+    Yields progressively longer partial outputs.
+    """
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+    def _run():
+        model.generate(
+            **inputs,
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=int(max_new_tokens),
+            streamer=streamer,
+        )
+    t = threading.Thread(target=_run, daemon=True)
+    t.start()
+    partial = ""
+    for piece in streamer:
+        partial += piece
+        yield partial
 # -----------------------
 # Session state
 # -----------------------
 if "messages" not in st.session_state:
+    # message schema:
+    # user: {"role":"user","content":...}
+    # assistant: {"role":"assistant","content":..., "sources":[...], "context":..., "run_id":..., "tps":..., "output_tokens":..., "seconds":..., "feedback": None/1/-1}
     st.session_state.messages = []
 if "pending_query" not in st.session_state:
     st.session_state.pending_query = None
 # -----------------------
+# Sidebar (ONLY system prompt + web search)
 # -----------------------
 with st.sidebar:
     st.markdown("### Settings")
     use_web_search = st.checkbox("Use web search", value=True)
+# Load model
+tokenizer, model, device = load_model_and_tokenizer()
+# LangSmith client (optional)
+ls_client = get_langsmith_client()
 # -----------------------
+# Suggested queries when empty
 # -----------------------
 if len(st.session_state.messages) == 0 and st.session_state.pending_query is None:
     st.markdown("#### Suggested")
 # -----------------------
 # Render chat history
 # -----------------------
+def render_sources_popover(sources: List[Dict[str, str]], context: str):
+    def _body():
+        st.markdown("**Sources**")
+        if sources:
+            for j, s in enumerate(sources, start=1):
+                title = (s.get("title") or "").strip() or f"Result {j}"
+                url = (s.get("url") or "").strip()
+                snippet = (s.get("snippet") or "").strip()
+                if url:
+                    st.markdown(f"- [{title}]({url})")
+                else:
+                    st.markdown(f"- {title}")
+                if snippet:
+                    st.caption(snippet)
+        else:
+            st.caption("(No sources returned.)")
+        st.markdown("**Full context**")
+        if context.strip():
+            st.code(context)
+        else:
+            st.caption("(Empty context.)")
+    try:
+        with st.popover("ℹ️"):
+            _body()
+    except Exception:
+        with st.expander("ℹ️ Sources / Context"):
+            _body()
+for idx, m in enumerate(st.session_state.messages):
     if m["role"] == "user":
         with st.chat_message("user"):
             st.markdown(m["content"])
         with st.chat_message("assistant"):
             st.markdown(m["content"])
+            meta = st.columns([1, 2.2, 2.2, 2.2, 2.2])
+            with meta[0]:
                 render_sources_popover(m.get("sources", []), m.get("context", ""))
+            with meta[1]:
+                st.caption(f"⚡ {m.get('tps', 0.0):.1f} tok/s")
+            with meta[2]:
+                st.caption(f"🧮 {m.get('output_tokens', 0)} toks")
+            with meta[3]:
+                st.caption(f"⏱️ {m.get('seconds', 0.0):.2f}s")
+            # Feedback buttons wired to LangSmith
+            feedback = m.get("feedback", None)
+            run_id = m.get("run_id", None)
+            btn_cols = st.columns([1, 1, 6])
+            with btn_cols[0]:
+                up_disabled = feedback is not None
+                if st.button("👍", key=f"fb_up_{idx}", disabled=up_disabled):
+                    st.session_state.messages[idx]["feedback"] = 1
+                    ls_feedback(ls_client, run_id, score=1)
+                    st.rerun()
+            with btn_cols[1]:
+                down_disabled = feedback is not None
+                if st.button("👎", key=f"fb_down_{idx}", disabled=down_disabled):
+                    st.session_state.messages[idx]["feedback"] = -1
+                    ls_feedback(ls_client, run_id, score=-1)
+                    st.rerun()
 # -----------------------
     st.session_state.pending_query = None
 if user_input:
+    # Add user message
     st.session_state.messages.append({"role": "user", "content": user_input})
+    # Build context (optional web search)
     sources: List[Dict[str, str]] = []
+    raw_context = ""
     if use_web_search:
         try:
+            sources = search_top_snippets(user_input, top_k=TOP_K)
+            raw_context = format_context_from_results(sources)
         except Exception:
             sources = []
+            raw_context = ""
+    # Truncate context to fit 512 tokens total prompt, keeping most recent
+    context = truncate_context_to_fit(
+        tokenizer=tokenizer,
+        context=raw_context,
+        system_prompt=system_prompt,
+        question=user_input,
+        max_input_tokens=MAX_INPUT_TOKENS,
+    )
+    prompt = build_prompt(context, system_prompt, user_input)
+    # Create LangSmith run now (inputs)
+    run_id = ls_create_run(
+        ls_client,
         context=context,
         system_prompt=system_prompt,
+        question=user_input,
+        model_name=MODEL_NAME,
     )
+    # Stream generation into the UI
+    with st.chat_message("assistant"):
+        placeholder = st.empty()
+        t0 = time.perf_counter()
+        final_text = ""
+        for partial in generate_stream(tokenizer, model, device, prompt, max_new_tokens=MAX_NEW_TOKENS):
+            final_text = partial
+            placeholder.markdown(final_text)
+            autoscroll_to_bottom()
+        t1 = time.perf_counter()
+        elapsed = max(t1 - t0, 1e-6)
+        out_tokens = count_tokens(tokenizer, final_text)
+        tps = (out_tokens / elapsed) if out_tokens > 0 else 0.0
+        # Metadata row + feedback buttons (live)
+        meta = st.columns([1, 2.2, 2.2, 2.2, 2.2])
+        with meta[0]:
+            render_sources_popover(sources, context)
+        with meta[1]:
+            st.caption(f"⚡ {tps:.1f} tok/s")
+        with meta[2]:
+            st.caption(f"🧮 {out_tokens} toks")
+        with meta[3]:
+            st.caption(f"⏱️ {elapsed:.2f}s")
+        btn_cols = st.columns([1, 1, 6])
+        with btn_cols[0]:
+            if st.button("👍", key=f"fb_up_live_{len(st.session_state.messages)}"):
+                ls_feedback(ls_client, run_id, score=1)
+        with btn_cols[1]:
+            if st.button("👎", key=f"fb_down_live_{len(st.session_state.messages)}"):
+                ls_feedback(ls_client, run_id, score=-1)
+    # End LangSmith run (outputs)
+    ls_end_run(
+        ls_client,
+        run_id,
+        answer=final_text,
+        meta={
+            "seconds": elapsed,
+            "output_tokens": out_tokens,
+            "tokens_per_second": tps,
+            "used_web_search": bool(use_web_search),
+            "max_input_tokens": MAX_INPUT_TOKENS,
+            "max_new_tokens": MAX_NEW_TOKENS,
+        },
+    )
+    # Persist assistant message for history (feedback state stored)
     st.session_state.messages.append(
         {
             "role": "assistant",
+            "content": final_text,
             "sources": sources,
             "context": context,
+            "run_id": run_id,
             "seconds": elapsed,
+            "output_tokens": out_tokens,
             "tps": tps,
+            "feedback": None,
         }
     )
     st.rerun()