Spaces:

teapotai
/

tinyteapotchat

Running

App Files Files Community

zakerytclarke commited on Feb 22

Commit

e4379b8

verified ·

1 Parent(s): 42556c6

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +88 -82

src/streamlit_app.py CHANGED Viewed

@@ -12,7 +12,6 @@ try:
 except:
     LangSmithClient = None
 # =========================
 # CONFIG
 # =========================
@@ -22,11 +21,7 @@ MAX_NEW_TOKENS = 192
 TOP_K_SEARCH = 3
 LOGO_URL = "https://teapotai.com/assets/logo.gif"
-st.set_page_config(
-    page_title="TeapotAI Chat",
-    page_icon="🫖",
-    layout="centered"
-)
 # =========================
 # LOAD MODEL (CACHED)
@@ -36,42 +31,40 @@ def load_model():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    model.to(device)
-    model.eval()
     return tokenizer, model, device
 tokenizer, model, device = load_model()
 # =========================
-# LANGSMITH
 # =========================
 @st.cache_resource
 def get_langsmith():
-    api_key = os.getenv("LANGCHAIN_API_KEY") or os.getenv("LANGSMITH_API_KEY")
-    if api_key and LangSmithClient:
         return LangSmithClient()
     return None
 ls_client = get_langsmith()
 # =========================
 # SESSION STATE
 # =========================
 if "messages" not in st.session_state:
     st.session_state.messages = []
 # =========================
-# HEADER (LOGO RESTORED)
 # =========================
 col1, col2 = st.columns([1, 6])
 with col1:
     st.image(LOGO_URL, use_column_width=True)
 with col2:
     st.markdown("## TeapotAI Chat")
-    st.caption("Fast, grounded answers with web context")
 # =========================
 # SIDEBAR SETTINGS
@@ -88,32 +81,27 @@ with st.sidebar:
             "If the context does not answer the question, reply exactly: "
             "'I am sorry but I don't have any information on that'."
         ),
-        height=180
     )
-    st.markdown("### Extra Context (Optional)")
-    user_context = st.text_area(
-        "Paste context to append to web results",
-        height=150,
-        placeholder="Add any custom context here..."
     )
     use_web = st.checkbox("Use web search", value=True)
 # =========================
-# WEB SEARCH (FAST)
 # =========================
-def web_search(query: str):
     api_key = os.getenv("BRAVE_API_KEY") or st.secrets.get("BRAVE_API_KEY", None)
     if not api_key:
         return "", 0.0
-    headers = {
-        "X-Subscription-Token": api_key,
-        "Accept": "application/json"
-    }
     params = {"q": query, "count": TOP_K_SEARCH}
     t0 = time.perf_counter()
@@ -129,35 +117,37 @@ def web_search(query: str):
         return "", 0.0
     t1 = time.perf_counter()
-    blocks = []
-    for i, item in enumerate(data.get("web", {}).get("results", [])[:TOP_K_SEARCH], 1):
-        title = item.get("title", "")
-        url = item.get("url", "")
         desc = item.get("description", "")
-        desc = desc.replace("<strong>", "").replace("</strong>", "")
-        blocks.append(f"[{i}] {title}\nURL: {url}\nSnippet: {desc}")
-    context = "\n\n".join(blocks)
-    return context, (t1 - t0)
 # =========================
 # TRUNCATE TO LAST 512 TOKENS
 # =========================
-def truncate_to_512(context: str, system: str, question: str):
-    base_prompt = f"\n{system}\n{question}\n"
-    base_tokens = tokenizer.encode(base_prompt)
     budget = MAX_INPUT_TOKENS - len(base_tokens)
-    ctx_tokens = tokenizer.encode(context)
     if len(ctx_tokens) <= budget:
-        return context
     # Keep MOST RECENT tokens (tail truncation)
     truncated = ctx_tokens[-budget:]
     return tokenizer.decode(truncated, skip_special_tokens=True)
 # =========================
 # STREAM GENERATION
 # =========================
@@ -182,17 +172,12 @@ def stream_generate(prompt: str):
         text += chunk
         yield text
 # =========================
-# LANGSMITH FEEDBACK HANDLER
 # =========================
 def handle_feedback(idx: int):
     val = st.session_state[f"feedback_{idx}"]
     msg = st.session_state.messages[idx]
-    if val is None:
-        return
     msg["feedback"] = val
     if ls_client and msg.get("run_id"):
@@ -204,18 +189,18 @@ def handle_feedback(idx: int):
                 score=score,
                 comment="thumbs_up" if score else "thumbs_down",
             )
-        except Exception as e:
-            print("LangSmith feedback error:", e)
 # =========================
-# RENDER CHAT
 # =========================
 for i, msg in enumerate(st.session_state.messages):
     with st.chat_message(msg["role"]):
         st.markdown(msg["content"])
         if msg["role"] == "assistant":
             st.caption(
                 f"🔎 {msg['search_time']:.2f}s search • "
                 f"🧠 {msg['gen_time']:.2f}s generate • "
@@ -223,48 +208,63 @@ for i, msg in enumerate(st.session_state.messages):
                 f"🧮 {msg['tokens']} tokens"
             )
-            feedback_key = f"feedback_{i}"
-            st.session_state.setdefault(feedback_key, msg.get("feedback"))
             st.feedback(
                 "thumbs",
-                key=feedback_key,
                 disabled=msg.get("feedback") is not None,
                 on_change=handle_feedback,
                 args=(i,),
             )
 # =========================
-# CHAT INPUT
 # =========================
 query = st.chat_input("Ask a question...")
 if query:
     st.session_state.messages.append({"role": "user", "content": query})
-    # ---- WEB SEARCH ----
-    web_context = ""
     search_time = 0.0
     if use_web:
-        web_context, search_time = web_search(query)
-    # ---- COMBINED CONTEXT (WEB + USER BOX) ----
-    combined_context = ""
-    if user_context:
-        combined_context += user_context.strip() + "\n\n"
-    if web_context:
-        combined_context += web_context
-    truncated_context = truncate_to_512(
-        combined_context,
         system_prompt,
-        query
     )
-    prompt = f"{truncated_context}\n{system_prompt}\n{query}\n"
-    # ---- LANGSMITH RUN ----
     run_id = None
     if ls_client:
         try:
@@ -272,26 +272,28 @@ if query:
                 name="teapot_chat",
                 run_type="llm",
                 inputs={
-                    "context": truncated_context,
                     "system_prompt": system_prompt,
                     "question": query,
                 },
             )
             run_id = run.id
         except:
             pass
-    # ---- STREAM OUTPUT ----
     with st.chat_message("assistant"):
         placeholder = st.empty()
-        gen_start = time.perf_counter()
         final_text = ""
         for partial in stream_generate(prompt):
             final_text = partial
             placeholder.markdown(final_text)
-        gen_time = time.perf_counter() - gen_start
         tokens = len(tokenizer.encode(final_text))
         tps = tokens / gen_time if gen_time > 0 else 0.0
@@ -312,13 +314,17 @@ if query:
         {
             "role": "assistant",
             "content": final_text,
             "search_time": search_time,
             "gen_time": gen_time,
-            "tps": tps,
             "tokens": tokens,
             "run_id": run_id,
             "feedback": None,
         }
     )
     st.rerun()

 except:
     LangSmithClient = None
 # =========================
 # CONFIG
 # =========================
 TOP_K_SEARCH = 3
 LOGO_URL = "https://teapotai.com/assets/logo.gif"
+st.set_page_config(page_title="TeapotAI Chat", page_icon="🫖", layout="centered")
 # =========================
 # LOAD MODEL (CACHED)
     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
     model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device).eval()
     return tokenizer, model, device
 tokenizer, model, device = load_model()
 # =========================
+# LANGSMITH (OPTIONAL)
 # =========================
 @st.cache_resource
 def get_langsmith():
+    key = os.getenv("LANGCHAIN_API_KEY") or os.getenv("LANGSMITH_API_KEY")
+    if key and LangSmithClient:
         return LangSmithClient()
     return None
 ls_client = get_langsmith()
 # =========================
 # SESSION STATE
 # =========================
 if "messages" not in st.session_state:
     st.session_state.messages = []
+if "pending_response" not in st.session_state:
+    st.session_state.pending_response = None
 # =========================
+# HEADER (LOGO)
 # =========================
 col1, col2 = st.columns([1, 6])
 with col1:
     st.image(LOGO_URL, use_column_width=True)
 with col2:
     st.markdown("## TeapotAI Chat")
+    st.caption("Fast grounded answers with clean web context")
 # =========================
 # SIDEBAR SETTINGS
             "If the context does not answer the question, reply exactly: "
             "'I am sorry but I don't have any information on that'."
         ),
+        height=180,
     )
+    st.markdown("### Local Context (Appended)")
+    local_context = st.text_area(
+        "Paste additional context (optional)",
+        height=160,
+        placeholder="This will be appended after web content..."
     )
     use_web = st.checkbox("Use web search", value=True)
 # =========================
+# WEB SEARCH (SNIPPETS ONLY)
 # =========================
+def web_search_snippets(query: str):
     api_key = os.getenv("BRAVE_API_KEY") or st.secrets.get("BRAVE_API_KEY", None)
     if not api_key:
         return "", 0.0
+    headers = {"X-Subscription-Token": api_key, "Accept": "application/json"}
     params = {"q": query, "count": TOP_K_SEARCH}
     t0 = time.perf_counter()
         return "", 0.0
     t1 = time.perf_counter()
+    snippets = []
+    for item in data.get("web", {}).get("results", [])[:TOP_K_SEARCH]:
         desc = item.get("description", "")
+        desc = desc.replace("<strong>", "").replace("</strong>", "").strip()
+        if desc:
+            snippets.append(desc)
+    # Paragraph-separated ONLY (no title, no URL)
+    clean_context = "\n\n".join(snippets)
+    return clean_context, (t1 - t0)
 # =========================
 # TRUNCATE TO LAST 512 TOKENS
 # =========================
+def truncate_context(web_ctx, local_ctx, system, question):
+    ordered_context = (
+        f"{web_ctx}\n\n{local_ctx}".strip()
+    )
+    base = f"\n{system}\n{question}\n"
+    base_tokens = tokenizer.encode(base)
     budget = MAX_INPUT_TOKENS - len(base_tokens)
+    ctx_tokens = tokenizer.encode(ordered_context)
     if len(ctx_tokens) <= budget:
+        return ordered_context
     # Keep MOST RECENT tokens (tail truncation)
     truncated = ctx_tokens[-budget:]
     return tokenizer.decode(truncated, skip_special_tokens=True)
 # =========================
 # STREAM GENERATION
 # =========================
         text += chunk
         yield text
 # =========================
+# FEEDBACK HANDLER (Native st.feedback)
 # =========================
 def handle_feedback(idx: int):
     val = st.session_state[f"feedback_{idx}"]
     msg = st.session_state.messages[idx]
     msg["feedback"] = val
     if ls_client and msg.get("run_id"):
                 score=score,
                 comment="thumbs_up" if score else "thumbs_down",
             )
+        except:
+            pass
 # =========================
+# RENDER CHAT HISTORY
 # =========================
 for i, msg in enumerate(st.session_state.messages):
     with st.chat_message(msg["role"]):
         st.markdown(msg["content"])
         if msg["role"] == "assistant":
+            # Metrics row
             st.caption(
                 f"🔎 {msg['search_time']:.2f}s search • "
                 f"🧠 {msg['gen_time']:.2f}s generate • "
                 f"🧮 {msg['tokens']} tokens"
             )
+            # Inspectable context (clean UX)
+            with st.expander("🔍 Inspect Context Used"):
+                st.markdown("**Web Content:**")
+                st.write(msg["web_context"] or "_None_")
+                st.markdown("**Local Context:**")
+                st.write(msg["local_context"] or "_None_")
+                st.markdown("**Final Truncated Context (512 tokens tail):**")
+                st.write(msg["final_context"])
+            # Native thumbs feedback
+            key = f"feedback_{i}"
+            st.session_state.setdefault(key, msg.get("feedback"))
             st.feedback(
                 "thumbs",
+                key=key,
                 disabled=msg.get("feedback") is not None,
                 on_change=handle_feedback,
                 args=(i,),
             )
 # =========================
+# USER INPUT (FIXED ORDER)
 # =========================
 query = st.chat_input("Ask a question...")
 if query:
+    # 1️⃣ Immediately show user message FIRST (fix streaming race)
     st.session_state.messages.append({"role": "user", "content": query})
+    st.rerun()
+# =========================
+# GENERATE AFTER RERUN (Prevents premature streaming)
+# =========================
+if (
+    st.session_state.messages
+    and st.session_state.messages[-1]["role"] == "user"
+    and st.session_state.pending_response is None
+):
+    query = st.session_state.messages[-1]["content"]
+    # --- Web Search ---
+    web_ctx = ""
     search_time = 0.0
     if use_web:
+        web_ctx, search_time = web_search_snippets(query)
+    # --- Strict Order Context ---
+    final_context = truncate_context(
+        web_ctx,
+        local_context,
         system_prompt,
+        query,
     )
+    prompt = f"{final_context}\n{system_prompt}\n{query}\n"
+    # LangSmith run
     run_id = None
     if ls_client:
         try:
                 name="teapot_chat",
                 run_type="llm",
                 inputs={
+                    "web_content": web_ctx,
+                    "local_context": local_context,
                     "system_prompt": system_prompt,
                     "question": query,
+                    "final_context": final_context,
                 },
             )
             run_id = run.id
         except:
             pass
+    # --- Stream UI ---
     with st.chat_message("assistant"):
         placeholder = st.empty()
+        start = time.perf_counter()
         final_text = ""
         for partial in stream_generate(prompt):
             final_text = partial
             placeholder.markdown(final_text)
+        gen_time = time.perf_counter() - start
         tokens = len(tokenizer.encode(final_text))
         tps = tokens / gen_time if gen_time > 0 else 0.0
         {
             "role": "assistant",
             "content": final_text,
+            "web_context": web_ctx,
+            "local_context": local_context,
+            "final_context": final_context,
             "search_time": search_time,
             "gen_time": gen_time,
             "tokens": tokens,
+            "tps": tps,
             "run_id": run_id,
             "feedback": None,
         }
     )
+    st.session_state.pending_response = None
     st.rerun()