ZeroGPU-LLM-Inference

Build error

App Files Files Community

Luigi commited on Apr 10, 2025

Commit

14564aa

1 Parent(s): 5db22d5

bugfix for think tag handling

Browse files

Files changed (1) hide show

app.py +3 -7

app.py CHANGED Viewed

@@ -116,7 +116,6 @@ def validate_or_download_model():
         cleanup_old_models()
         download_model()
-    # First load attempt
     result = try_load_model(model_path)
     if isinstance(result, str):
         st.warning(f"Initial load failed: {result}\nAttempting re-download...")
@@ -153,7 +152,6 @@ st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
 user_input = st.chat_input("Ask something...")
 if user_input:
-    # Prevent appending user message if assistant hasn't replied yet
     if len(st.session_state.chat_history) % 2 == 1:
         st.warning("Please wait for the assistant to respond before sending another message.")
     else:
@@ -162,15 +160,12 @@ if user_input:
         with st.chat_message("user"):
             st.markdown(user_input)
-        # Trim conversation history to max 8 turns (user+assistant)
         MAX_TURNS = 8
         trimmed_history = st.session_state.chat_history[-MAX_TURNS * 2:]
         messages = [{"role": "system", "content": system_prompt}] + trimmed_history
         with st.chat_message("assistant"):
             full_response = ""
-            response_area = st.empty()
             stream = llm.create_chat_completion(
                 messages=messages,
                 max_tokens=max_tokens,
@@ -185,8 +180,9 @@ if user_input:
                 if "choices" in chunk:
                     delta = chunk["choices"][0]["delta"].get("content", "")
                     full_response += delta
-                    visible = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
-                    response_area.markdown(visible)
             st.session_state.chat_history.append({"role": "assistant", "content": full_response})

         cleanup_old_models()
         download_model()
     result = try_load_model(model_path)
     if isinstance(result, str):
         st.warning(f"Initial load failed: {result}\nAttempting re-download...")
 user_input = st.chat_input("Ask something...")
 if user_input:
     if len(st.session_state.chat_history) % 2 == 1:
         st.warning("Please wait for the assistant to respond before sending another message.")
     else:
         with st.chat_message("user"):
             st.markdown(user_input)
         MAX_TURNS = 8
         trimmed_history = st.session_state.chat_history[-MAX_TURNS * 2:]
         messages = [{"role": "system", "content": system_prompt}] + trimmed_history
         with st.chat_message("assistant"):
             full_response = ""
             stream = llm.create_chat_completion(
                 messages=messages,
                 max_tokens=max_tokens,
                 if "choices" in chunk:
                     delta = chunk["choices"][0]["delta"].get("content", "")
                     full_response += delta
+            visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
+            st.markdown(visible_response)
             st.session_state.chat_history.append({"role": "assistant", "content": full_response})