Spaces:

JumaRubea
/

assistant

Sleeping

App Files Files Community

JumaRubea commited on Aug 1, 2025

Commit

6ea5e8d

verified ·

1 Parent(s): b9c9890

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +22 -59

src/streamlit_app.py CHANGED Viewed

@@ -58,19 +58,15 @@ if user_input:
     with st.spinner("Thinking..."):
         try:
-            # Format messages for chat template
-            messages = [
-                {"role": "system", "content": system_prompt()},
-                {"role": "user", "content": user_input}
-            ]
-            # Tokenize input using chat template
-            inputs = tokenizer.apply_chat_template(
-                messages,
-                add_generation_prompt=True,
-                tokenize=True,
-                return_dict=True,
                 return_tensors="pt",
             ).to(device)
             # Generate tokens
@@ -80,7 +76,8 @@ if user_input:
             # Stream tokens
             generated = inputs["input_ids"]
             outputs = model.generate(
-                **inputs,
                 max_new_tokens=200,
                 do_sample=False,
                 temperature=0.5,
@@ -92,56 +89,22 @@ if user_input:
             )
             sequence = outputs.sequences[0]
-            # Decode only new tokens one by one
             for i in range(generated.shape[-1], sequence.shape[-1]):
                 token_id = sequence[i].unsqueeze(0)
-                text = tokenizer.decode(token_id, skip_special_tokens=True)
-                if text.strip():
                     full_response += text
                     placeholder.markdown(full_response)
-            st.chat_message("assistant").markdown(full_response)
-            save_message(selected_chat_id, "assistant", full_response)
         except Exception as e:
-            st.error(f"Error: {str(e)}")
-            # Fallback to manual formatting if apply_chat_template fails
-            try:
-                system_message = system_prompt()
-                prompt = f"<|SYSTEM|> {system_message} <|USER|> {user_input} <|ASSISTANT>"
-                inputs = tokenizer(
-                    prompt,
-                    return_tensors="pt",
-                    add_special_tokens=True
-                ).to(device)
-                full_response = ""
-                placeholder = st.empty()
-                generated = inputs["input_ids"]
-                outputs = model.generate(
-                    input_ids=inputs["input_ids"],
-                    attention_mask=inputs["attention_mask"],
-                    max_new_tokens=200,
-                    do_sample=False,
-                    temperature=0.5,
-                    top_p=0.9,
-                    eos_token_id=tokenizer.eos_token_id,
-                    pad_token_id=tokenizer.eos_token_id,
-                    return_dict_in_generate=True,
-                    output_scores=False
-                )
-                sequence = outputs.sequences[0]
-                for i in range(generated.shape[-1], sequence.shape[-1]):
-                    token_id = sequence[i].unsqueeze(0)
-                    text = tokenizer.decode(token_id, skip_special_tokens=True)
-                    if text.strip():
-                        full_response += text
-                        placeholder.markdown(full_response)
-                st.chat_message("assistant").markdown(full_response)
-                save_message(selected_chat_id, "assistant", full_response)
-            except Exception as fallback_e:
-                st.error(f"Fallback Error: {str(fallback_e)}")

     with st.spinner("Thinking..."):
         try:
+            # Manually format the chat prompt
+            system_message = system_prompt()
+            prompt = f"<|SYSTEM|> {system_message} <|USER|> {user_input} <|ASSISTANT>"
+            # Tokenize the formatted prompt
+            inputs = tokenizer(
+                prompt,
                 return_tensors="pt",
+                add_special_tokens=True
             ).to(device)
             # Generate tokens
             # Stream tokens
             generated = inputs["input_ids"]
             outputs = model.generate(
+                input_ids=inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
                 max_new_tokens=200,
                 do_sample=False,
                 temperature=0.5,
             )
             sequence = outputs.sequences[0]
+            # Decode tokens one by one, preserving spaces
             for i in range(generated.shape[-1], sequence.shape[-1]):
                 token_id = sequence[i].unsqueeze(0)
+                text = tokenizer.decode(token_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+                if text:
                     full_response += text
                     placeholder.markdown(full_response)
+            # Final response, decoding only new tokens
+            final_response = tokenizer.decode(
+                sequence[generated.shape[-1]:],
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True
+            ).strip()
+            st.chat_message("assistant").markdown(final_response)
+            save_message(selected_chat_id, "assistant", final_response)
         except Exception as e:
+            st.error(f"Error: {str(e)}")