Spaces:

SiennaClarke
/

ChatBoxApp

Sleeping

App Files Files Community

SiennaClarke commited on Jan 21

Commit

cd09b92

verified ·

1 Parent(s): 8fe6af1

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -27

app.py CHANGED Viewed

@@ -1,59 +1,63 @@
 import streamlit as st
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 import torch
-# 1. Page Configuration (Centered and No Sidebar)
-st.set_page_config(
-    page_title="Qwen 3 0.6B Chat",
-    page_icon="⚡",
-    layout="centered",
-    initial_sidebar_state="collapsed"
-)
-# Custom CSS to hide the sidebar toggle button entirely
 st.markdown("<style>[data-testid='collapsedControl'] { display: none; }</style>", unsafe_allow_html=True)
-# 2. Model & Tokenizer Initialization (Using your direct load logic)
-MODEL_ID = "Qwen/Qwen3-0.6B"
 @st.cache_resource
 def load_llm():
-    # Loading the tokenizer and model directly as requested
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
-        device_map="auto",
-        torch_dtype="auto"
     )
     return tokenizer, model
 tokenizer, model = load_llm()
-# 3. Chat UI Logic
-st.title("⚡ Qwen 3 0.6B")
-st.caption("Using your direct-load logic with real-time streaming.")
 if "messages" not in st.session_state:
     st.session_state.messages = []
 # Display history
 for msg in st.session_state.messages:
     with st.chat_message(msg["role"]):
         st.markdown(msg["content"])
-# 4. Input & Streaming Generation
-if prompt := st.chat_input("Ask Qwen 3..."):
-    # Store and display user message
     st.session_state.messages.append({"role": "user", "content": prompt})
     with st.chat_message("user"):
         st.markdown(prompt)
     with st.chat_message("assistant"):
-        # Initialize the streamer
         streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        # Using your chat template logic
         inputs = tokenizer.apply_chat_template(
             st.session_state.messages,
             add_generation_prompt=True,
@@ -62,24 +66,22 @@ if prompt := st.chat_input("Ask Qwen 3..."):
             return_tensors="pt",
         ).to(model.device)
-        # Background thread for generation
         generation_kwargs = dict(
             **inputs,
             streamer=streamer,
             max_new_tokens=512,
             do_sample=True,
             temperature=0.7,
-            top_p=0.8,
             pad_token_id=tokenizer.eos_token_id
         )
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
-        # Update the UI as tokens arrive
         placeholder = st.empty()
         full_response = ""
         for new_text in streamer:
             full_response += new_text
             placeholder.markdown(full_response + "▌")

 import streamlit as st
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
 from threading import Thread
 import torch
+# UI Setup (No Sidebar as requested)
+st.set_page_config(page_title="Qwen 2.5 32B Chat", page_icon="🐘", layout="centered", initial_sidebar_state="collapsed")
 st.markdown("<style>[data-testid='collapsedControl'] { display: none; }</style>", unsafe_allow_html=True)
+# 1. Model Configuration (Quantized to fit on 24GB VRAM or 32GB RAM)
+MODEL_ID = "Qwen/Qwen2.5-32B-Instruct"
 @st.cache_resource
 def load_llm():
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    # 4-bit config allows this 64GB model to fit in ~18-20GB of memory
+    quant_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,
+        bnb_4bit_quant_type="nf4"
+    )
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
+        quantization_config=quant_config,
+        device_map="auto" # Automatically splits between GPU and CPU
     )
     return tokenizer, model
 tokenizer, model = load_llm()
+# 2. Chat Interface
+st.title("🐘 Qwen 2.5 32B")
+st.caption("Running high-parameter model with 4-bit quantization")
 if "messages" not in st.session_state:
     st.session_state.messages = []
+# Action Button
+if st.button("Clear History"):
+    st.session_state.messages = []
+    st.rerun()
 # Display history
 for msg in st.session_state.messages:
     with st.chat_message(msg["role"]):
         st.markdown(msg["content"])
+# 3. Chat Logic with your exact Template Code
+if prompt := st.chat_input("Message Qwen 2.5 32B..."):
     st.session_state.messages.append({"role": "user", "content": prompt})
     with st.chat_message("user"):
         st.markdown(prompt)
     with st.chat_message("assistant"):
+        # Setup Streamer
         streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        # YOUR EXACT LOGIC: Applying the chat template
         inputs = tokenizer.apply_chat_template(
             st.session_state.messages,
             add_generation_prompt=True,
             return_tensors="pt",
         ).to(model.device)
+        # Threading for live streaming
         generation_kwargs = dict(
             **inputs,
             streamer=streamer,
             max_new_tokens=512,
             do_sample=True,
             temperature=0.7,
             pad_token_id=tokenizer.eos_token_id
         )
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
+        # Word-by-word UI update
         placeholder = st.empty()
         full_response = ""
         for new_text in streamer:
             full_response += new_text
             placeholder.markdown(full_response + "▌")