SiennaClarke commited on
Commit
d7f5026
·
verified ·
1 Parent(s): 4c6dc8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -63
app.py CHANGED
@@ -1,90 +1,82 @@
1
  import streamlit as st
2
- from llama_cpp import Llama
3
- import re
 
4
 
5
- # Page configuration
6
- st.set_page_config(page_title="Qwen 3 Advanced AI", page_icon="🧠", layout="wide")
7
 
8
- # 1. Model Configuration
9
- # Qwen 3 4B Thinking is the flagship 2026 small model with deep reasoning
10
- MODEL_REPO = "unsloth/Qwen3-4B-Thinking-2507-GGUF"
11
- MODEL_FILE = "Qwen3-4B-Thinking-2507-Q4_K_M.gguf"
12
 
13
  @st.cache_resource
14
- def load_qwen():
15
- return Llama.from_pretrained(
16
- repo_id=MODEL_REPO,
17
- filename=MODEL_FILE,
18
- n_ctx=8192, # Sufficient context for long reasoning chains
19
- n_threads=4, # Optimized for standard multi-core CPUs
20
- verbose=False
21
  )
 
22
 
23
- llm = load_qwen()
24
 
25
- # 2. UI Elements
26
- st.title("🧠 Qwen 3 Reasoning Hub")
27
- st.markdown("This model uses **Native Thinking** to solve logic, math, and code.")
 
 
28
 
29
  if "messages" not in st.session_state:
30
  st.session_state.messages = []
31
 
32
- # Sidebar for Mode Toggle
33
- with st.sidebar:
34
- st.header("Settings")
35
- reasoning_on = st.toggle("Enable Deep Reasoning (/think)", value=True)
36
- if st.button("Clear Chat"):
37
  st.session_state.messages = []
38
  st.rerun()
39
 
40
- # Display Chat History
41
  for msg in st.session_state.messages:
42
  with st.chat_message(msg["role"]):
43
  st.markdown(msg["content"])
44
 
45
- # 3. Main Chat Logic
46
- if prompt := st.chat_input("Ask a difficult logic question..."):
47
  st.session_state.messages.append({"role": "user", "content": prompt})
48
  with st.chat_message("user"):
49
  st.markdown(prompt)
50
 
51
  with st.chat_message("assistant"):
52
- # Qwen 3 Template with 'Soft Switch'
53
- prefix = "/think " if reasoning_on else "/no_think "
54
- formatted_prompt = f"<|im_start|>user\n{prefix}{prompt}<|im_end|>\n<|im_start|>assistant\n"
55
 
56
- response_placeholder = st.empty()
57
- full_text = ""
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
- # Stream the response
60
- # Using Temperature 0.6 as per Qwen 3 official best practices for thinking
61
- for chunk in llm(
62
- formatted_prompt,
63
- max_tokens=2048,
64
- stream=True,
65
- stop=["<|im_end|>"],
66
- temperature=0.6,
67
- top_p=0.95
68
- ):
69
- token = chunk['choices'][0]['text']
70
- full_text += token
71
-
72
- # Format the <think> block for better UI
73
- # This hides the thinking process inside a blockquote
74
- display_text = full_text
75
- if "<think>" in display_text:
76
- parts = re.split(r'(<think>.*?</think>)', display_text, flags=re.DOTALL)
77
- clean_display = ""
78
- for part in parts:
79
- if part.startswith("<think>"):
80
- thought = part.replace("<think>", "").replace("</think>", "").strip()
81
- clean_display += f"> 💭 **Reasoning:**\n> {thought}\n\n"
82
- else:
83
- clean_display += part
84
- response_placeholder.markdown(clean_display + "▌")
85
- else:
86
- response_placeholder.markdown(display_text + "▌")
87
 
88
- # Final render without the cursor
89
- response_placeholder.markdown(clean_display if "<think>" in full_text else full_text)
90
- st.session_state.messages.append({"role": "assistant", "content": full_text})
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
3
+ from threading import Thread
4
+ import torch
5
 
6
+ # Clean, centered layout without sidebar
7
+ st.set_page_config(page_title="Qwen 3 4B Stream", page_icon="", layout="centered", initial_sidebar_state="collapsed")
8
 
9
+ # 1. Model Configuration (Qwen 3 4B - 4-bit for speed)
10
+ MODEL_ID = "unsloth/Qwen3-4B-Instruct-2507-bnb-4bit"
 
 
11
 
12
  @st.cache_resource
13
+ def load_resource():
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ MODEL_ID,
17
+ device_map="auto",
18
+ torch_dtype="auto"
 
19
  )
20
+ return tokenizer, model
21
 
22
+ tokenizer, model = load_resource()
23
 
24
+ # Custom CSS to hide the sidebar toggle
25
+ st.markdown("<style>[data-testid='collapsedControl'] { display: none; }</style>", unsafe_allow_html=True)
26
+
27
+ st.title("⚡ Qwen 3 4B Stream")
28
+ st.caption("Real-time local generation | No Sidebar")
29
 
30
  if "messages" not in st.session_state:
31
  st.session_state.messages = []
32
 
33
+ # Action Buttons
34
+ col1, col2 = st.columns([5, 1])
35
+ with col2:
36
+ if st.button("Reset"):
 
37
  st.session_state.messages = []
38
  st.rerun()
39
 
40
+ # Display chat history
41
  for msg in st.session_state.messages:
42
  with st.chat_message(msg["role"]):
43
  st.markdown(msg["content"])
44
 
45
+ # 2. Streaming Chat Input
46
+ if prompt := st.chat_input("Ask Qwen 3..."):
47
  st.session_state.messages.append({"role": "user", "content": prompt})
48
  with st.chat_message("user"):
49
  st.markdown(prompt)
50
 
51
  with st.chat_message("assistant"):
52
+ # Setup the Streamer
53
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
54
 
55
+ # Prepare input
56
+ input_text = tokenizer.apply_chat_template(st.session_state.messages, tokenize=False, add_generation_prompt=True)
57
+ inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
58
+
59
+ # 3. Generation in a separate thread
60
+ generation_kwargs = dict(
61
+ **inputs,
62
+ streamer=streamer,
63
+ max_new_tokens=1024,
64
+ do_sample=True,
65
+ temperature=0.7,
66
+ top_p=0.8,
67
+ pad_token_id=tokenizer.eos_token_id
68
+ )
69
 
70
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
71
+ thread.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ # 4. Stream to UI
74
+ placeholder = st.empty()
75
+ full_response = ""
76
+
77
+ for new_text in streamer:
78
+ full_response += new_text
79
+ placeholder.markdown(full_response + "▌")
80
+
81
+ placeholder.markdown(full_response)
82
+ st.session_state.messages.append({"role": "assistant", "content": full_response})