SiennaClarke commited on
Commit
1d5a30d
·
verified ·
1 Parent(s): 3b3aef1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -25
app.py CHANGED
@@ -3,73 +3,98 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
3
  from threading import Thread
4
  import torch
5
 
6
- # 1. Page Configuration (No Sidebar)
7
- st.set_page_config(page_title="Claude Clone", page_icon="🤖", layout="centered")
8
 
 
9
  st.markdown("""
10
  <style>
11
  [data-testid="stSidebar"] {display: none;}
12
- .stChatMessage {border-radius: 15px; padding: 10px; margin-bottom: 10px;}
 
13
  </style>
14
  """, unsafe_allow_html=True)
15
 
16
- st.title("Qwen 2.5 Coder 1.5B 🚀")
17
- st.caption("Now with real-time streaming and optimized CPU inference.")
18
 
19
  # 2. Optimized Model Loading
20
- @st.cache_resource
21
  def load_model():
22
- model_id = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
 
23
  tokenizer = AutoTokenizer.from_pretrained(model_id)
24
 
25
- # Use bfloat16 for speed on modern CPUs, or float32 for maximum compatibility
 
 
 
 
 
26
  model = AutoModelForCausalLM.from_pretrained(
27
  model_id,
28
- torch_dtype=torch.float32, # CPU-friendly
29
  device_map="auto"
30
  )
31
- return model, tokenizer
32
 
33
- model, tokenizer = load_model()
34
 
35
- # 3. Session State
36
  if "messages" not in st.session_state:
37
  st.session_state.messages = []
38
 
39
- # Display History
40
  for message in st.session_state.messages:
41
  with st.chat_message(message["role"]):
42
  st.markdown(message["content"])
43
 
44
  # 4. Chat Input & Streaming Logic
45
- if prompt := st.chat_input("Ask me anything..."):
 
46
  st.session_state.messages.append({"role": "user", "content": prompt})
 
47
  with st.chat_message("user"):
48
  st.markdown(prompt)
49
 
50
  with st.chat_message("assistant"):
51
- # Set up the streamer
52
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
53
 
54
- # Prepare the input
55
- messages = [{"role": "system", "content": "You are a helpful coding assistant."}] + st.session_state.messages
56
- inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
 
 
 
 
 
 
 
 
57
 
58
- # Run generation in a separate thread to allow UI to remain responsive
59
  generation_kwargs = dict(
60
- input_ids=inputs,
61
  streamer=streamer,
62
- max_new_tokens=512,
63
  do_sample=True,
64
  temperature=0.7,
65
- top_p=0.9,
 
66
  pad_token_id=tokenizer.eos_token_id
67
  )
68
-
 
69
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
70
  thread.start()
71
 
72
- # Stream the response to the UI
73
- full_response = st.write_stream(streamer)
 
74
 
 
 
 
 
75
  st.session_state.messages.append({"role": "assistant", "content": full_response})
 
3
  from threading import Thread
4
  import torch
5
 
6
+ # 1. Page Configuration
7
+ st.set_page_config(page_title="Qwen Chat", page_icon="🧠", layout="centered")
8
 
9
+ # Custom CSS for a cleaner "Claude-like" feel
10
  st.markdown("""
11
  <style>
12
  [data-testid="stSidebar"] {display: none;}
13
+ .stChatMessage { border-radius: 10px; margin-bottom: 5px; }
14
+ .stChatInputContainer { padding-bottom: 20px; }
15
  </style>
16
  """, unsafe_allow_html=True)
17
 
18
+ st.title("Qwen 2.5 3B Chat 🚀")
19
+ st.caption("A balanced, high-performance model for local CPU/GPU inference.")
20
 
21
  # 2. Optimized Model Loading
22
+ @st.cache_resourced
23
  def load_model():
24
+ # '3B' is the most feasible mid-point for modern laptops/PCs
25
+ model_id = "Qwen/Qwen2.5-3B-Instruct"
26
  tokenizer = AutoTokenizer.from_pretrained(model_id)
27
 
28
+ # Auto-detect device (Use GPU if available, else CPU)
29
+ device = "cuda" if torch.cuda.is_available() else "cpu"
30
+
31
+ # Use float16 for GPU or bfloat16 for modern CPUs to save memory
32
+ dtype = torch.float16 if device == "cuda" else torch.bfloat16
33
+
34
  model = AutoModelForCausalLM.from_pretrained(
35
  model_id,
36
+ torch_dtype=dtype,
37
  device_map="auto"
38
  )
39
+ return model, tokenizer, device
40
 
41
+ model, tokenizer, device = load_model()
42
 
43
+ # 3. Session State for Chat History
44
  if "messages" not in st.session_state:
45
  st.session_state.messages = []
46
 
47
+ # Display Chat History
48
  for message in st.session_state.messages:
49
  with st.chat_message(message["role"]):
50
  st.markdown(message["content"])
51
 
52
  # 4. Chat Input & Streaming Logic
53
+ if prompt := st.chat_input("How can I help you today?"):
54
+ # Add user message to history
55
  st.session_state.messages.append({"role": "user", "content": prompt})
56
+
57
  with st.chat_message("user"):
58
  st.markdown(prompt)
59
 
60
  with st.chat_message("assistant"):
61
+ # Setup Streamer
62
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
63
 
64
+ # Format conversation using the model's chat template
65
+ # Limit history to last 5 turns to prevent CPU slowdown
66
+ context_messages = st.session_state.messages[-10:]
67
+ full_prompt = [{"role": "system", "content": "You are Qwen, a helpful and concise AI assistant."}] + context_messages
68
+
69
+ model_inputs = tokenizer.apply_chat_template(
70
+ full_prompt,
71
+ tokenize=True,
72
+ add_generation_prompt=True,
73
+ return_tensors="pt"
74
+ ).to(device)
75
 
76
+ # Generation Arguments
77
  generation_kwargs = dict(
78
+ input_ids=model_inputs,
79
  streamer=streamer,
80
+ max_new_tokens=1024,
81
  do_sample=True,
82
  temperature=0.7,
83
+ top_p=0.8,
84
+ repetition_penalty=1.1,
85
  pad_token_id=tokenizer.eos_token_id
86
  )
87
+
88
+ # Start thread
89
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
90
  thread.start()
91
 
92
+ # Stream the output
93
+ response_container = st.empty()
94
+ full_response = ""
95
 
96
+ # Use st.write_stream for a native feel
97
+ full_response = st.write_stream(streamer)
98
+
99
+ # Save assistant response to history
100
  st.session_state.messages.append({"role": "assistant", "content": full_response})