SiennaClarke commited on
Commit
f6a9a39
·
verified ·
1 Parent(s): 3699c16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -107
app.py CHANGED
@@ -1,121 +1,72 @@
1
  import streamlit as st
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
3
- from threading import Thread
4
  import torch
5
- import sys
6
 
7
- # --- UI Configuration ---
8
- st.set_page_config(
9
- page_title="Klove AI ChatBox",
10
- page_icon="🐘",
11
- layout="centered",
12
- initial_sidebar_state="collapsed"
13
- )
14
 
15
- # Professional CSS injection for cleaner UI
16
- st.markdown("""
17
- <style>
18
- [data-testid='collapsedControl'] { display: none; }
19
- .stChatMessage { border-radius: 10px; margin-bottom: 10px; }
20
- </style>
21
- """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- # --- Model Constants ---
24
- MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
25
 
26
- @st.cache_resource(show_spinner="Initializing Model Engine...")
27
- def load_llm():
28
- try:
29
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
30
-
31
- # Expert Config: nf4 quantization with bfloat16 for better stability if hardware supports it
32
- compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
33
-
34
- quant_config = BitsAndBytesConfig(
35
- load_in_4bit=True,
36
- bnb_4bit_compute_dtype=compute_dtype,
37
- bnb_4bit_quant_type="nf4",
38
- bnb_4bit_use_double_quant=True # Expert addition: Saves extra VRAM
39
- )
40
-
41
- model = AutoModelForCausalLM.from_pretrained(
42
- MODEL_ID,
43
- quantization_config=quant_config,
44
- device_map="auto",
45
- trust_remote_code=True,
46
- low_cpu_mem_usage=True
47
- )
48
- return tokenizer, model
49
- except Exception as e:
50
- st.error(f"Failed to load model: {e}")
51
- st.stop()
52
-
53
- tokenizer, model = load_llm()
54
-
55
- # --- Chat Session State ---
56
  if "messages" not in st.session_state:
57
- st.session_state.messages = []
58
-
59
- # --- Header ---
60
- st.title("🐘 Qwen 2.5 Chat")
61
- st.caption(f"Backend: {MODEL_ID} (4-bit NF4 Quantized)")
62
-
63
- if st.button("Clear Conversation", type="primary"):
64
- st.session_state.messages = []
65
- st.rerun()
66
 
67
- # --- Message Rendering ---
68
- for msg in st.session_state.messages:
69
- # Handle the 'coder' role mapping to 'assistant' for UI consistency
70
- role = "assistant" if msg["role"] == "coder" else msg["role"]
71
- with st.chat_message(role):
72
- st.markdown(msg["content"])
73
 
74
- # --- Generation Logic ---
75
- if prompt := st.chat_input("Message to Qwen..."):
76
- # Append User Message
77
  st.session_state.messages.append({"role": "user", "content": prompt})
 
78
  with st.chat_message("user"):
79
  st.markdown(prompt)
80
 
81
- # Generate Assistant Response
82
  with st.chat_message("assistant"):
83
- placeholder = st.empty()
84
- full_response = ""
85
-
86
- # 1. Prepare Inputs
87
- inputs = tokenizer.apply_chat_template(
88
- # Filter history to only include user/coder roles for the template
89
- st.session_state.messages,
90
- add_generation_prompt=True,
91
- tokenize=True,
92
- return_dict=True,
93
- return_tensors="pt"
94
- ).to(model.device)
95
-
96
- # 2. Setup Streamer
97
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
98
-
99
- # 3. Execution (Expert Note: use inference_mode for speed/memory)
100
- generation_kwargs = dict(
101
- **inputs,
102
- streamer=streamer,
103
- max_new_tokens=1024, # Increased for more robust answers
104
- do_sample=True,
105
- temperature=0.7,
106
- top_p=0.9, # Added for higher quality sampling
107
- pad_token_id=tokenizer.eos_token_id
108
- )
109
-
110
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
111
- thread.start()
112
-
113
- # 4. Stream Handling
114
- for new_text in streamer:
115
- full_response += new_text
116
- placeholder.markdown(full_response + "▌")
117
-
118
- placeholder.markdown(full_response)
119
-
120
- # Store as 'coder' per original logic requirement
121
- st.session_state.messages.append({"role": "coder", "content": full_response})
 
1
  import streamlit as st
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
3
  import torch
 
4
 
5
+ # 1. Page Configuration
6
+ st.set_page_config(page_title="QwenCoder-Mini", page_icon="💻")
7
+ st.title("💻 Qwen2.5 Coder: Mini-Claude")
8
+ st.markdown("Running on **Qwen2.5-Coder-3B-Instruct** (CPU Optimized)")
 
 
 
9
 
10
+ # 2. Model Loading (Cached to prevent reloading on every click)
11
+ @st.cache_resource
12
+ def load_model():
13
+ model_id = "Qwen/Qwen2.5-Coder-3B-Instruct"
14
+
15
+ # Load tokenizer
16
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
17
+
18
+ # Load model with 4-bit quantization to save RAM (Crucial for 16GB limit)
19
+ model = AutoModelForCausalLM.from_pretrained(
20
+ model_id,
21
+ device_map="auto",
22
+ torch_dtype="auto",
23
+ trust_remote_code=True
24
+ )
25
+
26
+ # Create the pipeline
27
+ pipe = pipeline(
28
+ "text-generation",
29
+ model=model,
30
+ tokenizer=tokenizer,
31
+ )
32
+ return pipe
33
 
34
+ # Initialize the pipeline
35
+ generator = load_model()
36
 
37
+ # 3. Chat History Setup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  if "messages" not in st.session_state:
39
+ st.session_state.messages = [
40
+ {"role": "system", "content": "You are an expert software engineer like Claude. Provide complete, production-ready code with explanations."}
41
+ ]
 
 
 
 
 
 
42
 
43
+ # Display chat history
44
+ for message in st.session_state.messages:
45
+ if message["role"] != "system":
46
+ with st.chat_message(message["role"]):
47
+ st.markdown(message["content"])
 
48
 
49
+ # 4. Chat Input & Generation
50
+ if prompt := st.chat_input("Ask me to write some code..."):
51
+ # Add user message to state
52
  st.session_state.messages.append({"role": "user", "content": prompt})
53
+
54
  with st.chat_message("user"):
55
  st.markdown(prompt)
56
 
 
57
  with st.chat_message("assistant"):
58
+ with st.spinner("Writing code..."):
59
+ # Generate response
60
+ response = generator(
61
+ st.session_state.messages,
62
+ max_new_tokens=1024,
63
+ temperature=0.7,
64
+ top_p=0.9,
65
+ return_full_text=False
66
+ )
67
+
68
+ answer = response[0]['generated_text']
69
+ st.markdown(answer)
70
+
71
+ # Add assistant message to state
72
+ st.session_state.messages.append({"role": "assistant", "content": answer})