Spaces:

SiennaClarke
/

ChatBoxApp

Sleeping

App Files Files Community

SiennaClarke commited on Jan 22

Commit

f6a9a39

verified ·

1 Parent(s): 3699c16

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -107

app.py CHANGED Viewed

@@ -1,121 +1,72 @@
 import streamlit as st
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
-from threading import Thread
 import torch
-import sys
-# --- UI Configuration ---
-st.set_page_config(
-    page_title="Klove AI ChatBox",
-    page_icon="🐘",
-    layout="centered",
-    initial_sidebar_state="collapsed"
-)
-# Professional CSS injection for cleaner UI
-st.markdown("""
-    <style>
-        [data-testid='collapsedControl'] { display: none; }
-        .stChatMessage { border-radius: 10px; margin-bottom: 10px; }
-    </style>
-""", unsafe_allow_html=True)
-# --- Model Constants ---
-MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
-@st.cache_resource(show_spinner="Initializing Model Engine...")
-def load_llm():
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
-        # Expert Config: nf4 quantization with bfloat16 for better stability if hardware supports it
-        compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-        quant_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=compute_dtype,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True # Expert addition: Saves extra VRAM
-        )
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            quantization_config=quant_config,
-            device_map="auto",
-            trust_remote_code=True,
-            low_cpu_mem_usage=True
-        )
-        return tokenizer, model
-    except Exception as e:
-        st.error(f"Failed to load model: {e}")
-        st.stop()
-tokenizer, model = load_llm()
-# --- Chat Session State ---
 if "messages" not in st.session_state:
-    st.session_state.messages = []
-# --- Header ---
-st.title("🐘 Qwen 2.5 Chat")
-st.caption(f"Backend: {MODEL_ID} (4-bit NF4 Quantized)")
-if st.button("Clear Conversation", type="primary"):
-    st.session_state.messages = []
-    st.rerun()
-# --- Message Rendering ---
-for msg in st.session_state.messages:
-    # Handle the 'coder' role mapping to 'assistant' for UI consistency
-    role = "assistant" if msg["role"] == "coder" else msg["role"]
-    with st.chat_message(role):
-        st.markdown(msg["content"])
-# --- Generation Logic ---
-if prompt := st.chat_input("Message to Qwen..."):
-    # Append User Message
     st.session_state.messages.append({"role": "user", "content": prompt})
     with st.chat_message("user"):
         st.markdown(prompt)
-    # Generate Assistant Response
     with st.chat_message("assistant"):
-        placeholder = st.empty()
-        full_response = ""
-        # 1. Prepare Inputs
-        inputs = tokenizer.apply_chat_template(
-            # Filter history to only include user/coder roles for the template
-            st.session_state.messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt"
-        ).to(model.device)
-        # 2. Setup Streamer
-        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        # 3. Execution (Expert Note: use inference_mode for speed/memory)
-        generation_kwargs = dict(
-            **inputs,
-            streamer=streamer,
-            max_new_tokens=1024, # Increased for more robust answers
-            do_sample=True,
-            temperature=0.7,
-            top_p=0.9, # Added for higher quality sampling
-            pad_token_id=tokenizer.eos_token_id
-        )
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        # 4. Stream Handling
-        for new_text in streamer:
-            full_response += new_text
-            placeholder.markdown(full_response + "▌")
-        placeholder.markdown(full_response)
-        # Store as 'coder' per original logic requirement
-        st.session_state.messages.append({"role": "coder", "content": full_response})

 import streamlit as st
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import torch
+# 1. Page Configuration
+st.set_page_config(page_title="QwenCoder-Mini", page_icon="💻")
+st.title("💻 Qwen2.5 Coder: Mini-Claude")
+st.markdown("Running on **Qwen2.5-Coder-3B-Instruct** (CPU Optimized)")
+# 2. Model Loading (Cached to prevent reloading on every click)
+@st.cache_resource
+def load_model():
+    model_id = "Qwen/Qwen2.5-Coder-3B-Instruct"
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    # Load model with 4-bit quantization to save RAM (Crucial for 16GB limit)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        device_map="auto",
+        torch_dtype="auto",
+        trust_remote_code=True
+    )
+    # Create the pipeline
+    pipe = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+    )
+    return pipe
+# Initialize the pipeline
+generator = load_model()
+# 3. Chat History Setup
 if "messages" not in st.session_state:
+    st.session_state.messages = [
+        {"role": "system", "content": "You are an expert software engineer like Claude. Provide complete, production-ready code with explanations."}
+    ]
+# Display chat history
+for message in st.session_state.messages:
+    if message["role"] != "system":
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+# 4. Chat Input & Generation
+if prompt := st.chat_input("Ask me to write some code..."):
+    # Add user message to state
     st.session_state.messages.append({"role": "user", "content": prompt})
     with st.chat_message("user"):
         st.markdown(prompt)
     with st.chat_message("assistant"):
+        with st.spinner("Writing code..."):
+            # Generate response
+            response = generator(
+                st.session_state.messages,
+                max_new_tokens=1024,
+                temperature=0.7,
+                top_p=0.9,
+                return_full_text=False
+            )
+            answer = response[0]['generated_text']
+            st.markdown(answer)
+    # Add assistant message to state
+    st.session_state.messages.append({"role": "assistant", "content": answer})