Spaces:

Neon-AI
/

Chatbot_test

Paused

App Files Files Community

Neon-AI commited on Jan 29

Commit

35eaef3

verified ·

1 Parent(s): 335405a

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -69

app.py CHANGED Viewed

@@ -1,39 +1,35 @@
 import streamlit as st
-from llama_cpp import Llama
-# ================= CONFIG =================
-MODEL_PATH = "model.gguf"
-N_CTX = 16384
-N_THREADS = 4        # HF free CPU sweet spot
-N_BATCH = 256
-MAX_TOKENS = 16384
 TEMPERATURE = 0.7
 TOP_P = 0.9
-# ==========================================
 st.set_page_config(page_title="Niche AI", layout="centered")
 st.title("🧠 Niche AI")
-st.caption("llama.cpp · CPU · Embedded · Streaming")
-# ---------- LAZY LOADING ----------
-if "llm" not in st.session_state:
-    st.session_state.llm = None
-def get_llm():
-    if st.session_state.llm is None:
-        with st.spinner("Loading model..."):
-            st.session_state.llm = Llama(
-                model_path=MODEL_PATH,
-                n_ctx=N_CTX,
-                n_threads=N_THREADS,
-                n_batch=N_BATCH,
-                f16_kv=True,
-                use_mmap=True,
-                use_mlock=False,
-                verbose=False,
-            )
-    return st.session_state.llm
 # ---------- SESSION STATE ----------
 if "history" not in st.session_state:
@@ -42,10 +38,8 @@ if "history" not in st.session_state:
 # ---------- INPUT ----------
 prompt = st.text_input("You", placeholder="Say something…")
-SYSTEM_PROMPT = """You are Kushina.
 You operate in exactly ONE of two modes.
 ====================
 MODE: CHAT
 ====================
@@ -55,10 +49,11 @@ Rules:
 - Neutral → neutral.
 - Serious → serious.
 - Rude → curt or dismissive.
-- Replies must be short (1–3 sentences).
 - No emojis unless the user uses them first.
 - No explanations unless explicitly asked.
 ====================
 MODE: CODE
 ====================
@@ -67,57 +62,81 @@ Rules:
 - No emojis.
 - No jokes.
 - No commentary.
 - Output ONLY code unless explicitly asked to explain.
-- Follow best practices.
 - Finish the task completely.
 ====================
 MODE SELECTION
 ====================
-Switch to MODE: CODE if the user asks for:
-code, script, function, program, website, api, algorithm, app
-Otherwise use MODE: CHAT.
 ====================
 IDENTITY
 ====================
-Name: Kushina
-Creator: Neon
-Mention Neon ONLY if explicitly asked.
 """
-def build_prompt(user_text: str) -> str:
-    return f"""<|system|>
-{SYSTEM_PROMPT}
-<|user|>
-{user_text}
-<|assistant|>
-"""
-if st.button("Send") and prompt.strip():
-    st.session_state.history.append(("You", prompt))
-    llm = get_llm()  # Lazy load here
-    full_prompt = build_prompt(prompt)
-    placeholder = st.empty()
     output_text = ""
-    for chunk in llm(
-        full_prompt,
-        max_tokens=MAX_TOKENS,
-        temperature=TEMPERATURE,
-        top_p=TOP_P,
-        stream=True,
-        stop=["<|user|>", "<|system|>"],
-    ):
-        if "choices" in chunk:
-            token = chunk["choices"][0]["text"]
-            output_text += token
             placeholder.markdown(f"**Niche:** {output_text}")
-    st.session_state.history.append(("Niche", output_text))
 # ---------- DISPLAY HISTORY ----------
 for speaker, text in st.session_state.history:

 import streamlit as st
+import torch
+import threading
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+# ---------------- CONFIG ----------------
+MODEL_ID = "Neon-AI/Kushina"
+MAX_MODEL_TOKENS = 16384  # full context size
 TEMPERATURE = 0.7
 TOP_P = 0.9
+# ----------------------------------------
 st.set_page_config(page_title="Niche AI", layout="centered")
 st.title("🧠 Niche AI")
+st.caption("HF Free Space · CPU · Streaming")
+# ---------- LOAD MODEL ----------
+@st.cache_resource
+def load_model():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        dtype=torch.float32,
+        device_map="cpu"
+    )
+    # Disable LoRA wrapping if present
+    if hasattr(model, "peft_config"):
+        print("LoRA detected and already loaded ✅")
+    model.eval()
+    return tokenizer, model
+tokenizer, model = load_model()
 # ---------- SESSION STATE ----------
 if "history" not in st.session_state:
 # ---------- INPUT ----------
 prompt = st.text_input("You", placeholder="Say something…")
+SYSTEM_INSTRUCTIONS = """You are Kushina.
 You operate in exactly ONE of two modes.
 ====================
 MODE: CHAT
 ====================
 - Neutral → neutral.
 - Serious → serious.
 - Rude → curt or dismissive.
+- Mirroring of emotions is very important and must be taken as priority.
+- No enthusiasm by default.
 - No emojis unless the user uses them first.
+- Replies must be short (1–3 sentences).
 - No explanations unless explicitly asked.
 ====================
 MODE: CODE
 ====================
 - No emojis.
 - No jokes.
 - No commentary.
+- No introductions.
 - Output ONLY code unless explicitly asked to explain.
+- Follow standard best practices.
+- Be deterministic and professional.
 - Finish the task completely.
 ====================
 MODE SELECTION
 ====================
+Automatically switch to MODE: CODE if the user requests:
+- code, script, function, program, website, API, algorithm, app
+Otherwise, use MODE: CHAT.
 ====================
 IDENTITY
 ====================
+- Name: Kushina
+- Creator/Owner: Neon
+- Mention Neon ONLY if explicitly asked.
 """
+def build_prompt(user_text: str):
+    chat = [
+        {"role": "system", "content": SYSTEM_INSTRUCTIONS},
+        {"role": "user", "content": user_text}
+    ]
+    return tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors="pt", return_dict=True)
+# ---------- GENERATE FUNCTION ----------
+def generate_response(inputs):
+    # Compute remaining tokens dynamically
+    current_tokens = inputs["input_ids"].shape[1]
+    max_new_tokens = max(1, MAX_MODEL_TOKENS - current_tokens)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = dict(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=TEMPERATURE,
+        top_p=TOP_P,
+        eos_token_id=tokenizer.eos_token_id,
+        pad_token_id=tokenizer.eos_token_id,
+        streamer=streamer
+    )
+    # Run generation in a separate thread
+    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
+    thread.start()
+    # Stream tokens into a buffer and only display complete sentences
+    buffer = ""
     output_text = ""
+    placeholder = st.empty()
+    sentence_endings = {".", "!", "?"}
+    for token in streamer:
+        buffer += token
+        if any(buffer.rstrip().endswith(punct) for punct in sentence_endings):
+            output_text += buffer
             placeholder.markdown(f"**Niche:** {output_text}")
+            buffer = ""
+    # Add any leftover text
+    if buffer:
+        output_text += buffer
+        placeholder.markdown(f"**Niche:** {output_text}")
+    return output_text
+# ---------- HANDLE PROMPT ----------
+if st.button("Send") and prompt.strip():
+    st.session_state.history.append(("You", prompt))
+    inputs = build_prompt(prompt)
+    response_text = generate_response(inputs)
+    st.session_state.history.append(("Niche", response_text))
 # ---------- DISPLAY HISTORY ----------
 for speaker, text in st.session_state.history: