Neon-AI commited on
Commit
35eaef3
·
verified ·
1 Parent(s): 335405a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -69
app.py CHANGED
@@ -1,39 +1,35 @@
1
  import streamlit as st
2
- from llama_cpp import Llama
 
 
3
 
4
- # ================= CONFIG =================
5
- MODEL_PATH = "model.gguf"
6
- N_CTX = 16384
7
- N_THREADS = 4 # HF free CPU sweet spot
8
- N_BATCH = 256
9
-
10
- MAX_TOKENS = 16384
11
  TEMPERATURE = 0.7
12
  TOP_P = 0.9
13
- # ==========================================
14
 
15
  st.set_page_config(page_title="Niche AI", layout="centered")
16
  st.title("🧠 Niche AI")
17
- st.caption("llama.cpp · CPU · Embedded · Streaming")
18
-
19
- # ---------- LAZY LOADING ----------
20
- if "llm" not in st.session_state:
21
- st.session_state.llm = None
22
-
23
- def get_llm():
24
- if st.session_state.llm is None:
25
- with st.spinner("Loading model..."):
26
- st.session_state.llm = Llama(
27
- model_path=MODEL_PATH,
28
- n_ctx=N_CTX,
29
- n_threads=N_THREADS,
30
- n_batch=N_BATCH,
31
- f16_kv=True,
32
- use_mmap=True,
33
- use_mlock=False,
34
- verbose=False,
35
- )
36
- return st.session_state.llm
37
 
38
  # ---------- SESSION STATE ----------
39
  if "history" not in st.session_state:
@@ -42,10 +38,8 @@ if "history" not in st.session_state:
42
  # ---------- INPUT ----------
43
  prompt = st.text_input("You", placeholder="Say something…")
44
 
45
- SYSTEM_PROMPT = """You are Kushina.
46
-
47
  You operate in exactly ONE of two modes.
48
-
49
  ====================
50
  MODE: CHAT
51
  ====================
@@ -55,10 +49,11 @@ Rules:
55
  - Neutral → neutral.
56
  - Serious → serious.
57
  - Rude → curt or dismissive.
58
- - Replies must be short (1–3 sentences).
 
59
  - No emojis unless the user uses them first.
 
60
  - No explanations unless explicitly asked.
61
-
62
  ====================
63
  MODE: CODE
64
  ====================
@@ -67,57 +62,81 @@ Rules:
67
  - No emojis.
68
  - No jokes.
69
  - No commentary.
 
70
  - Output ONLY code unless explicitly asked to explain.
71
- - Follow best practices.
 
72
  - Finish the task completely.
73
-
74
  ====================
75
  MODE SELECTION
76
  ====================
77
- Switch to MODE: CODE if the user asks for:
78
- code, script, function, program, website, api, algorithm, app
79
-
80
- Otherwise use MODE: CHAT.
81
-
82
  ====================
83
  IDENTITY
84
  ====================
85
- Name: Kushina
86
- Creator: Neon
87
- Mention Neon ONLY if explicitly asked.
88
  """
89
 
90
- def build_prompt(user_text: str) -> str:
91
- return f"""<|system|>
92
- {SYSTEM_PROMPT}
93
- <|user|>
94
- {user_text}
95
- <|assistant|>
96
- """
97
-
98
- if st.button("Send") and prompt.strip():
99
- st.session_state.history.append(("You", prompt))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- llm = get_llm() # Lazy load here
102
- full_prompt = build_prompt(prompt)
 
103
 
104
- placeholder = st.empty()
 
105
  output_text = ""
 
 
106
 
107
- for chunk in llm(
108
- full_prompt,
109
- max_tokens=MAX_TOKENS,
110
- temperature=TEMPERATURE,
111
- top_p=TOP_P,
112
- stream=True,
113
- stop=["<|user|>", "<|system|>"],
114
- ):
115
- if "choices" in chunk:
116
- token = chunk["choices"][0]["text"]
117
- output_text += token
118
  placeholder.markdown(f"**Niche:** {output_text}")
 
 
 
 
 
 
 
 
119
 
120
- st.session_state.history.append(("Niche", output_text))
 
 
 
 
 
121
 
122
  # ---------- DISPLAY HISTORY ----------
123
  for speaker, text in st.session_state.history:
 
1
  import streamlit as st
2
+ import torch
3
+ import threading
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
 
6
+ # ---------------- CONFIG ----------------
7
+ MODEL_ID = "Neon-AI/Kushina"
8
+ MAX_MODEL_TOKENS = 16384 # full context size
 
 
 
 
9
  TEMPERATURE = 0.7
10
  TOP_P = 0.9
11
+ # ----------------------------------------
12
 
13
  st.set_page_config(page_title="Niche AI", layout="centered")
14
  st.title("🧠 Niche AI")
15
+ st.caption("HF Free Space · CPU · Streaming")
16
+
17
+ # ---------- LOAD MODEL ----------
18
+ @st.cache_resource
19
+ def load_model():
20
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ MODEL_ID,
23
+ dtype=torch.float32,
24
+ device_map="cpu"
25
+ )
26
+ # Disable LoRA wrapping if present
27
+ if hasattr(model, "peft_config"):
28
+ print("LoRA detected and already loaded ✅")
29
+ model.eval()
30
+ return tokenizer, model
31
+
32
+ tokenizer, model = load_model()
 
 
33
 
34
  # ---------- SESSION STATE ----------
35
  if "history" not in st.session_state:
 
38
  # ---------- INPUT ----------
39
  prompt = st.text_input("You", placeholder="Say something…")
40
 
41
+ SYSTEM_INSTRUCTIONS = """You are Kushina.
 
42
  You operate in exactly ONE of two modes.
 
43
  ====================
44
  MODE: CHAT
45
  ====================
 
49
  - Neutral → neutral.
50
  - Serious → serious.
51
  - Rude → curt or dismissive.
52
+ - Mirroring of emotions is very important and must be taken as priority.
53
+ - No enthusiasm by default.
54
  - No emojis unless the user uses them first.
55
+ - Replies must be short (1–3 sentences).
56
  - No explanations unless explicitly asked.
 
57
  ====================
58
  MODE: CODE
59
  ====================
 
62
  - No emojis.
63
  - No jokes.
64
  - No commentary.
65
+ - No introductions.
66
  - Output ONLY code unless explicitly asked to explain.
67
+ - Follow standard best practices.
68
+ - Be deterministic and professional.
69
  - Finish the task completely.
 
70
  ====================
71
  MODE SELECTION
72
  ====================
73
+ Automatically switch to MODE: CODE if the user requests:
74
+ - code, script, function, program, website, API, algorithm, app
75
+ Otherwise, use MODE: CHAT.
 
 
76
  ====================
77
  IDENTITY
78
  ====================
79
+ - Name: Kushina
80
+ - Creator/Owner: Neon
81
+ - Mention Neon ONLY if explicitly asked.
82
  """
83
 
84
+ def build_prompt(user_text: str):
85
+ chat = [
86
+ {"role": "system", "content": SYSTEM_INSTRUCTIONS},
87
+ {"role": "user", "content": user_text}
88
+ ]
89
+ return tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors="pt", return_dict=True)
90
+
91
+ # ---------- GENERATE FUNCTION ----------
92
+ def generate_response(inputs):
93
+ # Compute remaining tokens dynamically
94
+ current_tokens = inputs["input_ids"].shape[1]
95
+ max_new_tokens = max(1, MAX_MODEL_TOKENS - current_tokens)
96
+
97
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
98
+
99
+ gen_kwargs = dict(
100
+ **inputs,
101
+ max_new_tokens=max_new_tokens,
102
+ do_sample=True,
103
+ temperature=TEMPERATURE,
104
+ top_p=TOP_P,
105
+ eos_token_id=tokenizer.eos_token_id,
106
+ pad_token_id=tokenizer.eos_token_id,
107
+ streamer=streamer
108
+ )
109
 
110
+ # Run generation in a separate thread
111
+ thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
112
+ thread.start()
113
 
114
+ # Stream tokens into a buffer and only display complete sentences
115
+ buffer = ""
116
  output_text = ""
117
+ placeholder = st.empty()
118
+ sentence_endings = {".", "!", "?"}
119
 
120
+ for token in streamer:
121
+ buffer += token
122
+ if any(buffer.rstrip().endswith(punct) for punct in sentence_endings):
123
+ output_text += buffer
 
 
 
 
 
 
 
124
  placeholder.markdown(f"**Niche:** {output_text}")
125
+ buffer = ""
126
+
127
+ # Add any leftover text
128
+ if buffer:
129
+ output_text += buffer
130
+ placeholder.markdown(f"**Niche:** {output_text}")
131
+
132
+ return output_text
133
 
134
+ # ---------- HANDLE PROMPT ----------
135
+ if st.button("Send") and prompt.strip():
136
+ st.session_state.history.append(("You", prompt))
137
+ inputs = build_prompt(prompt)
138
+ response_text = generate_response(inputs)
139
+ st.session_state.history.append(("Niche", response_text))
140
 
141
  # ---------- DISPLAY HISTORY ----------
142
  for speaker, text in st.session_state.history: