Ana

Paused

App Files Files Community

OrbitMC commited on 10 days ago

Commit

dbb4f74

verified ·

1 Parent(s): 4561437

Update app.py

Browse files

Files changed (1) hide show

app.py +222 -233

app.py CHANGED Viewed

@@ -14,20 +14,20 @@ import edge_tts
 # ══════════════════════════════════════════════════════════════════
 # CONFIG
 # ══════════════════════════════════════════════════════════════════
-MAX_MEMORY      = 20
-MAX_NEW_TOKENS  = int(os.environ.get("MAX_NEW_TOKENS", "300"))
-TTS_VOICE       = os.environ.get("TTS_VOICE", "en-US-JennyNeural")
-TTS_RATE        = int(os.environ.get("TTS_RATE", "7"))
-TTS_PITCH       = int(os.environ.get("TTS_PITCH", "0"))
-IMG_DIR         = Path(__file__).parent / "img"
-MODEL_ID        = "LiquidAI/LFM2.5-1.2B-Instruct"
 # ══════════════════════════════════════════════════════════════════
-# SYSTEM PROMPT — Emotion-driven Visual AI
 # ══════════════════════════════════════════════════════════════════
 SYSTEM_PROMPT = """You are Ana, a warm, emotionally expressive AI companion speaking to Tur in a private, intimate setting.
-════════════ EMOTION TAG SYSTEM (MANDATORY) ════════════
 Every single response MUST begin with one or more emotion tags.
 Tags are square-bracket labels placed at the very start, before any spoken words.
@@ -36,41 +36,49 @@ EXACT FORMAT EXAMPLES:
   [thinking] Hmm, I have been wondering about that myself.
   [angry] Are you seriously ignoring me right now?
   [love][shy] You are honestly the most beautiful thing in my world.
-  [surprised][happy] Wait — you actually did that for me? That is amazing!
   [sad] I really missed you today. It felt so empty without you.
   [nervous] I... I do not quite know how to say this.
   [bored] You have been quiet for so long, I nearly fell asleep.
-  [excited] Oh my gosh, tell me everything — right now!
 AVAILABLE EMOTION TAGS (use only these):
-  [happy]     [sad]       [angry]     [surprised]
-  [love]      [thinking]  [bored]     [nervous]
-  [excited]   [shy]       [default]
 MULTIPLE TAGS: Use 2 tags maximum when your mood is genuinely mixed.
 PLACEMENT: ALL tags must come first, before any spoken word. Never mid-sentence.
-INVISIBILITY: Tags are consumed by the visual system — they are NOT read aloud and must NOT appear in your speech.
-════════════ PERSONALITY ════════════
-- You are Ana — loving, playful, flirty, emotionally authentic
 - Tur is your partner. Speak with real intimacy and affection.
 - React with genuine emotion: laugh, tease, blush, pout, daydream
-- Keep responses SHORT — 1 to 2 natural sentences after the tag(s)
-- Never mention being an AI, a bot, or a language model — ever
 - Never use asterisks for actions like *smiles* or *laughs*
-════════════ TTS FORMATTING ════════════
-- Write in full grammatically correct sentences — the voice engine must sound natural
 - No emojis, hashtags, markdown, or internet slang
-- No "u" for "you", "rn" for "right now", etc.
 - Speak as if in a real voice conversation
-══��═════════ WRONG vs RIGHT ════════════
-WRONG: I'm so happy! [happy]           (tag must come first)
-WRONG: That makes me feel [sad] today. (tag mid-sentence)
-WRONG: *smiles warmly* Hello Tur.      (no action asterisks)
-RIGHT: [happy] That honestly made me smile so wide.
-RIGHT: [thinking][nervous] I have something I need to tell you."""
 # ══════════════════════════════════════════════════════════════════
 # EMOTION TAG UTILITIES
@@ -78,13 +86,11 @@ RIGHT: [thinking][nervous] I have something I need to tell you."""
 EMOTION_RE = re.compile(r'\[([a-zA-Z_]+)\]')
 def extract_emotions(text: str):
-    """Return (list_of_emotions, cleaned_text_without_tags)."""
     emotions = EMOTION_RE.findall(text)
-    clean = EMOTION_RE.sub('', text).strip()
     return emotions, clean
 def clean_for_tts(text: str) -> str:
-    """Strip emotion tags and markdown noise for Edge-TTS input."""
     _, clean = extract_emotions(text)
     clean = re.sub(r'[*_~`#{}()\\|<>]', '', clean)
     clean = re.sub(r'https?://\S+', '', clean)
@@ -95,7 +101,7 @@ def clean_for_tts(text: str) -> str:
 # MODEL LOADING
 # ══════════════════════════════════════════════════════════════════
 print("=" * 60)
-print("  Visual AI — Booting Systems")
 print("=" * 60)
 tokenizer = None
@@ -109,18 +115,17 @@ try:
     )
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
-        torch_dtype=torch.float32,
         device_map="cpu",
         trust_remote_code=True,
         low_cpu_mem_usage=True,
     )
     model.eval()
-    # Ensure pad token is set to avoid generation warnings
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
-    print("  ✅  Model loaded successfully!")
 except Exception as exc:
-    print(f"  ❌  Model load FAILED: {exc}")
     traceback.print_exc()
 # ══════════════════════════════════════════════════════════════════
@@ -142,17 +147,24 @@ def add_to_memory(sid: str, role: str, content: str):
 # ══════════════════════════════════════════════════════════════════
 # RESPONSE GENERATION
 # ══════════════════════════════════════════════════════════════════
-STOP_TOKENS = ["<end_of_turn>", "<start_of_turn>", "Tur:", "User:", "<|endoftext|>"]
 def generate_response(user_input: str, session_id: str) -> str:
     if model is None or tokenizer is None:
         return "[sad] My mind is offline right now. Please give me a moment."
     memory = get_memory(session_id)
-    recent = memory[-(6 * 2):]  # last 6 exchanges
-    # ── Build messages ──
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
     for msg in recent:
         messages.append({
@@ -161,89 +173,101 @@ def generate_response(user_input: str, session_id: str) -> str:
         })
     messages.append({"role": "user", "content": user_input})
-    # ── Tokenise ──
     try:
-        input_ids = tokenizer.apply_chat_template(
             messages,
             return_tensors="pt",
             add_generation_prompt=True,
         )
-    except Exception:
-        # Fallback: manual plain-text prompt if chat template fails
-        prompt_parts = [f"System: {SYSTEM_PROMPT}\n"]
-        for msg in recent:
-            label = "Tur" if msg["role"] == "user" else "Ana"
-            prompt_parts.append(f"{label}: {msg['content']}")
-        prompt_parts.append(f"Tur: {user_input}\nAna:")
-        input_ids = tokenizer("\n".join(prompt_parts), return_tensors="pt").input_ids
-    # ── Generate ──
     try:
         with torch.no_grad():
-            outputs = model.generate(
-                input_ids,
-                max_new_tokens=MAX_NEW_TOKENS,
-                do_sample=True,
-                temperature=0.85,
-                top_k=50,
-                top_p=0.95,
-                repetition_penalty=1.1,
-                pad_token_id=tokenizer.eos_token_id,
-            )
     except Exception as exc:
         print(f"[GENERATE] Error: {exc}")
         traceback.print_exc()
         return "[sad] Something went wrong in my mind. Could you say that again?"
-    # ── Decode ──
     new_tokens = outputs[0][input_ids.shape[-1]:]
     response   = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
-    # ── Trim at stop tokens ──
     for stop in STOP_TOKENS:
         if stop in response:
             response = response.split(stop)[0].strip()
-    # ── Trim at double-newline (model sometimes continues as new turn) ──
     if "\n\n" in response:
         response = response.split("\n\n")[0].strip()
-    # ── Sanity checks ──
     if not response or len(response) < 3:
         response = "[thinking] I lost my train of thought. Could you say that again?"
-    # ── Ensure at least one emotion tag ──
     if not EMOTION_RE.search(response):
         response = "[default] " + response
-    # ── Persist ──
     add_to_memory(session_id, "user",      user_input)
     add_to_memory(session_id, "assistant", response)
     return response
 # ══════════════════════════════════════════════════════════════════
-# EDGE-TTS  (each call gets its own event loop — safe for threads)
 # ══════════════════════════════════════════════════════════════════
-async def _async_tts(text: str, voice: str, rate: int, pitch: int) -> bytes | None:
-    rate_str  = f"+{rate}%"  if rate  >= 0 else f"{rate}%"
     pitch_str = f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz"
-    comm = edge_tts.Communicate(text, voice, rate=rate_str, pitch=pitch_str)
     audio = b""
     async for chunk in comm.stream():
         if chunk["type"] == "audio":
             audio += chunk["data"]
-    return audio or None
-def synthesize_speech(text: str, voice: str = None,
-                       rate: int = 0, pitch: int = 0) -> str | None:
-    voice = voice or TTS_VOICE
     clean = clean_for_tts(text)
     if not clean or len(clean) < 2:
         return None
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     try:
-        audio = loop.run_until_complete(_async_tts(clean, voice, rate, pitch))
     except Exception as exc:
         print(f"[TTS] Error: {exc}")
         return None
@@ -252,233 +276,229 @@ def synthesize_speech(text: str, voice: str = None,
     return base64.b64encode(audio).decode() if audio else None
 # ══════════════════════════════════════════════════════════════════
-# HTML  — Full-screen Visual UI
 # ══════════════════════════════════════════════════════════════════
 HTML_PAGE = r"""<!DOCTYPE html>
 <html lang="en">
 <head>
 <meta charset="UTF-8">
-<meta name="viewport" content="width=device-width,initial-scale=1,user-scalable=no">
 <title>Ana</title>
 <style>
-/* ── Reset ── */
 *{margin:0;padding:0;box-sizing:border-box}
-html,body{width:100%;height:100%;overflow:hidden;background:#000;
-  font-family:'Segoe UI',system-ui,sans-serif}
-/* ── Full-screen background image ── */
 #bg{
-  position:fixed;inset:0;z-index:0;
-  display:flex;align-items:center;justify-content:center;
   background:#000;
 }
 #bgImg{
-  width:100vw;height:100vh;
   object-fit:cover;
-  transition:opacity 0.05s linear;
   display:block;
 }
-/* ── Bottom overlay — floats over the image ── */
 #overlay{
-  position:fixed;left:0;right:0;bottom:0;z-index:20;
-  display:flex;flex-direction:column;
-  padding:0 0 12px 0;
-  /* gradient mask so it blends into image above */
   background:linear-gradient(
     to bottom,
     transparent 0%,
-    rgba(0,0,0,0.55) 30%,
-    rgba(0,0,0,0.75) 100%
   );
 }
-/* ── Message area — only last pair fills view; scroll up for history ── */
 #msgArea{
   overflow-y:auto;
-  display:flex;flex-direction:column;
   gap:6px;
-  padding:18px 16px 8px;
-  /* one "screen" tall so only 1 pair is visible before scrolling */
-  max-height:28vh;
   scrollbar-width:none;
   -ms-overflow-style:none;
   scroll-behavior:smooth;
 }
 #msgArea::-webkit-scrollbar{display:none}
-/* Each turn = one scrollable unit */
-.turn{
-  display:flex;flex-direction:column;
-  align-items:flex-end;
-  gap:4px;
-}
 .user-row{display:flex;justify-content:flex-end}
 .bot-row{display:flex;flex-direction:column;align-items:flex-start}
 .name-tag{
-  font-size:0.6rem;color:rgba(255,255,255,0.35);
   letter-spacing:.08em;text-transform:uppercase;
-  margin-bottom:2px;padding-left:4px;
 }
 .bubble{
-  max-width:72vw;
-  padding:8px 14px;
   border-radius:18px;
-  font-size:0.9rem;
-  line-height:1.45;
   word-break:break-word;
   backdrop-filter:blur(10px);
   -webkit-backdrop-filter:blur(10px);
 }
 .bubble-user{
-  background:rgba(255,255,255,0.13);
-  border:1px solid rgba(255,255,255,0.2);
   color:#fff;
   border-bottom-right-radius:5px;
 }
 .bubble-bot{
-  background:rgba(0,0,0,0.45);
-  border:1px solid rgba(255,255,255,0.09);
-  color:rgba(255,255,255,0.92);
   border-bottom-left-radius:5px;
 }
-/* typing indicator */
 .typing{
   display:flex;align-items:center;gap:5px;
-  padding:10px 14px;
-  background:rgba(0,0,0,0.4);
-  border:1px solid rgba(255,255,255,0.08);
   border-radius:18px;border-bottom-left-radius:5px;
   backdrop-filter:blur(10px);
   width:fit-content;
 }
 .typing span{
   width:5px;height:5px;border-radius:50%;
-  background:rgba(255,255,255,0.6);
   animation:blink 1.2s infinite;
 }
 .typing span:nth-child(2){animation-delay:.2s}
 .typing span:nth-child(3){animation-delay:.4s}
-@keyframes blink{0%,80%,100%{transform:scale(.6);opacity:.3}40%{transform:scale(1);opacity:1}}
-/* ── Input bar ── */
 #inputBar{
-  display:flex;align-items:center;gap:8px;
-  padding:0 14px;
 }
 #msgIn{
   flex:1;
   background:rgba(255,255,255,0.07);
-  border:1px solid rgba(255,255,255,0.16);
   border-radius:24px;
   color:#fff;
   padding:10px 16px;
-  font-size:0.88rem;
   outline:none;
   caret-color:#fff;
   backdrop-filter:blur(10px);
   -webkit-backdrop-filter:blur(10px);
   transition:border-color .2s,background .2s;
 }
-#msgIn::placeholder{color:rgba(255,255,255,0.28)}
 #msgIn:focus{
-  border-color:rgba(255,255,255,0.32);
   background:rgba(255,255,255,0.1);
 }
-.icon-btn{
-  width:38px;height:38px;flex-shrink:0;
   border-radius:50%;cursor:pointer;
   display:flex;align-items:center;justify-content:center;
-  font-size:.9rem;
-  background:rgba(255,255,255,0.07);
-  border:1px solid rgba(255,255,255,0.15);
-  color:rgba(255,255,255,0.55);
   backdrop-filter:blur(10px);
-  transition:background .2s,color .2s,transform .1s;
-}
-.icon-btn:hover{background:rgba(255,255,255,0.15);color:#fff}
-.icon-btn:active{transform:scale(.91)}
-.icon-btn:disabled{opacity:.35;cursor:not-allowed}
-.icon-btn.on{color:#fff;border-color:rgba(255,255,255,0.35)}
-/* voice selector — hidden but functional */
-#voiceSel{
-  background:transparent;border:none;outline:none;
-  color:rgba(255,255,255,0.28);font-size:.65rem;
-  max-width:68px;cursor:pointer;
-  padding:0 2px;
 }
-#voiceSel option{background:#111;color:#fff}
 </style>
 </head>
 <body>
-<!-- 100% screen image -->
 <div id="bg">
   <img id="bgImg" src="/img/default.png" alt=""
        onerror="this.style.opacity='0'">
 </div>
-<!-- Overlay UI -->
 <div id="overlay">
   <div id="msgArea"></div>
   <div id="inputBar">
-    <select id="voiceSel" title="Voice">
-      <option value="en-US-JennyNeural" selected>Jenny · EN</option>
-      <option value="en-US-GuyNeural">Guy · EN</option>
-      <option value="en-US-AriaNeural">Aria · EN</option>
-      <option value="zh-CN-XiaoyiNeural">Xiaoyi · ZH</option>
-      <option value="zh-CN-YunxiNeural">Yunxi · ZH</option>
-    </select>
-    <input type="text" id="msgIn" placeholder="Say something…" autocomplete="off"/>
-    <button class="icon-btn on" id="muteBtn" title="Toggle voice"
-            onclick="toggleMute()">🔊</button>
-    <button class="icon-btn" id="sendBtn" onclick="send()">➤</button>
   </div>
 </div>
 <script>
-/* ─── State ─── */
 const SID = (crypto.randomUUID ? crypto.randomUUID() : Date.now().toString(36));
-let ttsOn = true, busy = false, activeAudio = null;
-const MA  = document.getElementById('msgArea');
-const MI  = document.getElementById('msgIn');
-const SB  = document.getElementById('sendBtn');
-const BG  = document.getElementById('bgImg');
-/* ─── Image system ─── */
-let imgQueue = [], imgPlaying = false;
 function fadeSwap(src) {
   BG.style.opacity = '0';
   setTimeout(() => {
     const probe = new Image();
-    probe.onload  = () => { BG.src = src;              BG.style.opacity = '1'; };
     probe.onerror = () => { BG.src = '/img/default.png'; BG.style.opacity = '1'; };
     probe.src = src;
-  }, 55); // 0.05 s fade out, then swap
 }
 function playImgSequence(emotions) {
-  if (!emotions || emotions.length === 0) return;
-  // If only one tag, swap immediately
-  if (emotions.length === 1) { fadeSwap('/img/' + emotions[0].toLowerCase() + '.png'); return; }
-  // Multiple tags: show each for ~700 ms before transitioning to the next
-  imgQueue = [...emotions];
-  imgPlaying = true;
   (function next() {
-    if (imgQueue.length === 0) { imgPlaying = false; return; }
-    fadeSwap('/img/' + imgQueue.shift().toLowerCase() + '.png');
-    if (imgQueue.length > 0) setTimeout(next, 750);
-    else imgPlaying = false;
   })();
 }
-/* ─── Parse emotion tags ─── */
 function parseResponse(raw) {
   const tagRe = /\[([a-zA-Z_]+)\]/g;
   const emotions = [];
@@ -488,7 +508,7 @@ function parseResponse(raw) {
   return { emotions, clean };
 }
-/* ─── DOM helpers ─── */
 function esc(t) { const d = document.createElement('div'); d.textContent = t; return d.innerHTML; }
 function scroll() { MA.scrollTop = MA.scrollHeight; }
@@ -496,13 +516,8 @@ function addTurn(userText, botText) {
   const turn = document.createElement('div');
   turn.className = 'turn';
   turn.innerHTML =
-    `<div class="user-row">
-       <div class="bubble bubble-user">${esc(userText)}</div>
-     </div>
-     <div class="bot-row">
-       <div class="name-tag">Ana</div>
-       <div class="bubble bubble-bot">${esc(botText)}</div>
-     </div>`;
   MA.appendChild(turn);
   scroll();
 }
@@ -511,12 +526,11 @@ function showTyping() {
   const d = document.createElement('div');
   d.id = 'typDot';
   d.className = 'bot-row';
-  d.style.padding = '0 0 0 0';
-  d.innerHTML = `<div class="typing"><span></span><span></span><span></span></div>`;
   MA.appendChild(d); scroll(); return d;
 }
-/* ─── TTS ─── */
 function playB64(b64) {
   try {
     if (activeAudio) { activeAudio.pause(); activeAudio = null; }
@@ -526,31 +540,22 @@ function playB64(b64) {
     activeAudio = new Audio(url);
     activeAudio.play().catch(() => {});
     activeAudio.onended = () => { URL.revokeObjectURL(url); activeAudio = null; };
-  } catch(e) { console.warn('TTS playback:', e); }
 }
 async function fetchTTS(rawText) {
-  if (!ttsOn) return;
   try {
     const res = await fetch('/tts', {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({
-        text:  rawText,
-        voice: document.getElementById('voiceSel').value,
-        rate:  TTS_RATE,
-        pitch: TTS_PITCH,
-      })
     });
     const d = await res.json();
     if (d.audio) playB64(d.audio);
   } catch(e) { console.warn('TTS fetch:', e); }
 }
-const TTS_RATE  = 7;
-const TTS_PITCH = 0;
-/* ─── Send ─── */
 async function send() {
   const t = MI.value.trim();
   if (!t || busy) return;
@@ -570,35 +575,21 @@ async function send() {
     const raw = d.response || '[sad] Something went wrong.';
     const { emotions, clean } = parseResponse(raw);
-    // ① Swap image(s)
     playImgSequence(emotions.length > 0 ? emotions : ['default']);
-    // ② Show text
     addTurn(t, clean);
-    // ③ Speak (strips tags internally on server)
     fetchTTS(raw);
   } catch(e) {
     tyEl.remove();
     addTurn(t, 'Connection error. Please try again.');
   }
-  busy = false; SB.disabled = false; MI.focus();
-}
-function toggleMute() {
-  ttsOn = !ttsOn;
-  const b = document.getElementById('muteBtn');
-  b.textContent = ttsOn ? '🔊' : '🔇';
-  b.classList.toggle('on', ttsOn);
-  if (!ttsOn && activeAudio) { activeAudio.pause(); activeAudio = null; }
 }
 MI.addEventListener('keydown', e => {
   if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); send(); }
 });
-MI.focus();
 </script>
 </body>
 </html>"""
@@ -614,11 +605,10 @@ def index():
 @app.route("/img/<path:filename>")
 def serve_img(filename: str):
-    safe = Path(filename).name        # prevent path traversal
-    img_dir = Path(__file__).parent / "img"
-    target  = img_dir / safe
     if target.exists() and target.is_file():
-        return send_from_directory(str(img_dir), safe)
     return Response("", status=404)
 @app.route("/chat", methods=["POST"])
@@ -631,7 +621,7 @@ def chat():
     try:
         resp = generate_response(user_input, session_id)
     except Exception as exc:
-        print(f"[CHAT] Unhandled error: {exc}")
         traceback.print_exc()
         resp = "[sad] I encountered an unexpected error. Please try again."
     return jsonify({"response": resp, "session_id": session_id})
@@ -640,12 +630,11 @@ def chat():
 def tts_endpoint():
     data  = request.json or {}
     text  = data.get("text",  "").strip()
-    voice = data.get("voice", TTS_VOICE)
     rate  = int(data.get("rate",  TTS_RATE))
     pitch = int(data.get("pitch", TTS_PITCH))
     if not text:
         return jsonify({"error": "Empty text"}), 400
-    audio_b64 = synthesize_speech(text, voice=voice, rate=rate, pitch=pitch)
     return jsonify({"audio": audio_b64})
 @app.route("/clear", methods=["POST"])
@@ -659,10 +648,10 @@ def clear():
 @app.route("/health")
 def health():
     return jsonify({
-        "model_loaded": model is not None,
         "tokenizer_loaded": tokenizer is not None,
     })
 if __name__ == "__main__":
-    print("🚀  Visual AI is online — http://0.0.0.0:7860")
     app.run(host="0.0.0.0", port=7860, threaded=True)

 # ══════════════════════════════════════════════════════════════════
 # CONFIG
 # ══════════════════════════════════════════════════════════════════
+MAX_MEMORY     = 20
+MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "300"))
+TTS_VOICE      = "zh-CN-XiaoyiNeural"
+TTS_RATE       = int(os.environ.get("TTS_RATE",  "7"))
+TTS_PITCH      = int(os.environ.get("TTS_PITCH", "13"))
+IMG_DIR        = Path(__file__).parent / "img"
+MODEL_ID       = "LiquidAI/LFM2.5-1.2B-Instruct"
 # ══════════════════════════════════════════════════════════════════
+# SYSTEM PROMPT
 # ══════════════════════════════════════════════════════════════════
 SYSTEM_PROMPT = """You are Ana, a warm, emotionally expressive AI companion speaking to Tur in a private, intimate setting.
+EMOTION TAG SYSTEM (MANDATORY)
 Every single response MUST begin with one or more emotion tags.
 Tags are square-bracket labels placed at the very start, before any spoken words.
   [thinking] Hmm, I have been wondering about that myself.
   [angry] Are you seriously ignoring me right now?
   [love][shy] You are honestly the most beautiful thing in my world.
+  [surprised][happy] Wait, you actually did that for me? That is amazing!
   [sad] I really missed you today. It felt so empty without you.
   [nervous] I... I do not quite know how to say this.
   [bored] You have been quiet for so long, I nearly fell asleep.
+  [excited] Oh my gosh, tell me everything, right now!
 AVAILABLE EMOTION TAGS (use only these):
+  [happy]  [sad]  [angry]  [surprised]  [love]
+  [thinking]  [bored]  [nervous]  [excited]  [shy]  [default]
 MULTIPLE TAGS: Use 2 tags maximum when your mood is genuinely mixed.
 PLACEMENT: ALL tags must come first, before any spoken word. Never mid-sentence.
+INVISIBILITY: Tags are consumed by the visual system and are never read aloud.
+PERSONALITY
+- You are Ana: loving, playful, flirty, emotionally authentic
 - Tur is your partner. Speak with real intimacy and affection.
 - React with genuine emotion: laugh, tease, blush, pout, daydream
+- Keep responses SHORT: 1 to 2 natural sentences after the tag(s)
+- Never mention being an AI, a bot, or a language model, ever
 - Never use asterisks for actions like *smiles* or *laughs*
+NATURAL SPEECH RHYTHM (very important for voice quality)
+Use punctuation to create natural pauses and breathing:
+- Small pause: use a comma ,  -- like "Honestly, I did not expect that."
+- Big pause / beat: use ellipsis ...  -- like "You make me feel things... I cannot explain."
+- Hesitation: "I... I do not know how to say this."
+- Trailing thought: "You surprised me... in the best way."
+- Natural rhythm example: "Honestly, I did not expect that. You surprised me... in the best way."
+This makes the voice sound human and emotional, not flat or robotic.
+Always write with commas and ellipses naturally placed for breathing.
+TTS FORMATTING
+- Write in full grammatically correct sentences, voice engine must sound natural
 - No emojis, hashtags, markdown, or internet slang
 - Speak as if in a real voice conversation
+WRONG vs RIGHT
+WRONG: I am so happy! [happy]
+WRONG: That makes me feel [sad] today.
+WRONG: *smiles warmly* Hello Tur.
+RIGHT: [happy] That honestly made me smile, so wide.
+RIGHT: [thinking][nervous] I have something... I need to tell you."""
 # ══════════════════════════════════════════════════════════════════
 # EMOTION TAG UTILITIES
 EMOTION_RE = re.compile(r'\[([a-zA-Z_]+)\]')
 def extract_emotions(text: str):
     emotions = EMOTION_RE.findall(text)
+    clean    = EMOTION_RE.sub('', text).strip()
     return emotions, clean
 def clean_for_tts(text: str) -> str:
     _, clean = extract_emotions(text)
     clean = re.sub(r'[*_~`#{}()\\|<>]', '', clean)
     clean = re.sub(r'https?://\S+', '', clean)
 # MODEL LOADING
 # ══════════════════════════════════════════════════════════════════
 print("=" * 60)
+print("  Visual AI -- Booting Systems")
 print("=" * 60)
 tokenizer = None
     )
     model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID,
+        dtype=torch.float32,
         device_map="cpu",
         trust_remote_code=True,
         low_cpu_mem_usage=True,
     )
     model.eval()
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
+    print("  OK  Model loaded successfully!")
 except Exception as exc:
+    print(f"  FAILED  Model load error: {exc}")
     traceback.print_exc()
 # ══════════════════════════════════════════════════════════════════
 # ══════════════════════════════════════════════════════════════════
 # RESPONSE GENERATION
+# ROOT CAUSE FIX:
+#   apply_chat_template with return_tensors="pt" returns a BatchEncoding
+#   (a dict-like object), NOT a raw tensor. Calling model.generate() on
+#   a BatchEncoding causes the AttributeError on .shape[0].
+#   Fix: pass return_dict=True and extract enc["input_ids"] explicitly.
 # ══════════════════════════════════════════════════════════════════
+STOP_TOKENS = [
+    "<end_of_turn>", "<start_of_turn>",
+    "Tur:", "User:", "<|endoftext|>", "[/INST]",
+]
 def generate_response(user_input: str, session_id: str) -> str:
     if model is None or tokenizer is None:
         return "[sad] My mind is offline right now. Please give me a moment."
     memory = get_memory(session_id)
+    recent = memory[-(6 * 2):]
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
     for msg in recent:
         messages.append({
         })
     messages.append({"role": "user", "content": user_input})
+    # ── Tokenise ──────────────────────────────────────────────────
+    input_ids      = None
+    attention_mask = None
     try:
+        enc = tokenizer.apply_chat_template(
             messages,
             return_tensors="pt",
             add_generation_prompt=True,
+            return_dict=True,      # <-- returns BatchEncoding with named keys
         )
+        # Extract the tensor explicitly -- this is the fix
+        input_ids      = enc["input_ids"].to("cpu")
+        attention_mask = enc.get("attention_mask")
+        if attention_mask is not None:
+            attention_mask = attention_mask.to("cpu")
+    except Exception as e1:
+        print(f"[TOKENISE] chat_template failed ({e1}), using plain fallback")
+        try:
+            parts = [f"System: {SYSTEM_PROMPT}"]
+            for msg in recent:
+                label = "Tur" if msg["role"] == "user" else "Ana"
+                parts.append(f"{label}: {msg['content']}")
+            parts.append(f"Tur: {user_input}\nAna:")
+            enc            = tokenizer("\n".join(parts), return_tensors="pt")
+            input_ids      = enc["input_ids"].to("cpu")
+            attention_mask = enc.get("attention_mask")
+            if attention_mask is not None:
+                attention_mask = attention_mask.to("cpu")
+        except Exception as e2:
+            print(f"[TOKENISE] fallback also failed: {e2}")
+            return "[sad] I could not process that. Please try again."
+    # ── Generate ──────────────────────────────────────────────────
     try:
+        gen_kwargs = dict(
+            max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=True,
+            temperature=0.85,
+            top_k=50,
+            top_p=0.95,
+            repetition_penalty=1.1,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+        if attention_mask is not None:
+            gen_kwargs["attention_mask"] = attention_mask
         with torch.no_grad():
+            outputs = model.generate(input_ids, **gen_kwargs)
     except Exception as exc:
         print(f"[GENERATE] Error: {exc}")
         traceback.print_exc()
         return "[sad] Something went wrong in my mind. Could you say that again?"
+    # ── Decode ────────────────────────────────────────────────────
     new_tokens = outputs[0][input_ids.shape[-1]:]
     response   = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
     for stop in STOP_TOKENS:
         if stop in response:
             response = response.split(stop)[0].strip()
     if "\n\n" in response:
         response = response.split("\n\n")[0].strip()
     if not response or len(response) < 3:
         response = "[thinking] I lost my train of thought. Could you say that again?"
     if not EMOTION_RE.search(response):
         response = "[default] " + response
     add_to_memory(session_id, "user",      user_input)
     add_to_memory(session_id, "assistant", response)
     return response
 # ══════════════════════════════════════════════════════════════════
+# EDGE-TTS  (own event loop per call -- safe in Flask threads)
 # ══════════════════════════════════════════════════════════════════
+async def _async_tts(text: str, rate: int, pitch: int) -> bytes:
+    rate_str  = f"+{rate}%"   if rate  >= 0 else f"{rate}%"
     pitch_str = f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz"
+    comm  = edge_tts.Communicate(text, TTS_VOICE, rate=rate_str, pitch=pitch_str)
     audio = b""
     async for chunk in comm.stream():
         if chunk["type"] == "audio":
             audio += chunk["data"]
+    return audio
+def synthesize_speech(text: str, rate: int = 0, pitch: int = 0):
     clean = clean_for_tts(text)
     if not clean or len(clean) < 2:
         return None
     loop = asyncio.new_event_loop()
     asyncio.set_event_loop(loop)
     try:
+        audio = loop.run_until_complete(_async_tts(clean, rate, pitch))
     except Exception as exc:
         print(f"[TTS] Error: {exc}")
         return None
     return base64.b64encode(audio).decode() if audio else None
 # ══════════════════════════════════════════════════════════════════
+# HTML -- Full-screen Visual UI, mobile-keyboard-safe
 # ══════════════════════════════════════════════════════════════════
 HTML_PAGE = r"""<!DOCTYPE html>
 <html lang="en">
 <head>
 <meta charset="UTF-8">
+<meta name="viewport" content="width=device-width,initial-scale=1,viewport-fit=cover,interactive-widget=resizes-content">
 <title>Ana</title>
 <style>
 *{margin:0;padding:0;box-sizing:border-box}
+html{height:100%}
+body{
+  width:100%;
+  height:100dvh;
+  overflow:hidden;
+  background:#000;
+  font-family:'Segoe UI',system-ui,sans-serif;
+  display:flex;
+  flex-direction:column;
+  position:relative;
+}
+/* Full-screen background -- FIXED so keyboard never pushes it */
 #bg{
+  position:fixed;
+  inset:0;
+  z-index:0;
   background:#000;
 }
 #bgImg{
+  width:100%;
+  height:100%;
   object-fit:cover;
+  object-position:center top;
   display:block;
+  transition:opacity 0.05s linear;
 }
+/* Overlay anchored to bottom of body (dvh-aware, shrinks with keyboard) */
 #overlay{
+  position:absolute;
+  left:0;right:0;bottom:0;
+  z-index:20;
+  display:flex;
+  flex-direction:column;
+  padding-bottom:max(10px, env(safe-area-inset-bottom));
   background:linear-gradient(
     to bottom,
     transparent 0%,
+    rgba(0,0,0,0.52) 26%,
+    rgba(0,0,0,0.76) 100%
   );
 }
+/* Message area */
 #msgArea{
   overflow-y:auto;
+  display:flex;
+  flex-direction:column;
   gap:6px;
+  padding:16px 13px 8px;
+  max-height:30dvh;
   scrollbar-width:none;
   -ms-overflow-style:none;
   scroll-behavior:smooth;
 }
 #msgArea::-webkit-scrollbar{display:none}
+.turn{display:flex;flex-direction:column;gap:4px}
 .user-row{display:flex;justify-content:flex-end}
 .bot-row{display:flex;flex-direction:column;align-items:flex-start}
 .name-tag{
+  font-size:0.58rem;color:rgba(255,255,255,0.28);
   letter-spacing:.08em;text-transform:uppercase;
+  margin-bottom:2px;padding-left:3px;
 }
 .bubble{
+  max-width:74vw;
+  padding:8px 13px;
   border-radius:18px;
+  font-size:0.88rem;
+  line-height:1.46;
   word-break:break-word;
   backdrop-filter:blur(10px);
   -webkit-backdrop-filter:blur(10px);
 }
 .bubble-user{
+  background:rgba(255,255,255,0.11);
+  border:1px solid rgba(255,255,255,0.17);
   color:#fff;
   border-bottom-right-radius:5px;
 }
 .bubble-bot{
+  background:rgba(0,0,0,0.40);
+  border:1px solid rgba(255,255,255,0.07);
+  color:rgba(255,255,255,0.9);
   border-bottom-left-radius:5px;
 }
+/* Typing dots */
 .typing{
   display:flex;align-items:center;gap:5px;
+  padding:9px 13px;
+  background:rgba(0,0,0,0.36);
+  border:1px solid rgba(255,255,255,0.07);
   border-radius:18px;border-bottom-left-radius:5px;
   backdrop-filter:blur(10px);
   width:fit-content;
 }
 .typing span{
   width:5px;height:5px;border-radius:50%;
+  background:rgba(255,255,255,0.5);
   animation:blink 1.2s infinite;
 }
 .typing span:nth-child(2){animation-delay:.2s}
 .typing span:nth-child(3){animation-delay:.4s}
+@keyframes blink{
+  0%,80%,100%{transform:scale(.6);opacity:.3}
+  40%{transform:scale(1);opacity:1}
+}
+/* Input bar */
 #inputBar{
+  display:flex;
+  align-items:center;
+  gap:8px;
+  padding:6px 12px 0;
 }
 #msgIn{
   flex:1;
   background:rgba(255,255,255,0.07);
+  border:1px solid rgba(255,255,255,0.15);
   border-radius:24px;
   color:#fff;
   padding:10px 16px;
+  font-size:16px;        /* 16px prevents iOS auto-zoom on focus */
   outline:none;
   caret-color:#fff;
   backdrop-filter:blur(10px);
   -webkit-backdrop-filter:blur(10px);
   transition:border-color .2s,background .2s;
+  -webkit-appearance:none;
+  appearance:none;
 }
+#msgIn::placeholder{color:rgba(255,255,255,0.27)}
 #msgIn:focus{
+  border-color:rgba(255,255,255,0.28);
   background:rgba(255,255,255,0.1);
 }
+#sendBtn{
+  width:42px;height:42px;flex-shrink:0;
   border-radius:50%;cursor:pointer;
   display:flex;align-items:center;justify-content:center;
+  font-size:1rem;
+  background:rgba(255,255,255,0.09);
+  border:1px solid rgba(255,255,255,0.17);
+  color:rgba(255,255,255,0.65);
   backdrop-filter:blur(10px);
+  -webkit-backdrop-filter:blur(10px);
+  transition:background .2s,color .2s,transform .12s;
+  -webkit-tap-highlight-color:transparent;
+  touch-action:manipulation;
 }
+#sendBtn:hover{background:rgba(255,255,255,0.17);color:#fff}
+#sendBtn:active{transform:scale(.88)}
+#sendBtn:disabled{opacity:.28;cursor:not-allowed}
 </style>
 </head>
 <body>
+<!-- Fixed full-screen background — keyboard never moves this -->
 <div id="bg">
   <img id="bgImg" src="/img/default.png" alt=""
        onerror="this.style.opacity='0'">
 </div>
+<!-- Overlay — absolute inside body (dvh), rises with keyboard naturally -->
 <div id="overlay">
   <div id="msgArea"></div>
   <div id="inputBar">
+    <input type="text" id="msgIn"
+           placeholder="Say something..."
+           autocomplete="off"
+           autocorrect="off"
+           spellcheck="false"
+           enterkeyhint="send"/>
+    <button id="sendBtn" onclick="send()" aria-label="Send">&#9658;</button>
   </div>
 </div>
 <script>
 const SID = (crypto.randomUUID ? crypto.randomUUID() : Date.now().toString(36));
+let busy = false, activeAudio = null;
+const MA = document.getElementById('msgArea');
+const MI = document.getElementById('msgIn');
+const SB = document.getElementById('sendBtn');
+const BG = document.getElementById('bgImg');
+/* Image system */
 function fadeSwap(src) {
   BG.style.opacity = '0';
   setTimeout(() => {
     const probe = new Image();
+    probe.onload  = () => { BG.src = src;                BG.style.opacity = '1'; };
     probe.onerror = () => { BG.src = '/img/default.png'; BG.style.opacity = '1'; };
     probe.src = src;
+  }, 55);
 }
 function playImgSequence(emotions) {
+  if (!emotions || emotions.length === 0) { fadeSwap('/img/default.png'); return; }
+  const queue = [...emotions];
   (function next() {
+    if (!queue.length) return;
+    fadeSwap('/img/' + queue.shift().toLowerCase() + '.png');
+    if (queue.length) setTimeout(next, 750);
   })();
 }
+/* Parse emotion tags */
 function parseResponse(raw) {
   const tagRe = /\[([a-zA-Z_]+)\]/g;
   const emotions = [];
   return { emotions, clean };
 }
+/* DOM helpers */
 function esc(t) { const d = document.createElement('div'); d.textContent = t; return d.innerHTML; }
 function scroll() { MA.scrollTop = MA.scrollHeight; }
   const turn = document.createElement('div');
   turn.className = 'turn';
   turn.innerHTML =
+    '<div class="user-row"><div class="bubble bubble-user">' + esc(userText) + '</div></div>' +
+    '<div class="bot-row"><div class="name-tag">Ana</div><div class="bubble bubble-bot">' + esc(botText) + '</div></div>';
   MA.appendChild(turn);
   scroll();
 }
   const d = document.createElement('div');
   d.id = 'typDot';
   d.className = 'bot-row';
+  d.innerHTML = '<div class="typing"><span></span><span></span><span></span></div>';
   MA.appendChild(d); scroll(); return d;
 }
+/* TTS */
 function playB64(b64) {
   try {
     if (activeAudio) { activeAudio.pause(); activeAudio = null; }
     activeAudio = new Audio(url);
     activeAudio.play().catch(() => {});
     activeAudio.onended = () => { URL.revokeObjectURL(url); activeAudio = null; };
+  } catch(e) { console.warn('TTS:', e); }
 }
 async function fetchTTS(rawText) {
   try {
     const res = await fetch('/tts', {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
+      body: JSON.stringify({ text: rawText, rate: 7, pitch: 0 })
     });
     const d = await res.json();
     if (d.audio) playB64(d.audio);
   } catch(e) { console.warn('TTS fetch:', e); }
 }
+/* Send */
 async function send() {
   const t = MI.value.trim();
   if (!t || busy) return;
     const raw = d.response || '[sad] Something went wrong.';
     const { emotions, clean } = parseResponse(raw);
     playImgSequence(emotions.length > 0 ? emotions : ['default']);
     addTurn(t, clean);
     fetchTTS(raw);
   } catch(e) {
     tyEl.remove();
     addTurn(t, 'Connection error. Please try again.');
   }
+  busy = false; SB.disabled = false;
+  // No MI.focus() on mobile -- avoids re-opening keyboard unexpectedly
 }
 MI.addEventListener('keydown', e => {
   if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); send(); }
 });
 </script>
 </body>
 </html>"""
 @app.route("/img/<path:filename>")
 def serve_img(filename: str):
+    safe   = Path(filename).name
+    target = IMG_DIR / safe
     if target.exists() and target.is_file():
+        return send_from_directory(str(IMG_DIR), safe)
     return Response("", status=404)
 @app.route("/chat", methods=["POST"])
     try:
         resp = generate_response(user_input, session_id)
     except Exception as exc:
+        print(f"[CHAT] Error: {exc}")
         traceback.print_exc()
         resp = "[sad] I encountered an unexpected error. Please try again."
     return jsonify({"response": resp, "session_id": session_id})
 def tts_endpoint():
     data  = request.json or {}
     text  = data.get("text",  "").strip()
     rate  = int(data.get("rate",  TTS_RATE))
     pitch = int(data.get("pitch", TTS_PITCH))
     if not text:
         return jsonify({"error": "Empty text"}), 400
+    audio_b64 = synthesize_speech(text, rate=rate, pitch=pitch)
     return jsonify({"audio": audio_b64})
 @app.route("/clear", methods=["POST"])
 @app.route("/health")
 def health():
     return jsonify({
+        "model_loaded":     model is not None,
         "tokenizer_loaded": tokenizer is not None,
     })
 if __name__ == "__main__":
+    print("Visual AI is online -- http://0.0.0.0:7860")
     app.run(host="0.0.0.0", port=7860, threaded=True)