Ana

Paused

App Files Files Community

OrbitMC commited on 21 days ago

Commit

ac22457

verified ·

1 Parent(s): 5b51ec0

Update app.py

Browse files

Files changed (1) hide show

app.py +301 -258

app.py CHANGED Viewed

@@ -1,277 +1,320 @@
-import sys
-import os
-# =========================================================
-# 1. SETUP PHASE (Runs during Docker build to bake models)
-# =========================================================
-if "--setup" in sys.argv:
-    print("Pre-downloading models into Docker image...")
-    from huggingface_hub import hf_hub_download
-    hf_hub_download(repo_id="unsloth/Qwen3.5-0.8B-GGUF", filename="Qwen3.5-0.8B-UD-Q2_K_XL.gguf")
-    from langchain_huggingface import HuggingFaceEmbeddings
-    HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
-    try:
-        from kittentts import KittenTTS
-        KittenTTS("KittenML/kitten-tts-nano-0.8-fp32")
-    except Exception as e:
-        print(f"TTS Download check: {e}")
-    print("Setup complete. Exiting installer.")
-    sys.exit(0)
-# =========================================================
-# 2. RUNTIME PHASE (Runs when Space is active)
-# =========================================================
-import time
 import json
-import uuid
 from pathlib import Path
-from flask import Flask, request, jsonify, send_file, render_template_string
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
-from kittentts import KittenTTS
-from langchain_huggingface import HuggingFaceEmbeddings
-from langchain_community.vectorstores import FAISS
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_core.documents import Document
-# Initialize App & Directories
-app = Flask(__name__)
-DB_DIR = Path("database")
-AUDIO_DIR = Path("static/audio")
-for d in [DB_DIR, AUDIO_DIR]:
     d.mkdir(parents=True, exist_ok=True)
-print("Loading Models into RAM...")
-# Initialize Embeddings & Vector Store
-embeddings = HuggingFaceEmbeddings(
-    model_name="sentence-transformers/all-MiniLM-L6-v2",
-    model_kwargs={"device": "cpu"}
 )
-faiss_path = DB_DIR / "index.faiss"
-if faiss_path.exists():
-    vector_store = FAISS.load_local(str(DB_DIR), embeddings, allow_dangerous_deserialization=True)
-else:
-    vector_store = FAISS.from_documents([Document(page_content="Ana initialized.")], embeddings)
-    vector_store.save_local(str(DB_DIR))
-# Initialize LLM
-model_path = hf_hub_download(
-    repo_id="unsloth/Qwen3.5-0.8B-GGUF",
-    filename="Qwen3.5-0.8B-UD-Q2_K_XL.gguf"
-)
-llm = Llama(model_path=model_path, n_ctx=2048, n_threads=os.cpu_count() or 4, verbose=False)
-# Initialize TTS
-tts = KittenTTS("KittenML/kitten-tts-nano-0.8-fp32")
-# --- Helper Functions ---
-def get_context(query):
-    try:
-        retriever = vector_store.as_retriever(search_kwargs={"k": 3})
-        return "\n".join(d.page_content for d in retriever.invoke(query))
-    except:
-        return ""
-def generate_audio(text):
-    if not text.strip(): return None
-    audio_id = str(uuid.uuid4())
-    audio_path = AUDIO_DIR / f"{audio_id}.wav"
-    try:
-        import soundfile as sf
-        audio_data = tts.generate(text, voice="Kiki")
-        sf.write(str(audio_path), audio_data, 24000)
-        return f"/audio/{audio_id}.wav"
-    except Exception as e:
-        print(f"TTS Error: {e}")
-        return None
-# --- HTML Interface ---
-HTML_TEMPLATE = """
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Ana Web Terminal</title>
-    <style>
-        body { font-family: 'Courier New', Courier, monospace; background-color: #0d1117; color: #ff79c6; margin: 0; padding: 0; display: flex; height: 100vh; }
-        .sidebar { width: 300px; background-color: #161b22; padding: 20px; border-right: 1px solid #30363d; display: flex; flex-direction: column; gap: 15px; }
-        .main { flex: 1; display: flex; flex-direction: column; padding: 20px; }
-        .chat-box { flex: 1; overflow-y: auto; background: #000; padding: 15px; border: 1px solid #30363d; border-radius: 5px; margin-bottom: 15px; color: #f8f8f2; }
-        .message { margin-bottom: 10px; }
-        .user-msg { color: #8be9fd; }
-        .ai-msg { color: #ff79c6; }
-        .sys-msg { color: #50fa7b; }
-        .input-group { display: flex; gap: 10px; }
-        input[type="text"], textarea { width: 100%; background: #0d1117; color: #fff; border: 1px solid #30363d; padding: 10px; border-radius: 3px; font-family: inherit; }
-        button { background: #bd93f9; color: #fff; border: none; padding: 10px 15px; cursor: pointer; border-radius: 3px; font-family: inherit; font-weight: bold; }
-        button:hover { background: #ff79c6; }
-        .btn-danger { background: #ff5555; }
-        .btn-danger:hover { background: #ff0000; }
-        label { font-size: 0.9em; font-weight: bold; color: #f8f8f2; }
-        h2 { margin-top: 0; color: #ff79c6; font-size: 1.2em; text-align: center; border-bottom: 1px solid #30363d; padding-bottom: 10px; }
-    </style>
-</head>
-<body>
-    <div class="sidebar">
-        <h2>⚙️ ANA SETTINGS</h2>
-        <label>System Prompt:</label>
-        <textarea id="sys_prompt" rows="5">You are Ana, a concise and intelligent AI assistant. Reply in 1-2 short sentences. Never use emojis or markdown. Be slightly casual but highly efficient.</textarea>
-        <label>Temperature (<span id="temp_val">0.65</span>):</label>
-        <input type="range" id="temperature" min="0.1" max="1.0" step="0.05" value="0.65" oninput="document.getElementById('temp_val').innerText=this.value">
-        <label>
-            <input type="checkbox" id="use_tts" checked> Enable Voice (TTS)
-        </label>
-        <button class="btn-danger" onclick="clearMemory()" style="margin-top: auto;">Wipe Vector Memory</button>
-    </div>
-    <div class="main">
-        <div class="chat-box" id="chat_box">
-            <div class="message sys-msg">[SYSTEM] Ana Online. Awaiting input...</div>
-        </div>
-        <div class="input-group">
-            <input type="text" id="user_input" placeholder="Talk to Ana..." onkeypress="if(event.key==='Enter') sendMessage()" autofocus>
-            <button onclick="sendMessage()" id="send_btn">Send</button>
-        </div>
-    </div>
-    <audio id="audio_player" autoplay></audio>
-    <script>
-        let history =[];
-        async function sendMessage() {
-            const inputField = document.getElementById("user_input");
-            const text = inputField.value.trim();
-            if (!text) return;
-            appendMessage("User", text, "user-msg");
-            inputField.value = "";
-            document.getElementById("send_btn").disabled = true;
-            const sysPrompt = document.getElementById("sys_prompt").value;
-            const temp = document.getElementById("temperature").value;
-            const useTTS = document.getElementById("use_tts").checked;
-            try {
-                const response = await fetch("/chat", {
-                    method: "POST",
-                    headers: { "Content-Type": "application/json" },
-                    body: JSON.stringify({
-                        message: text,
-                        history: history,
-                        system_prompt: sysPrompt,
-                        temperature: temp,
-                        use_tts: useTTS
-                    })
-                });
-                const data = await response.json();
-                appendMessage("Ana", data.reply, "ai-msg");
-                history.push({ role: "user", content: text });
-                history.push({ role: "assistant", content: data.reply });
-                if (history.length > 8) history = history.slice(history.length - 8);
-                if (data.audio_url) {
-                    const player = document.getElementById("audio_player");
-                    player.src = data.audio_url;
-                    player.play();
-                }
-            } catch (err) {
-                appendMessage("System", "Error connecting to backend.", "sys-msg");
-            }
-            document.getElementById("send_btn").disabled = false;
-            document.getElementById("user_input").focus();
-        }
-        function appendMessage(sender, text, className) {
-            const box = document.getElementById("chat_box");
-            const msgDiv = document.createElement("div");
-            msgDiv.className = `message ${className}`;
-            msgDiv.innerText = `[${sender}] ${text}`;
-            box.appendChild(msgDiv);
-            box.scrollTop = box.scrollHeight;
-        }
-        async function clearMemory() {
-            if(confirm("Wipe long-term Vector Memory?")) {
-                await fetch("/clear_memory", { method: "POST" });
-                history =[];
-                appendMessage("System", "Memory wiped successfully. Ana has forgotten past interactions.", "sys-msg");
-            }
-        }
-    </script>
-</body>
-</html>
-"""
-# --- Flask Routes ---
-@app.route("/")
-def index():
-    return render_template_string(HTML_TEMPLATE)
-@app.route("/chat", methods=["POST"])
-def chat():
-    data = request.json
-    user_input = data.get("message", "")
-    history = data.get("history",[])
-    sys_prompt = data.get("system_prompt", "You are Ana, a concise AI assistant. Reply in 1-2 short sentences.")
-    temp = float(data.get("temperature", 0.65))
-    use_tts = data.get("use_tts", True)
-    context = get_context(user_input)
-    messages =[{"role": "system", "content": f"{sys_prompt}\n\nRelevant Memory:\n{context}"}]
-    for msg in history[-4:]:
-        messages.append(msg)
-    messages.append({"role": "user", "content": user_input})
-    response = llm.create_chat_completion(
-        messages=messages,
-        max_tokens=150,
-        temperature=temp,
-        top_p=0.9
     )
-    reply_text = response["choices"][0]["message"]["content"].strip()
-    audio_url = None
-    if use_tts:
-        audio_url = generate_audio(reply_text)
     try:
-        new_doc = Document(page_content=f"User: {user_input}\nAna: {reply_text}")
-        vector_store.add_documents([new_doc])
-        vector_store.save_local(str(DB_DIR))
     except Exception as e:
-        print("Memory save error:", e)
-    return jsonify({
-        "reply": reply_text,
-        "audio_url": audio_url
-    })
-@app.route("/audio/<filename>")
-def serve_audio(filename):
-    return send_file(AUDIO_DIR / filename, mimetype="audio/wav")
-@app.route("/clear_memory", methods=["POST"])
-def clear_memory():
-    global vector_store
-    vector_store = FAISS.from_documents([Document(page_content="Memory wiped.")], embeddings)
-    vector_store.save_local(str(DB_DIR))
-    return jsonify({"status": "success"})
 if __name__ == "__main__":
-    app.run(host="0.0.0.0", port=7860)

+"""
+J.A.R.V.I.S — FastAPI backend
+Model is loaded ONCE at startup and kept in RAM for instant responses.
+"""
+import os
 import json
+import time
+import warnings
+import asyncio
 from pathlib import Path
+from contextlib import asynccontextmanager
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import HTMLResponse, StreamingResponse, FileResponse
+from pydantic import BaseModel
+warnings.filterwarnings("ignore")
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["HF_HOME"] = "/app/cache"
+os.environ["TRANSFORMERS_CACHE"] = "/app/cache"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "/app/cache"
+# ── Paths ──
+VECTOR_DIR = Path("/app/database/vector_store")
+LEARN_DIR  = Path("/app/database/learning_data")
+CHATS_DIR  = Path("/app/database/chats_data")
+CACHE_DIR  = Path("/app/cache")
+for d in [VECTOR_DIR, LEARN_DIR, CHATS_DIR]:
     d.mkdir(parents=True, exist_ok=True)
+# ── Global model holders (loaded once, never reloaded) ──
+LLM       = None
+RETRIEVER = None
+TTS       = None
+TTS_OK    = False
+SYSTEM_PROMPT = (
+    "You are J.A.R.V.I.S, a concise and intelligent AI assistant. "
+    "Always reply in 1–2 short, direct sentences. "
+    "Never use emojis, markdown, asterisks, or filler phrases. "
+    "Be helpful, precise, and slightly formal."
 )
+# ══════════════════════════════════════════
+#  STARTUP — load everything into RAM once
+# ══════════════════════════════════════════
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    global LLM, RETRIEVER, TTS, TTS_OK
+    print("=" * 55)
+    print("  J.A.R.V.I.S  —  starting up")
+    print("=" * 55)
+    # 1. Vector store / embeddings
+    print("[1/3] Loading embeddings & vector store...", flush=True)
+    from langchain_huggingface import HuggingFaceEmbeddings
+    from langchain_community.vectorstores import FAISS
+    from langchain_text_splitters import RecursiveCharacterTextSplitter
+    from langchain_core.documents import Document
+    embeddings = HuggingFaceEmbeddings(
+        model_name="sentence-transformers/all-MiniLM-L6-v2",
+        model_kwargs={"device": "cpu"},
+        cache_folder=str(CACHE_DIR),
+    )
+    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+    def _load_docs():
+        docs = []
+        for f in LEARN_DIR.glob("*.txt"):
+            try:
+                docs.append(Document(page_content=f.read_text(errors="ignore"),
+                                     metadata={"source": f.name}))
+            except Exception:
+                pass
+        for f in CHATS_DIR.glob("*.json"):
+            try:
+                data = json.loads(f.read_text(errors="ignore"))
+                content = "\n".join(
+                    f"{m['role']}: {m['content']}"
+                    for m in data.get("messages", [])
+                    if isinstance(m, dict) and "role" in m and "content" in m
+                )
+                if content.strip():
+                    docs.append(Document(page_content=content,
+                                         metadata={"source": f.name}))
+            except Exception:
+                pass
+        return docs
+    index_file = VECTOR_DIR / "index.faiss"
+    if index_file.exists():
+        try:
+            vs = FAISS.load_local(str(VECTOR_DIR), embeddings,
+                                   allow_dangerous_deserialization=True)
+            print("  Vector store loaded from disk.")
+        except Exception:
+            vs = None
+    if not index_file.exists() or vs is None:
+        docs = _load_docs() or [Document(page_content="No data yet.")]
+        chunks = splitter.split_documents(docs)
+        vs = FAISS.from_documents(chunks, embeddings)
+        vs.save_local(str(VECTOR_DIR))
+        print("  Vector store built and saved.")
+    RETRIEVER = vs.as_retriever(search_kwargs={"k": 3})
+    print("  ✓ Vector store ready")
+    # 2. LLM — loaded into RAM, stays there forever
+    print("[2/3] Loading LLM into RAM (model pre-cached in image)...", flush=True)
+    from huggingface_hub import hf_hub_download
+    from llama_cpp import Llama
+    model_path = hf_hub_download(
+        repo_id="unsloth/Qwen3.5-0.8B-GGUF",
+        filename="Qwen3.5-0.8B-UD-Q2_K_XL.gguf",
+        cache_dir=str(CACHE_DIR),
+        local_files_only=True,   # ← never re-download; use baked image cache
     )
+    LLM = Llama(
+        model_path=model_path,
+        n_ctx=2048,
+        n_threads=os.cpu_count() or 4,
+        n_batch=512,          # larger batch = faster prompt processing
+        use_mmap=True,        # memory-map the file — fastest cold load on CPU
+        use_mlock=True,       # lock pages in RAM — prevents swap thrashing
+        verbose=False,
+    )
+    print("  ✓ LLM ready")
+    # 3. TTS (optional)
+    print("[3/3] Loading TTS...", flush=True)
     try:
+        from kittentts import KittenTTS
+        TTS    = KittenTTS("KittenML/kitten-tts-nano-0.8-fp32")
+        TTS_OK = True
+        print("  ✓ TTS ready (Kiki)")
     except Exception as e:
+        print(f"  ⚠ TTS unavailable: {e}")
+    print("\n  ✓ ALL SYSTEMS ONLINE — serving on :7860\n")
+    yield
+    # Shutdown
+    print("J.A.R.V.I.S: shutting down.")
+# ══════════════════════════════════════════
+#  APP
+# ══════════════════════════════════════════
+app = FastAPI(title="J.A.R.V.I.S", lifespan=lifespan)
+app.mount("/static", StaticFiles(directory="static"), name="static")
+# ── Request / response schemas ──
+class ChatRequest(BaseModel):
+    message: str
+    history: list[list[str]] = []   # [[user, assistant], ...]
+class ChatResponse(BaseModel):
+    reply: str
+# ── Routes ──
+@app.get("/", response_class=HTMLResponse)
+async def root():
+    return FileResponse("static/index.html")
+@app.get("/health")
+async def health():
+    return {"status": "ok", "llm": LLM is not None, "tts": TTS_OK}
+@app.post("/chat", response_model=ChatResponse)
+async def chat(req: ChatRequest):
+    if LLM is None:
+        raise HTTPException(503, "Model not ready yet")
+    # Retrieve context
+    context = ""
+    try:
+        docs    = RETRIEVER.invoke(req.message)
+        context = "\n".join(d.page_content for d in docs)
+    except Exception:
+        pass
+    # Build messages
+    system = SYSTEM_PROMPT
+    if context.strip():
+        system += f"\n\nBackground context (use only if relevant):\n{context}"
+    messages = [{"role": "system", "content": system}]
+    for turn in req.history[-4:]:
+        if len(turn) == 2:
+            messages.append({"role": "user",      "content": turn[0]})
+            messages.append({"role": "assistant", "content": turn[1]})
+    messages.append({"role": "user", "content": req.message})
+    # Generate (run in thread so we don't block the event loop)
+    loop = asyncio.get_event_loop()
+    def _generate():
+        result = LLM.create_chat_completion(
+            messages=messages,
+            max_tokens=150,
+            temperature=0.65,
+            top_p=0.9,
+            repeat_penalty=1.1,
+            stream=False,
+        )
+        return result["choices"][0]["message"]["content"].strip()
+    reply = await loop.run_in_executor(None, _generate)
+    return ChatResponse(reply=reply)
+@app.post("/chat/stream")
+async def chat_stream(req: ChatRequest):
+    """Server-Sent Events streaming endpoint."""
+    if LLM is None:
+        raise HTTPException(503, "Model not ready yet")
+    context = ""
+    try:
+        docs    = RETRIEVER.invoke(req.message)
+        context = "\n".join(d.page_content for d in docs)
+    except Exception:
+        pass
+    system = SYSTEM_PROMPT
+    if context.strip():
+        system += f"\n\nBackground context:\n{context}"
+    messages = [{"role": "system", "content": system}]
+    for turn in req.history[-4:]:
+        if len(turn) == 2:
+            messages.append({"role": "user",      "content": turn[0]})
+            messages.append({"role": "assistant", "content": turn[1]})
+    messages.append({"role": "user", "content": req.message})
+    async def event_gen():
+        loop   = asyncio.get_event_loop()
+        queue  = asyncio.Queue()
+        def _stream():
+            for chunk in LLM.create_chat_completion(
+                messages=messages,
+                max_tokens=150,
+                temperature=0.65,
+                top_p=0.9,
+                repeat_penalty=1.1,
+                stream=True,
+            ):
+                piece = chunk["choices"][0].get("delta", {}).get("content", "")
+                if piece:
+                    asyncio.run_coroutine_threadsafe(queue.put(piece), loop)
+            asyncio.run_coroutine_threadsafe(queue.put(None), loop)  # sentinel
+        loop.run_in_executor(None, _stream)
+        while True:
+            piece = await queue.get()
+            if piece is None:
+                yield "data: [DONE]\n\n"
+                break
+            yield f"data: {json.dumps(piece)}\n\n"
+    return StreamingResponse(event_gen(), media_type="text/event-stream")
+@app.post("/tts")
+async def tts_endpoint(body: dict):
+    """Return raw PCM audio bytes for the given text."""
+    if not TTS_OK:
+        raise HTTPException(503, "TTS not available")
+    text = body.get("text", "").strip()
+    if not text:
+        raise HTTPException(400, "No text provided")
+    loop = asyncio.get_event_loop()
+    def _speak():
+        return TTS.generate(text, voice="Kiki")
+    audio_bytes = await loop.run_in_executor(None, _speak)
+    return StreamingResponse(iter([bytes(audio_bytes)]),
+                             media_type="audio/wav")
+@app.post("/save")
+async def save_chat(body: dict):
+    history = body.get("history", [])
+    if not history:
+        return {"saved": False}
+    path = CHATS_DIR / f"session_{int(time.time())}.json"
+    messages = []
+    for turn in history:
+        if len(turn) == 2:
+            messages.append({"role": "user",      "content": turn[0]})
+            messages.append({"role": "assistant", "content": turn[1]})
+    path.write_text(json.dumps({"messages": messages}, ensure_ascii=False, indent=2))
+    return {"saved": True, "path": str(path)}
 if __name__ == "__main__":
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=int(os.environ.get("PORT", 7860)),
+        log_level="warning",
+    )