Xmas / app.py
luguog's picture
Update app.py
04de49f verified
#!/usr/bin/env python3
# ============================================================
# PROPRIETARY SOFTWARE — CONFIDENTIAL
# Product: ECHOFORM™
# File: app.py
# Mode: Tokenless · Offline · HF-native
# ============================================================
"""
ECHOFORM™ — Refactored, Debugged, HF-Serious Voice Cognition Engine
Fixes applied:
- Tokenless-safe models
- Audio shape normalization
- CPU-isolated embeddings
- Deterministic inference
- Defensive TTS handling
- Prompt format aligned to Qwen
- Memory race mitigation (single-tenant safe)
- Payload size guard
- inference_mode enabled
"""
import io
import time
import uuid
import torch
import numpy as np
import soundfile as sf
from typing import List, Dict
from fastapi import FastAPI, UploadFile, HTTPException
from pydantic import BaseModel
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
# ============================================================
# HARD DETERMINISM
# ============================================================
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# ============================================================
# CONFIG (TOKENLESS-SAFE)
# ============================================================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
STT_MODEL = "distil-whisper/distil-large-v3"
LLM_MODEL = "Qwen/Qwen2.5-3B-Instruct"
TTS_MODEL = "espnet/kan-bayashi_ljspeech_vits"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L12-v2"
MAX_TOKENS = 192
LATENT_MEMORY_LIMIT = 8
MAX_AUDIO_BYTES = 10_000_000 # 10MB cap
# ============================================================
# INIT (LOCAL ONLY)
# ============================================================
app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None)
stt = pipeline(
"automatic-speech-recognition",
model=STT_MODEL,
device=0 if DEVICE == "cuda" else -1,
)
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
llm = AutoModelForCausalLM.from_pretrained(
LLM_MODEL,
device_map="auto",
torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
)
tts = pipeline(
"text-to-speech",
model=TTS_MODEL,
device=0 if DEVICE == "cuda" else -1,
)
# embeddings forced to CPU to avoid VRAM contention
embedder = SentenceTransformer(EMBED_MODEL, device="cpu")
# ============================================================
# EPHEMERAL MEMORY (SINGLE-TENANT)
# ============================================================
LATENT_MEMORY: List[Dict[str, np.ndarray]] = []
# ============================================================
# SCHEMA
# ============================================================
class AgentResponse(BaseModel):
id: str
text: str
latency_ms: int
memory_slots_used: int
# ============================================================
# CORE LOGIC
# ============================================================
def now_ms() -> int:
return int(time.time() * 1000)
def normalize_audio(pcm: np.ndarray) -> np.ndarray:
pcm = pcm.astype(np.float32)
if pcm.ndim > 1:
pcm = pcm.mean(axis=1)
return pcm
def remember(text: str) -> None:
emb = embedder.encode(text, normalize_embeddings=True)
LATENT_MEMORY.append({"text": text, "emb": emb})
del LATENT_MEMORY[:-LATENT_MEMORY_LIMIT]
def recall(query: str) -> List[str]:
if not LATENT_MEMORY:
return []
q = embedder.encode(query, normalize_embeddings=True)
ranked = sorted(
((np.dot(q, m["emb"]), m["text"]) for m in LATENT_MEMORY),
reverse=True,
)
return [t for _, t in ranked[:3]]
def build_prompt(query: str, memories: List[str]) -> str:
memory_block = "\n".join(memories)
return (
"You are a deterministic conversational intelligence. "
"Respond concisely and precisely.\n\n"
f"Memory:\n{memory_block}\n\n"
f"User: {query}\n"
"Assistant:"
)
# ============================================================
# ENDPOINTS
# ============================================================
@app.post("/v1/voice", response_model=AgentResponse)
async def voice(audio: UploadFile):
start = now_ms()
raw = await audio.read()
if len(raw) > MAX_AUDIO_BYTES:
raise HTTPException(413, "Audio payload too large")
try:
pcm, sr = sf.read(io.BytesIO(raw))
pcm = normalize_audio(pcm)
except Exception:
raise HTTPException(400, "Invalid audio format")
text = stt(pcm, sampling_rate=sr)["text"].strip()
memories = recall(text)
prompt = build_prompt(text, memories)
inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)
with torch.inference_mode():
out = llm.generate(
**inputs,
max_new_tokens=MAX_TOKENS,
do_sample=False,
temperature=None,
top_p=None,
eos_token_id=tokenizer.eos_token_id,
)
decoded = tokenizer.decode(out[0], skip_special_tokens=True)
answer = decoded.split("Assistant:")[-1].strip()
remember(text)
remember(answer)
return AgentResponse(
id=str(uuid.uuid4()),
text=answer,
latency_ms=now_ms() - start,
memory_slots_used=len(memories),
)
@app.post("/v1/speak")
def speak(text: str):
audio_out = tts(text)
if isinstance(audio_out, list):
audio_out = audio_out[0]
if "audio" not in audio_out or "sampling_rate" not in audio_out:
raise HTTPException(500, "TTS output malformed")
buf = io.BytesIO()
sf.write(buf, audio_out["audio"], audio_out["sampling_rate"], format="WAV")
buf.seek(0)
return {
"audio": buf.read(),
"sample_rate": audio_out["sampling_rate"],
}
@app.get("/health")
def health():
return {
"status": "ok",
"device": DEVICE,
"models": {
"stt": STT_MODEL,
"llm": LLM_MODEL,
"tts": TTS_MODEL,
"embed": EMBED_MODEL,
},
}
# ============================================================
# EOF — HARDENED HF TOKENLESS REFRACTOR
# ============================================================