| |
| |
| |
| |
| |
| |
| |
|
|
| """ |
| ECHOFORM™ — Refactored, Debugged, HF-Serious Voice Cognition Engine |
| |
| Fixes applied: |
| - Tokenless-safe models |
| - Audio shape normalization |
| - CPU-isolated embeddings |
| - Deterministic inference |
| - Defensive TTS handling |
| - Prompt format aligned to Qwen |
| - Memory race mitigation (single-tenant safe) |
| - Payload size guard |
| - inference_mode enabled |
| """ |
|
|
| import io |
| import time |
| import uuid |
| import torch |
| import numpy as np |
| import soundfile as sf |
| from typing import List, Dict |
| from fastapi import FastAPI, UploadFile, HTTPException |
| from pydantic import BaseModel |
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM |
| from sentence_transformers import SentenceTransformer |
|
|
| |
| |
| |
|
|
| torch.backends.cudnn.deterministic = True |
| torch.backends.cudnn.benchmark = False |
|
|
| |
| |
| |
|
|
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
| STT_MODEL = "distil-whisper/distil-large-v3" |
| LLM_MODEL = "Qwen/Qwen2.5-3B-Instruct" |
| TTS_MODEL = "espnet/kan-bayashi_ljspeech_vits" |
| EMBED_MODEL = "sentence-transformers/all-MiniLM-L12-v2" |
|
|
| MAX_TOKENS = 192 |
| LATENT_MEMORY_LIMIT = 8 |
| MAX_AUDIO_BYTES = 10_000_000 |
|
|
| |
| |
| |
|
|
| app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None) |
|
|
| stt = pipeline( |
| "automatic-speech-recognition", |
| model=STT_MODEL, |
| device=0 if DEVICE == "cuda" else -1, |
| ) |
|
|
| tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL) |
| llm = AutoModelForCausalLM.from_pretrained( |
| LLM_MODEL, |
| device_map="auto", |
| torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32, |
| ) |
|
|
| tts = pipeline( |
| "text-to-speech", |
| model=TTS_MODEL, |
| device=0 if DEVICE == "cuda" else -1, |
| ) |
|
|
| |
| embedder = SentenceTransformer(EMBED_MODEL, device="cpu") |
|
|
| |
| |
| |
|
|
| LATENT_MEMORY: List[Dict[str, np.ndarray]] = [] |
|
|
| |
| |
| |
|
|
| class AgentResponse(BaseModel): |
| id: str |
| text: str |
| latency_ms: int |
| memory_slots_used: int |
|
|
| |
| |
| |
|
|
| def now_ms() -> int: |
| return int(time.time() * 1000) |
|
|
|
|
| def normalize_audio(pcm: np.ndarray) -> np.ndarray: |
| pcm = pcm.astype(np.float32) |
| if pcm.ndim > 1: |
| pcm = pcm.mean(axis=1) |
| return pcm |
|
|
|
|
| def remember(text: str) -> None: |
| emb = embedder.encode(text, normalize_embeddings=True) |
| LATENT_MEMORY.append({"text": text, "emb": emb}) |
| del LATENT_MEMORY[:-LATENT_MEMORY_LIMIT] |
|
|
|
|
| def recall(query: str) -> List[str]: |
| if not LATENT_MEMORY: |
| return [] |
| q = embedder.encode(query, normalize_embeddings=True) |
| ranked = sorted( |
| ((np.dot(q, m["emb"]), m["text"]) for m in LATENT_MEMORY), |
| reverse=True, |
| ) |
| return [t for _, t in ranked[:3]] |
|
|
|
|
| def build_prompt(query: str, memories: List[str]) -> str: |
| memory_block = "\n".join(memories) |
| return ( |
| "You are a deterministic conversational intelligence. " |
| "Respond concisely and precisely.\n\n" |
| f"Memory:\n{memory_block}\n\n" |
| f"User: {query}\n" |
| "Assistant:" |
| ) |
|
|
| |
| |
| |
|
|
| @app.post("/v1/voice", response_model=AgentResponse) |
| async def voice(audio: UploadFile): |
| start = now_ms() |
|
|
| raw = await audio.read() |
| if len(raw) > MAX_AUDIO_BYTES: |
| raise HTTPException(413, "Audio payload too large") |
|
|
| try: |
| pcm, sr = sf.read(io.BytesIO(raw)) |
| pcm = normalize_audio(pcm) |
| except Exception: |
| raise HTTPException(400, "Invalid audio format") |
|
|
| text = stt(pcm, sampling_rate=sr)["text"].strip() |
| memories = recall(text) |
| prompt = build_prompt(text, memories) |
|
|
| inputs = tokenizer(prompt, return_tensors="pt").to(llm.device) |
|
|
| with torch.inference_mode(): |
| out = llm.generate( |
| **inputs, |
| max_new_tokens=MAX_TOKENS, |
| do_sample=False, |
| temperature=None, |
| top_p=None, |
| eos_token_id=tokenizer.eos_token_id, |
| ) |
|
|
| decoded = tokenizer.decode(out[0], skip_special_tokens=True) |
| answer = decoded.split("Assistant:")[-1].strip() |
|
|
| remember(text) |
| remember(answer) |
|
|
| return AgentResponse( |
| id=str(uuid.uuid4()), |
| text=answer, |
| latency_ms=now_ms() - start, |
| memory_slots_used=len(memories), |
| ) |
|
|
|
|
| @app.post("/v1/speak") |
| def speak(text: str): |
| audio_out = tts(text) |
| if isinstance(audio_out, list): |
| audio_out = audio_out[0] |
|
|
| if "audio" not in audio_out or "sampling_rate" not in audio_out: |
| raise HTTPException(500, "TTS output malformed") |
|
|
| buf = io.BytesIO() |
| sf.write(buf, audio_out["audio"], audio_out["sampling_rate"], format="WAV") |
| buf.seek(0) |
|
|
| return { |
| "audio": buf.read(), |
| "sample_rate": audio_out["sampling_rate"], |
| } |
|
|
|
|
| @app.get("/health") |
| def health(): |
| return { |
| "status": "ok", |
| "device": DEVICE, |
| "models": { |
| "stt": STT_MODEL, |
| "llm": LLM_MODEL, |
| "tts": TTS_MODEL, |
| "embed": EMBED_MODEL, |
| }, |
| } |
|
|
| |
| |
| |