Spaces:

luguog
/

Xmas

Runtime error

App Files Files Community

Xmas / app.py

luguog

Update app.py

04de49f verified 4 months ago

raw

history blame contribute delete

6.26 kB

	#!/usr/bin/env python3
	# ============================================================
	# PROPRIETARY SOFTWARE — CONFIDENTIAL
	# Product: ECHOFORM™
	# File: app.py
	# Mode: Tokenless · Offline · HF-native
	# ============================================================

	"""
	ECHOFORM™ — Refactored, Debugged, HF-Serious Voice Cognition Engine

	Fixes applied:
	- Tokenless-safe models
	- Audio shape normalization
	- CPU-isolated embeddings
	- Deterministic inference
	- Defensive TTS handling
	- Prompt format aligned to Qwen
	- Memory race mitigation (single-tenant safe)
	- Payload size guard
	- inference_mode enabled
	"""

	import io
	import time
	import uuid
	import torch
	import numpy as np
	import soundfile as sf
	from typing import List, Dict
	from fastapi import FastAPI, UploadFile, HTTPException
	from pydantic import BaseModel
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	from sentence_transformers import SentenceTransformer

	# ============================================================
	# HARD DETERMINISM
	# ============================================================

	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False

	# ============================================================
	# CONFIG (TOKENLESS-SAFE)
	# ============================================================

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	STT_MODEL = "distil-whisper/distil-large-v3"
	LLM_MODEL = "Qwen/Qwen2.5-3B-Instruct"
	TTS_MODEL = "espnet/kan-bayashi_ljspeech_vits"
	EMBED_MODEL = "sentence-transformers/all-MiniLM-L12-v2"

	MAX_TOKENS = 192
	LATENT_MEMORY_LIMIT = 8
	MAX_AUDIO_BYTES = 10_000_000 # 10MB cap

	# ============================================================
	# INIT (LOCAL ONLY)
	# ============================================================

	app = FastAPI(docs_url=None, redoc_url=None, openapi_url=None)

	stt = pipeline(
	"automatic-speech-recognition",
	model=STT_MODEL,
	device=0 if DEVICE == "cuda" else -1,
	)

	tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
	llm = AutoModelForCausalLM.from_pretrained(
	LLM_MODEL,
	device_map="auto",
	torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
	)

	tts = pipeline(
	"text-to-speech",
	model=TTS_MODEL,
	device=0 if DEVICE == "cuda" else -1,
	)

	# embeddings forced to CPU to avoid VRAM contention
	embedder = SentenceTransformer(EMBED_MODEL, device="cpu")

	# ============================================================
	# EPHEMERAL MEMORY (SINGLE-TENANT)
	# ============================================================

	LATENT_MEMORY: List[Dict[str, np.ndarray]] = []

	# ============================================================
	# SCHEMA
	# ============================================================

	class AgentResponse(BaseModel):
	id: str
	text: str
	latency_ms: int
	memory_slots_used: int

	# ============================================================
	# CORE LOGIC
	# ============================================================

	def now_ms() -> int:
	return int(time.time() * 1000)


	def normalize_audio(pcm: np.ndarray) -> np.ndarray:
	pcm = pcm.astype(np.float32)
	if pcm.ndim > 1:
	pcm = pcm.mean(axis=1)
	return pcm


	def remember(text: str) -> None:
	emb = embedder.encode(text, normalize_embeddings=True)
	LATENT_MEMORY.append({"text": text, "emb": emb})
	del LATENT_MEMORY[:-LATENT_MEMORY_LIMIT]


	def recall(query: str) -> List[str]:
	if not LATENT_MEMORY:
	return []
	q = embedder.encode(query, normalize_embeddings=True)
	ranked = sorted(
	((np.dot(q, m["emb"]), m["text"]) for m in LATENT_MEMORY),
	reverse=True,
	)
	return [t for _, t in ranked[:3]]


	def build_prompt(query: str, memories: List[str]) -> str:
	memory_block = "\n".join(memories)
	return (
	"You are a deterministic conversational intelligence. "
	"Respond concisely and precisely.\n\n"
	f"Memory:\n{memory_block}\n\n"
	f"User: {query}\n"
	"Assistant:"
	)

	# ============================================================
	# ENDPOINTS
	# ============================================================

	@app.post("/v1/voice", response_model=AgentResponse)
	async def voice(audio: UploadFile):
	start = now_ms()

	raw = await audio.read()
	if len(raw) > MAX_AUDIO_BYTES:
	raise HTTPException(413, "Audio payload too large")

	try:
	pcm, sr = sf.read(io.BytesIO(raw))
	pcm = normalize_audio(pcm)
	except Exception:
	raise HTTPException(400, "Invalid audio format")

	text = stt(pcm, sampling_rate=sr)["text"].strip()
	memories = recall(text)
	prompt = build_prompt(text, memories)

	inputs = tokenizer(prompt, return_tensors="pt").to(llm.device)

	with torch.inference_mode():
	out = llm.generate(
	**inputs,
	max_new_tokens=MAX_TOKENS,
	do_sample=False,
	temperature=None,
	top_p=None,
	eos_token_id=tokenizer.eos_token_id,
	)

	decoded = tokenizer.decode(out[0], skip_special_tokens=True)
	answer = decoded.split("Assistant:")[-1].strip()

	remember(text)
	remember(answer)

	return AgentResponse(
	id=str(uuid.uuid4()),
	text=answer,
	latency_ms=now_ms() - start,
	memory_slots_used=len(memories),
	)


	@app.post("/v1/speak")
	def speak(text: str):
	audio_out = tts(text)
	if isinstance(audio_out, list):
	audio_out = audio_out[0]

	if "audio" not in audio_out or "sampling_rate" not in audio_out:
	raise HTTPException(500, "TTS output malformed")

	buf = io.BytesIO()
	sf.write(buf, audio_out["audio"], audio_out["sampling_rate"], format="WAV")
	buf.seek(0)

	return {
	"audio": buf.read(),
	"sample_rate": audio_out["sampling_rate"],
	}


	@app.get("/health")
	def health():
	return {
	"status": "ok",
	"device": DEVICE,
	"models": {
	"stt": STT_MODEL,
	"llm": LLM_MODEL,
	"tts": TTS_MODEL,
	"embed": EMBED_MODEL,
	},
	}

	# ============================================================
	# EOF — HARDENED HF TOKENLESS REFRACTOR
	# ============================================================