Spaces:

build-small-hackathon
/

tiny-army

Running

polats Claude Opus 4.8 (1M context) commited on Jun 5

Commit

2151ea1

1 Parent(s): 476605d

Portraits step 2: /portrait engine routing + imagen facade + Settings picker

- app.py /portrait now honors an `engine` hint ('local' | 'cloud' | auto), so the
frontend can pick Z-Image-on-GPU vs cloud FLUX explicitly. Local loader uses
4-bit NF4 + a per-process VRAM cap (peak ~8 GB) so it can't crash the display.
- web/imagen.js — image facade mirroring tts.js (engines, persisted activeId,
onImageEngineChange, generatePortrait → PNG Blob).
- web/imagenServer.js — engines backed by /portrait: zimage-local (your GPU,
localhost-only) + flux / flux-dev (cloud NIM, HF fallback).
- web/imagenBar.js + settingsPanel.js — a "Portrait" provider picker section,
injected after Voice, persisted and synced like the voice picker.

Verified locally: picker renders/persists; engine:'local' generates a 1024² PNG;
engine:'cloud' cleanly 503s without keys (cloud verified in prod).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

Files changed (5) hide show

app.py +142 -0
web/imagen.js +42 -0
web/imagenBar.js +42 -0
web/imagenServer.js +50 -0
web/settingsPanel.js +4 -0

app.py CHANGED Viewed

@@ -396,6 +396,148 @@ async def qwen_tts(request: Request):
     return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
 @fastapi_app.get("/persona/status")
 def persona_status():
     return llm.status()

     return Response(wav, media_type="audio/wav", headers={"Cache-Control": "no-store"})
+# ── Persona portraits (image generation) ─────────────────────────────────────
+# Mirrors the voice path: TINY_IMAGE_MODE=local runs the OPEN WEIGHTS on your GPU
+# (Z-Image-Turbo, 6B, ~12 GB bf16 — coexists with the TTS model on a 24 GB card);
+# otherwise we proxy a cloud provider so its key stays server-side. Returns a PNG.
+IMAGE_MODE = os.environ.get("TINY_IMAGE_MODE", "").strip().lower()
+HF_TOKEN = os.environ.get("HF_TOKEN", "")
+NIM_KEY = os.environ.get("NVIDIA_NIM_API_KEY", "")
+_NIM_BASE = "https://ai.api.nvidia.com/v1/genai"
+# id -> NIM FLUX preset (same shapes woid uses: schnell fast, dev higher quality).
+_NIM_PROVIDERS = {
+    "flux-schnell": {"model": "black-forest-labs/flux.1-schnell", "steps": 4, "cfg": 0.0},
+    "flux-dev": {"model": "black-forest-labs/flux.1-dev", "steps": 28, "cfg": 3.5},
+}
+_MIN_IMAGE_BYTES = 15_000  # smaller than this = a blank/safety-blocked frame → retry
+_img_pipe = None
+_img_lock = threading.Lock()
+def _load_image_pipe():
+    import torch
+    from diffusers import ZImagePipeline
+    mid = os.environ.get("TINY_IMAGE_MODEL", "Tongyi-MAI/Z-Image-Turbo")
+    if not torch.cuda.is_available():
+        return ZImagePipeline.from_pretrained(mid, torch_dtype=torch.float32).to("cpu")
+    dt = torch.bfloat16
+    # GUARDRAIL: the 3090 also drives the desktop, so cap THIS process's VRAM — a spike can
+    # then never grab the whole card and crash the display (it OOM-errors instead). Default
+    # ~60% of 24 GB ≈ 14 GB; measured peak is ~8 GB, so there's comfortable headroom.
+    try:
+        torch.cuda.set_per_process_memory_fraction(float(os.environ.get("TINY_IMAGE_VRAM_FRAC", "0.6")), 0)
+    except Exception:  # noqa: BLE001
+        pass
+    if os.environ.get("TINY_IMAGE_QUANT", "1").lower() not in ("0", "false", "no"):
+        # 4-bit NF4 quantize the 6B transformer (~12 GB bf16 → ~3 GB). With cpu-offload the
+        # measured peak is ~8 GB (vs ~20 GB unquantized, which crashed the display), and the
+        # VRAM frees between generations. Portrait quality is effectively unchanged.
+        from diffusers import ZImageTransformer2DModel, BitsAndBytesConfig
+        quant = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=dt)
+        transformer = ZImageTransformer2DModel.from_pretrained(mid, subfolder="transformer", quantization_config=quant, torch_dtype=dt)
+        pipe = ZImagePipeline.from_pretrained(mid, transformer=transformer, torch_dtype=dt)
+    else:
+        pipe = ZImagePipeline.from_pretrained(mid, torch_dtype=dt)
+    # Non-transformer components stream CPU↔GPU per forward; VRAM returns to ~0 when idle.
+    pipe.enable_model_cpu_offload()
+    return pipe
+def _local_portrait(prompt, seed=None, width=1024, height=1024, steps=9):
+    global _img_pipe
+    import io
+    import torch
+    with _img_lock:  # one GPU model can't decode in parallel
+        if _img_pipe is None:
+            _img_pipe = _load_image_pipe()
+        gen = None
+        if seed is not None:
+            dev = "cuda" if torch.cuda.is_available() else "cpu"
+            gen = torch.Generator(dev).manual_seed(int(seed))
+        img = _img_pipe(prompt=prompt, height=height, width=width,
+                        num_inference_steps=steps, guidance_scale=0.0, generator=gen).images[0]
+    out = io.BytesIO(); img.save(out, format="PNG")
+    return out.getvalue()
+def _nim_portrait(prompt, provider="flux-schnell", width=1024, height=1024):
+    import random
+    p = _NIM_PROVIDERS.get(provider, _NIM_PROVIDERS["flux-schnell"])
+    url = f"{_NIM_BASE}/{p['model']}"
+    for _ in range(3):  # retry under the blank/safety-blocked threshold (woid's guard)
+        payload = _json.dumps({
+            "prompt": prompt, "cfg_scale": p["cfg"], "width": width, "height": height,
+            "seed": random.randint(0, 2_147_483_647), "steps": p["steps"],
+        }).encode()
+        req = urllib.request.Request(url, data=payload, method="POST", headers={
+            "Authorization": f"Bearer {NIM_KEY}", "Content-Type": "application/json", "Accept": "application/json",
+        })
+        try:
+            with urllib.request.urlopen(req, timeout=120) as r:
+                j = _json.loads(r.read().decode())
+        except urllib.error.HTTPError as e:
+            return None, f"nim image {e.code}: {e.read().decode()[:200]}"
+        except Exception as e:  # noqa: BLE001
+            return None, f"nim image error: {e}"
+        b64 = j.get("image") or (j.get("artifacts") or [{}])[0].get("base64")
+        if not b64:
+            continue
+        data = base64.b64decode(b64)
+        if len(data) >= _MIN_IMAGE_BYTES:
+            return data, None
+    return None, "image kept coming back blank — safety-blocked prompt?"
+def _hf_portrait(prompt, model="black-forest-labs/FLUX.1-schnell"):
+    # HF Inference (text-to-image) returns raw image bytes; reuses our existing HF_TOKEN.
+    url = f"https://api-inference.huggingface.co/models/{model}"
+    payload = _json.dumps({"inputs": prompt}).encode()
+    req = urllib.request.Request(url, data=payload, method="POST", headers={
+        "Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json", "Accept": "image/png",
+    })
+    try:
+        with urllib.request.urlopen(req, timeout=120) as r:
+            data = r.read()
+    except urllib.error.HTTPError as e:
+        return None, f"hf image {e.code}: {e.read().decode()[:200]}"
+    except Exception as e:  # noqa: BLE001
+        return None, f"hf image error: {e}"
+    if len(data) < _MIN_IMAGE_BYTES:
+        return None, "hf returned a tiny/blank image"
+    return data, None
+@fastapi_app.post("/portrait")
+async def portrait(request: Request):
+    body = await request.json()
+    prompt = (body.get("prompt") or "").strip()
+    seed = body.get("seed")
+    provider = body.get("provider") or ""  # cloud sub-provider hint (e.g. flux-dev)
+    engine = (body.get("engine") or "").strip().lower()  # 'local' | 'cloud' | '' = auto
+    if not prompt:
+        return Response("prompt required", status_code=400)
+    want_local = engine == "local" or (not engine and IMAGE_MODE == "local")
+    if want_local:  # in-process open weights on your GPU (dev)
+        if IMAGE_MODE != "local":
+            return Response("local image mode not enabled (run with TINY_IMAGE_MODE=local)", status_code=503)
+        try:
+            png = await asyncio.to_thread(_local_portrait, prompt, seed)
+        except Exception as e:  # noqa: BLE001 — surface a clear setup hint
+            return Response(f"local image error (pip install 'git+https://github.com/huggingface/diffusers' accelerate?): {e}", status_code=500)
+        return Response(png, media_type="image/png", headers={"Cache-Control": "no-store"})
+    # Cloud: prefer NVIDIA NIM (woid's FLUX path), else HF Inference (our HF_TOKEN).
+    if NIM_KEY:
+        png, err = await asyncio.to_thread(_nim_portrait, prompt, provider or "flux-schnell")
+    elif HF_TOKEN:
+        png, err = await asyncio.to_thread(_hf_portrait, prompt)
+    else:
+        return Response("no image provider (set NVIDIA_NIM_API_KEY / HF_TOKEN, or TINY_IMAGE_MODE=local)", status_code=503)
+    if err:
+        return Response(err, status_code=502)
+    return Response(png, media_type="image/png", headers={"Cache-Control": "no-store"})
 @fastapi_app.get("/persona/status")
 def persona_status():
     return llm.status()

web/imagen.js ADDED Viewed

	@@ -0,0 +1,42 @@

+// Image facade — mirrors tts.js. Picks the active portrait engine (local Z-Image on your
+// GPU, or cloud FLUX; in-browser SD-Turbo / Janus get added here later) and exposes one
+// generatePortrait(). The persona panel + the Settings image bar import only from here.
+import { engineLocal as zimagelocal, engineCloud as flux, engineCloudDev as fluxdev, isLocalhost } from '/web/imagenServer.js'
+const ENGINES = [zimagelocal, flux, fluxdev]
+// Default: local Z-Image on localhost (your GPU), cloud FLUX in prod. Persisted across
+// refreshes; a saved choice wins if it's still available.
+const KEY = 'tinyarmy.imageEngine'
+let activeId = (() => {
+  let saved = ''
+  try { saved = localStorage.getItem(KEY) || '' } catch { /* ignore */ }
+  const e = ENGINES.find((x) => x.id === saved)
+  return e && e.available() ? saved : (isLocalhost() ? 'zimage-local' : 'flux')
+})()
+const eng = () => ENGINES.find((e) => e.id === activeId) || ENGINES.find((e) => e.available()) || ENGINES[0]
+export const listImageEngines = () =>
+  ENGINES.map((e) => ({ id: e.id, label: e.label, available: e.available(), note: e.note || '' }))
+export const getImageEngineId = () => activeId
+const _listeners = new Set()
+export function onImageEngineChange(fn) { _listeners.add(fn); return () => _listeners.delete(fn) }
+export function setImageEngine(id) {
+  if (!ENGINES.some((e) => e.id === id) || id === activeId) return
+  activeId = id
+  try { localStorage.setItem(KEY, id) } catch { /* ignore */ }
+  for (const fn of _listeners) { try { fn(id) } catch { /* ignore */ } }
+}
+export const imageNeedsDownload = () => !!eng().needsDownload
+export const imageBackendLabel = () => eng().backendLabel()
+export const imageNetworked = () => !!eng().networked
+export async function ensureImage(onProgress) { return eng().ensure(onProgress) }
+// Generate a portrait → PNG Blob. `prompt` is the appearance description; `seed` keeps it
+// reproducible where the engine supports it.
+export async function generatePortrait(prompt, { seed } = {}) {
+  return eng().generate(prompt, { seed })
+}

web/imagenBar.js ADDED Viewed

	@@ -0,0 +1,42 @@

+// Portrait-model picker for the Settings page. Chooses the engine that draws hero
+// portraits (local Z-Image on your GPU, or cloud FLUX). Mirrors ttsBar.js: it only sets
+// the engine on the shared imagen.js facade; the persona panel reads that choice.
+import {
+  listImageEngines, getImageEngineId, setImageEngine,
+  imageBackendLabel, imageNeedsDownload, onImageEngineChange,
+} from '/web/imagen.js'
+function el(tag, props = {}, kids = []) {
+  const n = document.createElement(tag)
+  for (const [k, v] of Object.entries(props)) {
+    if (k === 'class') n.className = v
+    else if (k.startsWith('on') && typeof v === 'function') n.addEventListener(k.slice(2), v)
+    else if (v != null) n.setAttribute(k, v)
+  }
+  for (const kid of [].concat(kids)) if (kid != null) n.append(kid)
+  return n
+}
+export function mountImagenBar(host, { onChange } = {}) {
+  const engSel = el('select', { class: 'model-select engine-select' })
+  const info = el('div', { class: 'model-info' })
+  host.append(el('div', { class: 'model-bar imagen-bar' }, [
+    el('label', { class: 'persona-label' }, '🖼 Portrait model'),
+    engSel, info,
+  ]))
+  engSel.replaceChildren(...listImageEngines().map((e) =>
+    el('option', { value: e.id, ...(e.available ? {} : { disabled: 'disabled' }) },
+      `${e.label}${e.available ? '' : ' · ' + (e.note || 'n/a')}`)))
+  engSel.value = getImageEngineId()
+  function renderInfo() {
+    info.textContent = `${imageBackendLabel()}${imageNeedsDownload() ? ' · downloads on first use' : ''}`
+  }
+  engSel.addEventListener('change', () => { setImageEngine(engSel.value); renderInfo(); onChange && onChange() })
+  onImageEngineChange((id) => { engSel.value = id; renderInfo() })
+  renderInfo()
+  return { refresh: renderInfo }
+}

web/imagenServer.js ADDED Viewed

	@@ -0,0 +1,50 @@

+// Image engines backed by our /portrait route (server-side, so keys stay off the client):
+//   • zimage-local — Z-Image-Turbo on YOUR GPU (TINY_IMAGE_MODE=local). Localhost only.
+//   • flux / flux-dev — FLUX via NVIDIA NIM (or HF Inference) in the cloud.
+// Mirrors ttsQwen3.js (engine + engineLocal). generate() returns a PNG Blob.
+export const isLocalhost = () => {
+  try { return /^(localhost|127\.0\.0\.1|\[?::1\]?|0\.0\.0\.0)$/i.test(location.hostname) } catch { return false }
+}
+async function postPortrait(body) {
+  const resp = await fetch('/portrait', {
+    method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify(body),
+  })
+  if (!resp.ok) throw new Error(`portrait ${resp.status}: ${(await resp.text()).slice(0, 160)}`)
+  return resp.blob() // image/png
+}
+const common = { needsDownload: false, networked: true, ensure: async () => {} }
+// LOCAL: same-origin /portrait on a locally-run app.py (TINY_IMAGE_MODE=local → Z-Image
+// on your GPU). Only offered on localhost; shown disabled with a note in prod.
+export const engineLocal = {
+  ...common,
+  id: 'zimage-local',
+  label: 'Z-Image-Turbo · local (your GPU)',
+  available: () => isLocalhost(),
+  note: 'run the project locally',
+  generate: (prompt, { seed } = {}) => postPortrait({ prompt, seed, engine: 'local' }),
+  backendLabel: () => '🖥 local model',
+}
+// CLOUD: FLUX via the backend proxy (NVIDIA NIM, else HF Inference). `provider` picks the
+// NIM sub-model. schnell = fast (4 steps), dev = higher quality (28 steps).
+export const engineCloud = {
+  ...common,
+  id: 'flux',
+  label: 'FLUX schnell · cloud (fast)',
+  available: () => true,
+  generate: (prompt, { seed } = {}) => postPortrait({ prompt, seed, engine: 'cloud', provider: 'flux-schnell' }),
+  backendLabel: () => '☁ FLUX (NIM)',
+}
+export const engineCloudDev = {
+  ...common,
+  id: 'flux-dev',
+  label: 'FLUX dev · cloud (higher quality)',
+  available: () => true,
+  generate: (prompt, { seed } = {}) => postPortrait({ prompt, seed, engine: 'cloud', provider: 'flux-dev' }),
+  backendLabel: () => '☁ FLUX dev (NIM)',
+}

web/settingsPanel.js CHANGED Viewed

@@ -8,6 +8,7 @@
 // just won't appear (graceful no-op) and the app still runs on defaults.
 import { mountModelBar } from '/web/modelBar.js'
 import { mountTtsBar } from '/web/ttsBar.js'
 import { mountPersonaPromptBar } from '/web/personaPromptBar.js'
 import { mountQualityBar } from '/web/qualityBar.js'
@@ -51,6 +52,9 @@ export function mountSettingsPanel() {
       'The provider that voices your heroes. Qwen3-TTS designs a voice from each hero’s ' +
       'description; Kokoro/Kitten run on your device with a named voice you pick per hero. ' +
       'The voice belongs to the hero, so there’s no global voice to choose here.', mountTtsBar)
     injectSection(sample, 'tac-persona-prompt-settings', 'Persona Prompt',
       'The system prompt that writes each hero (name, about, quote and voice design). ' +
       'Edit it to change their style; Save uses it on the next “Recruit hero”.', mountPersonaPromptBar)

 // just won't appear (graceful no-op) and the app still runs on defaults.
 import { mountModelBar } from '/web/modelBar.js'
 import { mountTtsBar } from '/web/ttsBar.js'
+import { mountImagenBar } from '/web/imagenBar.js'
 import { mountPersonaPromptBar } from '/web/personaPromptBar.js'
 import { mountQualityBar } from '/web/qualityBar.js'
       'The provider that voices your heroes. Qwen3-TTS designs a voice from each hero’s ' +
       'description; Kokoro/Kitten run on your device with a named voice you pick per hero. ' +
       'The voice belongs to the hero, so there’s no global voice to choose here.', mountTtsBar)
+    injectSection(sample, 'tac-image-settings', 'Portrait',
+      'The model that paints hero portraits. Z-Image-Turbo runs on your GPU when you host ' +
+      'the project locally; FLUX runs in the cloud otherwise.', mountImagenBar)
     injectSection(sample, 'tac-persona-prompt-settings', 'Persona Prompt',
       'The system prompt that writes each hero (name, about, quote and voice design). ' +
       'Edit it to change their style; Save uses it on the next “Recruit hero”.', mountPersonaPromptBar)