hackathon / app_single.py
angkit007's picture
renamed
f0a208d
Raw
History Blame Contribute Delete
48.9 kB
"""
app_single.py β€” MiniCPM-V 4.6 Β· An Adventure in Thousand Token Wood
=====================================================================
A storybook playground: MiniCPM-V reads an uploaded image like a page
from an adventure, then a woodland cat performs its mood in a forest
clearing β€” complete with a tiny generative tune.
Pipeline:
1. Upload image β†’ MiniCPM-V streams a description
2. Model returns a JSON dance spec (mood + 6 numeric animation params)
3. The cat performs in the clearing using those exact params β€” every
move is model-determined, not hardcoded.
4. A free, generative melody (Web Audio API, no audio files) plays
along β€” tempo and register also derived from the model's params.
Dance params returned by model:
mood : one of 10 mood words
speed : animation cycle seconds (0.3 fast … 3.0 slow)
jump : vertical bounce px (0 … 60)
sway : body rotation degrees (0 … 20)
tail_speed : tail cycle seconds (0.2 … 3.0)
tail_range : tail swing degrees (5 … 120)
ear_tilt : ear rotation degrees (0 … 25)
Two backends β€” switchable in the UI:
β€’ API (default) β€” calls the hosted MiniCPM-V 4.6 API. Needs internet.
β€’ Local (offline) β€” downloads openbmb/MiniCPM-V-4 (4.1B, Apache-2.0) once,
caches it to ./model_cache/, then runs fully offline.
Requires: pip install torch transformers accelerate
Run locally:
pip install -r requirements.txt
python app_single.py
β†’ open http://localhost:7860
Optional: set your own API key so you're not on the shared public quota
Windows (PowerShell): $env:MINICPM_API_KEY="sk-..."
macOS / Linux: export MINICPM_API_KEY="sk-..."
"""
import base64, io, os, json, re
import gradio as gr
from openai import OpenAI, APIStatusError, APIConnectionError
from PIL import Image
# ── Config ────────────────────────────────────────────────────────────────────
API_BASE_URL = "https://api.modelbest.cn/v1"
PUBLIC_API_KEY = "sk-pQ8L2zF3XmR5kY9wV4jB7hN1tC6vM0xG3aD5sH2bJ9lK4cZ8"
MODELS = {
"⚑ Instruct (fast, direct)": "MiniCPM-V-4.6-Instruct",
"🧠 Thinking (reasons first)": "MiniCPM-V-4.6-Thinking",
}
DEFAULT_PROMPT = "Describe this image in detail."
DEFAULT_MAX_TOKENS = 512
DEFAULT_TEMPERATURE = 0.7
IMAGE_QUALITY = 90
MOOD_LABELS = ["happy","sad","calm","energetic","mysterious","depressed",
"romantic","tense","nostalgic","angry","neutral"]
PROMPT_EXAMPLES = [
["Describe this image in detail."],
["List every object you can see."],
["What is the mood or atmosphere of this image?"],
["What text, if any, appears in this image?"],
["Explain this image to someone who cannot see it."],
]
# ── Mood palettes β€” each mood is a "firefly color" in the wood ────────────────
# scale: semitone offsets from root (a small mode/scale per mood)
# root : MIDI-ish base note number (we map to Hz with 440 * 2^((n-69)/12))
MOOD_PALETTE = {
"happy": {"bg":"#1a1605","body":"#FFD166","detail":"#E8A23A","eye":"#2D1B00","nose":"#FF8A3D","pcol":"#FFE08A","particle":"✦","label":"Happy","caption":"Bouncing with joy", "scale":[0,2,4,7,9,12], "root":72},
"sad": {"bg":"#0c1116","body":"#8AA0B2","detail":"#5D7A8E","eye":"#1A2530","nose":"#B7C7D2","pcol":"#A9C8E0","particle":"Β·","label":"Sad","caption":"Slow, heavy steps", "scale":[0,3,5,7,10,12], "root":60},
"calm": {"bg":"#0a1614","body":"#6FBFB3","detail":"#4A9C8F","eye":"#0A2018","nose":"#A8E0D6","pcol":"#BFEDE4","particle":"β—‹","label":"Calm","caption":"Drifting at ease", "scale":[0,2,5,7,9,12], "root":64},
"energetic": {"bg":"#1a0e05","body":"#FF8A5B","detail":"#E8623A","eye":"#1a0500","nose":"#FFD1BC","pcol":"#FFCB6B","particle":"β˜…","label":"Energetic","caption":"Can't sit still", "scale":[0,2,4,5,7,9,11,12],"root":71},
"mysterious": {"bg":"#120c1a","body":"#A98BD6","detail":"#6D4FA8","eye":"#F0B8FF","nose":"#D9C2EE","pcol":"#C7B3F0","particle":"✧","label":"Mysterious","caption":"Slipping through shadow", "scale":[0,1,4,5,7,8,11,12],"root":62},
"romantic": {"bg":"#1a0c12","body":"#F2A0BD","detail":"#D9648D","eye":"#1a0010","nose":"#FBE0EA","pcol":"#F7B8CE","particle":"β™₯","label":"Romantic","caption":"A slow, dreamy waltz", "scale":[0,2,4,7,9,12], "root":67},
"tense": {"bg":"#100808","body":"#F0726E","detail":"#C03C38","eye":"#FFB3AE","nose":"#F7C7C4","pcol":"#F2A6A2","particle":"|","label":"Tense","caption":"Coiled and alert", "scale":[0,1,3,6,7,10,12], "root":61},
"nostalgic": {"bg":"#160f06","body":"#F2C083","detail":"#D98A3D","eye":"#160f06","nose":"#FBE3C7","pcol":"#F7DDB5","particle":"β—¦","label":"Nostalgic","caption":"Rocking to old memories", "scale":[0,2,3,7,9,12], "root":65},
"angry": {"bg":"#160505","body":"#F0635E","detail":"#A8201C","eye":"#FF6961","nose":"#F7B0AC","pcol":"#F58F8A","particle":"✸","label":"Angry","caption":"Stomping, full of fire", "scale":[0,1,3,5,6,8,10,12],"root":59},
"neutral": {"bg":"#0e0f13","body":"#A6ADB8","detail":"#727A86","eye":"#0d0d18","nose":"#D8DDE3","pcol":"#C7CDD6","particle":"Β·","label":"Neutral","caption":"Steady and unhurried", "scale":[0,2,4,7,9,12], "root":64},
}
# ── Default dance specs (fallback if model call fails) ────────────────────────
DEFAULT_DANCE = {
"happy": {"speed":0.7, "jump":50, "sway":6, "tail_speed":0.4, "tail_range":200,"ear_tilt":8},
"sad": {"speed":2.4, "jump":2, "sway":8, "tail_speed":2.5, "tail_range":30, "ear_tilt":15},
"calm": {"speed":2.8, "jump":10, "sway":2, "tail_speed":3.2, "tail_range":35, "ear_tilt":3},
"energetic": {"speed":0.3, "jump":30, "sway":15, "tail_speed":0.28,"tail_range":180,"ear_tilt":15},
"mysterious": {"speed":2.0, "jump":15, "sway":5, "tail_speed":1.8, "tail_range":100,"ear_tilt":5},
"romantic": {"speed":1.6, "jump":12, "sway":5, "tail_speed":1.6, "tail_range":65, "ear_tilt":3},
"tense": {"speed":0.4, "jump":3, "sway":3, "tail_speed":0.4, "tail_range":10, "ear_tilt":12},
"nostalgic": {"speed":2.2, "jump":6, "sway":6, "tail_speed":2.0, "tail_range":65, "ear_tilt":5},
"angry": {"speed":0.38,"jump":18, "sway":5, "tail_speed":0.32,"tail_range":160,"ear_tilt":20},
"neutral": {"speed":2.0, "jump":8, "sway":1, "tail_speed":2.2, "tail_range":30, "ear_tilt":2},
}
# ── Helpers ───────────────────────────────────────────────────────────────────
def pil_to_data_url(image):
image = image.convert("RGB")
buf = io.BytesIO()
image.save(buf, format="JPEG", quality=IMAGE_QUALITY)
return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode()
def _resolve_key(ui_key):
return (os.environ.get("MINICPM_API_KEY","").strip()
or (ui_key or "").strip() or PUBLIC_API_KEY)
def _client(ui_key):
return OpenAI(api_key=_resolve_key(ui_key), base_url=API_BASE_URL)
# ── Description (streaming) ───────────────────────────────────────────────────
def stream_description(image, prompt, model_label, max_tokens, temperature, api_key):
if image is None:
yield "⚠️ Please upload an image first."
return
try:
stream = _client(api_key).chat.completions.create(
model=MODELS[model_label],
messages=[{"role":"user","content":[
{"type":"image_url","image_url":{"url": pil_to_data_url(image)}},
{"type":"text","text": prompt},
]}],
max_tokens=max_tokens, temperature=temperature, stream=True,
)
result = ""
for chunk in stream:
delta = chunk.choices[0].delta.content or ""
if delta:
result += delta
yield result
except APIStatusError as e:
yield f"❌ API error {e.status_code}: {e.message}"
except APIConnectionError:
yield "❌ Cannot reach api.modelbest.cn"
except Exception as e:
yield f"❌ {e}"
# ── Model-driven dance spec ───────────────────────────────────────────────────
DANCE_SYSTEM_PROMPT = f"""You are a cat dance choreographer AI.
Given a scene description, return ONLY a valid JSON object β€” no prose, no markdown, no code fences.
JSON schema (all fields required):
{{
"mood": one of {MOOD_LABELS},
"speed": float 0.3–3.0 (animation cycle seconds; lower = faster),
"jump": int 0–60 (vertical bounce in pixels),
"sway": int 0–20 (body rotation degrees),
"tail_speed": float 0.2–3.0 (tail cycle seconds),
"tail_range": int 5–200 (tail swing degrees),
"ear_tilt": int 0–25 (ear tilt degrees)
}}
Choose values that physically match the scene mood. An energetic scene should have
low speed (fast), high jump, high sway. A calm scene should have high speed (slow),
low jump, low sway. Be creative β€” the cat's whole body expresses the image's emotion."""
def _keyword_mood(description: str) -> str:
"""Simple keyword-based mood fallback when JSON parsing fails."""
t = description.lower()
for m, kws in [
("happy",["happy","joy","celebrate","laugh","smile","bright","sunny"]),
("sad",["sad","lonely","rain","sorrow","grief","cry","gloom"]),
("energetic",["energetic","vibrant","excited","dynamic","rush","active"]),
("calm",["calm","peaceful","quiet","gentle","serene","still"]),
("mysterious",["mysterious","dark","eerie","shadow","mystic","fog"]),
("romantic",["romantic","love","tender","intimate","warm","soft"]),
("tense",["tense","anxious","fear","alarm","nervous","danger"]),
("nostalgic",["nostalgic","memory","vintage","old","past","retro"]),
("angry",["angry","furious","rage","fierce","storm"]),
]:
if any(w in t for w in kws):
return m
return "neutral"
def get_dance_spec(description: str, api_key: str) -> tuple[str, dict]:
"""
Returns (mood, dance_params_dict).
The model outputs the full dance spec as JSON.
Falls back to defaults if parsing fails.
"""
if not description or description.startswith(("⚠️","❌")):
return "neutral", DEFAULT_DANCE["neutral"]
try:
resp = _client(api_key).chat.completions.create(
model="MiniCPM-V-4.6-Instruct",
messages=[
{"role":"system","content": DANCE_SYSTEM_PROMPT},
{"role":"user", "content": f"Scene description:\n{description[:800]}"},
],
max_tokens=120, temperature=0.3,
)
raw = resp.choices[0].message.content.strip()
# Strip markdown fences if present
raw = re.sub(r"```[a-z]*", "", raw).strip().strip("`").strip()
spec = json.loads(raw)
mood = spec.get("mood","neutral")
if mood not in MOOD_LABELS:
mood = "neutral"
dance = {
"speed": float(max(0.3, min(3.0, spec.get("speed", 1.5)))),
"jump": int(max(0, min(60, spec.get("jump", 10)))),
"sway": int(max(0, min(20, spec.get("sway", 5)))),
"tail_speed": float(max(0.2, min(3.0, spec.get("tail_speed", 1.5)))),
"tail_range": int(max(5, min(200, spec.get("tail_range", 40)))),
"ear_tilt": int(max(0, min(25, spec.get("ear_tilt", 5)))),
}
return mood, dance
except Exception:
mood = _keyword_mood(description)
return mood, DEFAULT_DANCE[mood]
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# OFFLINE / LOCAL BACKEND
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# Runs entirely on this machine, no internet required after first download.
# Model : openbmb/MiniCPM-V-4 (4.1B params, Apache-2.0, fully public)
# Cache : ./model_cache/ (weights) + .download_complete (sentinel)
#
# Heavy deps (torch, transformers) are imported lazily β€” only when the
# user actually selects the Local backend β€” so API-only users don't need
# them installed.
from pathlib import Path
LOCAL_MODEL_ID = "openbmb/MiniCPM-V-4"
LOCAL_CACHE_DIR = Path(__file__).parent / "model_cache"
LOCAL_SENTINEL = LOCAL_CACHE_DIR / ".download_complete"
_local_model = None
_local_tokenizer = None
def local_is_cached() -> bool:
return LOCAL_SENTINEL.exists()
def local_cache_size_gb() -> float:
if not LOCAL_CACHE_DIR.exists():
return 0.0
return sum(f.stat().st_size for f in LOCAL_CACHE_DIR.rglob("*") if f.is_file()) / 1e9
def local_status_md() -> str:
if local_is_cached():
return (f"βœ… **Model cached** β€” `{LOCAL_MODEL_ID}` "
f"({local_cache_size_gb():.1f} GB) ready to run offline.")
return (f"⬇️ **Not downloaded yet** β€” `{LOCAL_MODEL_ID}` (~8 GB) will be "
f"fetched on first use and cached in `model_cache/`. "
f"Requires internet for this one-time download.")
def _load_local_model():
"""
Lazily import torch/transformers and load MiniCPM-V-4 from local cache,
downloading once if needed. Returns (model, tokenizer).
"""
global _local_model, _local_tokenizer
if _local_model is not None:
return _local_model, _local_tokenizer
try:
import torch
import transformers
from transformers import AutoModel, AutoTokenizer
except ImportError as e:
raise RuntimeError(
"Local backend requires extra packages.\n"
"Install with:\n"
" pip install torch transformers accelerate\n"
f"(original error: {e})"
)
# transformers v5 broke MiniCPM-V-4's custom code (all_tied_weights_keys)
_tv = tuple(int(x) for x in transformers.__version__.split(".")[:2])
if _tv >= (5, 0):
from transformers import modeling_utils as _mu
_orig_getattr = getattr(_mu.PreTrainedModel, "__getattr__", None)
def _safe_getattr(self, name):
if name == "all_tied_weights_keys":
return {}
if _orig_getattr is not None:
return _orig_getattr(self, name)
raise AttributeError(name)
_mu.PreTrainedModel.__getattr__ = _safe_getattr
LOCAL_CACHE_DIR.mkdir(parents=True, exist_ok=True)
local_only = local_is_cached()
common = dict(
trust_remote_code=True,
cache_dir=str(LOCAL_CACHE_DIR),
local_files_only=local_only,
)
_local_tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_ID, **common)
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
_local_model = AutoModel.from_pretrained(
LOCAL_MODEL_ID,
torch_dtype=dtype,
attn_implementation="sdpa",
device_map="auto" if device == "cuda" else None,
low_cpu_mem_usage=True,
**common,
)
if device == "cpu":
_local_model = _local_model.to(device)
_local_model.eval()
if not local_only:
LOCAL_SENTINEL.write_text(f"{LOCAL_MODEL_ID} downloaded.\nDelete to re-download.\n")
return _local_model, _local_tokenizer
def stream_description_local(image, prompt, max_tokens, temperature):
"""Local (offline) equivalent of stream_description β€” non-streaming, single yield."""
if image is None:
yield "⚠️ Please upload an image first."
return
try:
model, tokenizer = _load_local_model()
msgs = [{"role": "user", "content": [image.convert("RGB"), prompt]}]
result = model.chat(
image=image.convert("RGB"),
msgs=msgs,
tokenizer=tokenizer,
sampling=(temperature > 0),
temperature=max(temperature, 0.01),
max_new_tokens=max_tokens,
)
yield result
except RuntimeError as e:
yield f"❌ {e}"
except Exception as e:
yield f"❌ Local inference error: {e}"
def get_dance_spec_local(description: str) -> tuple[str, dict]:
"""Local equivalent of get_dance_spec β€” one extra text-only local call."""
if not description or description.startswith(("⚠️","❌")):
return "neutral", DEFAULT_DANCE["neutral"]
try:
model, tokenizer = _load_local_model()
msgs = [{"role": "user", "content": [
DANCE_SYSTEM_PROMPT + f"\n\nScene description:\n{description[:800]}"
]}]
raw = model.chat(
image=None, msgs=msgs, tokenizer=tokenizer,
sampling=False, max_new_tokens=150,
)
raw = re.sub(r"```[a-z]*", "", raw).strip().strip("`").strip()
spec = json.loads(raw)
mood = spec.get("mood","neutral")
if mood not in MOOD_LABELS:
mood = "neutral"
dance = {
"speed": float(max(0.3, min(3.0, spec.get("speed", 1.5)))),
"jump": int(max(0, min(60, spec.get("jump", 10)))),
"sway": int(max(0, min(20, spec.get("sway", 5)))),
"tail_speed": float(max(0.2, min(3.0, spec.get("tail_speed", 1.5)))),
"tail_range": int(max(5, min(200, spec.get("tail_range", 40)))),
"ear_tilt": int(max(0, min(25, spec.get("ear_tilt", 5)))),
}
return mood, dance
except Exception:
return _keyword_mood(description), DEFAULT_DANCE[_keyword_mood(description)]
# ── Keyword dance for text-only tab (no API needed) ───────────────────────────
def generate_animation(text: str) -> str:
t = text.lower()
mood = "neutral"
for m, kws in [
("happy",["happy","celebrate","party","joy","cheerful"]),
("sad",["sad","lonely","rain","grief","sorrow"]),
("energetic",["energy","dance","excited","lively"]),
("calm",["calm","peace","serene","gentle","quiet"]),
("mysterious",["mysterious","eerie","dark","shadow"]),
("romantic",["romantic","love","tender","warm"]),
("tense",["tense","nervous","anxiety","fear"]),
("nostalgic",["nostalgic","memory","vintage","old"]),
("angry",["angry","furious","rage","fierce"]),
]:
if any(w in t for w in kws):
mood = m
break
return cat_html(mood, DEFAULT_DANCE[mood])
# ── Stage chrome β€” shared studio frame ────────────────────────────────────────
STAGE_FONT = "'Space Grotesk', 'Inter', system-ui, sans-serif"
LABEL_FONT = "'Inter', system-ui, sans-serif"
MONO_FONT = "'JetBrains Mono', 'SFMono-Regular', Consolas, monospace"
def _stage_open(spotlight_color: str, breathe_speed: float = 4.0) -> str:
"""Opening <div> + shared <style> for the emotion card, HF light style."""
return f"""<div class="stage" style="--spot:{spotlight_color};">
<style>
@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@500;700&family=Inter:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap');
.stage {{
position:relative; height:440px; border-radius:12px;
overflow:hidden; isolation:isolate;
background:
radial-gradient(ellipse 70% 50% at 50% 22%, color-mix(in srgb, var(--spot) 14%, transparent), transparent 70%),
#F8F9FA;
border:1px solid #E5E7EB;
display:flex; flex-direction:column; align-items:center; justify-content:center;
font-family:{STAGE_FONT};
}}
@keyframes spot_breathe {{
0%,100% {{ opacity:.7; }}
50% {{ opacity:1; }}
}}
.stage::before {{
content:''; position:absolute; inset:0; pointer-events:none;
background: radial-gradient(ellipse 45% 36% at 50% 18%, color-mix(in srgb, var(--spot) 18%, transparent), transparent 72%);
animation: spot_breathe {breathe_speed}s ease-in-out infinite;
}}
/* faint dot-grid texture, HF-card style */
.stage::after {{
content:''; position:absolute; inset:0; pointer-events:none; opacity:.5;
background-image: radial-gradient(circle, #E5E7EB 1px, transparent 1px);
background-size: 22px 22px;
}}
.stage-cue {{
position:absolute; top:16px; left:0; right:0;
display:flex; align-items:center; justify-content:center; gap:8px;
font-family:{MONO_FONT};
font-size:.68rem; letter-spacing:.16em; text-transform:uppercase;
color:#6B7280; font-weight:500; z-index:3;
}}
.stage-cue .dot {{
width:8px; height:8px; border-radius:50%;
background:var(--spot); box-shadow:0 0 0 3px color-mix(in srgb, var(--spot) 25%, transparent);
}}
.stage-cue .mood-name {{
color:#111827; font-weight:700; letter-spacing:.1em;
font-family:{MONO_FONT};
background:#FFFFFF; border:1px solid #E5E7EB;
border-radius:999px; padding:2px 10px;
}}
.stage-caption {{
position:absolute; bottom:62px; left:0; right:0; text-align:center; z-index:3;
color:#4B5563; font-size:.92rem; letter-spacing:.01em; font-style:italic;
font-family:{STAGE_FONT}; font-weight:500;
}}
.cue-sheet {{
position:absolute; bottom:14px; left:0; right:0; z-index:3;
display:flex; justify-content:center; gap:8px; flex-wrap:wrap;
padding:0 20px;
}}
.cue-chip {{
font-family:{MONO_FONT}; font-size:.64rem; letter-spacing:.03em;
color:#374151; background:#FFFFFF; border:1px solid #E5E7EB;
border-radius:999px; padding:3px 10px; white-space:nowrap;
box-shadow: 0 1px 2px rgba(0,0,0,.03);
}}
.cue-chip b {{ color:#92660C; font-weight:600; }}
/* ── music toggle button ── */
.music-toggle {{
position:absolute; top:14px; right:14px; z-index:4;
width:36px; height:36px; border-radius:50%;
background:#FFFFFF; border:1px solid #E5E7EB;
display:flex; align-items:center; justify-content:center;
cursor:pointer; font-size:1rem; color:#374151;
box-shadow: 0 1px 2px rgba(0,0,0,.04);
transition: transform .15s ease, background .15s ease, box-shadow .15s ease;
}}
.music-toggle:hover {{
transform: scale(1.06);
box-shadow: 0 2px 8px rgba(0,0,0,.08);
}}
.music-toggle.playing {{
background: #FFD21E;
border-color: #FFD21E;
color:#111827;
}}
.music-toggle .icon-play {{ display:inline; }}
.music-toggle .icon-pause {{ display:none; }}
.music-toggle.playing .icon-play {{ display:none; }}
.music-toggle.playing .icon-pause {{ display:inline; }}
</style>
"""
def _stage_close() -> str:
return "</div>"
# ── Cat stage β€” all parts stay inside the stage, nothing can overflow ─────────
def cat_html(mood: str, dance: dict) -> str:
p = MOOD_PALETTE.get(mood, MOOD_PALETTE["neutral"])
B = p["body"]; D = p["detail"]; E = p["eye"]; N = p["nose"]
sp = dance["speed"]; jp = dance["jump"]
sw = dance["sway"]; tsp = dance["tail_speed"]
tr = dance["tail_range"]; et = dance["ear_tilt"]
t0 = -tr // 2; t1 = tr // 2
breathe = max(2.0, min(6.0, sp * 2))
stage_id = f"stage_{mood}"
# ── music params derived from dance spec ──
scale = p["scale"]
root = p["root"]
# tempo: faster dance (low sp) -> faster notes. Map sp [0.3,3.0] -> note interval [140,520]ms
note_ms = int(140 + (sp - 0.3) / (3.0 - 0.3) * (520 - 140))
# register: higher jump -> notes climb higher (octave shift 0,1,2)
octave_shift = 12 * min(2, jp // 25)
note_root = root + octave_shift
cue_chips = (
f'<span class="cue-chip">speed <b>{sp}s</b></span>'
f'<span class="cue-chip">jump <b>{jp}px</b></span>'
f'<span class="cue-chip">sway <b>{sw}Β°</b></span>'
f'<span class="cue-chip">tail <b>{tsp}s / {tr}Β°</b></span>'
f'<span class="cue-chip">ears <b>{et}Β°</b></span>'
)
return _stage_open(B, breathe) + f"""
<style>
@keyframes K_body {{
0%,100% {{ transform: translateY(0px) rotate(-{sw}deg); }}
50% {{ transform: translateY(-{jp}px) rotate({sw}deg); }}
}}
@keyframes K_tail {{
0%,100% {{ transform: rotate({t0}deg); }}
50% {{ transform: rotate({t1}deg); }}
}}
@keyframes K_ear {{
0%,100% {{ transform: rotate(-{et}deg); }}
50% {{ transform: rotate({et}deg); }}
}}
@keyframes K_blink {{
0%,88%,100% {{ transform: scaleY(1); }}
93% {{ transform: scaleY(0.08); }}
}}
@keyframes K_shadow {{
0%,100% {{ transform: translateX(-50%) scaleX(1); opacity:.45; }}
50% {{ transform: translateX(-50%) scaleX({max(0.4, 1 - jp/80):.2f}); opacity:.15; }}
}}
@keyframes K_part {{
0% {{ opacity:0; transform:translate(0,0) scale(.5); }}
20% {{ opacity:.9; }}
80% {{ opacity:.4; }}
100% {{ opacity:0; transform:translate(var(--px),var(--py)) scale(1.5); }}
}}
.cat-wrap {{ position:relative; width:160px; height:200px; z-index:2; }}
.cat-shadow {{
position:absolute; bottom:-4px; left:50%;
width:72px; height:11px; border-radius:50%;
background:rgba(0,0,0,.55);
animation: K_shadow {sp}s ease-in-out infinite;
}}
.cat-unit {{
position:absolute; bottom:0; left:50%;
transform-origin: center bottom;
animation: K_body {sp}s ease-in-out infinite;
}}
.c-body {{
position:absolute; bottom:0; left:-36px;
width:72px; height:62px;
border-radius:52% 52% 46% 46%;
background:{B};
box-shadow:inset -6px -5px 0 {D};
}}
.c-belly {{
position:absolute; bottom:5px; left:50%; transform:translateX(-50%);
width:40px; height:30px; border-radius:50%;
background:{D}28;
}}
.c-tail {{
position:absolute; bottom:4px; left:22px;
width:16px; height:52px;
border-radius:38% 62% 55% 45% / 28% 28% 72% 72%;
background:{B};
box-shadow:inset 3px 0 0 {D};
transform-origin:bottom center;
animation:K_tail {tsp}s ease-in-out infinite;
}}
.c-tail::after {{
content:'';
position:absolute; top:-9px; left:-5px;
width:26px; height:18px; border-radius:50%;
background:{B};
box-shadow:inset 2px -2px 0 {D};
}}
.c-paw-l,.c-paw-r {{
position:absolute; bottom:0;
width:22px; height:13px;
border-radius:50% 50% 42% 42%;
background:{B};
box-shadow:inset -2px -2px 0 {D};
}}
.c-paw-l {{ left:-34px; }}
.c-paw-r {{ left:12px; }}
.c-head {{
position:absolute; bottom:56px; left:-32px;
width:64px; height:58px; border-radius:50%;
background:{B};
box-shadow:inset -4px -3px 0 {D};
overflow:visible;
}}
.c-ear-l,.c-ear-r {{
position:absolute;
width:0; height:0;
border-left:11px solid transparent;
border-right:11px solid transparent;
border-bottom:21px solid {B};
animation:K_ear {sp}s ease-in-out infinite;
}}
.c-ear-l {{ top:-16px; left:2px; transform-origin:bottom left; }}
.c-ear-r {{ top:-16px; left:40px; transform-origin:bottom right; }}
.c-ear-l::after,.c-ear-r::after {{
content:'';position:absolute;top:5px;left:-6px;
width:0;height:0;
border-left:6px solid transparent;
border-right:6px solid transparent;
border-bottom:13px solid {D};
}}
.c-eye-l,.c-eye-r {{
position:absolute;
width:12px; height:12px; border-radius:50%;
background:{E};
animation:K_blink 3.5s ease-in-out infinite;
}}
.c-eye-l {{ top:18px; left:8px; }}
.c-eye-r {{ top:18px; left:44px; animation-delay:.2s; }}
.c-eye-l::after,.c-eye-r::after {{
content:'';position:absolute;top:2px;left:2px;
width:5px;height:5px;border-radius:50%;
background:rgba(255,255,255,.32);
}}
.c-nose {{
position:absolute; top:32px; left:27px;
width:10px; height:7px;
border-radius:50% 50% 40% 40%;
background:{N};
transform:translateX(-50%);
}}
.c-mouth-l,.c-mouth-r {{
position:absolute;
width:8px; height:5px;
border:0 solid {N};
border-bottom-width:1.5px;
border-radius:0 0 50% 50%;
top:38px;
}}
.c-mouth-l {{ left:21px; border-left-width:1.5px; transform:rotate(10deg); }}
.c-mouth-r {{ left:30px; border-right-width:1.5px; transform:rotate(-10deg); }}
.c-wl1,.c-wl2,.c-wr1,.c-wr2 {{
position:absolute; height:1.5px;
background:rgba(255,255,255,.5); border-radius:1px;
width:28px;
}}
.c-wl1 {{ top:29px; right:37px; transform:rotate(-10deg); transform-origin:right; }}
.c-wl2 {{ top:35px; right:37px; transform:rotate( 10deg); transform-origin:right; }}
.c-wr1 {{ top:29px; left:37px; transform:rotate( 10deg); transform-origin:left; }}
.c-wr2 {{ top:35px; left:37px; transform:rotate(-10deg); transform-origin:left; }}
.c-particle {{
position:absolute; pointer-events:none;
color:{D}; font-size:.9rem;
opacity:0;
animation:K_part var(--pd) var(--pde) ease-out infinite;
}}
</style>
<div class="stage-cue">
<span class="dot"></span>
<span class="mood-name">{p['label']}</span>
<span>&nbsp;Β·&nbsp;live emotion</span>
</div>
<button class="music-toggle" id="music_{stage_id}" title="Play the generated tune" aria-label="Toggle music">
<span class="icon-play">β™ͺ</span><span class="icon-pause">⏸</span>
</button>
<div class="cat-wrap" id="cw">
<div class="cat-shadow"></div>
<div class="cat-unit">
<div class="c-tail"></div>
<div class="c-body"><div class="c-belly"></div></div>
<div class="c-paw-l"></div>
<div class="c-paw-r"></div>
<div class="c-head">
<div class="c-ear-l"></div>
<div class="c-ear-r"></div>
<div class="c-eye-l"></div>
<div class="c-eye-r"></div>
<div class="c-nose"></div>
<div class="c-mouth-l"></div>
<div class="c-mouth-r"></div>
<div class="c-wl1"></div>
<div class="c-wl2"></div>
<div class="c-wr1"></div>
<div class="c-wr2"></div>
</div>
</div>
</div>
<div class="stage-caption">{p['caption']}</div>
<div class="cue-sheet">{cue_chips}</div>
<script>
(function(){{
const wrap = document.getElementById('cw');
const chars = '{p['particle']}'.split('');
for(let i=0;i<22;i++){{
const el = document.createElement('div');
el.className = 'c-particle';
el.textContent = chars[i % chars.length];
const a = Math.random()*Math.PI*2, d = 50+Math.random()*75;
el.style.setProperty('--px', (Math.cos(a)*d)+'px');
el.style.setProperty('--py', (Math.sin(a)*d-20)+'px');
el.style.setProperty('--pd', (.9+Math.random()*2).toFixed(2)+'s');
el.style.setProperty('--pde',(Math.random()*2.5).toFixed(2)+'s');
el.style.left = (55+Math.random()*50)+'px';
el.style.top = (40+Math.random()*80)+'px';
el.style.fontSize = (.55+Math.random()*.65).toFixed(2)+'rem';
wrap.appendChild(el);
}}
// ── Generative tune β€” Web Audio, no files ──
const scale = {scale};
const noteRoot= {note_root};
const noteMs = {note_ms};
const mood = "{mood}";
let ctx = null, timer = null, step = 0, master = null;
function midiToFreq(n) {{ return 440 * Math.pow(2, (n - 69) / 12); }}
function pattern(stepIdx) {{
// simple per-mood arpeggio shapes over the scale degrees
const len = scale.length;
let degree;
if (mood === 'energetic' || mood === 'angry') {{
degree = scale[stepIdx % len]; // straight run, bright
}} else if (mood === 'sad' || mood === 'nostalgic') {{
degree = scale[[0,2,1,3][stepIdx % 4] % len]; // gentle up-down
}} else if (mood === 'mysterious' || mood === 'tense') {{
degree = scale[[0,3,1,5][stepIdx % 4] % len]; // wider, uneasy leaps
}} else {{
degree = scale[[0,1,2,1][stepIdx % 4] % len]; // calm/happy/romantic/calm lilt
}}
return noteRoot + degree;
}}
function playNote() {{
if (!ctx) return;
const midi = pattern(step);
const freq = midiToFreq(midi);
const t0 = ctx.currentTime;
const osc = ctx.createOscillator();
const gain = ctx.createGain();
osc.type = (mood === 'angry' || mood === 'energetic') ? 'sawtooth'
: (mood === 'mysterious' || mood === 'tense') ? 'triangle'
: 'sine';
osc.frequency.setValueAtTime(freq, t0);
const dur = noteMs / 1000 * 0.9;
gain.gain.setValueAtTime(0.0001, t0);
gain.gain.exponentialRampToValueAtTime(0.18, t0 + 0.02);
gain.gain.exponentialRampToValueAtTime(0.0001, t0 + dur);
osc.connect(gain).connect(master);
osc.start(t0);
osc.stop(t0 + dur + 0.02);
step = (step + 1) % 16;
}}
const btn = document.getElementById('music_{stage_id}');
btn.addEventListener('click', function(){{
if (!ctx) {{
ctx = new (window.AudioContext || window.webkitAudioContext)();
master = ctx.createGain();
master.gain.value = 0.5;
master.connect(ctx.destination);
}}
if (timer) {{
clearInterval(timer); timer = null;
ctx.suspend();
btn.classList.remove('playing');
}} else {{
ctx.resume();
playNote();
timer = setInterval(playNote, {note_ms});
btn.classList.add('playing');
}}
}});
}})();
</script>""" + _stage_close()
def placeholder_html():
return _stage_open("#FFD21E", 6.0) + f"""
<div style="text-align:center; z-index:2; color:#6B7280; font-family:{STAGE_FONT};">
<div style="font-size:2.4rem; margin-bottom:14px; opacity:.6;">🐱</div>
<div style="font-size:1.05rem; font-weight:700; letter-spacing:.01em; color:#111827; margin-bottom:8px;">
No emotion yet
</div>
<div style="font-size:.82rem; color:#6B7280; max-width:280px; margin:0 auto; line-height:1.7; font-family:{LABEL_FONT};">
Upload an image β€” the model reads its mood and the cat performs it,
tune and all.
</div>
</div>""" + _stage_close()
def loading_html(local: bool = False) -> str:
title = "Running locally…" if local else "Analyzing image…"
caption = ("on-device inference β€” first run may take a while"
if local else "choreographing the emotion")
return _stage_open("#FFD21E", 2.0) + f"""
<div style="text-align:center; z-index:2; color:#6B7280; font-family:{STAGE_FONT};">
<div class="loading-spinner" style="
width:32px; height:32px; margin:0 auto 16px;
border:3px solid #E5E7EB; border-top-color:#FFD21E;
border-radius:50%; animation: spin 0.9s linear infinite;"></div>
<div style="font-size:.92rem; letter-spacing:.01em; color:#111827; font-weight:700;">
{title}
</div>
<div style="font-size:.78rem; color:#6B7280; margin-top:4px; font-family:{LABEL_FONT};">
{caption}
</div>
</div>
<style>@keyframes spin {{ to {{ transform: rotate(360deg); }} }}</style>""" + _stage_close()
# ── Main pipeline ─────────────────────────────────────────────────────────────
def run_image_pipeline(image, prompt, model_label, max_tokens, temperature, api_key, backend):
if backend == "Local (offline)":
yield "", loading_html(local=True)
final_desc = ""
for partial in stream_description_local(image, prompt, max_tokens, temperature):
final_desc = partial
yield final_desc, loading_html(local=True)
mood, dance = get_dance_spec_local(final_desc)
yield final_desc, cat_html(mood, dance)
return
final_desc = ""
for partial in stream_description(image, prompt, model_label, max_tokens, temperature, api_key):
final_desc = partial
yield partial, loading_html()
# Model determines the full dance spec
mood, dance = get_dance_spec(final_desc, api_key)
yield final_desc, cat_html(mood, dance)
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
# UI β€” Cat Dance Studio
# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@500;600;700&family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap');
:root {
--bg: #FFFFFF;
--surface: #F8F9FA;
--raised: #E5E7EB;
--text: #111827;
--text-dim: #4B5563;
--text-faint:#6B7280;
--accent: #FFD21E;
--accent-ink:#111827;
}
.gradio-container {
background: var(--bg) !important;
font-family: 'Inter', system-ui, sans-serif !important;
}
/* ── Header ────────────────────────────────────────────────────────────── */
#studio-header {
text-align:center; padding: 18px 20px 22px;
border:1px solid var(--raised); border-radius:12px;
background: var(--surface);
margin-bottom:8px;
}
#studio-header h1 {
font-family:'Space Grotesk', sans-serif !important;
font-weight:700 !important; letter-spacing:.01em;
font-size:1.9rem !important; color:var(--text) !important;
margin-bottom:6px !important;
}
#studio-header p {
color:var(--text-dim) !important; font-size:.92rem !important;
margin:0 !important;
}
#studio-header .eyebrow {
display:inline-flex; align-items:center; gap:8px;
font-family:'JetBrains Mono', monospace; font-size:.7rem;
letter-spacing:.18em; text-transform:uppercase;
color:var(--text-faint); margin-bottom:10px;
}
#studio-header .eyebrow .badge {
display:inline-flex; align-items:center; gap:5px;
background: var(--accent); color: var(--accent-ink);
border-radius:999px; padding:2px 10px;
font-weight:700; letter-spacing:.1em;
}
#studio-header .eyebrow .badge .dot {
width:6px; height:6px; border-radius:50%;
background: var(--accent-ink); opacity:.7;
}
/* ── Panels ────────────────────────────────────────────────────────────── */
.gr-form, .gr-box, .gr-panel, .gr-block.gr-box {
background: var(--bg) !important;
border: 1px solid var(--raised) !important;
border-radius: 10px !important;
}
/* Section labels */
.gradio-container label span {
font-family:'Inter', sans-serif !important;
font-size:.78rem !important; font-weight:600 !important;
letter-spacing:.02em !important; color:var(--text-dim) !important;
}
/* ── Buttons ───────────────────────────────────────────────────────────── */
#submit-img, #submit-txt {
background: var(--accent) !important;
color: var(--accent-ink) !important;
border: 1px solid #E8BD00 !important;
font-weight:700 !important;
letter-spacing:.02em !important;
font-family:'Space Grotesk', sans-serif !important;
box-shadow: 0 1px 2px rgba(0,0,0,.04) !important;
transition: transform .12s ease, box-shadow .12s ease !important;
}
#submit-img:hover, #submit-txt:hover {
transform: translateY(-1px);
box-shadow: 0 4px 12px rgba(255,210,30,.35) !important;
}
#submit-img:active, #submit-txt:active { transform: translateY(0); }
/* ── Description output ───────────────────────────────────────────────── */
#desc-output textarea {
font-family:'Inter', sans-serif !important;
font-size:.88rem !important; line-height:1.6 !important;
color:var(--text) !important;
background:var(--surface) !important;
}
/* ── Run-locally panel ─────────────────────────────────────────────────── */
#run-locally {
border:1px solid var(--raised) !important;
background: var(--surface) !important;
}
#run-locally code {
font-family:'JetBrains Mono', monospace !important;
font-size:.78rem !important;
background:var(--bg) !important;
border:1px solid var(--raised) !important;
border-radius:6px !important;
color:#92660C !important;
}
#run-locally pre {
background:var(--bg) !important;
border:1px solid var(--raised) !important;
border-radius:8px !important;
padding:10px 14px !important;
}
/* ── Tabs ──────────────────────────────────────────────────────────────── */
.tab-nav button {
font-family:'Space Grotesk', sans-serif !important;
font-weight:600 !important; letter-spacing:.01em !important;
color: var(--text-dim) !important;
}
.tab-nav button.selected {
color: var(--text) !important;
border-bottom-color: var(--accent) !important;
}
/* ── Misc ──────────────────────────────────────────────────────────────── */
footer { display:none !important; }
.gr-accordion { border-color: var(--raised) !important; }
"""
LOCAL_RUN_MD = """
**Run this studio on your own machine** β€” no install beyond Python.
```bash
pip install gradio openai pillow
python app_single.py
```
Then open **http://localhost:7860**
By default the app uses a shared public API key (rate-limited). To use your
own [modelbest.cn](https://modelbest.cn) key without typing it every time,
set an environment variable before launching:
```bash
# macOS / Linux
export MINICPM_API_KEY="sk-your-key-here"
# Windows (PowerShell)
$env:MINICPM_API_KEY="sk-your-key-here"
```
The app checks `MINICPM_API_KEY` first, then the **API Key** field below,
then falls back to the shared public key.
---
### πŸ”Œ Fully offline mode
Select **Local (offline)** as the Backend on the Image tab to run everything
on-device β€” no internet needed after the first download.
```bash
pip install torch transformers accelerate
python app_single.py
```
The first time you use the Local backend, it downloads `openbmb/MiniCPM-V-4`
(4.1B params, Apache-2.0, ~8 GB) into `model_cache/` next to this file. Every
run after that loads from disk only β€” no network calls.
To force a fresh download, delete the `model_cache/` folder.
A GPU is recommended but not required; the app automatically uses CUDA if
available and falls back to CPU otherwise.
"""
with gr.Blocks(title="An Adventure in Thousand Token Wood Β· MiniCPM-V 4.6", theme=gr.themes.Soft(), css=CSS) as demo:
gr.HTML(
"""<div id="studio-header">
<div class="eyebrow">
<span class="badge"><span class="dot"></span>MiniCPM-V 4.6</span>
<span>An Adventure in Thousand Token Wood</span>
</div>
<h1>Emberglade - An emotion identifier that makes you HAPPY !!!</h1>
<p>Upload an image. The model reads its mood β€” then a cat performs it, live, with its own tune.</p>
</div>"""
)
with gr.Tabs():
# ── Tab 1: Image pipeline ─────────────────────────────────────────────
with gr.TabItem("πŸ“· Image β†’ emotion"):
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(type="pil", label="Upload image", height=240)
prompt_input = gr.Textbox(value=DEFAULT_PROMPT, label="Prompt", lines=2)
backend_sel = gr.Radio(
choices=["API (online)", "Local (offline)"],
value="API (online)",
label="Backend",
)
model_sel = gr.Radio(choices=list(MODELS.keys()),
value=list(MODELS.keys())[0], label="Model",
info="Used only for the API backend")
with gr.Accordion("Generation settings", open=False):
max_tok = gr.Slider(64, 2048, value=DEFAULT_MAX_TOKENS, step=64, label="Max tokens")
temp = gr.Slider(0.0, 1.5, value=DEFAULT_TEMPERATURE, step=0.05, label="Temperature")
with gr.Accordion("API key", open=False):
api_key = gr.Textbox(label="Your key (optional)", type="password",
placeholder="sk-… leave blank to use the shared key")
gr.Markdown("Get your own at [modelbest.cn](https://modelbest.cn) β€” see **Run locally** below for setup.")
with gr.Accordion("Local model (offline)", open=False, elem_id="local-model"):
local_status = gr.Markdown(local_status_md())
gr.Markdown(
f"Model: `{LOCAL_MODEL_ID}` Β· 4.1B params Β· Apache-2.0\n\n"
"Selecting **Local (offline)** above will download this model "
"the first time it's used (~8 GB, one-time, needs internet), "
"then cache it in `model_cache/` for fully offline use afterward.\n\n"
"Requires: `pip install torch transformers accelerate`"
)
refresh_local_btn = gr.Button("Refresh status", size="sm")
img_btn = gr.Button("Start emotion", variant="primary", elem_id="submit-img")
gr.Examples(examples=PROMPT_EXAMPLES, inputs=[prompt_input], label="Prompt ideas")
with gr.Column(scale=1):
cat_out = gr.HTML(value=placeholder_html(), label="Stage")
desc_out = gr.Textbox(label="Description (model output, streaming)", lines=7,
placeholder="The model's description will stream in here…",
elem_id="desc-output")
pipeline_inputs = [image_input, prompt_input, model_sel, max_tok, temp, api_key, backend_sel]
img_btn.click(
fn=run_image_pipeline,
inputs=pipeline_inputs,
outputs=[desc_out, cat_out],
)
prompt_input.submit(
fn=run_image_pipeline,
inputs=pipeline_inputs,
outputs=[desc_out, cat_out],
)
refresh_local_btn.click(fn=local_status_md, outputs=[local_status])
# ── Tab 2: Text-only (keyword dance, no API) ──────────────────────────
with gr.TabItem("✍️ Text β†’ emotion"):
gr.Markdown("Type mood words for an instant emotion β€” no API key needed.")
with gr.Row():
with gr.Column(scale=1):
txt_input = gr.Textbox(
label="Describe a mood",
placeholder='"happy party" Β· "sad rain" Β· "energetic dance"',
lines=3,
)
txt_btn = gr.Button("Start emotion", variant="primary", elem_id="submit-txt")
gr.Examples(
examples=[["happy celebrate joy"],["sad lonely rain"],
["energetic dance excited"],["calm peaceful"],
["mysterious dark shadow"],["romantic love"],
["tense nervous fear"],["nostalgic memory"],["angry rage"]],
inputs=[txt_input], label="Quick examples",
)
with gr.Column(scale=1):
txt_cat = gr.HTML(value=placeholder_html(), label="Stage")
txt_btn.click(fn=generate_animation, inputs=[txt_input], outputs=[txt_cat])
txt_input.submit(fn=generate_animation, inputs=[txt_input], outputs=[txt_cat])
# ── Run locally ──────────────────────────────────────────────────────────
with gr.Accordion("βš™ Run locally", open=False, elem_id="run-locally"):
gr.Markdown(LOCAL_RUN_MD)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)