""" app_single.py — MiniCPM-V 4.6 · An Adventure in Thousand Token Wood ===================================================================== A storybook playground: MiniCPM-V reads an uploaded image like a page from an adventure, then a woodland cat performs its mood in a forest clearing — complete with a tiny generative tune. Pipeline: 1. Upload image → MiniCPM-V streams a description 2. Model returns a JSON dance spec (mood + 6 numeric animation params) 3. The cat performs in the clearing using those exact params — every move is model-determined, not hardcoded. 4. A free, generative melody (Web Audio API, no audio files) plays along — tempo and register also derived from the model's params. Dance params returned by model: mood : one of 10 mood words speed : animation cycle seconds (0.3 fast … 3.0 slow) jump : vertical bounce px (0 … 60) sway : body rotation degrees (0 … 20) tail_speed : tail cycle seconds (0.2 … 3.0) tail_range : tail swing degrees (5 … 120) ear_tilt : ear rotation degrees (0 … 25) Two backends — switchable in the UI: • API (default) — calls the hosted MiniCPM-V 4.6 API. Needs internet. • Local (offline) — downloads openbmb/MiniCPM-V-4 (4.1B, Apache-2.0) once, caches it to ./model_cache/, then runs fully offline. Requires: pip install torch transformers accelerate Run locally: pip install -r requirements.txt python app_single.py → open http://localhost:7860 Optional: set your own API key so you're not on the shared public quota Windows (PowerShell): $env:MINICPM_API_KEY="sk-..." macOS / Linux: export MINICPM_API_KEY="sk-..." """ import base64, io, os, json, re import gradio as gr from openai import OpenAI, APIStatusError, APIConnectionError from PIL import Image # ── Config ──────────────────────────────────────────────────────────────────── API_BASE_URL = "https://api.modelbest.cn/v1" PUBLIC_API_KEY = "sk-pQ8L2zF3XmR5kY9wV4jB7hN1tC6vM0xG3aD5sH2bJ9lK4cZ8" MODELS = { "⚡ Instruct (fast, direct)": "MiniCPM-V-4.6-Instruct", "🧠 Thinking (reasons first)": "MiniCPM-V-4.6-Thinking", } DEFAULT_PROMPT = "Describe this image in detail." DEFAULT_MAX_TOKENS = 512 DEFAULT_TEMPERATURE = 0.7 IMAGE_QUALITY = 90 MOOD_LABELS = ["happy","sad","calm","energetic","mysterious","depressed", "romantic","tense","nostalgic","angry","neutral"] PROMPT_EXAMPLES = [ ["Describe this image in detail."], ["List every object you can see."], ["What is the mood or atmosphere of this image?"], ["What text, if any, appears in this image?"], ["Explain this image to someone who cannot see it."], ] # ── Mood palettes — each mood is a "firefly color" in the wood ──────────────── # scale: semitone offsets from root (a small mode/scale per mood) # root : MIDI-ish base note number (we map to Hz with 440 * 2^((n-69)/12)) MOOD_PALETTE = { "happy": {"bg":"#1a1605","body":"#FFD166","detail":"#E8A23A","eye":"#2D1B00","nose":"#FF8A3D","pcol":"#FFE08A","particle":"✦","label":"Happy","caption":"Bouncing with joy", "scale":[0,2,4,7,9,12], "root":72}, "sad": {"bg":"#0c1116","body":"#8AA0B2","detail":"#5D7A8E","eye":"#1A2530","nose":"#B7C7D2","pcol":"#A9C8E0","particle":"·","label":"Sad","caption":"Slow, heavy steps", "scale":[0,3,5,7,10,12], "root":60}, "calm": {"bg":"#0a1614","body":"#6FBFB3","detail":"#4A9C8F","eye":"#0A2018","nose":"#A8E0D6","pcol":"#BFEDE4","particle":"○","label":"Calm","caption":"Drifting at ease", "scale":[0,2,5,7,9,12], "root":64}, "energetic": {"bg":"#1a0e05","body":"#FF8A5B","detail":"#E8623A","eye":"#1a0500","nose":"#FFD1BC","pcol":"#FFCB6B","particle":"★","label":"Energetic","caption":"Can't sit still", "scale":[0,2,4,5,7,9,11,12],"root":71}, "mysterious": {"bg":"#120c1a","body":"#A98BD6","detail":"#6D4FA8","eye":"#F0B8FF","nose":"#D9C2EE","pcol":"#C7B3F0","particle":"✧","label":"Mysterious","caption":"Slipping through shadow", "scale":[0,1,4,5,7,8,11,12],"root":62}, "romantic": {"bg":"#1a0c12","body":"#F2A0BD","detail":"#D9648D","eye":"#1a0010","nose":"#FBE0EA","pcol":"#F7B8CE","particle":"♥","label":"Romantic","caption":"A slow, dreamy waltz", "scale":[0,2,4,7,9,12], "root":67}, "tense": {"bg":"#100808","body":"#F0726E","detail":"#C03C38","eye":"#FFB3AE","nose":"#F7C7C4","pcol":"#F2A6A2","particle":"|","label":"Tense","caption":"Coiled and alert", "scale":[0,1,3,6,7,10,12], "root":61}, "nostalgic": {"bg":"#160f06","body":"#F2C083","detail":"#D98A3D","eye":"#160f06","nose":"#FBE3C7","pcol":"#F7DDB5","particle":"◦","label":"Nostalgic","caption":"Rocking to old memories", "scale":[0,2,3,7,9,12], "root":65}, "angry": {"bg":"#160505","body":"#F0635E","detail":"#A8201C","eye":"#FF6961","nose":"#F7B0AC","pcol":"#F58F8A","particle":"✸","label":"Angry","caption":"Stomping, full of fire", "scale":[0,1,3,5,6,8,10,12],"root":59}, "neutral": {"bg":"#0e0f13","body":"#A6ADB8","detail":"#727A86","eye":"#0d0d18","nose":"#D8DDE3","pcol":"#C7CDD6","particle":"·","label":"Neutral","caption":"Steady and unhurried", "scale":[0,2,4,7,9,12], "root":64}, } # ── Default dance specs (fallback if model call fails) ──────────────────────── DEFAULT_DANCE = { "happy": {"speed":0.7, "jump":50, "sway":6, "tail_speed":0.4, "tail_range":200,"ear_tilt":8}, "sad": {"speed":2.4, "jump":2, "sway":8, "tail_speed":2.5, "tail_range":30, "ear_tilt":15}, "calm": {"speed":2.8, "jump":10, "sway":2, "tail_speed":3.2, "tail_range":35, "ear_tilt":3}, "energetic": {"speed":0.3, "jump":30, "sway":15, "tail_speed":0.28,"tail_range":180,"ear_tilt":15}, "mysterious": {"speed":2.0, "jump":15, "sway":5, "tail_speed":1.8, "tail_range":100,"ear_tilt":5}, "romantic": {"speed":1.6, "jump":12, "sway":5, "tail_speed":1.6, "tail_range":65, "ear_tilt":3}, "tense": {"speed":0.4, "jump":3, "sway":3, "tail_speed":0.4, "tail_range":10, "ear_tilt":12}, "nostalgic": {"speed":2.2, "jump":6, "sway":6, "tail_speed":2.0, "tail_range":65, "ear_tilt":5}, "angry": {"speed":0.38,"jump":18, "sway":5, "tail_speed":0.32,"tail_range":160,"ear_tilt":20}, "neutral": {"speed":2.0, "jump":8, "sway":1, "tail_speed":2.2, "tail_range":30, "ear_tilt":2}, } # ── Helpers ─────────────────────────────────────────────────────────────────── def pil_to_data_url(image): image = image.convert("RGB") buf = io.BytesIO() image.save(buf, format="JPEG", quality=IMAGE_QUALITY) return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode() def _resolve_key(ui_key): return (os.environ.get("MINICPM_API_KEY","").strip() or (ui_key or "").strip() or PUBLIC_API_KEY) def _client(ui_key): return OpenAI(api_key=_resolve_key(ui_key), base_url=API_BASE_URL) # ── Description (streaming) ─────────────────────────────────────────────────── def stream_description(image, prompt, model_label, max_tokens, temperature, api_key): if image is None: yield "⚠️ Please upload an image first." return try: stream = _client(api_key).chat.completions.create( model=MODELS[model_label], messages=[{"role":"user","content":[ {"type":"image_url","image_url":{"url": pil_to_data_url(image)}}, {"type":"text","text": prompt}, ]}], max_tokens=max_tokens, temperature=temperature, stream=True, ) result = "" for chunk in stream: delta = chunk.choices[0].delta.content or "" if delta: result += delta yield result except APIStatusError as e: yield f"❌ API error {e.status_code}: {e.message}" except APIConnectionError: yield "❌ Cannot reach api.modelbest.cn" except Exception as e: yield f"❌ {e}" # ── Model-driven dance spec ─────────────────────────────────────────────────── DANCE_SYSTEM_PROMPT = f"""You are a cat dance choreographer AI. Given a scene description, return ONLY a valid JSON object — no prose, no markdown, no code fences. JSON schema (all fields required): {{ "mood": one of {MOOD_LABELS}, "speed": float 0.3–3.0 (animation cycle seconds; lower = faster), "jump": int 0–60 (vertical bounce in pixels), "sway": int 0–20 (body rotation degrees), "tail_speed": float 0.2–3.0 (tail cycle seconds), "tail_range": int 5–200 (tail swing degrees), "ear_tilt": int 0–25 (ear tilt degrees) }} Choose values that physically match the scene mood. An energetic scene should have low speed (fast), high jump, high sway. A calm scene should have high speed (slow), low jump, low sway. Be creative — the cat's whole body expresses the image's emotion.""" def _keyword_mood(description: str) -> str: """Simple keyword-based mood fallback when JSON parsing fails.""" t = description.lower() for m, kws in [ ("happy",["happy","joy","celebrate","laugh","smile","bright","sunny"]), ("sad",["sad","lonely","rain","sorrow","grief","cry","gloom"]), ("energetic",["energetic","vibrant","excited","dynamic","rush","active"]), ("calm",["calm","peaceful","quiet","gentle","serene","still"]), ("mysterious",["mysterious","dark","eerie","shadow","mystic","fog"]), ("romantic",["romantic","love","tender","intimate","warm","soft"]), ("tense",["tense","anxious","fear","alarm","nervous","danger"]), ("nostalgic",["nostalgic","memory","vintage","old","past","retro"]), ("angry",["angry","furious","rage","fierce","storm"]), ]: if any(w in t for w in kws): return m return "neutral" def get_dance_spec(description: str, api_key: str) -> tuple[str, dict]: """ Returns (mood, dance_params_dict). The model outputs the full dance spec as JSON. Falls back to defaults if parsing fails. """ if not description or description.startswith(("⚠️","❌")): return "neutral", DEFAULT_DANCE["neutral"] try: resp = _client(api_key).chat.completions.create( model="MiniCPM-V-4.6-Instruct", messages=[ {"role":"system","content": DANCE_SYSTEM_PROMPT}, {"role":"user", "content": f"Scene description:\n{description[:800]}"}, ], max_tokens=120, temperature=0.3, ) raw = resp.choices[0].message.content.strip() # Strip markdown fences if present raw = re.sub(r"```[a-z]*", "", raw).strip().strip("`").strip() spec = json.loads(raw) mood = spec.get("mood","neutral") if mood not in MOOD_LABELS: mood = "neutral" dance = { "speed": float(max(0.3, min(3.0, spec.get("speed", 1.5)))), "jump": int(max(0, min(60, spec.get("jump", 10)))), "sway": int(max(0, min(20, spec.get("sway", 5)))), "tail_speed": float(max(0.2, min(3.0, spec.get("tail_speed", 1.5)))), "tail_range": int(max(5, min(200, spec.get("tail_range", 40)))), "ear_tilt": int(max(0, min(25, spec.get("ear_tilt", 5)))), } return mood, dance except Exception: mood = _keyword_mood(description) return mood, DEFAULT_DANCE[mood] # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ # OFFLINE / LOCAL BACKEND # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ # Runs entirely on this machine, no internet required after first download. # Model : openbmb/MiniCPM-V-4 (4.1B params, Apache-2.0, fully public) # Cache : ./model_cache/ (weights) + .download_complete (sentinel) # # Heavy deps (torch, transformers) are imported lazily — only when the # user actually selects the Local backend — so API-only users don't need # them installed. from pathlib import Path LOCAL_MODEL_ID = "openbmb/MiniCPM-V-4" LOCAL_CACHE_DIR = Path(__file__).parent / "model_cache" LOCAL_SENTINEL = LOCAL_CACHE_DIR / ".download_complete" _local_model = None _local_tokenizer = None def local_is_cached() -> bool: return LOCAL_SENTINEL.exists() def local_cache_size_gb() -> float: if not LOCAL_CACHE_DIR.exists(): return 0.0 return sum(f.stat().st_size for f in LOCAL_CACHE_DIR.rglob("*") if f.is_file()) / 1e9 def local_status_md() -> str: if local_is_cached(): return (f"✅ **Model cached** — `{LOCAL_MODEL_ID}` " f"({local_cache_size_gb():.1f} GB) ready to run offline.") return (f"⬇️ **Not downloaded yet** — `{LOCAL_MODEL_ID}` (~8 GB) will be " f"fetched on first use and cached in `model_cache/`. " f"Requires internet for this one-time download.") def _load_local_model(): """ Lazily import torch/transformers and load MiniCPM-V-4 from local cache, downloading once if needed. Returns (model, tokenizer). """ global _local_model, _local_tokenizer if _local_model is not None: return _local_model, _local_tokenizer try: import torch import transformers from transformers import AutoModel, AutoTokenizer except ImportError as e: raise RuntimeError( "Local backend requires extra packages.\n" "Install with:\n" " pip install torch transformers accelerate\n" f"(original error: {e})" ) # transformers v5 broke MiniCPM-V-4's custom code (all_tied_weights_keys) _tv = tuple(int(x) for x in transformers.__version__.split(".")[:2]) if _tv >= (5, 0): from transformers import modeling_utils as _mu _orig_getattr = getattr(_mu.PreTrainedModel, "__getattr__", None) def _safe_getattr(self, name): if name == "all_tied_weights_keys": return {} if _orig_getattr is not None: return _orig_getattr(self, name) raise AttributeError(name) _mu.PreTrainedModel.__getattr__ = _safe_getattr LOCAL_CACHE_DIR.mkdir(parents=True, exist_ok=True) local_only = local_is_cached() common = dict( trust_remote_code=True, cache_dir=str(LOCAL_CACHE_DIR), local_files_only=local_only, ) _local_tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_ID, **common) device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 _local_model = AutoModel.from_pretrained( LOCAL_MODEL_ID, torch_dtype=dtype, attn_implementation="sdpa", device_map="auto" if device == "cuda" else None, low_cpu_mem_usage=True, **common, ) if device == "cpu": _local_model = _local_model.to(device) _local_model.eval() if not local_only: LOCAL_SENTINEL.write_text(f"{LOCAL_MODEL_ID} downloaded.\nDelete to re-download.\n") return _local_model, _local_tokenizer def stream_description_local(image, prompt, max_tokens, temperature): """Local (offline) equivalent of stream_description — non-streaming, single yield.""" if image is None: yield "⚠️ Please upload an image first." return try: model, tokenizer = _load_local_model() msgs = [{"role": "user", "content": [image.convert("RGB"), prompt]}] result = model.chat( image=image.convert("RGB"), msgs=msgs, tokenizer=tokenizer, sampling=(temperature > 0), temperature=max(temperature, 0.01), max_new_tokens=max_tokens, ) yield result except RuntimeError as e: yield f"❌ {e}" except Exception as e: yield f"❌ Local inference error: {e}" def get_dance_spec_local(description: str) -> tuple[str, dict]: """Local equivalent of get_dance_spec — one extra text-only local call.""" if not description or description.startswith(("⚠️","❌")): return "neutral", DEFAULT_DANCE["neutral"] try: model, tokenizer = _load_local_model() msgs = [{"role": "user", "content": [ DANCE_SYSTEM_PROMPT + f"\n\nScene description:\n{description[:800]}" ]}] raw = model.chat( image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=150, ) raw = re.sub(r"```[a-z]*", "", raw).strip().strip("`").strip() spec = json.loads(raw) mood = spec.get("mood","neutral") if mood not in MOOD_LABELS: mood = "neutral" dance = { "speed": float(max(0.3, min(3.0, spec.get("speed", 1.5)))), "jump": int(max(0, min(60, spec.get("jump", 10)))), "sway": int(max(0, min(20, spec.get("sway", 5)))), "tail_speed": float(max(0.2, min(3.0, spec.get("tail_speed", 1.5)))), "tail_range": int(max(5, min(200, spec.get("tail_range", 40)))), "ear_tilt": int(max(0, min(25, spec.get("ear_tilt", 5)))), } return mood, dance except Exception: return _keyword_mood(description), DEFAULT_DANCE[_keyword_mood(description)] # ── Keyword dance for text-only tab (no API needed) ─────────────────────────── def generate_animation(text: str) -> str: t = text.lower() mood = "neutral" for m, kws in [ ("happy",["happy","celebrate","party","joy","cheerful"]), ("sad",["sad","lonely","rain","grief","sorrow"]), ("energetic",["energy","dance","excited","lively"]), ("calm",["calm","peace","serene","gentle","quiet"]), ("mysterious",["mysterious","eerie","dark","shadow"]), ("romantic",["romantic","love","tender","warm"]), ("tense",["tense","nervous","anxiety","fear"]), ("nostalgic",["nostalgic","memory","vintage","old"]), ("angry",["angry","furious","rage","fierce"]), ]: if any(w in t for w in kws): mood = m break return cat_html(mood, DEFAULT_DANCE[mood]) # ── Stage chrome — shared studio frame ──────────────────────────────────────── STAGE_FONT = "'Space Grotesk', 'Inter', system-ui, sans-serif" LABEL_FONT = "'Inter', system-ui, sans-serif" MONO_FONT = "'JetBrains Mono', 'SFMono-Regular', Consolas, monospace" def _stage_open(spotlight_color: str, breathe_speed: float = 4.0) -> str: """Opening
+ shared """ def _stage_close() -> str: return "
" # ── Cat stage — all parts stay inside the stage, nothing can overflow ───────── def cat_html(mood: str, dance: dict) -> str: p = MOOD_PALETTE.get(mood, MOOD_PALETTE["neutral"]) B = p["body"]; D = p["detail"]; E = p["eye"]; N = p["nose"] sp = dance["speed"]; jp = dance["jump"] sw = dance["sway"]; tsp = dance["tail_speed"] tr = dance["tail_range"]; et = dance["ear_tilt"] t0 = -tr // 2; t1 = tr // 2 breathe = max(2.0, min(6.0, sp * 2)) stage_id = f"stage_{mood}" # ── music params derived from dance spec ── scale = p["scale"] root = p["root"] # tempo: faster dance (low sp) -> faster notes. Map sp [0.3,3.0] -> note interval [140,520]ms note_ms = int(140 + (sp - 0.3) / (3.0 - 0.3) * (520 - 140)) # register: higher jump -> notes climb higher (octave shift 0,1,2) octave_shift = 12 * min(2, jp // 25) note_root = root + octave_shift cue_chips = ( f'speed {sp}s' f'jump {jp}px' f'sway {sw}°' f'tail {tsp}s / {tr}°' f'ears {et}°' ) return _stage_open(B, breathe) + f"""
{p['label']}  · live emotion
{p['caption']}
{cue_chips}
""" + _stage_close() def placeholder_html(): return _stage_open("#FFD21E", 6.0) + f"""
🐱
No emotion yet
Upload an image — the model reads its mood and the cat performs it, tune and all.
""" + _stage_close() def loading_html(local: bool = False) -> str: title = "Running locally…" if local else "Analyzing image…" caption = ("on-device inference — first run may take a while" if local else "choreographing the emotion") return _stage_open("#FFD21E", 2.0) + f"""
{title}
{caption}
""" + _stage_close() # ── Main pipeline ───────────────────────────────────────────────────────────── def run_image_pipeline(image, prompt, model_label, max_tokens, temperature, api_key, backend): if backend == "Local (offline)": yield "", loading_html(local=True) final_desc = "" for partial in stream_description_local(image, prompt, max_tokens, temperature): final_desc = partial yield final_desc, loading_html(local=True) mood, dance = get_dance_spec_local(final_desc) yield final_desc, cat_html(mood, dance) return final_desc = "" for partial in stream_description(image, prompt, model_label, max_tokens, temperature, api_key): final_desc = partial yield partial, loading_html() # Model determines the full dance spec mood, dance = get_dance_spec(final_desc, api_key) yield final_desc, cat_html(mood, dance) # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ # UI — Cat Dance Studio # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ CSS = """ @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@500;600;700&family=Inter:wght@400;500;600;700&family=JetBrains+Mono:wght@400;500&display=swap'); :root { --bg: #FFFFFF; --surface: #F8F9FA; --raised: #E5E7EB; --text: #111827; --text-dim: #4B5563; --text-faint:#6B7280; --accent: #FFD21E; --accent-ink:#111827; } .gradio-container { background: var(--bg) !important; font-family: 'Inter', system-ui, sans-serif !important; } /* ── Header ────────────────────────────────────────────────────────────── */ #studio-header { text-align:center; padding: 18px 20px 22px; border:1px solid var(--raised); border-radius:12px; background: var(--surface); margin-bottom:8px; } #studio-header h1 { font-family:'Space Grotesk', sans-serif !important; font-weight:700 !important; letter-spacing:.01em; font-size:1.9rem !important; color:var(--text) !important; margin-bottom:6px !important; } #studio-header p { color:var(--text-dim) !important; font-size:.92rem !important; margin:0 !important; } #studio-header .eyebrow { display:inline-flex; align-items:center; gap:8px; font-family:'JetBrains Mono', monospace; font-size:.7rem; letter-spacing:.18em; text-transform:uppercase; color:var(--text-faint); margin-bottom:10px; } #studio-header .eyebrow .badge { display:inline-flex; align-items:center; gap:5px; background: var(--accent); color: var(--accent-ink); border-radius:999px; padding:2px 10px; font-weight:700; letter-spacing:.1em; } #studio-header .eyebrow .badge .dot { width:6px; height:6px; border-radius:50%; background: var(--accent-ink); opacity:.7; } /* ── Panels ────────────────────────────────────────────────────────────── */ .gr-form, .gr-box, .gr-panel, .gr-block.gr-box { background: var(--bg) !important; border: 1px solid var(--raised) !important; border-radius: 10px !important; } /* Section labels */ .gradio-container label span { font-family:'Inter', sans-serif !important; font-size:.78rem !important; font-weight:600 !important; letter-spacing:.02em !important; color:var(--text-dim) !important; } /* ── Buttons ───────────────────────────────────────────────────────────── */ #submit-img, #submit-txt { background: var(--accent) !important; color: var(--accent-ink) !important; border: 1px solid #E8BD00 !important; font-weight:700 !important; letter-spacing:.02em !important; font-family:'Space Grotesk', sans-serif !important; box-shadow: 0 1px 2px rgba(0,0,0,.04) !important; transition: transform .12s ease, box-shadow .12s ease !important; } #submit-img:hover, #submit-txt:hover { transform: translateY(-1px); box-shadow: 0 4px 12px rgba(255,210,30,.35) !important; } #submit-img:active, #submit-txt:active { transform: translateY(0); } /* ── Description output ───────────────────────────────────────────────── */ #desc-output textarea { font-family:'Inter', sans-serif !important; font-size:.88rem !important; line-height:1.6 !important; color:var(--text) !important; background:var(--surface) !important; } /* ── Run-locally panel ─────────────────────────────────────────────────── */ #run-locally { border:1px solid var(--raised) !important; background: var(--surface) !important; } #run-locally code { font-family:'JetBrains Mono', monospace !important; font-size:.78rem !important; background:var(--bg) !important; border:1px solid var(--raised) !important; border-radius:6px !important; color:#92660C !important; } #run-locally pre { background:var(--bg) !important; border:1px solid var(--raised) !important; border-radius:8px !important; padding:10px 14px !important; } /* ── Tabs ──────────────────────────────────────────────────────────────── */ .tab-nav button { font-family:'Space Grotesk', sans-serif !important; font-weight:600 !important; letter-spacing:.01em !important; color: var(--text-dim) !important; } .tab-nav button.selected { color: var(--text) !important; border-bottom-color: var(--accent) !important; } /* ── Misc ──────────────────────────────────────────────────────────────── */ footer { display:none !important; } .gr-accordion { border-color: var(--raised) !important; } """ LOCAL_RUN_MD = """ **Run this studio on your own machine** — no install beyond Python. ```bash pip install gradio openai pillow python app_single.py ``` Then open **http://localhost:7860** By default the app uses a shared public API key (rate-limited). To use your own [modelbest.cn](https://modelbest.cn) key without typing it every time, set an environment variable before launching: ```bash # macOS / Linux export MINICPM_API_KEY="sk-your-key-here" # Windows (PowerShell) $env:MINICPM_API_KEY="sk-your-key-here" ``` The app checks `MINICPM_API_KEY` first, then the **API Key** field below, then falls back to the shared public key. --- ### 🔌 Fully offline mode Select **Local (offline)** as the Backend on the Image tab to run everything on-device — no internet needed after the first download. ```bash pip install torch transformers accelerate python app_single.py ``` The first time you use the Local backend, it downloads `openbmb/MiniCPM-V-4` (4.1B params, Apache-2.0, ~8 GB) into `model_cache/` next to this file. Every run after that loads from disk only — no network calls. To force a fresh download, delete the `model_cache/` folder. A GPU is recommended but not required; the app automatically uses CUDA if available and falls back to CPU otherwise. """ with gr.Blocks(title="An Adventure in Thousand Token Wood · MiniCPM-V 4.6", theme=gr.themes.Soft(), css=CSS) as demo: gr.HTML( """
MiniCPM-V 4.6 An Adventure in Thousand Token Wood

Emberglade - An emotion identifier that makes you HAPPY !!!

Upload an image. The model reads its mood — then a cat performs it, live, with its own tune.

""" ) with gr.Tabs(): # ── Tab 1: Image pipeline ───────────────────────────────────────────── with gr.TabItem("📷 Image → emotion"): with gr.Row(): with gr.Column(scale=1): image_input = gr.Image(type="pil", label="Upload image", height=240) prompt_input = gr.Textbox(value=DEFAULT_PROMPT, label="Prompt", lines=2) backend_sel = gr.Radio( choices=["API (online)", "Local (offline)"], value="API (online)", label="Backend", ) model_sel = gr.Radio(choices=list(MODELS.keys()), value=list(MODELS.keys())[0], label="Model", info="Used only for the API backend") with gr.Accordion("Generation settings", open=False): max_tok = gr.Slider(64, 2048, value=DEFAULT_MAX_TOKENS, step=64, label="Max tokens") temp = gr.Slider(0.0, 1.5, value=DEFAULT_TEMPERATURE, step=0.05, label="Temperature") with gr.Accordion("API key", open=False): api_key = gr.Textbox(label="Your key (optional)", type="password", placeholder="sk-… leave blank to use the shared key") gr.Markdown("Get your own at [modelbest.cn](https://modelbest.cn) — see **Run locally** below for setup.") with gr.Accordion("Local model (offline)", open=False, elem_id="local-model"): local_status = gr.Markdown(local_status_md()) gr.Markdown( f"Model: `{LOCAL_MODEL_ID}` · 4.1B params · Apache-2.0\n\n" "Selecting **Local (offline)** above will download this model " "the first time it's used (~8 GB, one-time, needs internet), " "then cache it in `model_cache/` for fully offline use afterward.\n\n" "Requires: `pip install torch transformers accelerate`" ) refresh_local_btn = gr.Button("Refresh status", size="sm") img_btn = gr.Button("Start emotion", variant="primary", elem_id="submit-img") gr.Examples(examples=PROMPT_EXAMPLES, inputs=[prompt_input], label="Prompt ideas") with gr.Column(scale=1): cat_out = gr.HTML(value=placeholder_html(), label="Stage") desc_out = gr.Textbox(label="Description (model output, streaming)", lines=7, placeholder="The model's description will stream in here…", elem_id="desc-output") pipeline_inputs = [image_input, prompt_input, model_sel, max_tok, temp, api_key, backend_sel] img_btn.click( fn=run_image_pipeline, inputs=pipeline_inputs, outputs=[desc_out, cat_out], ) prompt_input.submit( fn=run_image_pipeline, inputs=pipeline_inputs, outputs=[desc_out, cat_out], ) refresh_local_btn.click(fn=local_status_md, outputs=[local_status]) # ── Tab 2: Text-only (keyword dance, no API) ────────────────────────── with gr.TabItem("✍️ Text → emotion"): gr.Markdown("Type mood words for an instant emotion — no API key needed.") with gr.Row(): with gr.Column(scale=1): txt_input = gr.Textbox( label="Describe a mood", placeholder='"happy party" · "sad rain" · "energetic dance"', lines=3, ) txt_btn = gr.Button("Start emotion", variant="primary", elem_id="submit-txt") gr.Examples( examples=[["happy celebrate joy"],["sad lonely rain"], ["energetic dance excited"],["calm peaceful"], ["mysterious dark shadow"],["romantic love"], ["tense nervous fear"],["nostalgic memory"],["angry rage"]], inputs=[txt_input], label="Quick examples", ) with gr.Column(scale=1): txt_cat = gr.HTML(value=placeholder_html(), label="Stage") txt_btn.click(fn=generate_animation, inputs=[txt_input], outputs=[txt_cat]) txt_input.submit(fn=generate_animation, inputs=[txt_input], outputs=[txt_cat]) # ── Run locally ────────────────────────────────────────────────────────── with gr.Accordion("⚙ Run locally", open=False, elem_id="run-locally"): gr.Markdown(LOCAL_RUN_MD) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)