""" app_single.py — MiniCPM-V 4.6 · An Adventure in Thousand Token Wood ===================================================================== A storybook playground: MiniCPM-V reads an uploaded image like a page from an adventure, then a woodland cat performs its mood in a forest clearing — complete with a tiny generative tune. Pipeline: 1. Upload image → MiniCPM-V streams a description 2. Model returns a JSON dance spec (mood + 6 numeric animation params) 3. The cat performs in the clearing using those exact params — every move is model-determined, not hardcoded. 4. A free, generative melody (Web Audio API, no audio files) plays along — tempo and register also derived from the model's params. Dance params returned by model: mood : one of 10 mood words speed : animation cycle seconds (0.3 fast … 3.0 slow) jump : vertical bounce px (0 … 60) sway : body rotation degrees (0 … 20) tail_speed : tail cycle seconds (0.2 … 3.0) tail_range : tail swing degrees (5 … 120) ear_tilt : ear rotation degrees (0 … 25) Two backends — switchable in the UI: • API (default) — calls the hosted MiniCPM-V 4.6 API. Needs internet. • Local (offline) — downloads openbmb/MiniCPM-V-4 (4.1B, Apache-2.0) once, caches it to ./model_cache/, then runs fully offline. Requires: pip install torch transformers accelerate Run locally: pip install -r requirements.txt python app_single.py → open http://localhost:7860 Optional: set your own API key so you're not on the shared public quota Windows (PowerShell): $env:MINICPM_API_KEY="sk-..." macOS / Linux: export MINICPM_API_KEY="sk-..." """ import base64, io, os, json, re import gradio as gr from openai import OpenAI, APIStatusError, APIConnectionError from PIL import Image # ── Config ──────────────────────────────────────────────────────────────────── API_BASE_URL = "https://api.modelbest.cn/v1" PUBLIC_API_KEY = "sk-pQ8L2zF3XmR5kY9wV4jB7hN1tC6vM0xG3aD5sH2bJ9lK4cZ8" MODELS = { "⚡ Instruct (fast, direct)": "MiniCPM-V-4.6-Instruct", "🧠 Thinking (reasons first)": "MiniCPM-V-4.6-Thinking", } DEFAULT_PROMPT = "Describe this image in detail." DEFAULT_MAX_TOKENS = 512 DEFAULT_TEMPERATURE = 0.7 IMAGE_QUALITY = 90 MOOD_LABELS = ["happy","sad","calm","energetic","mysterious","depressed", "romantic","tense","nostalgic","angry","neutral"] PROMPT_EXAMPLES = [ ["Describe this image in detail."], ["List every object you can see."], ["What is the mood or atmosphere of this image?"], ["What text, if any, appears in this image?"], ["Explain this image to someone who cannot see it."], ] # ── Mood palettes — each mood is a "firefly color" in the wood ──────────────── # scale: semitone offsets from root (a small mode/scale per mood) # root : MIDI-ish base note number (we map to Hz with 440 * 2^((n-69)/12)) MOOD_PALETTE = { "happy": {"bg":"#1a1605","body":"#FFD166","detail":"#E8A23A","eye":"#2D1B00","nose":"#FF8A3D","pcol":"#FFE08A","particle":"✦","label":"Happy","caption":"Bouncing with joy", "scale":[0,2,4,7,9,12], "root":72}, "sad": {"bg":"#0c1116","body":"#8AA0B2","detail":"#5D7A8E","eye":"#1A2530","nose":"#B7C7D2","pcol":"#A9C8E0","particle":"·","label":"Sad","caption":"Slow, heavy steps", "scale":[0,3,5,7,10,12], "root":60}, "calm": {"bg":"#0a1614","body":"#6FBFB3","detail":"#4A9C8F","eye":"#0A2018","nose":"#A8E0D6","pcol":"#BFEDE4","particle":"○","label":"Calm","caption":"Drifting at ease", "scale":[0,2,5,7,9,12], "root":64}, "energetic": {"bg":"#1a0e05","body":"#FF8A5B","detail":"#E8623A","eye":"#1a0500","nose":"#FFD1BC","pcol":"#FFCB6B","particle":"★","label":"Energetic","caption":"Can't sit still", "scale":[0,2,4,5,7,9,11,12],"root":71}, "mysterious": {"bg":"#120c1a","body":"#A98BD6","detail":"#6D4FA8","eye":"#F0B8FF","nose":"#D9C2EE","pcol":"#C7B3F0","particle":"✧","label":"Mysterious","caption":"Slipping through shadow", "scale":[0,1,4,5,7,8,11,12],"root":62}, "romantic": {"bg":"#1a0c12","body":"#F2A0BD","detail":"#D9648D","eye":"#1a0010","nose":"#FBE0EA","pcol":"#F7B8CE","particle":"♥","label":"Romantic","caption":"A slow, dreamy waltz", "scale":[0,2,4,7,9,12], "root":67}, "tense": {"bg":"#100808","body":"#F0726E","detail":"#C03C38","eye":"#FFB3AE","nose":"#F7C7C4","pcol":"#F2A6A2","particle":"|","label":"Tense","caption":"Coiled and alert", "scale":[0,1,3,6,7,10,12], "root":61}, "nostalgic": {"bg":"#160f06","body":"#F2C083","detail":"#D98A3D","eye":"#160f06","nose":"#FBE3C7","pcol":"#F7DDB5","particle":"◦","label":"Nostalgic","caption":"Rocking to old memories", "scale":[0,2,3,7,9,12], "root":65}, "angry": {"bg":"#160505","body":"#F0635E","detail":"#A8201C","eye":"#FF6961","nose":"#F7B0AC","pcol":"#F58F8A","particle":"✸","label":"Angry","caption":"Stomping, full of fire", "scale":[0,1,3,5,6,8,10,12],"root":59}, "neutral": {"bg":"#0e0f13","body":"#A6ADB8","detail":"#727A86","eye":"#0d0d18","nose":"#D8DDE3","pcol":"#C7CDD6","particle":"·","label":"Neutral","caption":"Steady and unhurried", "scale":[0,2,4,7,9,12], "root":64}, } # ── Default dance specs (fallback if model call fails) ──────────────────────── DEFAULT_DANCE = { "happy": {"speed":0.7, "jump":50, "sway":6, "tail_speed":0.4, "tail_range":200,"ear_tilt":8}, "sad": {"speed":2.4, "jump":2, "sway":8, "tail_speed":2.5, "tail_range":30, "ear_tilt":15}, "calm": {"speed":2.8, "jump":10, "sway":2, "tail_speed":3.2, "tail_range":35, "ear_tilt":3}, "energetic": {"speed":0.3, "jump":30, "sway":15, "tail_speed":0.28,"tail_range":180,"ear_tilt":15}, "mysterious": {"speed":2.0, "jump":15, "sway":5, "tail_speed":1.8, "tail_range":100,"ear_tilt":5}, "romantic": {"speed":1.6, "jump":12, "sway":5, "tail_speed":1.6, "tail_range":65, "ear_tilt":3}, "tense": {"speed":0.4, "jump":3, "sway":3, "tail_speed":0.4, "tail_range":10, "ear_tilt":12}, "nostalgic": {"speed":2.2, "jump":6, "sway":6, "tail_speed":2.0, "tail_range":65, "ear_tilt":5}, "angry": {"speed":0.38,"jump":18, "sway":5, "tail_speed":0.32,"tail_range":160,"ear_tilt":20}, "neutral": {"speed":2.0, "jump":8, "sway":1, "tail_speed":2.2, "tail_range":30, "ear_tilt":2}, } # ── Helpers ─────────────────────────────────────────────────────────────────── def pil_to_data_url(image): image = image.convert("RGB") buf = io.BytesIO() image.save(buf, format="JPEG", quality=IMAGE_QUALITY) return "data:image/jpeg;base64," + base64.b64encode(buf.getvalue()).decode() def _resolve_key(ui_key): return (os.environ.get("MINICPM_API_KEY","").strip() or (ui_key or "").strip() or PUBLIC_API_KEY) def _client(ui_key): return OpenAI(api_key=_resolve_key(ui_key), base_url=API_BASE_URL) # ── Description (streaming) ─────────────────────────────────────────────────── def stream_description(image, prompt, model_label, max_tokens, temperature, api_key): if image is None: yield "⚠️ Please upload an image first." return try: stream = _client(api_key).chat.completions.create( model=MODELS[model_label], messages=[{"role":"user","content":[ {"type":"image_url","image_url":{"url": pil_to_data_url(image)}}, {"type":"text","text": prompt}, ]}], max_tokens=max_tokens, temperature=temperature, stream=True, ) result = "" for chunk in stream: delta = chunk.choices[0].delta.content or "" if delta: result += delta yield result except APIStatusError as e: yield f"❌ API error {e.status_code}: {e.message}" except APIConnectionError: yield "❌ Cannot reach api.modelbest.cn" except Exception as e: yield f"❌ {e}" # ── Model-driven dance spec ─────────────────────────────────────────────────── DANCE_SYSTEM_PROMPT = f"""You are a cat dance choreographer AI. Given a scene description, return ONLY a valid JSON object — no prose, no markdown, no code fences. JSON schema (all fields required): {{ "mood": one of {MOOD_LABELS}, "speed": float 0.3–3.0 (animation cycle seconds; lower = faster), "jump": int 0–60 (vertical bounce in pixels), "sway": int 0–20 (body rotation degrees), "tail_speed": float 0.2–3.0 (tail cycle seconds), "tail_range": int 5–200 (tail swing degrees), "ear_tilt": int 0–25 (ear tilt degrees) }} Choose values that physically match the scene mood. An energetic scene should have low speed (fast), high jump, high sway. A calm scene should have high speed (slow), low jump, low sway. Be creative — the cat's whole body expresses the image's emotion.""" def _keyword_mood(description: str) -> str: """Simple keyword-based mood fallback when JSON parsing fails.""" t = description.lower() for m, kws in [ ("happy",["happy","joy","celebrate","laugh","smile","bright","sunny"]), ("sad",["sad","lonely","rain","sorrow","grief","cry","gloom"]), ("energetic",["energetic","vibrant","excited","dynamic","rush","active"]), ("calm",["calm","peaceful","quiet","gentle","serene","still"]), ("mysterious",["mysterious","dark","eerie","shadow","mystic","fog"]), ("romantic",["romantic","love","tender","intimate","warm","soft"]), ("tense",["tense","anxious","fear","alarm","nervous","danger"]), ("nostalgic",["nostalgic","memory","vintage","old","past","retro"]), ("angry",["angry","furious","rage","fierce","storm"]), ]: if any(w in t for w in kws): return m return "neutral" def get_dance_spec(description: str, api_key: str) -> tuple[str, dict]: """ Returns (mood, dance_params_dict). The model outputs the full dance spec as JSON. Falls back to defaults if parsing fails. """ if not description or description.startswith(("⚠️","❌")): return "neutral", DEFAULT_DANCE["neutral"] try: resp = _client(api_key).chat.completions.create( model="MiniCPM-V-4.6-Instruct", messages=[ {"role":"system","content": DANCE_SYSTEM_PROMPT}, {"role":"user", "content": f"Scene description:\n{description[:800]}"}, ], max_tokens=120, temperature=0.3, ) raw = resp.choices[0].message.content.strip() # Strip markdown fences if present raw = re.sub(r"```[a-z]*", "", raw).strip().strip("`").strip() spec = json.loads(raw) mood = spec.get("mood","neutral") if mood not in MOOD_LABELS: mood = "neutral" dance = { "speed": float(max(0.3, min(3.0, spec.get("speed", 1.5)))), "jump": int(max(0, min(60, spec.get("jump", 10)))), "sway": int(max(0, min(20, spec.get("sway", 5)))), "tail_speed": float(max(0.2, min(3.0, spec.get("tail_speed", 1.5)))), "tail_range": int(max(5, min(200, spec.get("tail_range", 40)))), "ear_tilt": int(max(0, min(25, spec.get("ear_tilt", 5)))), } return mood, dance except Exception: mood = _keyword_mood(description) return mood, DEFAULT_DANCE[mood] # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ # OFFLINE / LOCAL BACKEND # ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ # Runs entirely on this machine, no internet required after first download. # Model : openbmb/MiniCPM-V-4 (4.1B params, Apache-2.0, fully public) # Cache : ./model_cache/ (weights) + .download_complete (sentinel) # # Heavy deps (torch, transformers) are imported lazily — only when the # user actually selects the Local backend — so API-only users don't need # them installed. from pathlib import Path LOCAL_MODEL_ID = "openbmb/MiniCPM-V-4" LOCAL_CACHE_DIR = Path(__file__).parent / "model_cache" LOCAL_SENTINEL = LOCAL_CACHE_DIR / ".download_complete" _local_model = None _local_tokenizer = None def local_is_cached() -> bool: return LOCAL_SENTINEL.exists() def local_cache_size_gb() -> float: if not LOCAL_CACHE_DIR.exists(): return 0.0 return sum(f.stat().st_size for f in LOCAL_CACHE_DIR.rglob("*") if f.is_file()) / 1e9 def local_status_md() -> str: if local_is_cached(): return (f"✅ **Model cached** — `{LOCAL_MODEL_ID}` " f"({local_cache_size_gb():.1f} GB) ready to run offline.") return (f"⬇️ **Not downloaded yet** — `{LOCAL_MODEL_ID}` (~8 GB) will be " f"fetched on first use and cached in `model_cache/`. " f"Requires internet for this one-time download.") def _load_local_model(): """ Lazily import torch/transformers and load MiniCPM-V-4 from local cache, downloading once if needed. Returns (model, tokenizer). """ global _local_model, _local_tokenizer if _local_model is not None: return _local_model, _local_tokenizer try: import torch import transformers from transformers import AutoModel, AutoTokenizer except ImportError as e: raise RuntimeError( "Local backend requires extra packages.\n" "Install with:\n" " pip install torch transformers accelerate\n" f"(original error: {e})" ) # transformers v5 broke MiniCPM-V-4's custom code (all_tied_weights_keys) _tv = tuple(int(x) for x in transformers.__version__.split(".")[:2]) if _tv >= (5, 0): from transformers import modeling_utils as _mu _orig_getattr = getattr(_mu.PreTrainedModel, "__getattr__", None) def _safe_getattr(self, name): if name == "all_tied_weights_keys": return {} if _orig_getattr is not None: return _orig_getattr(self, name) raise AttributeError(name) _mu.PreTrainedModel.__getattr__ = _safe_getattr LOCAL_CACHE_DIR.mkdir(parents=True, exist_ok=True) local_only = local_is_cached() common = dict( trust_remote_code=True, cache_dir=str(LOCAL_CACHE_DIR), local_files_only=local_only, ) _local_tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_ID, **common) device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 _local_model = AutoModel.from_pretrained( LOCAL_MODEL_ID, torch_dtype=dtype, attn_implementation="sdpa", device_map="auto" if device == "cuda" else None, low_cpu_mem_usage=True, **common, ) if device == "cpu": _local_model = _local_model.to(device) _local_model.eval() if not local_only: LOCAL_SENTINEL.write_text(f"{LOCAL_MODEL_ID} downloaded.\nDelete to re-download.\n") return _local_model, _local_tokenizer def stream_description_local(image, prompt, max_tokens, temperature): """Local (offline) equivalent of stream_description — non-streaming, single yield.""" if image is None: yield "⚠️ Please upload an image first." return try: model, tokenizer = _load_local_model() msgs = [{"role": "user", "content": [image.convert("RGB"), prompt]}] result = model.chat( image=image.convert("RGB"), msgs=msgs, tokenizer=tokenizer, sampling=(temperature > 0), temperature=max(temperature, 0.01), max_new_tokens=max_tokens, ) yield result except RuntimeError as e: yield f"❌ {e}" except Exception as e: yield f"❌ Local inference error: {e}" def get_dance_spec_local(description: str) -> tuple[str, dict]: """Local equivalent of get_dance_spec — one extra text-only local call.""" if not description or description.startswith(("⚠️","❌")): return "neutral", DEFAULT_DANCE["neutral"] try: model, tokenizer = _load_local_model() msgs = [{"role": "user", "content": [ DANCE_SYSTEM_PROMPT + f"\n\nScene description:\n{description[:800]}" ]}] raw = model.chat( image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=150, ) raw = re.sub(r"```[a-z]*", "", raw).strip().strip("`").strip() spec = json.loads(raw) mood = spec.get("mood","neutral") if mood not in MOOD_LABELS: mood = "neutral" dance = { "speed": float(max(0.3, min(3.0, spec.get("speed", 1.5)))), "jump": int(max(0, min(60, spec.get("jump", 10)))), "sway": int(max(0, min(20, spec.get("sway", 5)))), "tail_speed": float(max(0.2, min(3.0, spec.get("tail_speed", 1.5)))), "tail_range": int(max(5, min(200, spec.get("tail_range", 40)))), "ear_tilt": int(max(0, min(25, spec.get("ear_tilt", 5)))), } return mood, dance except Exception: return _keyword_mood(description), DEFAULT_DANCE[_keyword_mood(description)] # ── Keyword dance for text-only tab (no API needed) ─────────────────────────── def generate_animation(text: str) -> str: t = text.lower() mood = "neutral" for m, kws in [ ("happy",["happy","celebrate","party","joy","cheerful"]), ("sad",["sad","lonely","rain","grief","sorrow"]), ("energetic",["energy","dance","excited","lively"]), ("calm",["calm","peace","serene","gentle","quiet"]), ("mysterious",["mysterious","eerie","dark","shadow"]), ("romantic",["romantic","love","tender","warm"]), ("tense",["tense","nervous","anxiety","fear"]), ("nostalgic",["nostalgic","memory","vintage","old"]), ("angry",["angry","furious","rage","fierce"]), ]: if any(w in t for w in kws): mood = m break return cat_html(mood, DEFAULT_DANCE[mood]) # ── Stage chrome — shared studio frame ──────────────────────────────────────── STAGE_FONT = "'Space Grotesk', 'Inter', system-ui, sans-serif" LABEL_FONT = "'Inter', system-ui, sans-serif" MONO_FONT = "'JetBrains Mono', 'SFMono-Regular', Consolas, monospace" def _stage_open(spotlight_color: str, breathe_speed: float = 4.0) -> str: """Opening
Upload an image. The model reads its mood — then a cat performs it, live, with its own tune.