Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

App Files Files Community

pratik-250620 commited on Feb 20

Commit

358d3bc

verified ·

1 Parent(s): 6da5a84

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

app.py +1198 -186
src/coherence/calibration.py +249 -0
src/coherence/cmsci_engine.py +536 -0
src/coherence/gram_volume.py +124 -0
src/coherence/negative_bank.py +212 -0
src/config/settings.py +44 -0
src/embeddings/prob_adapter_trainer.py +256 -0
src/embeddings/probabilistic_adapter.py +216 -0
src/embeddings/space_alignment.py +201 -0

app.py CHANGED Viewed

@@ -180,29 +180,743 @@ section[data-testid="stSidebar"] > div:first-child { padding-top: 1.2rem; }
 # Example prompts
 # ---------------------------------------------------------------------------
 EXAMPLE_PROMPTS = {
-    "Nature": [
-        "A peaceful forest at dawn with birdsong and morning mist",
-        "A field of golden wheat under a warm summer sunset",
-        "A dense jungle with exotic birds calling from the canopy",
-    ],
-    "Urban": [
-        "A bustling city street at night with neon lights and traffic",
-        "A quiet alley in an old town with distant footsteps echoing",
-        "A cafe terrace on a busy boulevard with clinking glasses",
-    ],
-    "Water": [
-        "Ocean waves crashing on a sandy beach at sunset",
-        "Rain falling on a pond with ripples spreading across the surface",
-        "A mountain stream flowing over rocks through a pine forest",
-    ],
-    "Mixed": [
-        "A lighthouse on a cliff during a thunderstorm at night",
-        "A bonfire on a beach with waves and guitar music at night",
-        "A train passing through countryside with distant church bells",
-    ],
 }
 DOMAIN_ICONS = {"nature": "\U0001f33f", "urban": "\U0001f3d9\ufe0f", "water": "\U0001f30a", "mixed": "\U0001f310", "other": "\U0001f4cd"}
 # ---------------------------------------------------------------------------
 # Planning prompt template (same as src/planner/prompts/unified.txt)
 # ---------------------------------------------------------------------------
@@ -306,22 +1020,128 @@ def get_inference_client():
     return InferenceClient(token=token)
 # ---------------------------------------------------------------------------
 # HF Inference API helpers
 # ---------------------------------------------------------------------------
-TEXT_GEN_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.3",
     "HuggingFaceH4/zephyr-7b-beta",
     "microsoft/Phi-3-mini-4k-instruct",
-    "meta-llama/Llama-3.2-3B-Instruct",
 ]
 def _hf_chat(system: str, user: str, max_tokens: int = 500, temperature: float = 0.3) -> str:
-    """Call HF Inference API chat completion, trying multiple models."""
     client = get_inference_client()
     last_error = None
     for model_id in TEXT_GEN_MODELS:
         try:
             response = client.chat_completion(
                 model=model_id,
@@ -337,9 +1157,15 @@ def _hf_chat(system: str, user: str, max_tokens: int = 500, temperature: float =
                 return text
         except Exception as e:
             last_error = e
-            logger.warning("Chat model %s failed: %s", model_id, e)
             continue
-    raise RuntimeError(f"All text models failed. Last error: {last_error}")
 def _parse_plan_json(raw: str) -> Optional[Dict[str, Any]]:
@@ -425,11 +1251,14 @@ def plan_extended(prompt: str) -> Optional[Any]:
 # Generation / retrieval functions
 # ---------------------------------------------------------------------------
-# HF Inference API model IDs
-IMAGE_GEN_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
 AUDIO_GEN_MODELS = [
-    "cvssp/audioldm2",
-    "facebook/musicgen-small",
 ]
 def gen_text(prompt: str, mode: str) -> dict:
@@ -487,25 +1316,44 @@ def gen_text(prompt: str, mode: str) -> dict:
 def generate_image(prompt: str) -> dict:
-    """Generate image via HF Inference API (SDXL), fallback to retrieval."""
     client = get_inference_client()
-    try:
-        image = client.text_to_image(prompt, model=IMAGE_GEN_MODEL)
-        tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False, dir="/tmp")
-        image.save(tmp.name)
-        return {
-            "path": tmp.name, "backend": "generative",
-            "model": "SDXL", "failed": False,
-        }
-    except Exception as e:
-        logger.warning("Image generation failed: %s — falling back to retrieval", e)
-        return retrieve_image(prompt)
 def generate_audio(prompt: str) -> dict:
-    """Generate audio via HF Inference API, fallback to retrieval."""
     client = get_inference_client()
     for model_id in AUDIO_GEN_MODELS:
         try:
             audio_bytes = client.text_to_audio(prompt, model=model_id)
             suffix = ".flac" if "musicgen" in model_id else ".wav"
@@ -514,7 +1362,6 @@ def generate_audio(prompt: str) -> dict:
                 tmp.write(audio_bytes)
                 tmp.flush()
             else:
-                # Some API versions return object with .read() or similar
                 tmp.write(bytes(audio_bytes))
                 tmp.flush()
             model_name = model_id.split("/")[-1]
@@ -523,11 +1370,17 @@ def generate_audio(prompt: str) -> dict:
                 "model": model_name, "failed": False,
             }
         except Exception as e:
-            logger.warning("Audio gen with %s failed: %s", model_id, e)
             continue
-    # All generative models failed — fall back to retrieval
     logger.warning("All audio generation models failed — falling back to retrieval")
-    return retrieve_audio(prompt)
 def retrieve_image(prompt: str) -> dict:
@@ -599,31 +1452,36 @@ def main():
         layout="wide",
         initial_sidebar_state="expanded",
     )
-    st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
-    # Hero
-    st.markdown(
-        '<div class="hero-wrap">'
-        '<div class="hero-title">Multimodal Coherence AI</div>'
-        '<div class="hero-sub">Generate semantically coherent <b>text + image + audio</b> bundles '
-        'and evaluate cross-modal alignment with the <b>MSCI</b> metric.</div>'
-        '</div>', unsafe_allow_html=True)
-    # Sidebar
     with st.sidebar:
         st.markdown("#### Configuration")
         backend = st.selectbox(
-            "Backend",
             ["generative", "retrieval"],
             format_func=lambda x: {
-                "generative": "Generative (SDXL + AudioLDM2)",
                 "retrieval": "Retrieval (CLIP + CLAP index)",
             }[x],
         )
         mode = st.selectbox(
-            "Planning Mode",
             ["direct", "planner", "council", "extended_prompt"],
             format_func=lambda x: {
                 "direct": "Direct",
@@ -634,13 +1492,25 @@ def main():
         )
         st.divider()
-        st.markdown("#### Examples")
-        for dname, prompts in EXAMPLE_PROMPTS.items():
-            icon = DOMAIN_ICONS.get(dname.lower(), "\U0001f4cd")
-            with st.expander(f"{icon} {dname}"):
-                for p in prompts:
-                    if st.button(p, key=f"ex_{hash(p)}", use_container_width=True):
-                        st.session_state["prompt_input"] = p
         st.divider()
         mode_desc = {
@@ -650,35 +1520,57 @@ def main():
             "extended_prompt": "Single LLM call with 3x token budget",
         }
         if backend == "generative":
-            img_info = "SDXL via HF API"
-            aud_info = "AudioLDM2 / MusicGen via HF API"
         else:
             img_info = "CLIP retrieval (57 images)"
             aud_info = "CLAP retrieval (104 clips)"
         st.markdown(
             f'<div class="sidebar-info">'
             f'<b>Text</b> HF Inference API<br>'
             f'<b>Planning</b> {mode_desc[mode]}<br>'
             f'<b>Image</b> {img_info}<br>'
-            f'<b>Audio</b> {aud_info}<br><br>'
             f'<b>Metric</b> MSCI = 0.45 &times; s<sub>t,i</sub> + 0.45 &times; s<sub>t,a</sub><br><br>'
             f'<b>Models</b><br>'
             f'CLIP ViT-B/32 (coherence eval)<br>'
             f'CLAP HTSAT-unfused (coherence eval)'
             f'</div>', unsafe_allow_html=True)
     # Prompt input
     default_prompt = st.session_state.get("prompt_input", "")
     prompt = st.text_area(
         "Scene", value=default_prompt, height=80,
-        placeholder="Describe a scene... e.g., 'A peaceful forest at dawn with birdsong and morning mist'",
         label_visibility="collapsed",
     )
     # Button + chips
     bc1, bc2 = st.columns([1, 3])
     with bc1:
-        go = st.button("Generate Bundle", type="primary", use_container_width=True, disabled=not prompt.strip())
     with bc2:
         mlbl = {"direct": "Direct", "planner": "Planner", "council": "Council", "extended_prompt": "Extended"}[mode]
         mcls = "chip-amber" if mode != "direct" else "chip-purple"
@@ -687,27 +1579,45 @@ def main():
             bchip = '<span class="chip chip-pink"><span class="chip-dot chip-dot-pink"></span>Generative</span>'
         else:
             bchip = '<span class="chip chip-purple"><span class="chip-dot chip-dot-purple"></span>Retrieval</span>'
         st.markdown(
             f'<div class="chip-row">'
             f'{bchip}'
             f'<span class="chip {mcls}"><span class="chip-dot {mdot}"></span>{mlbl}</span>'
             f'<span class="chip chip-green"><span class="chip-dot chip-dot-green"></span>CLIP + CLAP</span>'
             f'</div>', unsafe_allow_html=True)
     # Welcome state
     if not go and "last_result" not in st.session_state:
-        st.markdown(
-            '<div class="welcome">'
-            '<div class="welcome-icons">\U0001f3a8  \U0001f5bc\ufe0f  \U0001f50a</div>'
-            '<div class="welcome-text">Enter a scene description and click <b>Generate Bundle</b></div>'
-            '<div class="welcome-hint">or pick an example from the sidebar</div>'
-            '</div>', unsafe_allow_html=True)
         return
     if go and prompt.strip():
-        st.session_state["last_result"] = run_pipeline(prompt.strip(), mode, backend)
     if "last_result" in st.session_state:
         show_results(st.session_state["last_result"])
@@ -715,16 +1625,29 @@ def main():
 # Pipeline
 # ---------------------------------------------------------------------------
-def run_pipeline(prompt: str, mode: str, backend: str = "generative") -> dict:
-    R: dict = {"mode": mode, "backend": backend}
     t_all = time.time()
-    # 1) Text + Planning
     plan_label = "Generating text..." if mode == "direct" else f"Planning ({mode}) + generating text..."
     with st.status(plan_label, expanded=True) as s:
         t0 = time.time()
         try:
-            R["text"] = gen_text(prompt, mode)
             R["t_text"] = time.time() - t0
             has_plan = R["text"].get("plan") is not None
             lbl = f"Text ready ({R['t_text']:.1f}s)"
@@ -733,14 +1656,20 @@ def run_pipeline(prompt: str, mode: str, backend: str = "generative") -> dict:
             s.update(label=lbl, state="complete")
         except Exception as e:
             s.update(label=f"Text failed: {e}", state="error")
-            R["text"] = {"text": prompt, "image_prompt": prompt, "audio_prompt": prompt}
             R["t_text"] = time.time() - t0
-    ip = R["text"].get("image_prompt", prompt)
-    ap = R["text"].get("audio_prompt", prompt)
     # 2) Image
-    img_label = "Generating image (SDXL)..." if backend == "generative" else "Retrieving image..."
     with st.status(img_label, expanded=True) as s:
         t0 = time.time()
         try:
@@ -791,13 +1720,14 @@ def run_pipeline(prompt: str, mode: str, backend: str = "generative") -> dict:
             R["audio"] = None
             R["t_aud"] = time.time() - t0
-    # 4) Coherence evaluation
     with st.status("Evaluating coherence...", expanded=True) as s:
         t0 = time.time()
         try:
             imgp = R.get("image", {}).get("path") if R.get("image") else None
             audp = R.get("audio", {}).get("path") if R.get("audio") else None
-            R["coherence"] = eval_coherence(R["text"]["text"], imgp, audp)
             R["t_eval"] = time.time() - t0
             msci = R["coherence"].get("scores", {}).get("msci")
             s.update(label=f"MSCI = {msci:.4f} ({R['t_eval']:.1f}s)", state="complete")
@@ -821,23 +1751,52 @@ def show_results(R: dict):
     msci = sc.get("msci")
     st_i = sc.get("st_i")
     st_a = sc.get("st_a")
-    # Score cards
-    st.markdown('<div class="sec-label">Coherence Scores</div>', unsafe_allow_html=True)
-    cards = (
-        score_card_html("MSCI (Overall)", msci)
-        + score_card_html("Text \u2192 Image", st_i)
-        + score_card_html("Text \u2192 Audio", st_a)
-        + score_card_html("Classification", msci, is_class=True)
-    )
-    st.markdown(f'<div class="scores-grid">{cards}</div>', unsafe_allow_html=True)
     # Timing strip
     tt = R.get("t_total", 0)
     sep = '<span class="t-sep">|</span>'
     st.markdown(
-        f'<div class="timing">'
         f'<span class="t-total">Total {tt:.1f}s</span>{sep}'
         f'<span>Text {R.get("t_text", 0):.1f}s</span>{sep}'
         f'<span>Image {R.get("t_img", 0):.1f}s</span>{sep}'
         f'<span>Audio {R.get("t_aud", 0):.1f}s</span>{sep}'
@@ -846,141 +1805,194 @@ def show_results(R: dict):
     st.markdown("---")
     # Three columns: text | image | audio
     ct, ci, ca = st.columns([1.15, 1, 0.85])
     with ct:
-        st.markdown('<div class="sec-label">Generated Text</div>', unsafe_allow_html=True)
         txt = R.get("text", {}).get("text", "")
         text_err = R.get("text", {}).get("text_error")
         if text_err:
-            st.markdown(
-                f'<div class="warn-banner"><b>Text gen failed</b> — {text_err}</div>',
-                unsafe_allow_html=True)
-        st.markdown(f'<div class="text-card">{txt}</div>', unsafe_allow_html=True)
     with ci:
-        st.markdown('<div class="sec-label">Generated Image</div>', unsafe_allow_html=True)
         ii = R.get("image")
         if ii and ii.get("path"):
             ip = Path(ii["path"])
             backend = ii.get("backend", "unknown")
-            if backend == "retrieval" and ii.get("failed", False):
-                sim = ii.get("similarity", 0)
-                st.markdown(
-                    f'<div class="warn-banner"><b>Retrieval fallback</b> '
-                    f'(sim={sim:.3f}) \u2014 generation unavailable.</div>',
-                    unsafe_allow_html=True)
             if ip.exists():
                 st.image(str(ip), use_container_width=True)
                 model = ii.get("model", "")
                 if backend == "generative":
-                    st.caption(f"Generated via **{model}**")
                 else:
                     sim = ii.get("similarity", 0)
                     dom = ii.get("domain", "other")
                     ic = DOMAIN_ICONS.get(dom, "\U0001f4cd")
                     st.caption(f"{ic} {dom} \u00b7 sim **{sim:.3f}** \u00b7 Retrieved")
         else:
-            st.info("No image.")
     with ca:
-        st.markdown('<div class="sec-label">Generated Audio</div>', unsafe_allow_html=True)
         ai = R.get("audio")
         if ai and ai.get("path"):
             ap = Path(ai["path"])
             backend = ai.get("backend", "unknown")
-            if backend == "retrieval" and ai.get("failed", False):
-                sim = ai.get("similarity", 0)
-                st.markdown(
-                    f'<div class="warn-banner"><b>Retrieval fallback</b> '
-                    f'(sim={sim:.3f}) \u2014 generation unavailable.</div>',
-                    unsafe_allow_html=True)
             if ap.exists():
                 st.audio(str(ap))
                 model = ai.get("model", "")
                 if backend == "generative":
-                    st.caption(f"Generated via **{model}**")
                 else:
                     sim = ai.get("similarity", 0)
                     st.caption(f"sim **{sim:.3f}** \u00b7 Retrieved")
         else:
-            st.info("No audio.")
     st.markdown("---")
-    # Expandable details
-    with st.expander("Semantic Plan"):
-        td = R.get("text", {})
-        plan = td.get("plan")
-        if plan:
-            p1, p2 = st.columns(2)
-            with p1:
-                dash = "\u2014"
-                dot = "\u00b7"
-                scene = plan.get("scene_summary", dash)
-                domain = plan.get("domain", dash)
-                core = plan.get("core_semantics", {})
-                setting = core.get("setting", dash)
-                tod = core.get("time_of_day", dash)
-                weather = core.get("weather", dash)
-                subjects = ", ".join(core.get("main_subjects", []))
-                st.markdown(f"**Scene** {scene}")
-                st.markdown(f"**Domain** {domain}")
-                st.markdown(f"**Setting** {setting} {dot} **Time** {tod} {dot} **Weather** {weather}")
-                st.markdown(f"**Subjects** {subjects}")
-            with p2:
-                st.markdown("**Image prompt**")
-                st.code(td.get("image_prompt", ""), language=None)
-                st.markdown("**Audio prompt**")
-                st.code(td.get("audio_prompt", ""), language=None)
-        else:
-            mode = R.get("mode", "direct")
-            if mode == "direct":
-                st.write("Direct mode \u2014 no semantic plan. Prompt used as-is for all modalities.")
             else:
-                st.write(f"Planning ({mode}) did not produce a valid plan. Fell back to direct mode.")
-    with st.expander("Generation Details"):
-        r1, r2 = st.columns(2)
-        with r1:
-            ii = R.get("image")
-            if ii:
-                backend = ii.get("backend", "unknown")
-                model = ii.get("model", "")
-                if backend == "generative":
-                    st.markdown(f"**Image** generated via **{model}**")
-                    st.markdown(f"Prompt: *{R.get('text', {}).get('image_prompt', '')}*")
-                elif ii.get("top_5"):
-                    st.markdown("**Image** (retrieval fallback)")
-                    bars = "".join(sim_bar_html(n, s) for n, s in ii["top_5"])
-                    st.markdown(bars, unsafe_allow_html=True)
             else:
-                st.write("No image data.")
-        with r2:
-            ai = R.get("audio")
-            if ai:
-                backend = ai.get("backend", "unknown")
-                model = ai.get("model", "")
-                if backend == "generative":
-                    st.markdown(f"**Audio** generated via **{model}**")
-                    st.markdown(f"Prompt: *{R.get('text', {}).get('audio_prompt', '')}*")
-                elif ai.get("top_5"):
-                    st.markdown("**Audio** (retrieval fallback)")
-                    bars = "".join(sim_bar_html(n, s) for n, s in ai["top_5"])
-                    st.markdown(bars, unsafe_allow_html=True)
             else:
-                st.write("No audio data.")
-    with st.expander("Full Coherence Report"):
-        if coh:
-            st.json(coh)
-        else:
-            st.write("No data.")
 if __name__ == "__main__":

 # Example prompts
 # ---------------------------------------------------------------------------
 EXAMPLE_PROMPTS = {
+    "en": {
+        "Nature": [
+            "A peaceful forest at dawn with birdsong and morning mist",
+            "A field of golden wheat under a warm summer sunset",
+            "A dense jungle with exotic birds calling from the canopy",
+        ],
+        "Urban": [
+            "A bustling city street at night with neon lights and traffic",
+            "A quiet alley in an old town with distant footsteps echoing",
+            "A cafe terrace on a busy boulevard with clinking glasses",
+        ],
+        "Water": [
+            "Ocean waves crashing on a sandy beach at sunset",
+            "Rain falling on a pond with ripples spreading across the surface",
+            "A mountain stream flowing over rocks through a pine forest",
+        ],
+        "Mixed": [
+            "A lighthouse on a cliff during a thunderstorm at night",
+            "A bonfire on a beach with waves and guitar music at night",
+            "A train passing through countryside with distant church bells",
+        ],
+    },
+    "de": {
+        "Natur": [
+            "Ein friedlicher Wald bei Sonnenaufgang mit Vogelgesang und Morgennebel",
+            "Ein goldenes Weizenfeld unter einem warmen Sommerabend",
+            "Ein dichter Dschungel mit exotischen V\u00f6geln im Bl\u00e4tterdach",
+        ],
+        "Stadt": [
+            "Eine belebte Stra\u00dfe bei Nacht mit Neonlichtern und Verkehr",
+            "Eine ruhige Gasse in einer Altstadt mit fernen Schritten",
+            "Eine Caf\u00e9-Terrasse an einem belebten Boulevard mit klinkenden Gl\u00e4sern",
+        ],
+        "Wasser": [
+            "Meereswellen am Sandstrand bei Sonnenuntergang",
+            "Regen f\u00e4llt auf einen Teich mit sich ausbreitenden Wellen",
+            "Ein Bergbach flie\u00dft \u00fcber Felsen durch einen Kiefernwald",
+        ],
+        "Gemischt": [
+            "Ein Leuchtturm auf einer Klippe w\u00e4hrend eines Gewitters bei Nacht",
+            "Ein Lagerfeuer am Strand mit Wellen und Gitarrenmusik bei Nacht",
+            "Ein Zug f\u00e4hrt durch die Landschaft mit fernen Kirchenglocken",
+        ],
+    },
 }
 DOMAIN_ICONS = {"nature": "\U0001f33f", "urban": "\U0001f3d9\ufe0f", "water": "\U0001f30a", "mixed": "\U0001f310", "other": "\U0001f4cd"}
+# ---------------------------------------------------------------------------
+# Kid Mode — example prompts (German, fun themes for children)
+# ---------------------------------------------------------------------------
+KID_EXAMPLE_PROMPTS = {
+    "de": {
+        "\U0001f47e Abenteuer": [
+            "Pikachu in einem magischen Wald bei Sonnenuntergang",
+            "Ein Minecraft-Dorf auf einer Insel mitten im Ozean",
+            "Ein kleiner Drache fliegt \u00fcber eine Burg bei Nacht",
+            "Ein Weltraumabenteuer mit Raketen und bunten Planeten",
+        ],
+        "\U0001f43e Tiere": [
+            "Ein freundlicher Hund rettet ein K\u00e4tzchen im Regen",
+            "Dinosaurier spielen Fu\u00dfball auf einer sonnigen Wiese",
+            "Ein Einhorn galoppiert \u00fcber einen leuchtenden Regenbogen",
+            "Pinguine machen eine Schneeballschlacht am S\u00fcdpol",
+            "Ein kleiner Fuchs entdeckt einen geheimen Garten",
+        ],
+        "\u2728 Fantasie": [
+            "Ein Zauberer braut einen glitzernden Trank in einem Schloss",
+            "Eine Fee fliegt durch einen Wald voller leuchtender Pilze",
+            "Ein verzaubertes Baumhaus in den Wolken mit Regenbogenbr\u00fccke",
+            "Ein Roboter und ein Teddy gehen zusammen auf Schatzsuche",
+            "Ein magischer Unterwasserpalast mit sprechenden Fischen",
+        ],
+        "\U0001f602 Lustig": [
+            "Eine Katze f\u00e4hrt Skateboard durch eine bunte Stadt",
+            "Aliens landen im Schulgarten und spielen Verstecken",
+            "Ein Elefant versucht sich auf einem Trampolin",
+            "Ein Schneemann isst Eis am Strand im Sommer",
+            "Monster unter dem Bett machen eine Pyjamaparty",
+        ],
+        "\U0001f3ae Spielwelt": [
+            "Super Mario springt durch eine Welt aus S\u00fc\u00dfigkeiten",
+            "Ein Ritter k\u00e4mpft gegen einen freundlichen Drachen",
+            "Eine Unterwasser-Rennstrecke mit U-Booten und Delfinen",
+            "Ein Baumhaus-Dorf im Dschungel mit H\u00e4ngebr\u00fccken",
+            "Tiere bauen zusammen eine riesige Sandburg am Meer",
+        ],
+    },
+    "en": {
+        "\U0001f47e Adventure": [
+            "Pikachu in a magical forest at sunset",
+            "A Minecraft village on an island in the middle of the ocean",
+            "A little dragon flying over a castle at night",
+            "A space adventure with rockets and colorful planets",
+        ],
+        "\U0001f43e Animals": [
+            "A friendly dog rescuing a kitten in the rain",
+            "Dinosaurs playing football on a sunny meadow",
+            "A unicorn galloping over a glowing rainbow",
+            "Penguins having a snowball fight at the South Pole",
+            "A little fox discovering a secret garden",
+        ],
+        "\u2728 Fantasy": [
+            "A wizard brewing a sparkling potion in a castle",
+            "A fairy flying through a forest of glowing mushrooms",
+            "An enchanted treehouse in the clouds with a rainbow bridge",
+            "A robot and a teddy bear going on a treasure hunt together",
+            "A magical underwater palace with talking fish",
+        ],
+        "\U0001f602 Funny": [
+            "A cat riding a skateboard through a colorful city",
+            "Aliens landing in the school garden and playing hide and seek",
+            "An elephant trying to jump on a trampoline",
+            "A snowman eating ice cream at the beach in summer",
+            "Monsters under the bed having a pajama party",
+        ],
+        "\U0001f3ae Game World": [
+            "Super Mario jumping through a world made of candy",
+            "A knight fighting a friendly dragon",
+            "An underwater race track with submarines and dolphins",
+            "A treehouse village in the jungle with rope bridges",
+            "Animals building a giant sandcastle at the beach",
+        ],
+    },
+}
+# ---------------------------------------------------------------------------
+# Kid Mode — CSS theme (bright, bubbly, playful)
+# ---------------------------------------------------------------------------
+KID_CSS = """
+<style>
+/* ============================================================
+   KID MODE — Full theme override
+   ============================================================ */
+/* Kill the top gap */
+.block-container { padding-top: 0.5rem !important; }
+header[data-testid="stHeader"] { display: none !important; }
+/* Force light colorful background on EVERYTHING */
+.stApp, .stApp > div, .main, .main .block-container,
+[data-testid="stAppViewContainer"], [data-testid="stAppViewBlockContainer"],
+section.main, section.main > div {
+    background: linear-gradient(170deg, #dbeafe 0%, #fce7f3 35%, #fef3c7 65%, #dcfce7 100%) !important;
+    color: #1e293b !important;
+}
+/* Sidebar light theme */
+section[data-testid="stSidebar"], section[data-testid="stSidebar"] > div {
+    background: linear-gradient(180deg, #ede9fe 0%, #fce7f3 100%) !important;
+    color: #1e293b !important;
+}
+section[data-testid="stSidebar"] label,
+section[data-testid="stSidebar"] .stMarkdown,
+section[data-testid="stSidebar"] span,
+section[data-testid="stSidebar"] p {
+    color: #334155 !important;
+}
+/* Force dark text everywhere */
+.stMarkdown, .stMarkdown p, .stMarkdown span, .stMarkdown div,
+.stTextArea textarea, label, .stSelectbox label {
+    color: #1e293b !important;
+}
+.stTextArea textarea {
+    background: rgba(255,255,255,0.85) !important;
+    border: 2px solid #c4b5fd !important;
+    border-radius: 18px !important;
+    font-size: 1rem !important;
+}
+.stTextArea textarea:focus {
+    border-color: #8b5cf6 !important;
+    box-shadow: 0 0 0 4px rgba(139,92,246,0.15) !important;
+}
+/* Status containers */
+[data-testid="stStatusWidget"] {
+    background: rgba(255,255,255,0.6) !important;
+    border-radius: 14px !important;
+}
+/* Floating background elements */
+.kid-bg {
+    position: fixed; top: 0; left: 0; width: 100%; height: 100%;
+    pointer-events: none; z-index: 0; overflow: hidden;
+}
+.kid-bg-item {
+    position: absolute; opacity: 0.15;
+    animation: kid-float linear infinite;
+}
+@keyframes kid-float {
+    0%   { transform: translateY(105vh) rotate(0deg) scale(0.8); opacity: 0; }
+    8%   { opacity: 0.35; }
+    92%  { opacity: 0.35; }
+    100% { transform: translateY(-10vh) rotate(360deg) scale(1.1); opacity: 0; }
+}
+/* Twinkle for stars */
+@keyframes kid-twinkle {
+    0%, 100% { opacity: 0.15; transform: scale(0.8); }
+    50% { opacity: 0.5; transform: scale(1.2); }
+}
+.kid-star-fixed {
+    position: absolute; pointer-events: none;
+    animation: kid-twinkle ease-in-out infinite;
+}
+/* Clouds */
+.kid-cloud {
+    position: absolute; pointer-events: none; opacity: 0.18;
+    width: 120px; height: 50px; background: white;
+    border-radius: 50px; animation: kid-drift linear infinite;
+}
+.kid-cloud::before {
+    content: ''; position: absolute; background: white; border-radius: 50%;
+    width: 55px; height: 55px; top: -25px; left: 20px;
+}
+.kid-cloud::after {
+    content: ''; position: absolute; background: white; border-radius: 50%;
+    width: 40px; height: 40px; top: -18px; left: 55px;
+}
+@keyframes kid-drift {
+    0%   { transform: translateX(-150px); }
+    100% { transform: translateX(calc(100vw + 150px)); }
+}
+/* Hero — big colorful title */
+.kid-hero {
+    text-align: center; padding: 0.8rem 0 0.3rem; position: relative; z-index: 1;
+}
+.kid-hero-title {
+    font-size: 3.2rem; font-weight: 900; letter-spacing: -0.02em;
+    background: linear-gradient(135deg, #ec4899, #f97316, #eab308, #22c55e, #3b82f6, #8b5cf6);
+    background-size: 300% 300%;
+    -webkit-background-clip: text; -webkit-text-fill-color: transparent;
+    animation: kid-gradient 4s ease infinite;
+    text-shadow: none;
+}
+@keyframes kid-gradient {
+    0%   { background-position: 0% 50%; }
+    50%  { background-position: 100% 50%; }
+    100% { background-position: 0% 50%; }
+}
+.kid-hero-sub {
+    font-size: 1.15rem; color: #475569; margin-top: 0.2rem; font-weight: 500;
+}
+.kid-hero-sub b { color: #7c3aed; }
+/* Mascots — bigger, animated, with speech bubbles */
+.kid-mascot-row {
+    display: flex; justify-content: center; gap: 2rem; margin: 0.8rem 0 0.5rem;
+    position: relative; z-index: 1;
+}
+.kid-mascot {
+    display: flex; flex-direction: column; align-items: center;
+    padding: 0.8rem 1.2rem 0.5rem; border-radius: 24px;
+    background: rgba(255,255,255,0.9);
+    border: 3px solid rgba(255,255,255,1);
+    box-shadow: 0 8px 30px rgba(0,0,0,0.08), 0 2px 8px rgba(139,92,246,0.1);
+    transition: transform 0.3s cubic-bezier(0.34, 1.56, 0.64, 1);
+    cursor: default; position: relative;
+    min-width: 105px;
+}
+.kid-mascot:hover {
+    transform: scale(1.12) rotate(-3deg);
+    box-shadow: 0 12px 40px rgba(139,92,246,0.25);
+}
+.kid-mascot svg { display: block; margin: 0 auto; }
+.kid-mascot-name {
+    font-size: 0.9rem; font-weight: 800; margin-top: 0.15rem;
+    letter-spacing: 0.04em;
+}
+.kid-mascot:nth-child(1) .kid-mascot-name { color: #3b82f6; }
+.kid-mascot:nth-child(2) .kid-mascot-name { color: #ec4899; }
+.kid-mascot:nth-child(3) .kid-mascot-name { color: #f97316; }
+/* Continuous gentle bounce */
+.kid-mascot:nth-child(1) { animation: kid-bob 2s ease-in-out infinite; }
+.kid-mascot:nth-child(2) { animation: kid-bob 2s ease-in-out 0.3s infinite; }
+.kid-mascot:nth-child(3) { animation: kid-bob 2s ease-in-out 0.6s infinite; }
+@keyframes kid-bob {
+    0%, 100% { transform: translateY(0); }
+    50% { transform: translateY(-6px); }
+}
+.kid-mascot:hover { animation: none; }
+/* Speech bubble */
+.kid-speech {
+    position: absolute; top: -32px; left: 50%; transform: translateX(-50%);
+    background: #fef3c7; color: #92400e; font-size: 0.65rem; font-weight: 700;
+    padding: 3px 10px; border-radius: 12px; white-space: nowrap;
+    box-shadow: 0 2px 8px rgba(0,0,0,0.08);
+    opacity: 0; transition: opacity 0.2s;
+}
+.kid-speech::after {
+    content: ''; position: absolute; bottom: -5px; left: 50%; margin-left: -5px;
+    border-left: 5px solid transparent; border-right: 5px solid transparent;
+    border-top: 5px solid #fef3c7;
+}
+.kid-mascot:hover .kid-speech { opacity: 1; }
+/* Score cards — kid version */
+.kid-scores {
+    display: grid; grid-template-columns: repeat(4, 1fr);
+    gap: 0.8rem; margin: 0.6rem 0; position: relative; z-index: 1;
+}
+@media (max-width: 768px) { .kid-scores { grid-template-columns: repeat(2, 1fr); } }
+.kid-sc {
+    border-radius: 22px; padding: 1.1rem 0.8rem; text-align: center;
+    background: rgba(255,255,255,0.85);
+    border: 2.5px solid rgba(255,255,255,1);
+    box-shadow: 0 6px 24px rgba(0,0,0,0.06);
+    position: relative; overflow: hidden;
+    animation: kid-pop 0.4s cubic-bezier(0.34, 1.56, 0.64, 1) both;
+}
+.kid-sc:nth-child(1) { animation-delay: 0s; }
+.kid-sc:nth-child(2) { animation-delay: 0.1s; }
+.kid-sc:nth-child(3) { animation-delay: 0.2s; }
+.kid-sc:nth-child(4) { animation-delay: 0.3s; }
+@keyframes kid-pop {
+    0% { transform: scale(0.7); opacity: 0; }
+    100% { transform: scale(1); opacity: 1; }
+}
+.kid-sc::before {
+    content: ''; position: absolute; top: 0; left: 0; right: 0; height: 5px;
+    border-radius: 22px 22px 0 0;
+}
+.kid-sc-great::before { background: linear-gradient(90deg, #22c55e, #06b6d4); }
+.kid-sc-ok::before { background: linear-gradient(90deg, #f59e0b, #f97316); }
+.kid-sc-low::before { background: linear-gradient(90deg, #ef4444, #ec4899); }
+.kid-sc-main::before { background: linear-gradient(90deg, #8b5cf6, #ec4899, #f97316, #eab308); background-size: 200%; animation: kid-gradient 3s ease infinite; }
+.kid-sc-lbl {
+    font-size: 0.72rem; font-weight: 800; color: #64748b;
+    text-transform: uppercase; letter-spacing: 0.06em;
+}
+.kid-sc-stars { font-size: 1.8rem; margin: 0.3rem 0; line-height: 1.1; }
+.kid-sc-emoji { font-size: 2.4rem; margin: 0.15rem 0; }
+.kid-sc-val {
+    font-size: 0.7rem; color: #94a3b8; font-family: 'JetBrains Mono', monospace;
+}
+/* Verdict banner */
+.kid-verdict {
+    text-align: center; font-size: 1.4rem; font-weight: 800;
+    color: #334155; margin: 0.4rem 0 0.6rem;
+    animation: kid-pop 0.5s cubic-bezier(0.34, 1.56, 0.64, 1) both;
+}
+/* Section labels */
+.kid-sec-label {
+    font-size: 0.85rem; font-weight: 900; letter-spacing: 0.06em;
+    text-transform: uppercase; color: #7c3aed !important;
+    padding-bottom: 0.35rem; border-bottom: 3px solid #c4b5fd;
+    margin-bottom: 0.6rem;
+}
+.kid-text-card {
+    border-radius: 20px; padding: 1.2rem 1.3rem;
+    background: rgba(255,255,255,0.8);
+    border: 2px solid rgba(255,255,255,1);
+    box-shadow: 0 4px 20px rgba(0,0,0,0.05);
+    font-size: 0.95rem; line-height: 1.8; color: #334155 !important;
+}
+.kid-timing {
+    display: flex; gap: 0.5rem; flex-wrap: wrap; align-items: center;
+    padding: 0.45rem 0.9rem; border-radius: 16px;
+    background: rgba(255,255,255,0.6);
+    border: 2px solid rgba(255,255,255,0.9);
+    font-size: 0.72rem; color: #64748b !important; margin: 0.4rem 0;
+}
+.kid-timing span { color: #64748b !important; }
+.kid-timing .t-total { color: #7c3aed !important; font-weight: 700; }
+.kid-timing .t-sep { color: #cbd5e1 !important; }
+/* Warn banner */
+.kid-warn {
+    border-radius: 16px; padding: 0.8rem 1.1rem; margin-bottom: 0.6rem;
+    border-left: 4px solid #f97316; font-size: 0.85rem; color: #9a3412 !important;
+    background: rgba(255,237,213,0.7);
+}
+/* Button override */
+.stButton > button[kind="primary"] {
+    background: linear-gradient(135deg, #8b5cf6, #ec4899) !important;
+    border: none !important; border-radius: 16px !important;
+    font-weight: 800 !important; font-size: 1.05rem !important;
+    padding: 0.6rem 1.5rem !important;
+    box-shadow: 0 4px 15px rgba(139,92,246,0.3) !important;
+    transition: transform 0.2s, box-shadow 0.2s !important;
+}
+.stButton > button[kind="primary"]:hover {
+    transform: scale(1.03) !important;
+    box-shadow: 0 6px 25px rgba(139,92,246,0.4) !important;
+}
+/* Divider */
+hr { border-color: rgba(139,92,246,0.15) !important; }
+</style>
+"""
+# ---------------------------------------------------------------------------
+# Kid Mode — mascot HTML, star ratings, emoji feedback
+# ---------------------------------------------------------------------------
+MASCOT_HTML = """
+<!-- Rich floating background -->
+<div class="kid-bg">
+    <!-- Wave 1: floating emoji rising (spread across page) -->
+    <div class="kid-bg-item" style="font-size:30px;left:2%;animation-duration:14s;">\u2b50</div>
+    <div class="kid-bg-item" style="font-size:24px;left:8%;animation-duration:18s;animation-delay:2s;">\U0001f98b</div>
+    <div class="kid-bg-item" style="font-size:26px;left:14%;animation-duration:16s;animation-delay:5s;">\U0001f49c</div>
+    <div class="kid-bg-item" style="font-size:20px;left:20%;animation-duration:22s;animation-delay:1s;">\U0001f680</div>
+    <div class="kid-bg-item" style="font-size:32px;left:26%;animation-duration:13s;animation-delay:3s;">\u2728</div>
+    <div class="kid-bg-item" style="font-size:22px;left:32%;animation-duration:19s;animation-delay:7s;">\U0001f338</div>
+    <div class="kid-bg-item" style="font-size:28px;left:38%;animation-duration:15s;animation-delay:4s;">\U0001f31f</div>
+    <div class="kid-bg-item" style="font-size:18px;left:44%;animation-duration:20s;animation-delay:0s;">\U0001f984</div>
+    <div class="kid-bg-item" style="font-size:26px;left:50%;animation-duration:17s;animation-delay:6s;">\U0001f308</div>
+    <div class="kid-bg-item" style="font-size:24px;left:56%;animation-duration:14s;animation-delay:2s;">\U0001f49b</div>
+    <div class="kid-bg-item" style="font-size:20px;left:62%;animation-duration:21s;animation-delay:8s;">\U0001f33c</div>
+    <div class="kid-bg-item" style="font-size:30px;left:68%;animation-duration:16s;animation-delay:1s;">\u2b50</div>
+    <div class="kid-bg-item" style="font-size:22px;left:74%;animation-duration:18s;animation-delay:5s;">\U0001f98b</div>
+    <div class="kid-bg-item" style="font-size:28px;left:80%;animation-duration:13s;animation-delay:3s;">\u2728</div>
+    <div class="kid-bg-item" style="font-size:24px;left:86%;animation-duration:20s;animation-delay:9s;">\U0001f49a</div>
+    <div class="kid-bg-item" style="font-size:18px;left:92%;animation-duration:15s;animation-delay:4s;">\U0001f30d</div>
+    <div class="kid-bg-item" style="font-size:26px;left:97%;animation-duration:17s;animation-delay:0s;">\U0001f680</div>
+    <!-- Wave 2: offset for constant density -->
+    <div class="kid-bg-item" style="font-size:22px;left:5%;animation-duration:19s;animation-delay:10s;">\U0001f33c</div>
+    <div class="kid-bg-item" style="font-size:28px;left:15%;animation-duration:15s;animation-delay:11s;">\U0001f49b</div>
+    <div class="kid-bg-item" style="font-size:18px;left:25%;animation-duration:21s;animation-delay:9s;">\U0001f984</div>
+    <div class="kid-bg-item" style="font-size:26px;left:35%;animation-duration:16s;animation-delay:12s;">\u2b50</div>
+    <div class="kid-bg-item" style="font-size:24px;left:45%;animation-duration:18s;animation-delay:8s;">\U0001f98b</div>
+    <div class="kid-bg-item" style="font-size:20px;left:55%;animation-duration:14s;animation-delay:13s;">\U0001f308</div>
+    <div class="kid-bg-item" style="font-size:30px;left:65%;animation-duration:20s;animation-delay:10s;">\u2728</div>
+    <div class="kid-bg-item" style="font-size:22px;left:75%;animation-duration:17s;animation-delay:11s;">\U0001f338</div>
+    <div class="kid-bg-item" style="font-size:26px;left:85%;animation-duration:13s;animation-delay:14s;">\U0001f49a</div>
+    <div class="kid-bg-item" style="font-size:24px;left:95%;animation-duration:19s;animation-delay:9s;">\U0001f31f</div>
+    <!-- Wave 3: more for richness -->
+    <div class="kid-bg-item" style="font-size:20px;left:10%;animation-duration:17s;animation-delay:15s;">\U0001f680</div>
+    <div class="kid-bg-item" style="font-size:26px;left:30%;animation-duration:14s;animation-delay:16s;">\U0001f338</div>
+    <div class="kid-bg-item" style="font-size:22px;left:50%;animation-duration:19s;animation-delay:14s;">\U0001f984</div>
+    <div class="kid-bg-item" style="font-size:28px;left:70%;animation-duration:15s;animation-delay:17s;">\U0001f49c</div>
+    <div class="kid-bg-item" style="font-size:24px;left:90%;animation-duration:18s;animation-delay:15s;">\U0001f33c</div>
+    <!-- Twinkling stars (fixed) -->
+    <div class="kid-star-fixed" style="font-size:18px;top:5%;left:8%;animation-duration:2.5s;">\u2b50</div>
+    <div class="kid-star-fixed" style="font-size:14px;top:12%;left:30%;animation-duration:3s;animation-delay:0.5s;">\u2b50</div>
+    <div class="kid-star-fixed" style="font-size:16px;top:8%;left:55%;animation-duration:2.8s;animation-delay:1s;">\u2b50</div>
+    <div class="kid-star-fixed" style="font-size:12px;top:15%;left:80%;animation-duration:3.5s;animation-delay:0.3s;">\u2b50</div>
+    <div class="kid-star-fixed" style="font-size:15px;top:35%;left:5%;animation-duration:4s;animation-delay:0.8s;">\u2b50</div>
+    <div class="kid-star-fixed" style="font-size:11px;top:50%;left:92%;animation-duration:3.2s;animation-delay:1.5s;">\u2b50</div>
+    <div class="kid-star-fixed" style="font-size:17px;top:65%;left:15%;animation-duration:2.6s;animation-delay:0.2s;">\u2b50</div>
+    <div class="kid-star-fixed" style="font-size:13px;top:75%;left:70%;animation-duration:3.8s;animation-delay:2s;">\u2b50</div>
+    <div class="kid-star-fixed" style="font-size:10px;top:88%;left:45%;animation-duration:3s;animation-delay:1.2s;">\u2b50</div>
+    <div class="kid-star-fixed" style="font-size:14px;top:42%;left:88%;animation-duration:2.4s;animation-delay:0.7s;">\u2b50</div>
+    <!-- Clouds -->
+    <div class="kid-cloud" style="top:3%;animation-duration:40s;"></div>
+    <div class="kid-cloud" style="top:20%;animation-duration:55s;animation-delay:12s;width:90px;height:38px;"></div>
+    <div class="kid-cloud" style="top:45%;animation-duration:48s;animation-delay:25s;width:100px;height:42px;"></div>
+    <div class="kid-cloud" style="top:65%;animation-duration:52s;animation-delay:8s;width:80px;height:34px;"></div>
+    <div class="kid-cloud" style="top:85%;animation-duration:44s;animation-delay:20s;"></div>
+</div>
+<!-- Corner characters: cute SVG creatures -->
+<!-- Cat (bottom-left) -->
+<div style="position:fixed;bottom:15px;left:260px;z-index:2;opacity:0.4;pointer-events:none;animation:kid-bob 3s ease-in-out infinite;">
+<svg width="55" height="50" viewBox="0 0 55 50">
+    <polygon points="9,16 4,2 17,12" fill="#f97316"/>
+    <polygon points="46,16 51,2 39,12" fill="#f97316"/>
+    <ellipse cx="27" cy="27" rx="20" ry="16" fill="#fb923c"/>
+    <ellipse cx="20" cy="25" rx="2.5" ry="3" fill="#1e293b"/>
+    <ellipse cx="34" cy="25" rx="2.5" ry="3" fill="#1e293b"/>
+    <circle cx="21" cy="24" r="0.8" fill="white"/>
+    <circle cx="35" cy="24" r="0.8" fill="white"/>
+    <ellipse cx="27" cy="30" rx="2" ry="1.2" fill="#f472b6"/>
+    <path d="M24 32 Q27 35 30 32" stroke="#ea580c" stroke-width="1" fill="none"/>
+    <line x1="7" y1="27" x2="0" y2="25" stroke="#fdba74" stroke-width="1.2"/>
+    <line x1="7" y1="29" x2="0" y2="30" stroke="#fdba74" stroke-width="1.2"/>
+    <line x1="47" y1="27" x2="55" y2="25" stroke="#fdba74" stroke-width="1.2"/>
+    <line x1="47" y1="29" x2="55" y2="30" stroke="#fdba74" stroke-width="1.2"/>
+    <path d="M13 43 Q7 47 10 50" stroke="#fb923c" stroke-width="3.5" fill="none" stroke-linecap="round"/>
+</svg></div>
+<!-- Dog (bottom-right) -->
+<div style="position:fixed;bottom:15px;right:25px;z-index:2;opacity:0.4;pointer-events:none;animation:kid-bob 3.5s ease-in-out 0.5s infinite;">
+<svg width="55" height="50" viewBox="0 0 55 50">
+    <ellipse cx="10" cy="10" rx="9" ry="13" fill="#a16207" transform="rotate(-20,10,10)"/>
+    <ellipse cx="45" cy="10" rx="9" ry="13" fill="#a16207" transform="rotate(20,45,10)"/>
+    <circle cx="27" cy="25" r="18" fill="#d97706"/>
+    <ellipse cx="20" cy="22" rx="2.5" ry="3" fill="#1e293b"/>
+    <ellipse cx="34" cy="22" rx="2.5" ry="3" fill="#1e293b"/>
+    <circle cx="21" cy="21" r="0.8" fill="white"/>
+    <circle cx="35" cy="21" r="0.8" fill="white"/>
+    <ellipse cx="27" cy="29" rx="3.5" ry="2.5" fill="#1e293b"/>
+    <ellipse cx="27" cy="28" rx="2" ry="1.2" fill="#f472b6"/>
+    <path d="M22 33 Q27 38 32 33" stroke="#92400e" stroke-width="1.2" fill="none"/>
+</svg></div>
+<!-- Unicorn (top-right) -->
+<div style="position:fixed;top:75px;right:25px;z-index:2;opacity:0.35;pointer-events:none;animation:kid-bob 4s ease-in-out 1s infinite;">
+<svg width="50" height="55" viewBox="0 0 50 55">
+    <polygon points="25,0 22,15 28,15" fill="#fbbf24"/>
+    <circle cx="25" cy="25" r="14" fill="white" stroke="#e9d5ff" stroke-width="1"/>
+    <ellipse cx="19" cy="23" rx="2.5" ry="3" fill="#1e293b"/>
+    <ellipse cx="31" cy="23" rx="2.5" ry="3" fill="#1e293b"/>
+    <circle cx="20" cy="22" r="0.8" fill="white"/>
+    <circle cx="32" cy="22" r="0.8" fill="white"/>
+    <circle cx="14" cy="28" rx="3" fill="#fecdd3" opacity="0.5"/>
+    <circle cx="36" cy="28" rx="3" fill="#fecdd3" opacity="0.5"/>
+    <path d="M20 30 Q25 34 30 30" stroke="#ec4899" stroke-width="1.2" fill="none"/>
+    <path d="M11 16 Q5 10 7 18" stroke="#c4b5fd" stroke-width="2.5" fill="none" stroke-linecap="round"/>
+    <path d="M13 14 Q8 6 9 15" stroke="#fbcfe8" stroke-width="2" fill="none" stroke-linecap="round"/>
+    <path d="M39 16 Q45 10 43 18" stroke="#bfdbfe" stroke-width="2.5" fill="none" stroke-linecap="round"/>
+    <path d="M37 14 Q42 6 41 15" stroke="#fde68a" stroke-width="2" fill="none" stroke-linecap="round"/>
+</svg></div>
+<!-- Rocket (top-left past sidebar) -->
+<div style="position:fixed;top:65px;left:260px;z-index:2;opacity:0.35;pointer-events:none;animation:kid-bob 3.2s ease-in-out 0.8s infinite;">
+<svg width="35" height="55" viewBox="0 0 35 55">
+    <ellipse cx="17" cy="22" rx="10" ry="18" fill="#ef4444"/>
+    <ellipse cx="17" cy="22" rx="6.5" ry="12" fill="#fca5a5"/>
+    <circle cx="17" cy="19" r="4.5" fill="#dbeafe"/>
+    <circle cx="17" cy="19" r="2.5" fill="#3b82f6"/>
+    <polygon points="17,1 14,10 20,10" fill="#ef4444"/>
+    <polygon points="7,34 2,43 12,36" fill="#f97316"/>
+    <polygon points="27,34 32,43 22,36" fill="#f97316"/>
+    <ellipse cx="17" cy="40" rx="4" ry="3.5" fill="#fbbf24"/>
+    <ellipse cx="17" cy="44" rx="2.5" ry="5" fill="#fb923c" opacity="0.7"/>
+    <ellipse cx="17" cy="49" rx="1.5" ry="3.5" fill="#fbbf24" opacity="0.4"/>
+</svg></div>
+<!-- SVG Mascots -->
+<div class="kid-mascot-row">
+    <div class="kid-mascot">
+        <div class="kid-speech">Ich schreibe!</div>
+        <svg width="70" height="75" viewBox="0 0 70 75">
+            <!-- Textino: cute blue robot -->
+            <!-- Antenna -->
+            <line x1="35" y1="8" x2="35" y2="0" stroke="#60a5fa" stroke-width="2.5" stroke-linecap="round"/>
+            <circle cx="35" cy="0" r="4" fill="#fbbf24"/>
+            <!-- Head -->
+            <rect x="10" y="8" width="50" height="32" rx="12" fill="#3b82f6"/>
+            <!-- Face screen -->
+            <rect x="15" y="13" width="40" height="22" rx="8" fill="#dbeafe"/>
+            <!-- Eyes -->
+            <circle cx="27" cy="23" r="5" fill="white"/>
+            <circle cx="43" cy="23" r="5" fill="white"/>
+            <circle cx="28" cy="23" r="3" fill="#1e293b"/>
+            <circle cx="44" cy="23" r="3" fill="#1e293b"/>
+            <!-- Eye shine -->
+            <circle cx="29" cy="22" r="1" fill="white"/>
+            <circle cx="45" cy="22" r="1" fill="white"/>
+            <!-- Smile -->
+            <path d="M25 29 Q35 35 45 29" stroke="#3b82f6" stroke-width="2" fill="none" stroke-linecap="round"/>
+            <!-- Body -->
+            <rect x="18" y="40" width="34" height="22" rx="8" fill="#60a5fa"/>
+            <!-- Arms -->
+            <rect x="5" y="42" width="13" height="8" rx="4" fill="#93c5fd"/>
+            <rect x="52" y="42" width="13" height="8" rx="4" fill="#93c5fd"/>
+            <!-- Pencil in right hand -->
+            <line x1="65" y1="42" x2="69" y2="32" stroke="#f97316" stroke-width="3" stroke-linecap="round"/>
+            <polygon points="69,32 67,28 71,28" fill="#fbbf24"/>
+            <!-- Belly button -->
+            <circle cx="35" cy="51" r="3" fill="#3b82f6"/>
+            <!-- Feet -->
+            <rect x="20" y="62" width="12" height="8" rx="4" fill="#3b82f6"/>
+            <rect x="38" y="62" width="12" height="8" rx="4" fill="#3b82f6"/>
+        </svg>
+        <div class="kid-mascot-name">Textino</div>
+    </div>
+    <div class="kid-mascot">
+        <div class="kid-speech">Ich male!</div>
+        <svg width="70" height="75" viewBox="0 0 70 75">
+            <!-- Pixela: cute pink artist character -->
+            <!-- Beret -->
+            <ellipse cx="35" cy="10" rx="22" ry="8" fill="#ec4899"/>
+            <circle cx="35" cy="5" r="5" fill="#f472b6"/>
+            <!-- Head -->
+            <circle cx="35" cy="25" r="20" fill="#fda4af"/>
+            <!-- Rosy cheeks -->
+            <circle cx="22" cy="29" r="5" fill="#fecdd3" opacity="0.7"/>
+            <circle cx="48" cy="29" r="5" fill="#fecdd3" opacity="0.7"/>
+            <!-- Eyes -->
+            <ellipse cx="27" cy="23" rx="4.5" ry="5" fill="white"/>
+            <ellipse cx="43" cy="23" rx="4.5" ry="5" fill="white"/>
+            <circle cx="28" cy="23" r="3" fill="#1e293b"/>
+            <circle cx="44" cy="23" r="3" fill="#1e293b"/>
+            <circle cx="29" cy="22" r="1" fill="white"/>
+            <circle cx="45" cy="22" r="1" fill="white"/>
+            <!-- Cat mouth -->
+            <path d="M30 31 L35 34 L40 31" stroke="#e11d48" stroke-width="1.5" fill="none" stroke-linecap="round"/>
+            <!-- Body -->
+            <rect x="20" y="45" width="30" height="18" rx="10" fill="#fb7185"/>
+            <!-- Arms -->
+            <rect x="7" y="47" width="13" height="7" rx="3.5" fill="#fda4af"/>
+            <rect x="50" y="47" width="13" height="7" rx="3.5" fill="#fda4af"/>
+            <!-- Paintbrush in right hand -->
+            <line x1="63" y1="47" x2="68" y2="35" stroke="#a16207" stroke-width="2.5" stroke-linecap="round"/>
+            <ellipse cx="68" cy="33" rx="4" ry="5" fill="#8b5cf6" transform="rotate(-15,68,33)"/>
+            <!-- Paint palette in left hand -->
+            <ellipse cx="4" cy="50" rx="8" ry="5" fill="#fde68a" transform="rotate(10,4,50)"/>
+            <circle cx="2" cy="48" r="2" fill="#ef4444"/>
+            <circle cx="6" cy="47" r="2" fill="#3b82f6"/>
+            <circle cx="4" cy="52" r="2" fill="#22c55e"/>
+            <!-- Feet -->
+            <ellipse cx="28" cy="67" rx="7" ry="5" fill="#ec4899"/>
+            <ellipse cx="42" cy="67" rx="7" ry="5" fill="#ec4899"/>
+        </svg>
+        <div class="kid-mascot-name">Pixela</div>
+    </div>
+    <div class="kid-mascot">
+        <div class="kid-speech">Ich spiele!</div>
+        <svg width="70" height="75" viewBox="0 0 70 75">
+            <!-- Soundo: cute orange music character -->
+            <!-- Headphones band -->
+            <path d="M12 25 Q12 5 35 5 Q58 5 58 25" stroke="#f97316" stroke-width="4" fill="none" stroke-linecap="round"/>
+            <!-- Headphone pads -->
+            <rect x="6" y="20" width="12" height="16" rx="6" fill="#f97316"/>
+            <rect x="52" y="20" width="12" height="16" rx="6" fill="#f97316"/>
+            <rect x="8" y="22" width="8" height="12" rx="4" fill="#fdba74"/>
+            <rect x="54" y="22" width="8" height="12" rx="4" fill="#fdba74"/>
+            <!-- Head -->
+            <circle cx="35" cy="28" r="18" fill="#fed7aa"/>
+            <!-- Eyes - happy closed -->
+            <path d="M24 26 Q28 22 32 26" stroke="#1e293b" stroke-width="2.5" fill="none" stroke-linecap="round"/>
+            <path d="M38 26 Q42 22 46 26" stroke="#1e293b" stroke-width="2.5" fill="none" stroke-linecap="round"/>
+            <!-- Big open smile -->
+            <path d="M25 33 Q35 42 45 33" stroke="#ea580c" stroke-width="2" fill="#fef3c7" stroke-linecap="round"/>
+            <!-- Body -->
+            <rect x="22" y="46" width="26" height="16" rx="8" fill="#fb923c"/>
+            <!-- Arms -->
+            <rect x="9" y="48" width="13" height="7" rx="3.5" fill="#fdba74"/>
+            <rect x="48" y="48" width="13" height="7" rx="3.5" fill="#fdba74"/>
+            <!-- Music notes floating -->
+            <text x="60" y="15" font-size="14" fill="#8b5cf6" opacity="0.8">\u266a</text>
+            <text x="4" y="12" font-size="11" fill="#ec4899" opacity="0.7">\u266b</text>
+            <text x="55" y="45" font-size="10" fill="#f97316" opacity="0.6">\u266a</text>
+            <!-- Feet -->
+            <ellipse cx="29" cy="66" rx="7" ry="5" fill="#f97316"/>
+            <ellipse cx="41" cy="66" rx="7" ry="5" fill="#f97316"/>
+        </svg>
+        <div class="kid-mascot-name">Soundo</div>
+    </div>
+</div>
+"""
+def _kid_stars(v: Optional[float]) -> str:
+    """Convert a 0-1 score to 1-5 star rating HTML."""
+    if v is None:
+        return "\u2b50" * 0
+    n = max(1, min(5, round(v * 10)))  # 0.1→1 star, 0.5→5 stars
+    return "\u2b50" * n + "\u2606" * (5 - n)  # filled + empty
+def _kid_emoji(v: Optional[float]) -> str:
+    """Return emoji face based on coherence score."""
+    if v is None:
+        return "\U0001f914"
+    if v >= 0.45:
+        return "\U0001f929"  # star-struck
+    if v >= 0.35:
+        return "\U0001f60a"  # happy
+    if v >= 0.25:
+        return "\U0001f642"  # slightly smiling
+    return "\U0001f61f"  # worried
+def _kid_verdict(v: Optional[float], lang: str = "de") -> str:
+    """Return kid-friendly verdict text."""
+    if v is None:
+        return "Hmm..." if lang == "de" else "Hmm..."
+    if lang == "de":
+        if v >= 0.45:
+            return "Super! Alles passt perfekt zusammen! \U0001f389"
+        if v >= 0.35:
+            return "Gut gemacht! Das passt ziemlich gut! \U0001f44d"
+        if v >= 0.25:
+            return "Geht so \u2014 ein bisschen passt es! \U0001f914"
+        return "Hmm, das passt noch nicht so gut \U0001f61e"
+    else:
+        if v >= 0.45:
+            return "Amazing! Everything fits perfectly together! \U0001f389"
+        if v >= 0.35:
+            return "Well done! That fits pretty well! \U0001f44d"
+        if v >= 0.25:
+            return "So-so \u2014 it fits a little bit! \U0001f914"
+        return "Hmm, that doesn't quite fit yet \U0001f61e"
+def kid_score_card(label: str, value: Optional[float], is_main: bool = False) -> str:
+    """Kid-friendly score card with stars and emoji."""
+    cls = "kid-sc-main" if is_main else (
+        "kid-sc-great" if value and value >= 0.45 else
+        "kid-sc-ok" if value and value >= 0.30 else "kid-sc-low"
+    )
+    stars = _kid_stars(value)
+    emoji = _kid_emoji(value) if is_main else ""
+    val_str = f"{value:.3f}" if value is not None else "\u2014"
+    emoji_html = f'<div class="kid-sc-emoji">{emoji}</div>' if emoji else ""
+    return (
+        f'<div class="kid-sc {cls} kid-confetti">'
+        f'<div class="kid-sc-lbl">{label}</div>'
+        f'{emoji_html}'
+        f'<div class="kid-sc-stars">{stars}</div>'
+        f'<div class="kid-sc-val">{val_str}</div>'
+        f'</div>'
+    )
+# Kid-mode UI labels
+UI_LABELS_KID = {
+    "de": {
+        "hero_title": "Multimodale KI f\u00fcr Kids",
+        "hero_sub": "Beschreibe eine Szene und die KI erzeugt <b>Text + Bild + Audio</b> dazu!",
+        "config": "Einstellungen",
+        "backend": "Wie soll es erstellt werden?",
+        "planning": "Planungsmodus",
+        "language": "Sprache",
+        "examples": "Ideen zum Ausprobieren",
+        "scene_placeholder": "Beschreibe deine Szene hier... z.B. 'Ein Einhorn fliegt \u00fcber einen Regenbogen' \U0001f308",
+        "generate_btn": "\u2728 Los geht's!",
+        "welcome_text": "Beschreibe eine Szene und klicke auf <b>\u2728 Los geht's!</b>",
+        "welcome_hint": "oder w\u00e4hle eine Idee aus der Seitenleiste \U0001f449",
+        "scores_label": "\U0001f3af Wie gut passt alles zusammen?",
+        "gen_text_label": "\U0001f916 Textino schreibt...",
+        "gen_image_label": "\U0001f3a8 Pixela malt...",
+        "gen_audio_label": "\U0001f3b5 Soundo spielt...",
+        "translated_note": "Aus dem Deutschen \u00fcbersetzt",
+        "original_label": "Original (Deutsch)",
+    },
+    "en": {
+        "hero_title": "Multimodal AI for Kids",
+        "hero_sub": "Describe a scene and the AI creates <b>text + image + audio</b> for it!",
+        "config": "Settings",
+        "backend": "How should it be created?",
+        "planning": "Planning Mode",
+        "language": "Language",
+        "examples": "Ideas to Try",
+        "scene_placeholder": "Describe your scene here... e.g., 'A unicorn flying over a rainbow' \U0001f308",
+        "generate_btn": "\u2728 Let's Go!",
+        "welcome_text": "Describe a scene and click <b>\u2728 Let's Go!</b>",
+        "welcome_hint": "or pick an idea from the sidebar \U0001f449",
+        "scores_label": "\U0001f3af How well does everything fit together?",
+        "gen_text_label": "\U0001f916 Textino writes...",
+        "gen_image_label": "\U0001f3a8 Pixela paints...",
+        "gen_audio_label": "\U0001f3b5 Soundo plays...",
+        "translated_note": "Translated from German",
+        "original_label": "Original (German)",
+    },
+}
 # ---------------------------------------------------------------------------
 # Planning prompt template (same as src/planner/prompts/unified.txt)
 # ---------------------------------------------------------------------------
     return InferenceClient(token=token)
+# ---------------------------------------------------------------------------
+# Translation (German <-> English)
+# ---------------------------------------------------------------------------
+TRANSLATION_MODELS = {
+    "de-en": "Helsinki-NLP/opus-mt-de-en",
+    "en-de": "Helsinki-NLP/opus-mt-en-de",
+}
+def translate(text: str, direction: str) -> str:
+    """Translate text using HF Inference API. direction: 'de-en' or 'en-de'."""
+    if not text or not text.strip():
+        return text
+    model_id = TRANSLATION_MODELS[direction]
+    client = get_inference_client()
+    try:
+        result = client.translation(text, model=model_id)
+        if isinstance(result, str):
+            return result
+        # huggingface_hub returns a TranslationOutput object
+        return result.translation_text if hasattr(result, "translation_text") else str(result)
+    except Exception as e:
+        logger.warning("Translation (%s) failed: %s — returning original", direction, e)
+        return text
+def translate_de_to_en(text: str) -> str:
+    return translate(text, "de-en")
+def translate_en_to_de(text: str) -> str:
+    return translate(text, "en-de")
+# ---------------------------------------------------------------------------
+# UI labels (i18n)
+# ---------------------------------------------------------------------------
+UI_LABELS = {
+    "en": {
+        "hero_title": "Multimodal Coherence AI",
+        "hero_sub": 'Generate semantically coherent <b>text + image + audio</b> bundles '
+                    'and evaluate cross-modal alignment with the <b>MSCI</b> metric.',
+        "config": "Configuration",
+        "backend": "Backend",
+        "planning": "Planning Mode",
+        "language": "Language",
+        "examples": "Examples",
+        "scene_placeholder": "Describe a scene... e.g., 'A peaceful forest at dawn with birdsong and morning mist'",
+        "generate_btn": "Generate Bundle",
+        "welcome_text": 'Enter a scene description and click <b>Generate Bundle</b>',
+        "welcome_hint": "or pick an example from the sidebar",
+        "scores_label": "Coherence Scores",
+        "gen_text_label": "Generated Text",
+        "gen_image_label": "Generated Image",
+        "gen_audio_label": "Generated Audio",
+        "translated_note": "Translated from German",
+        "original_label": "Original (German)",
+    },
+    "de": {
+        "hero_title": "Multimodale Koh\u00e4renz-KI",
+        "hero_sub": 'Erzeuge semantisch koh\u00e4rente <b>Text + Bild + Audio</b> B\u00fcndel '
+                    'und bewerte die modale \u00dcbereinstimmung mit der <b>MSCI</b>-Metrik.',
+        "config": "Einstellungen",
+        "backend": "Verfahren",
+        "planning": "Planungsmodus",
+        "language": "Sprache",
+        "examples": "Beispiele",
+        "scene_placeholder": "Beschreibe eine Szene... z.B. 'Ein friedlicher Wald bei Sonnenaufgang mit Vogelgesang'",
+        "generate_btn": "B\u00fcndel erzeugen",
+        "welcome_text": 'Beschreibe eine Szene und klicke auf <b>B\u00fcndel erzeugen</b>',
+        "welcome_hint": "oder w\u00e4hle ein Beispiel aus der Seitenleiste",
+        "scores_label": "Koh\u00e4renz-Bewertung",
+        "gen_text_label": "Erzeugter Text",
+        "gen_image_label": "Erzeugtes Bild",
+        "gen_audio_label": "Erzeugtes Audio",
+        "translated_note": "Aus dem Deutschen \u00fcbersetzt",
+        "original_label": "Original (Deutsch)",
+    },
+}
 # ---------------------------------------------------------------------------
 # HF Inference API helpers
 # ---------------------------------------------------------------------------
+# Primary models (may consume credits via Inference Providers)
+TEXT_GEN_MODELS_PAID = [
     "mistralai/Mistral-7B-Instruct-v0.3",
+    "meta-llama/Llama-3.2-3B-Instruct",
+]
+# Free serverless models (rate-limited but no credit cost)
+TEXT_GEN_MODELS_FREE = [
     "HuggingFaceH4/zephyr-7b-beta",
     "microsoft/Phi-3-mini-4k-instruct",
+    "google/gemma-2-2b-it",
 ]
+# Combined: try free first, then paid
+TEXT_GEN_MODELS = TEXT_GEN_MODELS_FREE + TEXT_GEN_MODELS_PAID
+def _is_credit_error(e: Exception) -> bool:
+    """Check if an exception is a 402 Payment Required (credits depleted)."""
+    msg = str(e).lower()
+    return "402" in msg or "payment required" in msg or "credit" in msg
 def _hf_chat(system: str, user: str, max_tokens: int = 500, temperature: float = 0.3) -> str:
+    """Call HF Inference API chat completion, trying multiple models.
+    Tries free serverless models first, then paid models.
+    Skips paid models entirely if a 402 credit error is detected.
+    """
     client = get_inference_client()
     last_error = None
+    credits_depleted = False
     for model_id in TEXT_GEN_MODELS:
+        # Skip paid models if we already know credits are gone
+        if credits_depleted and model_id in TEXT_GEN_MODELS_PAID:
+            logger.info("Skipping paid model %s (credits depleted)", model_id)
+            continue
         try:
             response = client.chat_completion(
                 model=model_id,
                 return text
         except Exception as e:
             last_error = e
+            if _is_credit_error(e):
+                credits_depleted = True
+                logger.warning("Chat model %s: credits depleted (402)", model_id)
+            else:
+                logger.warning("Chat model %s failed: %s", model_id, e)
             continue
+    detail = "Credit balance is depleted." if credits_depleted else f"Last error: {last_error}"
+    raise RuntimeError(f"All text models failed. {detail}")
 def _parse_plan_json(raw: str) -> Optional[Dict[str, Any]]:
 # Generation / retrieval functions
 # ---------------------------------------------------------------------------
+# HF Inference API model IDs — free models first, paid fallback
+IMAGE_GEN_MODELS = [
+    "black-forest-labs/FLUX.1-schnell",        # Free serverless
+    "stabilityai/stable-diffusion-xl-base-1.0", # May need credits
+]
 AUDIO_GEN_MODELS = [
+    "facebook/musicgen-small",  # Free serverless
+    "cvssp/audioldm2",          # May need credits
 ]
 def gen_text(prompt: str, mode: str) -> dict:
 def generate_image(prompt: str) -> dict:
+    """Generate image via HF Inference API, trying free models first. Falls back to retrieval."""
     client = get_inference_client()
+    credits_depleted = False
+    for model_id in IMAGE_GEN_MODELS:
+        if credits_depleted and model_id == "stabilityai/stable-diffusion-xl-base-1.0":
+            logger.info("Skipping paid image model (credits depleted)")
+            continue
+        try:
+            image = client.text_to_image(prompt, model=model_id)
+            tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False, dir="/tmp")
+            image.save(tmp.name)
+            model_name = model_id.split("/")[-1]
+            return {
+                "path": tmp.name, "backend": "generative",
+                "model": model_name, "failed": False,
+            }
+        except Exception as e:
+            if _is_credit_error(e):
+                credits_depleted = True
+                logger.warning("Image model %s: credits depleted (402)", model_id)
+            else:
+                logger.warning("Image gen with %s failed: %s", model_id, e)
+            continue
+    logger.warning("All image generation models failed — falling back to retrieval")
+    result = retrieve_image(prompt)
+    if credits_depleted:
+        result["credit_error"] = True
+    return result
 def generate_audio(prompt: str) -> dict:
+    """Generate audio via HF Inference API, trying free models first. Falls back to retrieval."""
     client = get_inference_client()
+    credits_depleted = False
     for model_id in AUDIO_GEN_MODELS:
+        if credits_depleted and model_id == "cvssp/audioldm2":
+            logger.info("Skipping paid audio model (credits depleted)")
+            continue
         try:
             audio_bytes = client.text_to_audio(prompt, model=model_id)
             suffix = ".flac" if "musicgen" in model_id else ".wav"
                 tmp.write(audio_bytes)
                 tmp.flush()
             else:
                 tmp.write(bytes(audio_bytes))
                 tmp.flush()
             model_name = model_id.split("/")[-1]
                 "model": model_name, "failed": False,
             }
         except Exception as e:
+            if _is_credit_error(e):
+                credits_depleted = True
+                logger.warning("Audio model %s: credits depleted (402)", model_id)
+            else:
+                logger.warning("Audio gen with %s failed: %s", model_id, e)
             continue
     logger.warning("All audio generation models failed — falling back to retrieval")
+    result = retrieve_audio(prompt)
+    if credits_depleted:
+        result["credit_error"] = True
+    return result
 def retrieve_image(prompt: str) -> dict:
         layout="wide",
         initial_sidebar_state="expanded",
     )
+    # Sidebar — settings first (needed for CSS choice)
     with st.sidebar:
         st.markdown("#### Configuration")
+        kid_mode = st.toggle("\U0001f476 Kid Mode", value=False)
+        lang = st.selectbox(
+            "Language / Sprache",
+            ["en", "de"],
+            format_func=lambda x: {"en": "English", "de": "Deutsch"}[x],
+        )
+        # Select labels based on kid mode and language
+        if kid_mode:
+            L = UI_LABELS_KID.get(lang, UI_LABELS_KID["en"])
+        else:
+            L = UI_LABELS[lang]
         backend = st.selectbox(
+            L["backend"],
             ["generative", "retrieval"],
             format_func=lambda x: {
+                "generative": "Generative (FLUX/SDXL + MusicGen)",
                 "retrieval": "Retrieval (CLIP + CLAP index)",
             }[x],
         )
         mode = st.selectbox(
+            L["planning"],
             ["direct", "planner", "council", "extended_prompt"],
             format_func=lambda x: {
                 "direct": "Direct",
         )
         st.divider()
+        st.markdown(f"#### {L['examples']}")
+        # Kid mode uses fun themed prompts; normal mode uses domain prompts
+        if kid_mode:
+            lang_examples = KID_EXAMPLE_PROMPTS.get(lang, KID_EXAMPLE_PROMPTS["en"])
+            for dname, prompts in lang_examples.items():
+                with st.expander(dname):  # already has emoji in key
+                    for p in prompts:
+                        if st.button(p, key=f"ex_{hash(p)}", use_container_width=True):
+                            st.session_state["prompt_input"] = p
+        else:
+            lang_examples = EXAMPLE_PROMPTS.get(lang, EXAMPLE_PROMPTS["en"])
+            domain_icons_de = {"natur": "\U0001f33f", "stadt": "\U0001f3d9\ufe0f", "wasser": "\U0001f30a", "gemischt": "\U0001f310"}
+            for dname, prompts in lang_examples.items():
+                icon = DOMAIN_ICONS.get(dname.lower(), domain_icons_de.get(dname.lower(), "\U0001f4cd"))
+                with st.expander(f"{icon} {dname}"):
+                    for p in prompts:
+                        if st.button(p, key=f"ex_{hash(p)}", use_container_width=True):
+                            st.session_state["prompt_input"] = p
         st.divider()
         mode_desc = {
             "extended_prompt": "Single LLM call with 3x token budget",
         }
         if backend == "generative":
+            img_info = "FLUX.1-schnell / SDXL via HF API"
+            aud_info = "MusicGen / AudioLDM2 via HF API"
         else:
             img_info = "CLIP retrieval (57 images)"
             aud_info = "CLAP retrieval (104 clips)"
+        trans_info = "<br><b>Translation</b> opus-mt-de-en / en-de" if lang == "de" else ""
         st.markdown(
             f'<div class="sidebar-info">'
             f'<b>Text</b> HF Inference API<br>'
             f'<b>Planning</b> {mode_desc[mode]}<br>'
             f'<b>Image</b> {img_info}<br>'
+            f'<b>Audio</b> {aud_info}{trans_info}<br><br>'
             f'<b>Metric</b> MSCI = 0.45 &times; s<sub>t,i</sub> + 0.45 &times; s<sub>t,a</sub><br><br>'
             f'<b>Models</b><br>'
             f'CLIP ViT-B/32 (coherence eval)<br>'
             f'CLAP HTSAT-unfused (coherence eval)'
             f'</div>', unsafe_allow_html=True)
+    # Apply CSS based on mode
+    if kid_mode:
+        st.markdown(KID_CSS, unsafe_allow_html=True)      # kid theme (includes all needed overrides)
+    else:
+        st.markdown(CUSTOM_CSS, unsafe_allow_html=True)   # professional dark theme
+    # Hero
+    if kid_mode:
+        st.markdown(
+            f'<div class="kid-hero">'
+            f'<div class="kid-hero-title">{L["hero_title"]}</div>'
+            f'<div class="kid-hero-sub">{L["hero_sub"]}</div>'
+            f'</div>', unsafe_allow_html=True)
+        st.markdown(MASCOT_HTML, unsafe_allow_html=True)
+    else:
+        st.markdown(
+            f'<div class="hero-wrap">'
+            f'<div class="hero-title">{L["hero_title"]}</div>'
+            f'<div class="hero-sub">{L["hero_sub"]}</div>'
+            f'</div>', unsafe_allow_html=True)
     # Prompt input
     default_prompt = st.session_state.get("prompt_input", "")
     prompt = st.text_area(
         "Scene", value=default_prompt, height=80,
+        placeholder=L["scene_placeholder"],
         label_visibility="collapsed",
     )
     # Button + chips
     bc1, bc2 = st.columns([1, 3])
     with bc1:
+        go = st.button(L["generate_btn"], type="primary", use_container_width=True, disabled=not prompt.strip())
     with bc2:
         mlbl = {"direct": "Direct", "planner": "Planner", "council": "Council", "extended_prompt": "Extended"}[mode]
         mcls = "chip-amber" if mode != "direct" else "chip-purple"
             bchip = '<span class="chip chip-pink"><span class="chip-dot chip-dot-pink"></span>Generative</span>'
         else:
             bchip = '<span class="chip chip-purple"><span class="chip-dot chip-dot-purple"></span>Retrieval</span>'
+        lang_chip = ""
+        if lang == "de":
+            lang_chip = '<span class="chip chip-amber"><span class="chip-dot chip-dot-amber"></span>DE \u2192 EN</span>'
+        kid_chip = ""
+        if kid_mode:
+            kid_chip = '<span class="chip chip-green"><span class="chip-dot chip-dot-green"></span>\U0001f476 Kid</span>'
         st.markdown(
             f'<div class="chip-row">'
             f'{bchip}'
             f'<span class="chip {mcls}"><span class="chip-dot {mdot}"></span>{mlbl}</span>'
             f'<span class="chip chip-green"><span class="chip-dot chip-dot-green"></span>CLIP + CLAP</span>'
+            f'{lang_chip}{kid_chip}'
             f'</div>', unsafe_allow_html=True)
     # Welcome state
     if not go and "last_result" not in st.session_state:
+        if kid_mode:
+            st.markdown(
+                f'<div class="welcome" style="background:rgba(255,255,255,0.5);border-radius:24px;padding:3rem 2rem;">'
+                f'<div class="welcome-icons">\U0001f916\u2728\U0001f3a8\u2728\U0001f3b5</div>'
+                f'<div class="welcome-text" style="color:#334155;">{L["welcome_text"]}</div>'
+                f'<div class="welcome-hint" style="color:#64748b;">{L["welcome_hint"]}</div>'
+                f'</div>', unsafe_allow_html=True)
+        else:
+            st.markdown(
+                f'<div class="welcome">'
+                f'<div class="welcome-icons">\U0001f3a8  \U0001f5bc\ufe0f  \U0001f50a</div>'
+                f'<div class="welcome-text">{L["welcome_text"]}</div>'
+                f'<div class="welcome-hint">{L["welcome_hint"]}</div>'
+                f'</div>', unsafe_allow_html=True)
         return
     if go and prompt.strip():
+        st.session_state["last_result"] = run_pipeline(prompt.strip(), mode, backend, lang)
+        st.session_state["last_result"]["kid_mode"] = kid_mode
     if "last_result" in st.session_state:
+        # Update kid_mode in case user toggled it after generation
+        st.session_state["last_result"]["kid_mode"] = kid_mode
         show_results(st.session_state["last_result"])
 # Pipeline
 # ---------------------------------------------------------------------------
+def run_pipeline(prompt: str, mode: str, backend: str = "generative", lang: str = "en") -> dict:
+    R: dict = {"mode": mode, "backend": backend, "lang": lang, "original_prompt": prompt}
     t_all = time.time()
+    # 0) Translate German → English if needed
+    en_prompt = prompt
+    if lang == "de":
+        with st.status("\u00dcbersetze ins Englische...", expanded=True) as s:
+            t0 = time.time()
+            en_prompt = translate_de_to_en(prompt)
+            t_trans = time.time() - t0
+            R["t_translate"] = t_trans
+            R["en_prompt"] = en_prompt
+            s.update(label=f"Translated ({t_trans:.1f}s): {en_prompt[:80]}...", state="complete")
+    else:
+        R["en_prompt"] = prompt
+    # 1) Text + Planning (always in English for CLIP/CLAP)
     plan_label = "Generating text..." if mode == "direct" else f"Planning ({mode}) + generating text..."
     with st.status(plan_label, expanded=True) as s:
         t0 = time.time()
         try:
+            R["text"] = gen_text(en_prompt, mode)
             R["t_text"] = time.time() - t0
             has_plan = R["text"].get("plan") is not None
             lbl = f"Text ready ({R['t_text']:.1f}s)"
             s.update(label=lbl, state="complete")
         except Exception as e:
             s.update(label=f"Text failed: {e}", state="error")
+            R["text"] = {"text": en_prompt, "image_prompt": en_prompt, "audio_prompt": en_prompt}
             R["t_text"] = time.time() - t0
+    # Translate generated text back to German for display
+    if lang == "de":
+        en_text = R["text"].get("text", "")
+        R["text"]["text_en"] = en_text
+        R["text"]["text"] = translate_en_to_de(en_text)
+    ip = R["text"].get("image_prompt", en_prompt)
+    ap = R["text"].get("audio_prompt", en_prompt)
     # 2) Image
+    img_label = "Generating image..." if backend == "generative" else "Retrieving image..."
     with st.status(img_label, expanded=True) as s:
         t0 = time.time()
         try:
             R["audio"] = None
             R["t_aud"] = time.time() - t0
+    # 4) Coherence evaluation (always use English text for CLIP/CLAP)
     with st.status("Evaluating coherence...", expanded=True) as s:
         t0 = time.time()
         try:
             imgp = R.get("image", {}).get("path") if R.get("image") else None
             audp = R.get("audio", {}).get("path") if R.get("audio") else None
+            eval_text = R["text"].get("text_en", R["text"]["text"])  # English for CLIP/CLAP
+            R["coherence"] = eval_coherence(eval_text, imgp, audp)
             R["t_eval"] = time.time() - t0
             msci = R["coherence"].get("scores", {}).get("msci")
             s.update(label=f"MSCI = {msci:.4f} ({R['t_eval']:.1f}s)", state="complete")
     msci = sc.get("msci")
     st_i = sc.get("st_i")
     st_a = sc.get("st_a")
+    lang = R.get("lang", "en")
+    kid_mode = R.get("kid_mode", False)
+    if kid_mode:
+        L = UI_LABELS_KID.get(lang, UI_LABELS_KID["en"])
+    else:
+        L = UI_LABELS.get(lang, UI_LABELS["en"])
+    # Warn banner CSS class
+    warn_cls = "kid-warn" if kid_mode else "warn-banner"
+    # --- Score cards ---
+    if kid_mode:
+        st.markdown(f'<div class="kid-sec-label">{L["scores_label"]}</div>', unsafe_allow_html=True)
+        # Kid verdict banner
+        verdict = _kid_verdict(msci, lang)
+        st.markdown(f'<div class="kid-verdict">{verdict}</div>', unsafe_allow_html=True)
+        # Balloons for high coherence!
+        if msci is not None and msci >= 0.40:
+            st.balloons()
+        cards = (
+            kid_score_card("\U0001f3af Gesamt" if lang == "de" else "\U0001f3af Overall", msci, is_main=True)
+            + kid_score_card("\U0001f5bc\ufe0f Text \u2192 Bild" if lang == "de" else "\U0001f5bc\ufe0f Text \u2192 Image", st_i)
+            + kid_score_card("\U0001f50a Text \u2192 Ton" if lang == "de" else "\U0001f50a Text \u2192 Audio", st_a)
+            + kid_score_card("\U0001f31f Sterne" if lang == "de" else "\U0001f31f Stars", msci)
+        )
+        st.markdown(f'<div class="kid-scores">{cards}</div>', unsafe_allow_html=True)
+    else:
+        st.markdown(f'<div class="sec-label">{L["scores_label"]}</div>', unsafe_allow_html=True)
+        cards = (
+            score_card_html("MSCI (Overall)", msci)
+            + score_card_html("Text \u2192 Image", st_i)
+            + score_card_html("Text \u2192 Audio", st_a)
+            + score_card_html("Classification", msci, is_class=True)
+        )
+        st.markdown(f'<div class="scores-grid">{cards}</div>', unsafe_allow_html=True)
     # Timing strip
     tt = R.get("t_total", 0)
     sep = '<span class="t-sep">|</span>'
+    trans_timing = f'{sep}<span>Translate {R.get("t_translate", 0):.1f}s</span>' if lang == "de" else ""
+    timing_cls = "kid-timing" if kid_mode else "timing"
     st.markdown(
+        f'<div class="{timing_cls}">'
         f'<span class="t-total">Total {tt:.1f}s</span>{sep}'
+        f'{trans_timing}'
         f'<span>Text {R.get("t_text", 0):.1f}s</span>{sep}'
         f'<span>Image {R.get("t_img", 0):.1f}s</span>{sep}'
         f'<span>Audio {R.get("t_aud", 0):.1f}s</span>{sep}'
     st.markdown("---")
+    # CSS class helpers for kid/normal mode
+    sec_cls = "kid-sec-label" if kid_mode else "sec-label"
+    text_cls = "kid-text-card" if kid_mode else "text-card"
     # Three columns: text | image | audio
     ct, ci, ca = st.columns([1.15, 1, 0.85])
     with ct:
+        st.markdown(f'<div class="{sec_cls}">{L["gen_text_label"]}</div>', unsafe_allow_html=True)
         txt = R.get("text", {}).get("text", "")
         text_err = R.get("text", {}).get("text_error")
         if text_err:
+            if "credit" in text_err.lower() or "402" in text_err:
+                st.markdown(
+                    f'<div class="{warn_cls}"><b>Text gen failed</b> — '
+                    f'HF credits depleted. Add credits at huggingface.co/settings/billing '
+                    f'or wait for free-tier reset.</div>',
+                    unsafe_allow_html=True)
+            else:
+                st.markdown(
+                    f'<div class="{warn_cls}"><b>Text gen failed</b> — {text_err}</div>',
+                    unsafe_allow_html=True)
+        st.markdown(f'<div class="{text_cls}">{txt}</div>', unsafe_allow_html=True)
+        # Show English original when in German mode
+        if lang == "de":
+            text_en = R.get("text", {}).get("text_en", "")
+            if text_en and text_en != txt:
+                with st.expander("English (original)"):
+                    st.markdown(f'<div class="{text_cls}" style="opacity:0.7">{text_en}</div>',
+                                unsafe_allow_html=True)
     with ci:
+        st.markdown(f'<div class="{sec_cls}">{L["gen_image_label"]}</div>', unsafe_allow_html=True)
         ii = R.get("image")
         if ii and ii.get("path"):
             ip = Path(ii["path"])
             backend = ii.get("backend", "unknown")
+            if backend == "retrieval" and R.get("backend") == "generative":
+                if ii.get("credit_error"):
+                    st.markdown(
+                        f'<div class="{warn_cls}"><b>HF credits depleted</b> \u2014 '
+                        f'using retrieval fallback.</div>',
+                        unsafe_allow_html=True)
+                else:
+                    sim = ii.get("similarity", 0)
+                    st.markdown(
+                        f'<div class="{warn_cls}"><b>Retrieval fallback</b> '
+                        f'(sim={sim:.3f}) \u2014 generation unavailable.</div>',
+                        unsafe_allow_html=True)
             if ip.exists():
                 st.image(str(ip), use_container_width=True)
                 model = ii.get("model", "")
                 if backend == "generative":
+                    cap = f"\U0001f3a8 Pixela hat gemalt mit **{model}**" if kid_mode and lang == "de" else (
+                        f"\U0001f3a8 Pixela painted with **{model}**" if kid_mode else f"Generated via **{model}**")
+                    st.caption(cap)
                 else:
                     sim = ii.get("similarity", 0)
                     dom = ii.get("domain", "other")
                     ic = DOMAIN_ICONS.get(dom, "\U0001f4cd")
                     st.caption(f"{ic} {dom} \u00b7 sim **{sim:.3f}** \u00b7 Retrieved")
         else:
+            st.info("No image." if not kid_mode else "\U0001f3a8 Kein Bild." if lang == "de" else "\U0001f3a8 No image.")
     with ca:
+        st.markdown(f'<div class="{sec_cls}">{L["gen_audio_label"]}</div>', unsafe_allow_html=True)
         ai = R.get("audio")
         if ai and ai.get("path"):
             ap = Path(ai["path"])
             backend = ai.get("backend", "unknown")
+            if backend == "retrieval" and R.get("backend") == "generative":
+                if ai.get("credit_error"):
+                    st.markdown(
+                        f'<div class="{warn_cls}"><b>HF credits depleted</b> \u2014 '
+                        f'using retrieval fallback.</div>',
+                        unsafe_allow_html=True)
+                else:
+                    sim = ai.get("similarity", 0)
+                    st.markdown(
+                        f'<div class="{warn_cls}"><b>Retrieval fallback</b> '
+                        f'(sim={sim:.3f}) \u2014 generation unavailable.</div>',
+                        unsafe_allow_html=True)
             if ap.exists():
                 st.audio(str(ap))
                 model = ai.get("model", "")
                 if backend == "generative":
+                    cap = f"\U0001f3b5 Soundo spielt mit **{model}**" if kid_mode and lang == "de" else (
+                        f"\U0001f3b5 Soundo plays with **{model}**" if kid_mode else f"Generated via **{model}**")
+                    st.caption(cap)
                 else:
                     sim = ai.get("similarity", 0)
                     st.caption(f"sim **{sim:.3f}** \u00b7 Retrieved")
         else:
+            st.info("No audio." if not kid_mode else "\U0001f3b5 Kein Audio." if lang == "de" else "\U0001f3b5 No audio.")
     st.markdown("---")
+    # Expandable details (hidden in kid mode to keep it simple)
+    if not kid_mode:
+        with st.expander("Semantic Plan"):
+            td = R.get("text", {})
+            plan = td.get("plan")
+            if plan:
+                p1, p2 = st.columns(2)
+                with p1:
+                    dash = "\u2014"
+                    dot = "\u00b7"
+                    scene = plan.get("scene_summary", dash)
+                    domain = plan.get("domain", dash)
+                    core = plan.get("core_semantics", {})
+                    setting = core.get("setting", dash)
+                    tod = core.get("time_of_day", dash)
+                    weather = core.get("weather", dash)
+                    subjects = ", ".join(core.get("main_subjects", []))
+                    st.markdown(f"**Scene** {scene}")
+                    st.markdown(f"**Domain** {domain}")
+                    st.markdown(f"**Setting** {setting} {dot} **Time** {tod} {dot} **Weather** {weather}")
+                    st.markdown(f"**Subjects** {subjects}")
+                with p2:
+                    st.markdown("**Image prompt**")
+                    st.code(td.get("image_prompt", ""), language=None)
+                    st.markdown("**Audio prompt**")
+                    st.code(td.get("audio_prompt", ""), language=None)
             else:
+                mode = R.get("mode", "direct")
+                if mode == "direct":
+                    st.write("Direct mode \u2014 no semantic plan. Prompt used as-is for all modalities.")
+                else:
+                    st.write(f"Planning ({mode}) did not produce a valid plan. Fell back to direct mode.")
+        with st.expander("Generation Details"):
+            r1, r2 = st.columns(2)
+            with r1:
+                ii = R.get("image")
+                if ii:
+                    backend = ii.get("backend", "unknown")
+                    model = ii.get("model", "")
+                    if backend == "generative":
+                        st.markdown(f"**Image** generated via **{model}**")
+                        st.markdown(f"Prompt: *{R.get('text', {}).get('image_prompt', '')}*")
+                    elif ii.get("top_5"):
+                        st.markdown("**Image** (retrieval fallback)")
+                        bars = "".join(sim_bar_html(n, s) for n, s in ii["top_5"])
+                        st.markdown(bars, unsafe_allow_html=True)
+                else:
+                    st.write("No image data.")
+            with r2:
+                ai = R.get("audio")
+                if ai:
+                    backend = ai.get("backend", "unknown")
+                    model = ai.get("model", "")
+                    if backend == "generative":
+                        st.markdown(f"**Audio** generated via **{model}**")
+                        st.markdown(f"Prompt: *{R.get('text', {}).get('audio_prompt', '')}*")
+                    elif ai.get("top_5"):
+                        st.markdown("**Audio** (retrieval fallback)")
+                        bars = "".join(sim_bar_html(n, s) for n, s in ai["top_5"])
+                        st.markdown(bars, unsafe_allow_html=True)
+                else:
+                    st.write("No audio data.")
+        with st.expander("Full Coherence Report"):
+            if coh:
+                st.json(coh)
             else:
+                st.write("No data.")
+    else:
+        # Kid mode: simple "how it works" expander instead of technical details
+        label_how = "\U0001f914 Wie funktioniert das?" if lang == "de" else "\U0001f914 How does it work?"
+        with st.expander(label_how):
+            if lang == "de":
+                st.markdown(
+                    "1. **Textino** \U0001f916 liest deine Beschreibung und schreibt eine Geschichte\n"
+                    "2. **Pixela** \U0001f3a8 malt ein Bild, das zur Geschichte passt\n"
+                    "3. **Soundo** \U0001f3b5 erzeugt Ger\u00e4usche und Musik dazu\n"
+                    "4. Dann pr\u00fcfen wir, ob alles gut zusammenpasst! \u2b50"
+                )
             else:
+                st.markdown(
+                    "1. **Textino** \U0001f916 reads your description and writes a story\n"
+                    "2. **Pixela** \U0001f3a8 paints a picture that matches the story\n"
+                    "3. **Soundo** \U0001f3b5 creates sounds and music for it\n"
+                    "4. Then we check if everything fits together! \u2b50"
+                )
 if __name__ == "__main__":

src/coherence/calibration.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+Distribution Normalization for cMSCI.
+Scores from different embedding spaces (CLIP vs CLAP) and different
+pairwise channels (st_i, st_a, gram_volume) have different natural
+distributions. Z-score normalization makes them comparable.
+The ReferenceDistribution class fits mean/std from existing experiment
+data and normalizes new scores to z-scores or percentile ranks.
+"""
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional
+import numpy as np
+from scipy import stats as sp_stats
+logger = logging.getLogger(__name__)
+class ReferenceDistribution:
+    """
+    Stores mean/std for a single score channel and provides normalization.
+    Usage:
+        ref = ReferenceDistribution()
+        ref.fit(list_of_scores)
+        z = ref.normalize(new_score)         # z-score
+        p = ref.percentile(new_score)        # percentile rank [0, 1]
+    """
+    def __init__(self, name: str = ""):
+        self.name = name
+        self.mean: float = 0.0
+        self.std: float = 1.0
+        self.n: int = 0
+        self._sorted_values: Optional[np.ndarray] = None
+    def fit(self, scores: List[float]) -> None:
+        """Fit the distribution from a list of observed scores."""
+        arr = np.array(scores, dtype=np.float64)
+        self.n = len(arr)
+        self.mean = float(np.mean(arr))
+        self.std = float(np.std(arr, ddof=1)) if self.n > 1 else 1.0
+        if self.std < 1e-10:
+            self.std = 1.0
+        self._sorted_values = np.sort(arr)
+    def normalize(self, score: float) -> float:
+        """Z-score normalization: (score - mean) / std."""
+        return float((score - self.mean) / self.std)
+    def percentile(self, score: float) -> float:
+        """
+        Percentile rank of score within the reference distribution.
+        Returns a value in [0, 1] where 0.5 = median of reference.
+        """
+        if self._sorted_values is None or len(self._sorted_values) == 0:
+            return 0.5
+        rank = np.searchsorted(self._sorted_values, score, side="right")
+        return float(rank / len(self._sorted_values))
+    def to_dict(self) -> Dict:
+        return {
+            "name": self.name,
+            "mean": self.mean,
+            "std": self.std,
+            "n": self.n,
+        }
+    @classmethod
+    def from_dict(cls, d: Dict) -> "ReferenceDistribution":
+        obj = cls(name=d.get("name", ""))
+        obj.mean = d["mean"]
+        obj.std = d["std"]
+        obj.n = d.get("n", 0)
+        return obj
+    def save(self, path: str) -> None:
+        with open(path, "w") as f:
+            json.dump(self.to_dict(), f, indent=2)
+    @classmethod
+    def load(cls, path: str) -> "ReferenceDistribution":
+        with open(path) as f:
+            return cls.from_dict(json.load(f))
+class CalibrationStore:
+    """
+    Collection of ReferenceDistributions for all score channels.
+    Provides save/load for the full calibration state.
+    """
+    def __init__(self):
+        self.distributions: Dict[str, ReferenceDistribution] = {}
+    def add(self, name: str, scores: List[float]) -> ReferenceDistribution:
+        ref = ReferenceDistribution(name=name)
+        ref.fit(scores)
+        self.distributions[name] = ref
+        logger.info(
+            "Calibration[%s]: mean=%.4f, std=%.4f, n=%d",
+            name, ref.mean, ref.std, ref.n,
+        )
+        return ref
+    def normalize(self, name: str, score: float) -> float:
+        if name not in self.distributions:
+            return score
+        return self.distributions[name].normalize(score)
+    def percentile(self, name: str, score: float) -> float:
+        if name not in self.distributions:
+            return 0.5
+        return self.distributions[name].percentile(score)
+    def save(self, path: str) -> None:
+        data = {name: ref.to_dict() for name, ref in self.distributions.items()}
+        Path(path).parent.mkdir(parents=True, exist_ok=True)
+        with open(path, "w") as f:
+            json.dump(data, f, indent=2)
+        logger.info("Calibration saved to %s", path)
+    @classmethod
+    def load(cls, path: str) -> "CalibrationStore":
+        store = cls()
+        with open(path) as f:
+            data = json.load(f)
+        for name, d in data.items():
+            store.distributions[name] = ReferenceDistribution.from_dict(d)
+        logger.info("Calibration loaded from %s (%d channels)", path, len(store.distributions))
+        return store
+def has_channel(store: CalibrationStore, name: str) -> bool:
+    """Check if a calibration channel exists in the store."""
+    return name in store.distributions
+def extend_calibration_with_exmcr(
+    store: CalibrationStore,
+    gram_coh_ia_scores: List[float],
+    gram_coh_tia_scores: Optional[List[float]] = None,
+) -> CalibrationStore:
+    """
+    Extend calibration store with ExMCR-derived channels.
+    Args:
+        store: Existing CalibrationStore to extend.
+        gram_coh_ia_scores: Gram coherence of (image_clip, ExMCR(audio_clap)) pairs.
+        gram_coh_tia_scores: Optional 3-way gram coherence of (text, image, ExMCR(audio)).
+    Returns:
+        Extended CalibrationStore (same object, modified in place).
+    """
+    if gram_coh_ia_scores:
+        store.add("gram_coh_ia_exmcr", gram_coh_ia_scores)
+    if gram_coh_tia_scores:
+        store.add("gram_coh_tia", gram_coh_tia_scores)
+    return store
+def extend_calibration_with_uncertainty(
+    store: CalibrationStore,
+    uncertainty_ti_scores: List[float],
+    uncertainty_ta_scores: Optional[List[float]] = None,
+) -> CalibrationStore:
+    """
+    Extend calibration store with ProbVLM uncertainty channels.
+    Args:
+        store: Existing CalibrationStore to extend.
+        uncertainty_ti_scores: Per-sample mean uncertainty for text-image (CLIP adapter).
+        uncertainty_ta_scores: Per-sample mean uncertainty for text-audio (CLAP adapter).
+    Returns:
+        Extended CalibrationStore (same object, modified in place).
+    """
+    if uncertainty_ti_scores:
+        store.add("uncertainty_ti", uncertainty_ti_scores)
+    if uncertainty_ta_scores:
+        store.add("uncertainty_ta", uncertainty_ta_scores)
+    # Combined uncertainty channel
+    if uncertainty_ti_scores and uncertainty_ta_scores:
+        combined = [
+            (ti + ta) / 2.0
+            for ti, ta in zip(uncertainty_ti_scores, uncertainty_ta_scores)
+        ]
+        store.add("uncertainty_mean", combined)
+    return store
+def build_reference_distributions(
+    rq1_results_path: str,
+) -> CalibrationStore:
+    """
+    Build reference distributions from existing RQ1 baseline results.
+    Extracts st_i, st_a, and msci scores from baseline condition only
+    (matched image + audio), fitting a distribution for each channel.
+    Args:
+        rq1_results_path: Path to rq1_results.json
+    Returns:
+        CalibrationStore with fitted distributions for st_i, st_a, msci
+    """
+    with open(rq1_results_path) as f:
+        data = json.load(f)
+    st_i_scores = []
+    st_a_scores = []
+    msci_scores = []
+    for r in data["results"]:
+        if r.get("condition") != "baseline":
+            continue
+        if r.get("st_i") is not None:
+            st_i_scores.append(r["st_i"])
+        if r.get("st_a") is not None:
+            st_a_scores.append(r["st_a"])
+        if r.get("msci") is not None:
+            msci_scores.append(r["msci"])
+    store = CalibrationStore()
+    if st_i_scores:
+        store.add("st_i", st_i_scores)
+    if st_a_scores:
+        store.add("st_a", st_a_scores)
+    if msci_scores:
+        store.add("msci", msci_scores)
+    # GRAM coherence distributions (1 - gram_volume) for gram calibration mode
+    # gram_volume = sqrt(1 - cos^2), so gram_coherence = 1 - sqrt(1 - cos^2)
+    if st_i_scores:
+        gram_coh_ti = [1.0 - np.sqrt(max(0, 1 - s**2)) for s in st_i_scores]
+        store.add("gram_coh_ti", gram_coh_ti)
+    if st_a_scores:
+        gram_coh_ta = [1.0 - np.sqrt(max(0, 1 - s**2)) for s in st_a_scores]
+        store.add("gram_coh_ta", gram_coh_ta)
+    return store

src/coherence/cmsci_engine.py ADDED Viewed

	@@ -0,0 +1,536 @@

+"""
+Calibrated Multimodal Semantic Coherence Index (cMSCI) Engine.
+Replaces fixed weighted averaging (MSCI) with a principled pipeline:
+    1. Gramian Volume: geometric coherence of embedding vectors
+    2. Distribution Normalization: z-score calibration per channel
+    3. Contrastive Margin: comparison against hard negatives
+    4. Cross-Space Alignment: Ex-MCR projects CLAP→CLIP for 3-way GRAM
+    5. Probabilistic Uncertainty: MC sampling for confidence intervals
+The CalibratedCoherenceEngine runs alongside CoherenceEngine (not replacing
+it) and returns both legacy MSCI and new cMSCI scores for comparison.
+Variant progression:
+    A: MSCI (baseline, weighted cosine average)
+    B: GRAM-only (geometric, no calibration)
+    C: GRAM + z-norm (normalized geometric)
+    D: GRAM + z-norm + contrastive (calibrated geometric)
+    E: GRAM + z-norm + contrastive + Ex-MCR (3-way calibrated)
+    F: Full cMSCI (probabilistic + calibrated + 3-way)
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import numpy as np
+from src.coherence.gram_volume import (
+    gram_volume_2d,
+    gram_volume_3d,
+    gram_volume_nd,
+    normalized_gram_coherence,
+)
+from src.config.settings import (
+    CMSCI_MARGIN_ALPHA,
+    CMSCI_CHANNEL_WEIGHT_TI,
+    CMSCI_CALIBRATION_MODE,
+    CMSCI_W_3D,
+    CMSCI_GAMMA,
+)
+from src.embeddings.aligned_embeddings import AlignedEmbedder
+from src.embeddings.similarity import cosine_similarity
+logger = logging.getLogger(__name__)
+class CalibratedCoherenceEngine:
+    """
+    Uncertainty-aware, geometrically-grounded tri-modal coherence engine.
+    Computes cMSCI alongside legacy MSCI for comparison.
+    Usage:
+        engine = CalibratedCoherenceEngine()
+        result = engine.evaluate("A beach at sunset", "beach.jpg", "waves.wav")
+        print(result["cmsci"])       # Calibrated score
+        print(result["msci"])        # Legacy score (for comparison)
+        print(result["variant_scores"])  # Scores for each variant A-F
+    """
+    def __init__(
+        self,
+        target_dim: int = 512,
+        calibration_path: Optional[str] = None,
+        exmcr_weights_path: Optional[str] = None,
+        bridge_path: Optional[str] = None,
+        prob_clip_adapter_path: Optional[str] = None,
+        prob_clap_adapter_path: Optional[str] = None,
+        negative_bank_enabled: bool = True,
+    ):
+        self.embedder = AlignedEmbedder(target_dim=target_dim)
+        # Calibration store (Phase 2)
+        self._calibration = None
+        if calibration_path and Path(calibration_path).exists():
+            from src.coherence.calibration import CalibrationStore
+            self._calibration = CalibrationStore.load(calibration_path)
+            logger.info("Calibration loaded from %s", calibration_path)
+        # Negative bank (Phase 2)
+        self._negative_bank = None
+        if negative_bank_enabled:
+            try:
+                from src.coherence.negative_bank import NegativeBank
+                self._negative_bank = NegativeBank()
+            except Exception as e:
+                logger.warning("Negative bank disabled: %s", e)
+        # Ex-MCR projector (Phase 3 — projects CLAP into CLIP space)
+        self._exmcr = None
+        if exmcr_weights_path:
+            from src.embeddings.space_alignment import ExMCRProjector
+            self._exmcr = ExMCRProjector(weights_path=exmcr_weights_path)
+            if self._exmcr.is_identity:
+                logger.info("Ex-MCR in identity mode (no weights)")
+            else:
+                logger.info("Ex-MCR projector active")
+        # Cross-Space Bridge (projects CLIP image + CLAP audio → shared 256-d)
+        self._bridge = None
+        if bridge_path and Path(bridge_path).exists():
+            from src.embeddings.cross_space_bridge import CrossSpaceBridge
+            self._bridge = CrossSpaceBridge.load(bridge_path)
+            logger.info("CrossSpaceBridge loaded from %s", bridge_path)
+        # Probabilistic adapters (Phase 4)
+        self._prob_clip = None
+        self._prob_clap = None
+        if prob_clip_adapter_path and Path(prob_clip_adapter_path).exists():
+            from src.embeddings.probabilistic_adapter import ProbabilisticAdapter
+            self._prob_clip = ProbabilisticAdapter.load(prob_clip_adapter_path)
+            logger.info("CLIP probabilistic adapter loaded")
+        if prob_clap_adapter_path and Path(prob_clap_adapter_path).exists():
+            from src.embeddings.probabilistic_adapter import ProbabilisticAdapter
+            self._prob_clap = ProbabilisticAdapter.load(prob_clap_adapter_path)
+            logger.info("CLAP probabilistic adapter loaded")
+    def evaluate(
+        self,
+        text: str,
+        image_path: Optional[str] = None,
+        audio_path: Optional[str] = None,
+        domain: str = "",
+        n_mc_samples: int = 100,
+    ) -> Dict[str, Any]:
+        """
+        Evaluate multimodal coherence with full cMSCI pipeline.
+        Returns both legacy MSCI and cMSCI scores along with all
+        intermediate computations for ablation analysis.
+        Args:
+            text: Text prompt.
+            image_path: Path to image file.
+            audio_path: Path to audio file.
+            domain: Domain hint for negative bank (e.g., "nature").
+            n_mc_samples: Number of MC samples for uncertainty.
+        Returns:
+            Dict with keys:
+                msci: Legacy MSCI score (weighted cosine average)
+                cmsci: Calibrated cMSCI score
+                scores: Raw pairwise scores (st_i, st_a, si_a)
+                gram: Gramian volume scores
+                calibration: Z-normalized scores
+                contrastive: Contrastive margin results
+                uncertainty: MC sampling uncertainty (if adapters loaded)
+                variant_scores: Scores for each variant A-F
+        """
+        # ── Embed ──────────────────────────────────────────────
+        emb_text_clip = self.embedder.embed_text(text)
+        emb_text_clap = self.embedder.embed_text_for_audio(text) if audio_path else None
+        emb_image = self.embedder.embed_image(image_path) if image_path else None
+        emb_audio = self.embedder.embed_audio(audio_path) if audio_path else None
+        # ── Legacy MSCI (Variant A) ────────────────────────────
+        st_i = None
+        st_a = None
+        si_a = None
+        if emb_text_clip is not None and emb_image is not None:
+            st_i = float(round(cosine_similarity(emb_text_clip, emb_image), 4))
+        if emb_text_clap is not None and emb_audio is not None:
+            st_a = float(round(cosine_similarity(emb_text_clap, emb_audio), 4))
+        available = {}
+        if st_i is not None:
+            available["st_i"] = st_i
+        if st_a is not None:
+            available["st_a"] = st_a
+        weights = {"st_i": 0.45, "st_a": 0.45, "si_a": 0.10}
+        if len(available) >= 2:
+            total_w = sum(weights[k] for k in available if k in weights)
+            msci = sum(available[k] * weights[k] for k in available if k in weights) / max(total_w, 1e-6)
+        elif len(available) == 1:
+            msci = list(available.values())[0]
+        else:
+            msci = None
+        variant_a = msci
+        # ── Gramian Volume (Variant B) ─────────────────────────
+        gram_ti = None
+        gram_ta = None
+        gram_tia = None
+        gram_coherence_2way = None
+        if emb_text_clip is not None and emb_image is not None:
+            gram_ti = gram_volume_2d(emb_text_clip, emb_image)
+        if emb_text_clap is not None and emb_audio is not None:
+            gram_ta = gram_volume_2d(emb_text_clap, emb_audio)
+        # 2-way GRAM coherence (average of text-image and text-audio gram coherences)
+        gram_coherences = []
+        if gram_ti is not None:
+            gram_coherences.append(normalized_gram_coherence(gram_ti))
+        if gram_ta is not None:
+            gram_coherences.append(normalized_gram_coherence(gram_ta))
+        if gram_coherences:
+            gram_coherence_2way = float(np.mean(gram_coherences))
+        variant_b = gram_coherence_2way
+        # ── Z-Score Normalization (Variant C) ──────────────────
+        z_st_i = None
+        z_st_a = None
+        z_gram_ti = None
+        z_gram_ta = None
+        variant_c = variant_b  # default to B if no calibration
+        # Channel weight from settings (optimized via LOO-CV)
+        w_ti = CMSCI_CHANNEL_WEIGHT_TI
+        cal_mode = CMSCI_CALIBRATION_MODE
+        if self._calibration is not None:
+            if st_i is not None:
+                z_st_i = self._calibration.normalize("st_i", st_i)
+            if st_a is not None:
+                z_st_a = self._calibration.normalize("st_a", st_a)
+            # GRAM coherence z-scores (for gram calibration mode)
+            if gram_ti is not None:
+                gram_coh_ti = normalized_gram_coherence(gram_ti)
+                z_gram_ti = self._calibration.normalize("gram_coh_ti", gram_coh_ti)
+            if gram_ta is not None:
+                gram_coh_ta = normalized_gram_coherence(gram_ta)
+                z_gram_ta = self._calibration.normalize("gram_coh_ta", gram_coh_ta)
+            # Select calibration mode: cosine z-scores or gram coherence z-scores
+            if cal_mode == "gram" and z_gram_ti is not None and z_gram_ta is not None:
+                z_mean = w_ti * z_gram_ti + (1.0 - w_ti) * z_gram_ta
+            else:
+                # Cosine mode (original behavior) with weighted channels
+                z_coherences = []
+                z_weights = []
+                if z_st_i is not None:
+                    z_coherences.append(z_st_i)
+                    z_weights.append(w_ti)
+                if z_st_a is not None:
+                    z_coherences.append(z_st_a)
+                    z_weights.append(1.0 - w_ti)
+                if z_coherences:
+                    total_w = sum(z_weights)
+                    z_mean = sum(z * wt for z, wt in zip(z_coherences, z_weights)) / total_w
+                else:
+                    z_mean = None
+            if z_mean is not None:
+                # Map z-scores back to [0,1] via sigmoid for interpretability
+                variant_c = float(1.0 / (1.0 + np.exp(-z_mean)))
+        # ── Contrastive Margin (Variant D) ─────────────────────
+        contrastive_result = None
+        variant_d = variant_c  # default to C if no negatives
+        margin_alpha = CMSCI_MARGIN_ALPHA
+        if self._negative_bank is not None and gram_coherence_2way is not None:
+            matched_volume = float(np.mean([v for v in [gram_ti, gram_ta] if v is not None]))
+            contrastive_result = self._negative_bank.compute_contrastive_margin(
+                matched_volume=matched_volume,
+                text_clip_emb=emb_text_clip,
+                image_emb=emb_image,
+                text_clap_emb=emb_text_clap,
+                audio_emb=emb_audio,
+                domain=domain,
+                k=5,
+            )
+            if contrastive_result["n_negatives"] > 0:
+                # cMSCI_D = sigmoid(z_mean + alpha * margin)
+                # alpha amplifies the contrastive signal at the sigmoid operating point
+                margin = contrastive_result["margin"]
+                # Use the same calibration mode and weighting as Variant C
+                if cal_mode == "gram" and z_gram_ti is not None and z_gram_ta is not None:
+                    z_mean_d = w_ti * z_gram_ti + (1.0 - w_ti) * z_gram_ta
+                else:
+                    z_coherences_d = []
+                    z_weights_d = []
+                    if z_st_i is not None:
+                        z_coherences_d.append(z_st_i)
+                        z_weights_d.append(w_ti)
+                    elif st_i is not None:
+                        z_coherences_d.append(st_i)
+                        z_weights_d.append(w_ti)
+                    if z_st_a is not None:
+                        z_coherences_d.append(z_st_a)
+                        z_weights_d.append(1.0 - w_ti)
+                    elif st_a is not None:
+                        z_coherences_d.append(st_a)
+                        z_weights_d.append(1.0 - w_ti)
+                    if z_coherences_d:
+                        total_wd = sum(z_weights_d)
+                        z_mean_d = sum(z * wt for z, wt in zip(z_coherences_d, z_weights_d)) / total_wd
+                    else:
+                        z_mean_d = None
+                if z_mean_d is not None:
+                    variant_d = float(1.0 / (1.0 + np.exp(-(z_mean_d + margin_alpha * margin))))
+                else:
+                    variant_d = variant_c
+        # ── Cross-Space Complementarity — Variant E ──────────
+        # COMPLEMENTARITY: E = sigmoid(z_2d + w_3d * z_compl + alpha * margin)
+        # ExMCR projects CLAP audio → CLIP space, enabling measurement of
+        # image-audio complementarity (Gramian dispersion in unified space).
+        # High complementarity = image and audio contribute unique perspectives.
+        # Low complementarity = redundant cross-modal information.
+        # z_compl = z_normalize(gram_volume_ia) — positive z = more complementary.
+        # w_3d=0 recovers D exactly (safety guarantee).
+        audio_projected = None
+        variant_e = variant_d  # default to D if no projector
+        z_compl = None  # z-normalized complementarity (exported for optimizer)
+        gram_ia_volume = None  # raw image-audio Gramian volume
+        w_3d = CMSCI_W_3D
+        # Reconstruct D's pre-margin z-score (z_2d) for composition
+        z_2d = None
+        margin = 0.0
+        if contrastive_result is not None and contrastive_result["n_negatives"] > 0:
+            margin = contrastive_result["margin"]
+        if cal_mode == "gram" and z_gram_ti is not None and z_gram_ta is not None:
+            z_2d = w_ti * z_gram_ti + (1.0 - w_ti) * z_gram_ta
+        elif z_st_i is not None and z_st_a is not None:
+            z_2d = w_ti * z_st_i + (1.0 - w_ti) * z_st_a
+        # Project audio into CLIP space via ExMCR and compute complementarity
+        if self._exmcr is not None and not self._exmcr.is_identity:
+            if emb_audio is not None:
+                audio_projected = self._exmcr.project_audio(emb_audio)
+                if emb_image is not None:
+                    si_a = float(round(cosine_similarity(emb_image, audio_projected), 4))
+                    # Image-audio Gramian volume = dispersion = complementarity
+                    gram_ia_volume = gram_volume_2d(emb_image, audio_projected)
+                if emb_text_clip is not None and emb_image is not None and audio_projected is not None:
+                    gram_tia = gram_volume_3d(emb_text_clip, emb_image, audio_projected)
+        # Z-normalize complementarity (volume, NOT coherence)
+        # z_compl = -z_gram_ia_coherence (flipped: high volume = high complementarity)
+        if gram_ia_volume is not None and self._calibration is not None:
+            gram_ia_coherence = normalized_gram_coherence(gram_ia_volume)
+            z_gram_ia_coh = self._calibration.normalize("gram_coh_ia_exmcr", gram_ia_coherence)
+            z_compl = -z_gram_ia_coh  # flip: positive = more complementary
+        # Compose: E = sigmoid(z_2d + w_3d * z_compl + alpha * margin)
+        if z_2d is not None:
+            logit_e = z_2d + margin_alpha * margin
+            if z_compl is not None:
+                logit_e += w_3d * z_compl
+            variant_e = float(1.0 / (1.0 + np.exp(-logit_e)))
+        # ── Probabilistic Adaptive Weighting (Variant F) ──────
+        # ProbVLM drives per-sample channel weights instead of fixed w_ti.
+        # adaptive_w = (1/u_ti) / (1/u_ti + 1/u_ta)  — trust more confident channel
+        # w_ti_final = (1 - gamma) * base_w + gamma * adaptive_w
+        # gamma=0 → w_ti_final = base_w → recovers E exactly (safety guarantee)
+        # MC sampling remains metadata only (confidence intervals, not scoring).
+        uncertainty_result = None
+        variant_f = variant_e  # default to E
+        u_ti = None  # per-channel uncertainty (exported for optimizer)
+        u_ta = None
+        adaptive_w_ti = None
+        gamma = CMSCI_GAMMA
+        if self._prob_clip is not None or self._prob_clap is not None:
+            mc_volumes = []
+            # Per-channel uncertainty from ProbVLM adapters
+            if self._prob_clip is not None and emb_text_clip is not None and emb_image is not None:
+                u_text_clip = self._prob_clip.uncertainty(emb_text_clip)
+                u_image_clip = self._prob_clip.uncertainty(emb_image)
+                u_ti = float(np.mean([u_text_clip, u_image_clip]))
+                # MC samples for confidence interval metadata
+                text_samples = self._prob_clip.sample(emb_text_clip, n_mc_samples)
+                image_samples = self._prob_clip.sample(emb_image, n_mc_samples)
+                for t_s, i_s in zip(text_samples, image_samples):
+                    mc_volumes.append(gram_volume_2d(t_s, i_s))
+            if self._prob_clap is not None and emb_text_clap is not None and emb_audio is not None:
+                u_text_clap = self._prob_clap.uncertainty(emb_text_clap)
+                u_audio_clap = self._prob_clap.uncertainty(emb_audio)
+                u_ta = float(np.mean([u_text_clap, u_audio_clap]))
+                text_samples = self._prob_clap.sample(emb_text_clap, n_mc_samples)
+                audio_samples = self._prob_clap.sample(emb_audio, n_mc_samples)
+                for t_s, a_s in zip(text_samples, audio_samples):
+                    mc_volumes.append(gram_volume_2d(t_s, a_s))
+            # Compute adaptive channel weight from uncertainty
+            if u_ti is not None and u_ta is not None and u_ti > 0 and u_ta > 0 and gamma > 0:
+                inv_ti = 1.0 / u_ti
+                inv_ta = 1.0 / u_ta
+                adaptive_w = inv_ti / (inv_ti + inv_ta)
+                w_ti_final = (1.0 - gamma) * w_ti + gamma * adaptive_w
+                adaptive_w_ti = float(w_ti_final)
+                # Recompute z_2d with adaptive weights
+                if cal_mode == "gram" and z_gram_ti is not None and z_gram_ta is not None:
+                    z_2d_adaptive = w_ti_final * z_gram_ti + (1.0 - w_ti_final) * z_gram_ta
+                elif z_st_i is not None and z_st_a is not None:
+                    z_2d_adaptive = w_ti_final * z_st_i + (1.0 - w_ti_final) * z_st_a
+                else:
+                    z_2d_adaptive = None
+                if z_2d_adaptive is not None:
+                    logit_f = z_2d_adaptive + margin_alpha * margin
+                    if z_compl is not None:
+                        logit_f += w_3d * z_compl
+                    variant_f = float(1.0 / (1.0 + np.exp(-logit_f)))
+            # MC sampling for confidence intervals (metadata, NOT scoring)
+            if mc_volumes:
+                mc_coherences = [normalized_gram_coherence(v) for v in mc_volumes]
+                mc_mean = float(np.mean(mc_coherences))
+                mc_std = float(np.std(mc_coherences))
+                mc_ci_lower = float(np.percentile(mc_coherences, 2.5))
+                mc_ci_upper = float(np.percentile(mc_coherences, 97.5))
+            else:
+                mc_mean = mc_std = mc_ci_lower = mc_ci_upper = None
+            uncertainty_result = {
+                "mc_mean": round(mc_mean, 4) if mc_mean is not None else None,
+                "mc_std": round(mc_std, 4) if mc_std is not None else None,
+                "mc_ci_lower": round(mc_ci_lower, 4) if mc_ci_lower is not None else None,
+                "mc_ci_upper": round(mc_ci_upper, 4) if mc_ci_upper is not None else None,
+                "u_ti": round(u_ti, 6) if u_ti is not None else None,
+                "u_ta": round(u_ta, 6) if u_ta is not None else None,
+                "adaptive_w_ti": round(adaptive_w_ti, 4) if adaptive_w_ti is not None else None,
+                "gamma": gamma,
+                "n_samples": n_mc_samples,
+            }
+        # ── Assemble cMSCI ─────────────────────────────────────
+        # cMSCI is the highest available variant
+        cmsci = variant_f
+        active_variant = "F"
+        if variant_f == variant_e:
+            active_variant = "E" if variant_e != variant_d else "D"
+        if variant_e == variant_d:
+            active_variant = "D" if variant_d != variant_c else "C"
+        if variant_d == variant_c:
+            active_variant = "C" if variant_c != variant_b else "B"
+        if variant_c == variant_b:
+            active_variant = "B" if variant_b is not None else "A"
+        # Final cMSCI: use the most sophisticated available variant
+        if cmsci is None:
+            cmsci = msci  # fallback to legacy
+            active_variant = "A"
+        logger.info(
+            "cMSCI = %.4f (variant %s) | MSCI = %s",
+            cmsci if cmsci is not None else 0.0,
+            active_variant,
+            msci,
+        )
+        return {
+            "cmsci": round(cmsci, 4) if cmsci is not None else None,
+            "msci": round(msci, 4) if msci is not None else None,
+            "active_variant": active_variant,
+            "scores": {
+                "st_i": st_i,
+                "st_a": st_a,
+                "si_a": si_a,
+            },
+            "gram": {
+                "text_image": round(gram_ti, 4) if gram_ti is not None else None,
+                "text_audio": round(gram_ta, 4) if gram_ta is not None else None,
+                "text_image_audio": round(gram_tia, 4) if gram_tia is not None else None,
+                "coherence_2way": round(gram_coherence_2way, 4) if gram_coherence_2way is not None else None,
+            },
+            "calibration": {
+                "z_st_i": round(z_st_i, 4) if z_st_i is not None else None,
+                "z_st_a": round(z_st_a, 4) if z_st_a is not None else None,
+                "z_gram_ti": round(z_gram_ti, 4) if z_gram_ti is not None else None,
+                "z_gram_ta": round(z_gram_ta, 4) if z_gram_ta is not None else None,
+                "z_compl": round(z_compl, 4) if z_compl is not None else None,
+                "gram_ia_volume": round(gram_ia_volume, 4) if gram_ia_volume is not None else None,
+                "u_ti": round(u_ti, 6) if u_ti is not None else None,
+                "u_ta": round(u_ta, 6) if u_ta is not None else None,
+                "adaptive_w_ti": round(adaptive_w_ti, 4) if adaptive_w_ti is not None else None,
+                "cal_mode": cal_mode if self._calibration is not None else None,
+                "w_ti": w_ti,
+                "w_3d": w_3d,
+                "gamma": gamma,
+                "margin_alpha": CMSCI_MARGIN_ALPHA if contrastive_result else None,
+            },
+            "contrastive": contrastive_result,
+            "uncertainty": uncertainty_result,
+            "variant_scores": {
+                "A_msci": round(variant_a, 4) if variant_a is not None else None,
+                "B_gram": round(variant_b, 4) if variant_b is not None else None,
+                "C_gram_znorm": round(variant_c, 4) if variant_c is not None else None,
+                "D_gram_znorm_contrastive": round(variant_d, 4) if variant_d is not None else None,
+                "E_gram_znorm_contrastive_exmcr": round(variant_e, 4) if variant_e is not None else None,
+                "F_full_cmsci": round(variant_f, 4) if variant_f is not None else None,
+            },
+        }
+    def evaluate_batch(
+        self,
+        items: List[Dict[str, str]],
+        n_mc_samples: int = 100,
+    ) -> List[Dict[str, Any]]:
+        """
+        Evaluate a batch of (text, image_path, audio_path) triples.
+        Args:
+            items: List of dicts with keys "text", "image_path", "audio_path", "domain".
+            n_mc_samples: MC samples per item.
+        Returns:
+            List of result dicts from evaluate().
+        """
+        results = []
+        for item in items:
+            result = self.evaluate(
+                text=item.get("text", ""),
+                image_path=item.get("image_path"),
+                audio_path=item.get("audio_path"),
+                domain=item.get("domain", ""),
+                n_mc_samples=n_mc_samples,
+            )
+            results.append(result)
+        return results

src/coherence/gram_volume.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+Gramian Volume Scoring for Multimodal Coherence.
+The Gramian volume measures the geometric dispersion of embedding vectors.
+For n L2-normalized vectors, the Gramian matrix G has G_ij = <vi, vj>.
+    volume = sqrt(det(G))
+Properties:
+- Identical vectors → det(G) = 0 → volume = 0 (perfect alignment)
+- Mutually orthogonal unit vectors → det(G) = 1 → volume = 1 (max dispersion)
+- Coherence = 1 - volume → [0, 1] where 1 = perfect alignment
+For 2 unit vectors:
+    det(G) = 1 - cos²(θ) = sin²(θ)
+    volume = |sin(θ)|
+    coherence = 1 - |sin(θ)| ≈ cos(θ) for small angles
+For 3 unit vectors:
+    det(G) = 1 - cos²(a) - cos²(b) - cos²(c) + 2·cos(a)·cos(b)·cos(c)
+    where a, b, c are pairwise angles
+    This captures the full tri-modal geometric relationship in one number.
+"""
+from __future__ import annotations
+import numpy as np
+def _normalize(v: np.ndarray, eps: float = 1e-12) -> np.ndarray:
+    """L2-normalize a vector."""
+    v = v.astype(np.float64).squeeze()
+    norm = np.linalg.norm(v) + eps
+    return v / norm
+def gram_volume_2d(v1: np.ndarray, v2: np.ndarray) -> float:
+    """
+    Gramian volume for 2 vectors (area of parallelogram).
+    For unit vectors: volume = |sin(θ)| where θ is the angle between them.
+    Range: [0, 1] — 0 when identical, 1 when orthogonal.
+    """
+    v1_n = _normalize(v1)
+    v2_n = _normalize(v2)
+    cos_sim = np.clip(np.dot(v1_n, v2_n), -1.0, 1.0)
+    # det(G) = 1 - cos²(θ)
+    det_g = 1.0 - cos_sim ** 2
+    return float(np.sqrt(max(det_g, 0.0)))
+def gram_volume_3d(
+    v1: np.ndarray, v2: np.ndarray, v3: np.ndarray,
+) -> float:
+    """
+    Gramian volume for 3 vectors (volume of parallelepiped).
+    For unit vectors with pairwise cosines a, b, c:
+        det(G) = 1 - a² - b² - c² + 2abc
+    Range: [0, 1] — 0 when all collinear, 1 when mutually orthogonal.
+    """
+    v1_n = _normalize(v1)
+    v2_n = _normalize(v2)
+    v3_n = _normalize(v3)
+    a = np.dot(v1_n, v2_n)
+    b = np.dot(v1_n, v3_n)
+    c = np.dot(v2_n, v3_n)
+    det_g = 1.0 - a**2 - b**2 - c**2 + 2.0 * a * b * c
+    return float(np.sqrt(max(det_g, 0.0)))
+def gram_volume_nd(*vectors: np.ndarray) -> float:
+    """
+    Gramian volume for n vectors (general case).
+    Builds the Gram matrix G_ij = <vi, vj> from L2-normalized vectors
+    and returns sqrt(det(G)).
+    Args:
+        *vectors: Variable number of numpy arrays (embeddings).
+    Returns:
+        Gramian volume in [0, 1] for unit vectors.
+    """
+    n = len(vectors)
+    if n == 0:
+        return 0.0
+    if n == 1:
+        return 0.0
+    if n == 2:
+        return gram_volume_2d(vectors[0], vectors[1])
+    if n == 3:
+        return gram_volume_3d(vectors[0], vectors[1], vectors[2])
+    normed = [_normalize(v) for v in vectors]
+    G = np.zeros((n, n), dtype=np.float64)
+    for i in range(n):
+        for j in range(i, n):
+            dot = np.dot(normed[i], normed[j])
+            G[i, j] = dot
+            G[j, i] = dot
+    det_g = np.linalg.det(G)
+    return float(np.sqrt(max(det_g, 0.0)))
+def normalized_gram_coherence(volume: float, n_vectors: int = 2) -> float:
+    """
+    Map Gramian volume to coherence score in [0, 1].
+    1 = perfect alignment (volume = 0, all vectors identical)
+    0 = maximum dispersion (volume = 1, mutually orthogonal)
+    Args:
+        volume: Gramian volume (output of gram_volume_* functions).
+        n_vectors: Number of vectors used (for documentation; mapping is the same).
+    Returns:
+        Coherence score in [0, 1].
+    """
+    return float(max(0.0, min(1.0, 1.0 - volume)))

src/coherence/negative_bank.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""
+Contrastive Negative Bank for cMSCI Calibration.
+Computes contrastive margins by comparing a matched (text, image, audio)
+triple against hard-negative alternatives from the embedding indexes.
+A positive contrastive margin means the matched triple has tighter
+geometric coherence than mismatched alternatives — the defining
+property of a well-calibrated metric.
+Contrastive margin:
+    margin = mean(neg_volumes) - matched_volume
+    > 0 → matched triple is more coherent than negatives (good)
+    ≤ 0 → metric cannot distinguish matched from mismatched (bad)
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from src.coherence.gram_volume import gram_volume_2d, gram_volume_3d, normalized_gram_coherence
+from src.embeddings.similarity import l2_normalize
+logger = logging.getLogger(__name__)
+class NegativeBank:
+    """
+    Loads pre-computed embedding indexes and provides hard negatives.
+    Hard negatives are embeddings with high individual similarity to the
+    query but from a different domain — the most challenging cases for
+    the coherence metric.
+    """
+    def __init__(
+        self,
+        image_index_path: str = "data/embeddings/image_index.npz",
+        audio_index_path: str = "data/embeddings/audio_index.npz",
+    ):
+        self._image_ids: Optional[np.ndarray] = None
+        self._image_embs: Optional[np.ndarray] = None
+        self._image_domains: Optional[np.ndarray] = None
+        self._audio_ids: Optional[np.ndarray] = None
+        self._audio_embs: Optional[np.ndarray] = None
+        self._audio_domains: Optional[np.ndarray] = None
+        self._load_index(image_index_path, "image")
+        self._load_index(audio_index_path, "audio")
+    def _load_index(self, path: str, modality: str) -> None:
+        p = Path(path)
+        if not p.exists():
+            logger.warning("Index not found: %s — %s negatives disabled", path, modality)
+            return
+        data = np.load(path, allow_pickle=True)
+        ids = data["ids"] if "ids" in data else data.get("paths", np.array([]))
+        embs = data["embs"] if "embs" in data else data.get("embeddings", np.array([]))
+        domains = data["domains"] if "domains" in data else np.array(["other"] * len(ids))
+        if modality == "image":
+            self._image_ids = ids
+            self._image_embs = embs.astype(np.float32)
+            self._image_domains = domains
+            logger.info("Loaded image index: %d entries", len(ids))
+        else:
+            self._audio_ids = ids
+            self._audio_embs = embs.astype(np.float32)
+            self._audio_domains = domains
+            logger.info("Loaded audio index: %d entries", len(ids))
+    @property
+    def has_images(self) -> bool:
+        return self._image_embs is not None and len(self._image_embs) > 0
+    @property
+    def has_audio(self) -> bool:
+        return self._audio_embs is not None and len(self._audio_embs) > 0
+    def get_hard_negative_images(
+        self,
+        text_emb: np.ndarray,
+        exclude_domain: str = "",
+        k: int = 5,
+    ) -> List[np.ndarray]:
+        """
+        Get top-k hardest negative images (high text similarity but wrong domain).
+        Args:
+            text_emb: CLIP text embedding for the query.
+            exclude_domain: Domain to exclude (the correct domain).
+            k: Number of negatives to return.
+        Returns:
+            List of image embeddings (hard negatives).
+        """
+        if not self.has_images:
+            return []
+        text_n = l2_normalize(text_emb.squeeze())
+        sims = self._image_embs @ text_n
+        # Filter by domain: exclude the matched domain
+        if exclude_domain:
+            mask = np.array([d != exclude_domain for d in self._image_domains])
+        else:
+            mask = np.ones(len(sims), dtype=bool)
+        sims_masked = np.where(mask, sims, -np.inf)
+        top_k_idx = np.argsort(sims_masked)[-k:][::-1]
+        return [self._image_embs[i] for i in top_k_idx if sims_masked[i] > -np.inf]
+    def get_hard_negative_audio(
+        self,
+        text_emb: np.ndarray,
+        exclude_domain: str = "",
+        k: int = 5,
+    ) -> List[np.ndarray]:
+        """
+        Get top-k hardest negative audio (high text similarity but wrong domain).
+        Args:
+            text_emb: CLAP text embedding for the query.
+            exclude_domain: Domain to exclude.
+            k: Number of negatives to return.
+        Returns:
+            List of audio embeddings (hard negatives).
+        """
+        if not self.has_audio:
+            return []
+        text_n = l2_normalize(text_emb.squeeze())
+        sims = self._audio_embs @ text_n
+        if exclude_domain:
+            mask = np.array([d != exclude_domain for d in self._audio_domains])
+        else:
+            mask = np.ones(len(sims), dtype=bool)
+        sims_masked = np.where(mask, sims, -np.inf)
+        top_k_idx = np.argsort(sims_masked)[-k:][::-1]
+        return [self._audio_embs[i] for i in top_k_idx if sims_masked[i] > -np.inf]
+    def compute_contrastive_margin(
+        self,
+        matched_volume: float,
+        text_clip_emb: np.ndarray,
+        image_emb: np.ndarray,
+        text_clap_emb: Optional[np.ndarray] = None,
+        audio_emb: Optional[np.ndarray] = None,
+        domain: str = "",
+        k: int = 5,
+    ) -> Dict[str, float]:
+        """
+        Compute contrastive margin against hard negatives.
+        For each hard negative, computes the gram volume of the negative
+        triple and averages. Margin = mean(neg_volumes) - matched_volume.
+        A positive margin means the matched triple is geometrically tighter
+        than hard-negative alternatives.
+        Args:
+            matched_volume: Gram volume of the matched (text, image, audio) triple.
+            text_clip_emb: CLIP text embedding (for finding negative images).
+            image_emb: CLIP image embedding of the matched image.
+            text_clap_emb: CLAP text embedding (for finding negative audio).
+            audio_emb: CLAP audio embedding of the matched audio.
+            domain: Domain of the matched prompt (excluded from negatives).
+            k: Number of hard negatives per modality.
+        Returns:
+            Dict with margin, mean_neg_volume, n_negatives.
+        """
+        neg_volumes = []
+        # Image negatives: replace matched image with hard negative
+        neg_images = self.get_hard_negative_images(text_clip_emb, domain, k)
+        for neg_img in neg_images:
+            vol = gram_volume_2d(text_clip_emb, neg_img)
+            neg_volumes.append(vol)
+        # Audio negatives: replace matched audio with hard negative
+        if text_clap_emb is not None:
+            neg_audios = self.get_hard_negative_audio(text_clap_emb, domain, k)
+            for neg_aud in neg_audios:
+                vol = gram_volume_2d(text_clap_emb, neg_aud)
+                neg_volumes.append(vol)
+        if not neg_volumes:
+            return {
+                "margin": 0.0,
+                "mean_neg_volume": matched_volume,
+                "n_negatives": 0,
+            }
+        mean_neg = float(np.mean(neg_volumes))
+        margin = mean_neg - matched_volume
+        return {
+            "margin": float(margin),
+            "mean_neg_volume": mean_neg,
+            "n_negatives": len(neg_volumes),
+        }

src/config/settings.py CHANGED Viewed

@@ -106,3 +106,47 @@ DRIFT_ASYMMETRY_THRESHOLD = 0.15  # |st_i - st_a| gap to flag drift
 RERATING_FRACTION = 0.20
 KAPPA_ACCEPTABLE_THRESHOLD = 0.70
 ALPHA_ACCEPTABLE_THRESHOLD = 0.667

 RERATING_FRACTION = 0.20
 KAPPA_ACCEPTABLE_THRESHOLD = 0.70
 ALPHA_ACCEPTABLE_THRESHOLD = 0.667
+# ---------------------------------------------------------------------------
+# cMSCI (Calibrated Multimodal Semantic Coherence Index)
+# ---------------------------------------------------------------------------
+# Calibration store (fitted from RQ1 baseline data)
+CMSCI_CALIBRATION_PATH = PROJECT_ROOT / "artifacts" / "cmsci_calibration.json"
+# Ex-MCR cross-space alignment (CLAP → CLIP projection)
+EXMCR_WEIGHTS_PATH = PROJECT_ROOT / "models" / "exmcr" / "ex_clap.pt"
+# Cross-Space Bridge (CLIP image + CLAP audio → shared 256-d bridge space)
+BRIDGE_WEIGHTS_PATH = PROJECT_ROOT / "models" / "bridge" / "bridge_best.pt"
+# Probabilistic adapters (ProbVLM-style uncertainty)
+PROB_CLIP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clip_adapter.pt"
+PROB_CLAP_ADAPTER_PATH = PROJECT_ROOT / "models" / "prob_adapters" / "clap_adapter.pt"
+# Full pipeline optimized parameters (via LOO-CV on RQ3 human ratings)
+# Full-sample rho=0.608 (p=0.0004), LOO-CV rho=0.546 (p=0.0018), overfit gap=0.001
+# Selected in 87% of LOO folds (26/30) — highly stable
+CMSCI_MARGIN_ALPHA = 16             # Margin scaling factor (amplifies contrastive signal)
+CMSCI_CHANNEL_WEIGHT_TI = 0.90     # Text-image channel weight (1 - w for text-audio)
+CMSCI_CALIBRATION_MODE = "gram"    # "cosine" (z-norm cosine sims) or "gram" (z-norm gram coherences)
+# Variant E: ExMCR cross-modal complementarity (w_3d=0 recovers D exactly)
+# ExMCR projects CLAP audio → CLIP space; complementarity = Gramian dispersion
+# High complementarity = image and audio contribute unique perspectives (rewarded)
+CMSCI_W_3D = 0.45                  # Weight for z-normalized IA complementarity
+# Variant F: ProbVLM adaptive channel weighting (gamma=0 recovers E exactly)
+CMSCI_GAMMA = 0.10                 # Mixing ratio: w_final = (1-gamma)*base_w + gamma*adaptive_w
+# Contrastive negative bank
+CMSCI_NEGATIVE_K = 5                # Number of hard negatives per modality
+CMSCI_NEGATIVE_BANK_ENABLED = True  # Enable/disable contrastive calibration
+# MC sampling for uncertainty estimation
+CMSCI_MC_SAMPLES = 100  # Number of Monte Carlo samples for Variant F
+# Probabilistic adapter training
+PROB_ADAPTER_EPOCHS = 100
+PROB_ADAPTER_LR = 1e-4
+PROB_ADAPTER_BATCH_SIZE = 32
+PROB_ADAPTER_PATIENCE = 15

src/embeddings/prob_adapter_trainer.py ADDED Viewed

	@@ -0,0 +1,256 @@

+"""
+Training Loop for ProbVLM-Style Probabilistic Adapters.
+Trains lightweight post-hoc adapters on top of frozen CLIP/CLAP encoders.
+Each adapter learns to predict uncertainty (Generalized Gaussian parameters)
+for a single embedding space.
+Two adapters to train:
+    1. CLIP adapter: trained on (image_embedding, text_embedding) pairs
+    2. CLAP adapter: trained on (audio_embedding, text_embedding) pairs
+Training data:
+    - Our 57 images paired with text descriptions (CLIP pairs)
+    - Our 104 audio files paired with text descriptions (CLAP pairs)
+    - All 30 RQ1 prompts × matched media as additional pairs
+Loss:
+    L = L1(mu, target) + GenGaussLoss(mu, alpha, beta, target)
+GenGaussLoss:
+    -log p(target | mu, alpha, beta) ∝ log(alpha) - log(beta) + (|target - mu| / alpha)^beta
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+logger = logging.getLogger(__name__)
+try:
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    from torch.utils.data import DataLoader, Dataset, random_split
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+from src.embeddings.probabilistic_adapter import ProbabilisticAdapter
+class EmbeddingPairDataset(Dataset):
+    """Dataset of (input_embedding, target_embedding) pairs."""
+    def __init__(self, inputs: np.ndarray, targets: np.ndarray):
+        if not TORCH_AVAILABLE:
+            raise ImportError("PyTorch required")
+        assert len(inputs) == len(targets)
+        self.inputs = torch.tensor(inputs, dtype=torch.float32)
+        self.targets = torch.tensor(targets, dtype=torch.float32)
+    def __len__(self) -> int:
+        return len(self.inputs)
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.inputs[idx], self.targets[idx]
+class GenGaussNLL(nn.Module):
+    """
+    Negative log-likelihood loss for Generalized Gaussian distribution.
+    -log p(x | mu, alpha, beta) = log(2*alpha) + log(Gamma(1/beta)/beta) + (|x - mu| / alpha)^beta
+    Simplified (dropping constant terms):
+        L = log(alpha) + (|target - mu| / alpha)^beta
+    """
+    def forward(
+        self,
+        mu: torch.Tensor,
+        alpha: torch.Tensor,
+        beta: torch.Tensor,
+        target: torch.Tensor,
+    ) -> torch.Tensor:
+        residual = torch.abs(target - mu)
+        # Clamp alpha to avoid division by zero
+        alpha_c = torch.clamp(alpha, min=1e-6)
+        nll = torch.log(alpha_c) + (residual / alpha_c).pow(beta)
+        return nll.mean()
+def train_prob_adapter(
+    input_embeddings: np.ndarray,
+    target_embeddings: np.ndarray,
+    epochs: int = 100,
+    lr: float = 1e-4,
+    batch_size: int = 32,
+    val_split: float = 0.15,
+    patience: int = 15,
+    output_path: Optional[str] = None,
+    adapter_name: str = "adapter",
+) -> ProbabilisticAdapter:
+    """
+    Train a ProbabilisticAdapter on paired embeddings.
+    Args:
+        input_embeddings: Source embeddings [N, 512] (e.g. image CLIP or audio CLAP).
+        target_embeddings: Target embeddings [N, 512] (e.g. text CLIP or text CLAP).
+        epochs: Maximum training epochs.
+        lr: Learning rate.
+        batch_size: Batch size.
+        val_split: Fraction for validation.
+        patience: Early stopping patience.
+        output_path: If set, save best model here.
+        adapter_name: Name for logging.
+    Returns:
+        Trained ProbabilisticAdapter.
+    """
+    if not TORCH_AVAILABLE:
+        raise ImportError("PyTorch required for training")
+    device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+    # Build dataset
+    dataset = EmbeddingPairDataset(input_embeddings, target_embeddings)
+    n_val = max(1, int(len(dataset) * val_split))
+    n_train = len(dataset) - n_val
+    train_ds, val_ds = random_split(
+        dataset, [n_train, n_val],
+        generator=torch.Generator().manual_seed(42),
+    )
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=len(train_ds) > batch_size)
+    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
+    # Build model
+    input_dim = input_embeddings.shape[1]
+    adapter = ProbabilisticAdapter(input_dim=input_dim).to(device)
+    optimizer = torch.optim.AdamW(adapter.parameters(), lr=lr, weight_decay=1e-4)
+    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
+    l1_loss = nn.L1Loss()
+    gg_loss = GenGaussNLL()
+    best_val_loss = float("inf")
+    patience_counter = 0
+    logger.info(
+        "Training %s adapter: %d train, %d val, %d epochs, device=%s",
+        adapter_name, n_train, n_val, epochs, device,
+    )
+    for epoch in range(epochs):
+        # Train
+        adapter.train()
+        train_losses = []
+        for inp, tgt in train_loader:
+            inp, tgt = inp.to(device), tgt.to(device)
+            optimizer.zero_grad()
+            mu, alpha, beta = adapter(inp)
+            loss = l1_loss(mu, tgt) + gg_loss(mu, alpha, beta, tgt)
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(adapter.parameters(), max_norm=1.0)
+            optimizer.step()
+            train_losses.append(loss.item())
+        scheduler.step()
+        # Validate
+        adapter.eval()
+        val_losses = []
+        with torch.no_grad():
+            for inp, tgt in val_loader:
+                inp, tgt = inp.to(device), tgt.to(device)
+                mu, alpha, beta = adapter(inp)
+                loss = l1_loss(mu, tgt) + gg_loss(mu, alpha, beta, tgt)
+                val_losses.append(loss.item())
+        avg_train = np.mean(train_losses)
+        avg_val = np.mean(val_losses) if val_losses else float("inf")
+        if (epoch + 1) % 10 == 0 or epoch == 0:
+            logger.info(
+                "  [%s] Epoch %d/%d: train=%.4f, val=%.4f",
+                adapter_name, epoch + 1, epochs, avg_train, avg_val,
+            )
+        # Early stopping
+        if avg_val < best_val_loss:
+            best_val_loss = avg_val
+            patience_counter = 0
+            if output_path:
+                adapter.save(output_path)
+        else:
+            patience_counter += 1
+            if patience_counter >= patience:
+                logger.info("  [%s] Early stopping at epoch %d", adapter_name, epoch + 1)
+                break
+    # Load best if saved
+    if output_path and Path(output_path).exists():
+        adapter = ProbabilisticAdapter.load(output_path)
+        adapter = adapter.to(device)
+    else:
+        adapter = adapter.cpu()
+    adapter.eval()
+    logger.info("  [%s] Training complete. Best val_loss=%.4f", adapter_name, best_val_loss)
+    return adapter
+def build_training_pairs_from_index(
+    embedding_index_path: str,
+    text_embedder_fn,
+    modality: str = "image",
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Build (media_embedding, text_embedding) pairs from an embedding index.
+    For each media file in the index, generates a text description from
+    the filename/metadata and embeds it.
+    Args:
+        embedding_index_path: Path to image_index.npz or audio_index.npz.
+        text_embedder_fn: Function that takes text -> np.ndarray embedding.
+        modality: "image" for CLIP text, "audio" for CLAP text.
+    Returns:
+        (media_embeddings, text_embeddings) both shape [N, 512].
+    """
+    data = np.load(embedding_index_path, allow_pickle=True)
+    ids = data["ids"] if "ids" in data else data.get("paths", np.array([]))
+    embs = data["embs"] if "embs" in data else data.get("embeddings", np.array([]))
+    domains = data["domains"] if "domains" in data else np.array(["other"] * len(ids))
+    media_embs = []
+    text_embs = []
+    for i, (file_id, domain) in enumerate(zip(ids, domains)):
+        # Generate caption from filename
+        name = Path(str(file_id)).stem
+        # Clean up filename to make a caption
+        caption = name.replace("_", " ").replace("-", " ")
+        # Remove common prefixes
+        for prefix in ["fs ", "wm ", "proc "]:
+            if caption.lower().startswith(prefix):
+                caption = caption[len(prefix):]
+        # Add domain context
+        if domain != "other":
+            caption = f"{domain}: {caption}"
+        try:
+            text_emb = text_embedder_fn(caption)
+            media_embs.append(embs[i])
+            text_embs.append(text_emb)
+        except Exception as e:
+            logger.warning("Skipping %s: %s", file_id, e)
+    return np.array(media_embs), np.array(text_embs)

src/embeddings/probabilistic_adapter.py ADDED Viewed

	@@ -0,0 +1,216 @@

+"""
+ProbVLM-Style Probabilistic Adapter for Uncertainty Estimation.
+Converts point embeddings into distributions (Generalized Gaussian)
+following the BayesCap approach from ProbVLM.
+Each adapter takes a frozen embedding and predicts:
+    mu:    Shift from the input embedding (residual)
+    alpha: Scale parameter (controls spread)
+    beta:  Shape parameter (controls tail behavior)
+These define a Generalized Gaussian distribution:
+    p(x) ∝ exp(-(|x - mu| / alpha)^beta)
+MC sampling from this distribution produces N embedding samples,
+which propagate uncertainty through the Gramian volume computation.
+Architecture: BayesCap_MLP
+    input → Linear(d, hidden) → ReLU → Dropout
+          → Linear(hidden, hidden) → ReLU → Dropout
+          → Three heads: mu_head, alpha_head, beta_head
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+import numpy as np
+logger = logging.getLogger(__name__)
+try:
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+def _check_torch():
+    if not TORCH_AVAILABLE:
+        raise ImportError("PyTorch required for ProbabilisticAdapter")
+class ProbabilisticAdapter(nn.Module):
+    """
+    BayesCap-style adapter that maps point embeddings to distributions.
+    Takes a frozen embedding (from CLIP or CLAP) and predicts
+    Generalized Gaussian parameters: (mu, alpha, beta).
+    The adapter is lightweight (~0.5M params) and trains in minutes
+    on small datasets.
+    """
+    def __init__(
+        self,
+        input_dim: int = 512,
+        hidden_dim: int = 256,
+        num_layers: int = 3,
+        dropout: float = 0.1,
+    ):
+        _check_torch()
+        super().__init__()
+        self.input_dim = input_dim
+        # Shared backbone
+        layers = []
+        in_d = input_dim
+        for _ in range(num_layers - 1):
+            layers.extend([
+                nn.Linear(in_d, hidden_dim),
+                nn.ReLU(),
+                nn.Dropout(dropout),
+            ])
+            in_d = hidden_dim
+        self.backbone = nn.Sequential(*layers)
+        # Three output heads
+        self.mu_head = nn.Linear(hidden_dim, input_dim)
+        self.alpha_head = nn.Linear(hidden_dim, input_dim)
+        self.beta_head = nn.Linear(hidden_dim, input_dim)
+        self.config = {
+            "input_dim": input_dim,
+            "hidden_dim": hidden_dim,
+            "num_layers": num_layers,
+            "dropout": dropout,
+        }
+    def forward(
+        self, embedding: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Predict distribution parameters from a point embedding.
+        Args:
+            embedding: Input embedding [batch, input_dim].
+        Returns:
+            mu: Location parameter [batch, input_dim] (embedding + residual)
+            alpha: Scale parameter [batch, input_dim] (> 0, via softplus)
+            beta: Shape parameter [batch, input_dim] (> 0, via softplus)
+        """
+        h = self.backbone(embedding)
+        # mu: residual + input (anchored to original embedding)
+        mu = embedding + self.mu_head(h)
+        # alpha, beta: positive via softplus
+        alpha = F.softplus(self.alpha_head(h)) + 1e-6
+        beta = F.softplus(self.beta_head(h)) + 1e-6
+        return mu, alpha, beta
+    def sample(
+        self,
+        embedding: np.ndarray,
+        n_samples: int = 100,
+    ) -> np.ndarray:
+        """
+        Draw Monte Carlo samples from the predicted distribution.
+        Uses the reparameterization trick for Generalized Gaussian:
+            x = mu + alpha * sign(u) * |u|^(1/beta)
+        where u ~ Uniform(-1, 1)
+        Args:
+            embedding: Input embedding, shape (dim,) or (1, dim).
+            n_samples: Number of MC samples.
+        Returns:
+            Samples array, shape (n_samples, dim).
+        """
+        _check_torch()
+        self.eval()
+        emb = embedding.squeeze()
+        if emb.ndim == 1:
+            emb = emb[np.newaxis, :]
+        with torch.no_grad():
+            x = torch.tensor(emb, dtype=torch.float32)
+            mu, alpha, beta = self.forward(x)
+            # Expand for sampling: [1, dim] -> [n_samples, dim]
+            mu = mu.expand(n_samples, -1)
+            alpha = alpha.expand(n_samples, -1)
+            beta = beta.expand(n_samples, -1)
+            # Reparameterized sampling from Generalized Gaussian
+            u = torch.rand_like(mu) * 2 - 1  # Uniform(-1, 1)
+            sign = torch.sign(u)
+            samples = mu + alpha * sign * (torch.abs(u) + 1e-8).pow(1.0 / beta)
+            # L2 normalize samples (stay on unit sphere)
+            samples = F.normalize(samples, p=2, dim=-1)
+        return samples.cpu().numpy()
+    def uncertainty(self, embedding: np.ndarray) -> float:
+        """
+        Compute scalar aleatoric uncertainty for an embedding.
+        Returns the mean predicted alpha (scale parameter) across dimensions.
+        High alpha → high uncertainty → wide distribution.
+        Args:
+            embedding: Input embedding, shape (dim,) or (1, dim).
+        Returns:
+            Scalar uncertainty value (mean alpha).
+        """
+        _check_torch()
+        self.eval()
+        emb = embedding.squeeze()
+        if emb.ndim == 1:
+            emb = emb[np.newaxis, :]
+        with torch.no_grad():
+            x = torch.tensor(emb, dtype=torch.float32)
+            _, alpha, _ = self.forward(x)
+            return float(alpha.mean().item())
+    def save(self, path: str) -> None:
+        """Save adapter weights + config."""
+        _check_torch()
+        import json
+        p = Path(path)
+        p.parent.mkdir(parents=True, exist_ok=True)
+        torch.save(self.state_dict(), p)
+        config_path = p.with_suffix(".json")
+        with config_path.open("w") as f:
+            json.dump(self.config, f, indent=2)
+        logger.info("Saved ProbabilisticAdapter to %s", path)
+    @classmethod
+    def load(cls, path: str) -> "ProbabilisticAdapter":
+        """Load adapter from saved weights."""
+        _check_torch()
+        import json
+        p = Path(path)
+        config_path = p.with_suffix(".json")
+        with config_path.open("r") as f:
+            config = json.load(f)
+        model = cls(**config)
+        state_dict = torch.load(p, map_location="cpu", weights_only=True)
+        model.load_state_dict(state_dict)
+        model.eval()
+        logger.info("Loaded ProbabilisticAdapter from %s", path)
+        return model

src/embeddings/space_alignment.py ADDED Viewed

	@@ -0,0 +1,201 @@

+"""
+Ex-MCR Cross-Space Alignment: CLAP Audio → CLIP Space.
+Ex-MCR (Ex-Modal Contrastive Retrieval) projects CLAP audio embeddings
+INTO CLIP space while keeping CLIP embeddings unchanged. This lets us
+compute meaningful image-audio similarity and full 3-way Gramian volume.
+Architecture decision: Ex-MCR over C-MCR because:
+- Ex-MCR keeps CLIP embeddings frozen (no recomputation needed)
+- C-MCR projects BOTH spaces into a new space (breaks everything)
+The projector is a lightweight MLP:
+    CLAP 512-d → Linear(512, 512) → ReLU → Linear(512, 512) → L2 norm
+If Ex-MCR weights are not available, falls back to an untrained identity
+projection (which is equivalent to not using the projector).
+CLAP compatibility note:
+    Our project uses `laion/clap-htsat-unfused`.
+    Ex-MCR uses `laion_clap_fullset_fusion` (different model).
+    If projections are poor with our CLAP, switch to the fusion model.
+"""
+from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import Optional
+import numpy as np
+logger = logging.getLogger(__name__)
+try:
+    import torch
+    import torch.nn as nn
+    import torch.nn.functional as F
+    TORCH_AVAILABLE = True
+except ImportError:
+    TORCH_AVAILABLE = False
+class ExMCRProjector:
+    """
+    Projects CLAP audio embeddings into CLIP space.
+    Usage:
+        proj = ExMCRProjector("models/exmcr/ex_clap.pt")
+        audio_in_clip = proj.project_audio(clap_embedding)  # now comparable to CLIP
+    """
+    def __init__(
+        self,
+        weights_path: Optional[str] = None,
+        device: str = "cpu",
+    ):
+        """
+        Args:
+            weights_path: Path to Ex-MCR CLAP→CLIP projection weights (.pt).
+                If None or file doesn't exist, uses identity (passthrough).
+            device: Torch device for inference.
+        """
+        self._model = None
+        self._device = device
+        self._identity_mode = True
+        if weights_path and Path(weights_path).exists() and TORCH_AVAILABLE:
+            self._load_weights(weights_path)
+        elif weights_path and not Path(weights_path).exists():
+            logger.warning(
+                "Ex-MCR weights not found: %s — using identity projection", weights_path
+            )
+    def _load_weights(self, path: str) -> None:
+        """Load Ex-MCR projection head from saved weights."""
+        state_dict = torch.load(path, map_location=self._device, weights_only=True)
+        # Detect architecture from state dict keys
+        # Ex-MCR uses: layers.0.weight, layers.0.bias, layers.2.weight, layers.2.bias
+        # or: 0.weight, 0.bias, 2.weight, 2.bias
+        keys = list(state_dict.keys())
+        # Build matching MLP
+        if any("layers" in k for k in keys):
+            # Format: layers.0.weight etc.
+            in_dim = state_dict["layers.0.weight"].shape[1]
+            hidden_dim = state_dict["layers.0.weight"].shape[0]
+            out_dim = state_dict["layers.2.weight"].shape[0]
+            model = nn.Sequential(
+                nn.Linear(in_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, out_dim),
+            )
+            # Rename keys to match sequential
+            new_state = {}
+            for k, v in state_dict.items():
+                new_key = k.replace("layers.", "")
+                new_state[new_key] = v
+            model.load_state_dict(new_state)
+        elif any(k.startswith("0.") for k in keys):
+            # Format: 0.weight, 0.bias, 2.weight, 2.bias (Sequential)
+            in_dim = state_dict["0.weight"].shape[1]
+            hidden_dim = state_dict["0.weight"].shape[0]
+            out_dim = state_dict["2.weight"].shape[0]
+            model = nn.Sequential(
+                nn.Linear(in_dim, hidden_dim),
+                nn.ReLU(),
+                nn.Linear(hidden_dim, out_dim),
+            )
+            model.load_state_dict(state_dict)
+        else:
+            # Generic: try to infer from weight shapes
+            weight_keys = [k for k in keys if "weight" in k]
+            if len(weight_keys) >= 2:
+                first_w = state_dict[weight_keys[0]]
+                last_w = state_dict[weight_keys[-1]]
+                in_dim = first_w.shape[1]
+                hidden_dim = first_w.shape[0]
+                out_dim = last_w.shape[0]
+                model = nn.Sequential(
+                    nn.Linear(in_dim, hidden_dim),
+                    nn.ReLU(),
+                    nn.Linear(hidden_dim, out_dim),
+                )
+                model.load_state_dict(state_dict)
+            else:
+                logger.warning("Unrecognized Ex-MCR weight format — using identity")
+                return
+        model.to(self._device)
+        model.eval()
+        self._model = model
+        self._identity_mode = False
+        logger.info(
+            "Ex-MCR projector loaded: %d → %d → %d (from %s)",
+            in_dim, hidden_dim, out_dim, path,
+        )
+    @property
+    def is_identity(self) -> bool:
+        """True if projector is passthrough (no trained weights loaded)."""
+        return self._identity_mode
+    def project_audio(self, clap_embedding: np.ndarray) -> np.ndarray:
+        """
+        Project CLAP audio embedding into CLIP space.
+        Args:
+            clap_embedding: CLAP audio embedding, shape (512,) or (N, 512).
+        Returns:
+            Projected embedding in CLIP space, L2-normalized.
+        """
+        if self._identity_mode:
+            emb = clap_embedding.squeeze().astype(np.float32)
+            norm = np.linalg.norm(emb) + 1e-12
+            return emb / norm
+        if not TORCH_AVAILABLE:
+            return clap_embedding.squeeze().astype(np.float32)
+        was_1d = clap_embedding.ndim == 1 or (
+            clap_embedding.ndim == 2 and clap_embedding.shape[0] == 1
+        )
+        emb = clap_embedding.squeeze()
+        if emb.ndim == 1:
+            emb = emb[np.newaxis, :]
+        with torch.no_grad():
+            x = torch.tensor(emb, dtype=torch.float32, device=self._device)
+            projected = self._model(x)
+            projected = F.normalize(projected, p=2, dim=-1)
+            result = projected.cpu().numpy()
+        if was_1d:
+            return result.squeeze(0)
+        return result
+    def project_audio_batch(self, clap_embeddings: np.ndarray) -> np.ndarray:
+        """
+        Batch projection of CLAP audio embeddings into CLIP space.
+        Args:
+            clap_embeddings: Shape (N, 512).
+        Returns:
+            Projected embeddings in CLIP space, shape (N, 512), L2-normalized.
+        """
+        if self._identity_mode:
+            norms = np.linalg.norm(clap_embeddings, axis=1, keepdims=True) + 1e-12
+            return (clap_embeddings / norms).astype(np.float32)
+        if not TORCH_AVAILABLE:
+            norms = np.linalg.norm(clap_embeddings, axis=1, keepdims=True) + 1e-12
+            return (clap_embeddings / norms).astype(np.float32)
+        with torch.no_grad():
+            x = torch.tensor(clap_embeddings, dtype=torch.float32, device=self._device)
+            projected = self._model(x)
+            projected = F.normalize(projected, p=2, dim=-1)
+            return projected.cpu().numpy()