Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

App Files Files Community

pratik-250620 commited on 29 days ago

Commit

5f2e51b

verified ·

1 Parent(s): 6835659

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +300 -36

app.py CHANGED Viewed

@@ -5,17 +5,19 @@ Live demonstration of multimodal generation + coherence evaluation.
 Enter a scene description and the system produces coherent text, image,
 and audio with real-time MSCI scoring.
-Pipeline: HF Inference API (text) + CLIP retrieval (image) + CLAP retrieval (audio)
 """
 from __future__ import annotations
 import logging
 import os
 import sys
 import time
 from pathlib import Path
-from typing import Optional
 import streamlit as st
@@ -71,10 +73,14 @@ html, body, [class*="css"] { font-family: 'Inter', -apple-system, sans-serif; }
     font-size: 0.7rem; font-weight: 600; letter-spacing: 0.03em;
 }
 .chip-purple { background: rgba(129,140,248,0.14); color: #a5b4fc; }
 .chip-green  { background: rgba(52,211,153,0.14);  color: #6ee7b7; }
 .chip-dot { width: 6px; height: 6px; border-radius: 50%; }
 .chip-dot-purple { background: #818cf8; }
 .chip-dot-green  { background: #34d399; }
 .scores-grid {
     display: grid; grid-template-columns: repeat(4, 1fr);
@@ -196,6 +202,82 @@ EXAMPLE_PROMPTS = {
 }
 DOMAIN_ICONS = {"nature": "\U0001f33f", "urban": "\U0001f3d9\ufe0f", "water": "\U0001f30a", "mixed": "\U0001f310", "other": "\U0001f4cd"}
 # ---------------------------------------------------------------------------
 # Cached model loading
@@ -223,12 +305,136 @@ def get_inference_client():
     return InferenceClient(token=token)
 # ---------------------------------------------------------------------------
 # Generation / retrieval functions
 # ---------------------------------------------------------------------------
-def gen_text_hf(prompt: str) -> dict:
-    """Generate descriptive text using HF Inference API."""
     system_prompt = (
         "You are a concise descriptive writer. "
         "Write a literal description of the scene in 3 to 5 natural sentences. "
@@ -236,21 +442,19 @@ def gen_text_hf(prompt: str) -> dict:
         "Focus on concrete visual details AND the likely audio ambience."
     )
     try:
-        client = get_inference_client()
-        response = client.chat_completion(
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": f"Describe this scene: {prompt}"},
-            ],
-            max_tokens=250,
-        )
-        text = response.choices[0].message.content.strip()
         if not text:
             raise ValueError("Empty response")
-        return {"text": text, "image_prompt": prompt, "audio_prompt": prompt, "plan": None}
     except Exception as e:
-        logger.warning("HF Inference API failed: %s — using prompt as text", e)
-        return {"text": prompt, "image_prompt": prompt, "audio_prompt": prompt, "plan": None}
 def retrieve_image(prompt: str) -> dict:
@@ -334,6 +538,20 @@ def main():
     # Sidebar
     with st.sidebar:
         st.markdown("#### Examples")
         for dname, prompts in EXAMPLE_PROMPTS.items():
             icon = DOMAIN_ICONS.get(dname.lower(), "\U0001f4cd")
@@ -343,16 +561,23 @@ def main():
                         st.session_state["prompt_input"] = p
         st.divider()
         st.markdown(
-            '<div class="sidebar-info">'
-            '<b>Text</b> HF Inference API<br>'
-            '<b>Image</b> CLIP retrieval (57 images)<br>'
-            '<b>Audio</b> CLAP retrieval (104 clips)<br><br>'
-            '<b>Metric</b> MSCI = 0.45 &times; s<sub>t,i</sub> + 0.45 &times; s<sub>t,a</sub><br><br>'
-            '<b>Models</b><br>'
-            'CLIP ViT-B/32 (text-image)<br>'
-            'CLAP HTSAT-unfused (text-audio)'
-            '</div>', unsafe_allow_html=True)
     # Prompt input
     default_prompt = st.session_state.get("prompt_input", "")
@@ -367,11 +592,14 @@ def main():
     with bc1:
         go = st.button("Generate Bundle", type="primary", use_container_width=True, disabled=not prompt.strip())
     with bc2:
         st.markdown(
-            '<div class="chip-row">'
-            '<span class="chip chip-purple"><span class="chip-dot chip-dot-purple"></span>Retrieval</span>'
-            '<span class="chip chip-green"><span class="chip-dot chip-dot-green"></span>CLIP + CLAP</span>'
-            '</div>', unsafe_allow_html=True)
     # Welcome state
     if not go and "last_result" not in st.session_state:
@@ -384,7 +612,7 @@ def main():
         return
     if go and prompt.strip():
-        st.session_state["last_result"] = run_pipeline(prompt.strip())
     if "last_result" in st.session_state:
         show_results(st.session_state["last_result"])
@@ -394,17 +622,22 @@ def main():
 # Pipeline
 # ---------------------------------------------------------------------------
-def run_pipeline(prompt: str) -> dict:
-    R: dict = {}
     t_all = time.time()
-    # 1) Text
-    with st.status("Generating text...", expanded=True) as s:
         t0 = time.time()
         try:
-            R["text"] = gen_text_hf(prompt)
             R["t_text"] = time.time() - t0
-            s.update(label=f"Text ready ({R['t_text']:.1f}s)", state="complete")
         except Exception as e:
             s.update(label=f"Text failed: {e}", state="error")
             R["text"] = {"text": prompt, "image_prompt": prompt, "audio_prompt": prompt}
@@ -554,6 +787,37 @@ def show_results(R: dict):
     st.markdown("---")
     # Expandable details
     with st.expander("Retrieval Details"):
         r1, r2 = st.columns(2)
         with r1:

 Enter a scene description and the system produces coherent text, image,
 and audio with real-time MSCI scoring.
+Pipeline: HF Inference API (text + planning) + CLIP retrieval (image) + CLAP retrieval (audio)
+Planning modes: direct, planner, council (3-way), extended_prompt (3x tokens)
 """
 from __future__ import annotations
+import json
 import logging
 import os
 import sys
 import time
 from pathlib import Path
+from typing import Any, Dict, Optional
 import streamlit as st
     font-size: 0.7rem; font-weight: 600; letter-spacing: 0.03em;
 }
 .chip-purple { background: rgba(129,140,248,0.14); color: #a5b4fc; }
+.chip-pink   { background: rgba(244,114,182,0.14); color: #f9a8d4; }
 .chip-green  { background: rgba(52,211,153,0.14);  color: #6ee7b7; }
+.chip-amber  { background: rgba(251,191,36,0.12);  color: #fcd34d; }
 .chip-dot { width: 6px; height: 6px; border-radius: 50%; }
 .chip-dot-purple { background: #818cf8; }
+.chip-dot-pink   { background: #f472b6; }
 .chip-dot-green  { background: #34d399; }
+.chip-dot-amber  { background: #fbbf24; }
 .scores-grid {
     display: grid; grid-template-columns: repeat(4, 1fr);
 }
 DOMAIN_ICONS = {"nature": "\U0001f33f", "urban": "\U0001f3d9\ufe0f", "water": "\U0001f30a", "mixed": "\U0001f310", "other": "\U0001f4cd"}
+# ---------------------------------------------------------------------------
+# Planning prompt template (same as src/planner/prompts/unified.txt)
+# ---------------------------------------------------------------------------
+PLAN_PROMPT_TEMPLATE = """You must produce a SINGLE valid JSON object.
+RULES:
+- Every field MUST exist
+- Fields that represent lists MUST be arrays
+- Strings must never be arrays
+- Use short phrases, not long paragraphs
+- Do NOT include explanations
+- Do NOT include markdown
+- Do NOT truncate
+Schema:
+{
+  "scene_summary": string,
+  "domain": string,
+  "core_semantics": {
+    "setting": string,
+    "time_of_day": string,
+    "weather": string,
+    "main_subjects": [string],
+    "actions": [string]
+  },
+  "style_controls": {
+    "visual_style": [string],
+    "color_palette": [string],
+    "lighting": [string],
+    "camera": [string],
+    "mood_emotion": [string],
+    "narrative_tone": [string]
+  },
+  "image_constraints": {
+    "must_include": [string],
+    "must_avoid": [string],
+    "objects": [string],
+    "environment_details": [string],
+    "composition": [string]
+  },
+  "audio_constraints": {
+    "audio_intent": [string],
+    "sound_sources": [string],
+    "ambience": [string],
+    "tempo": string,
+    "must_include": [string],
+    "must_avoid": [string]
+  },
+  "text_constraints": {
+    "must_include": [string],
+    "must_avoid": [string],
+    "keywords": [string],
+    "length": string
+  }
+}
+User request:
+"""
+EXTENDED_PLAN_SYSTEM = """You are an expert multimodal content planner. Create a detailed,
+comprehensive semantic plan for generating coherent multimodal content (text, image, audio).
+You have an extended budget. Take your time to:
+1. Deeply analyze the user's request
+2. Consider multiple perspectives and interpretations
+3. Ensure semantic consistency across all modalities
+4. Provide rich, detailed specifications
+Think step by step about what visual elements, sounds, and descriptive text would best represent the scene.
+After your analysis, produce a SINGLE valid JSON object matching the schema."""
 # ---------------------------------------------------------------------------
 # Cached model loading
     return InferenceClient(token=token)
+# ---------------------------------------------------------------------------
+# HF Inference API helpers
+# ---------------------------------------------------------------------------
+def _hf_chat(system: str, user: str, max_tokens: int = 500, temperature: float = 0.3) -> str:
+    """Call HF Inference API chat completion."""
+    client = get_inference_client()
+    response = client.chat_completion(
+        messages=[
+            {"role": "system", "content": system},
+            {"role": "user", "content": user},
+        ],
+        max_tokens=max_tokens,
+        temperature=temperature,
+    )
+    return response.choices[0].message.content.strip()
+def _parse_plan_json(raw: str) -> Optional[Dict[str, Any]]:
+    """Parse a semantic plan JSON from LLM output, with repair."""
+    from src.utils.json_repair import try_repair_json
+    return try_repair_json(raw)
+def _validate_and_build_plan(data: Dict[str, Any]):
+    """Validate and build a SemanticPlan from dict."""
+    from src.planner.validation import validate_semantic_plan_dict
+    from src.planner.schema import SemanticPlan
+    validate_semantic_plan_dict(data)
+    return SemanticPlan(**data)
+# ---------------------------------------------------------------------------
+# Planning functions (HF Inference API)
+# ---------------------------------------------------------------------------
+def plan_single(prompt: str) -> Optional[Any]:
+    """Single planner call via HF API. Returns SemanticPlan or None."""
+    system = "You are a multimodal content planner. Output ONLY valid JSON, no explanations."
+    user = PLAN_PROMPT_TEMPLATE + prompt
+    try:
+        raw = _hf_chat(system, user, max_tokens=1200, temperature=0.3)
+        data = _parse_plan_json(raw)
+        if data:
+            return _validate_and_build_plan(data)
+    except Exception as e:
+        logger.warning("Planner call failed: %s", e)
+    return None
+def plan_council(prompt: str) -> Optional[Any]:
+    """Council mode: 3 planner calls merged. Returns SemanticPlan or None."""
+    plans = []
+    temps = [0.2, 0.4, 0.5]  # Slightly different temperatures for diversity
+    system = "You are a multimodal content planner. Output ONLY valid JSON, no explanations."
+    user = PLAN_PROMPT_TEMPLATE + prompt
+    for temp in temps:
+        try:
+            raw = _hf_chat(system, user, max_tokens=1200, temperature=temp)
+            data = _parse_plan_json(raw)
+            if data:
+                plan = _validate_and_build_plan(data)
+                plans.append(plan)
+        except Exception as e:
+            logger.warning("Council call failed (temp=%.1f): %s", temp, e)
+    if not plans:
+        return None
+    if len(plans) == 1:
+        return plans[0]
+    # Merge using existing merge logic
+    try:
+        from src.planner.merge_logic import merge_council_plans
+        while len(plans) < 3:
+            plans.append(plans[0])  # Pad if fewer than 3
+        merged, _ = merge_council_plans(plans[0], plans[1], plans[2])
+        return merged
+    except Exception as e:
+        logger.warning("Merge failed: %s — using first plan", e)
+        return plans[0]
+def plan_extended(prompt: str) -> Optional[Any]:
+    """Extended prompt mode: longer system prompt, more tokens. Returns SemanticPlan or None."""
+    user = PLAN_PROMPT_TEMPLATE + prompt
+    try:
+        raw = _hf_chat(EXTENDED_PLAN_SYSTEM, user, max_tokens=2000, temperature=0.35)
+        data = _parse_plan_json(raw)
+        if data:
+            return _validate_and_build_plan(data)
+    except Exception as e:
+        logger.warning("Extended planner failed: %s", e)
+    return None
 # ---------------------------------------------------------------------------
 # Generation / retrieval functions
 # ---------------------------------------------------------------------------
+def gen_text(prompt: str, mode: str) -> dict:
+    """Generate text and optional plan using HF Inference API."""
+    # Step 1: Plan (if not direct mode)
+    plan = None
+    image_prompt = prompt
+    audio_prompt = prompt
+    if mode == "planner":
+        plan = plan_single(prompt)
+    elif mode == "council":
+        plan = plan_council(prompt)
+    elif mode == "extended_prompt":
+        plan = plan_extended(prompt)
+    # Extract modality-specific prompts from plan
+    if plan is not None:
+        try:
+            from src.planner.schema_to_text import plan_to_prompts
+            prompts = plan_to_prompts(plan)
+            image_prompt = prompts["image_prompt"]
+            audio_prompt = prompts["audio_prompt"]
+            text_input = prompts["text_prompt"]
+        except Exception as e:
+            logger.warning("plan_to_prompts failed: %s", e)
+            text_input = prompt
+    else:
+        text_input = prompt
+    # Step 2: Generate text via HF API
     system_prompt = (
         "You are a concise descriptive writer. "
         "Write a literal description of the scene in 3 to 5 natural sentences. "
         "Focus on concrete visual details AND the likely audio ambience."
     )
     try:
+        text = _hf_chat(system_prompt, f"Describe this scene: {text_input}", max_tokens=250, temperature=0.7)
         if not text:
             raise ValueError("Empty response")
     except Exception as e:
+        logger.warning("HF text gen failed: %s — using prompt", e)
+        text = prompt
+    return {
+        "text": text,
+        "image_prompt": image_prompt,
+        "audio_prompt": audio_prompt,
+        "plan": plan.model_dump() if plan and hasattr(plan, "model_dump") else None,
+    }
 def retrieve_image(prompt: str) -> dict:
     # Sidebar
     with st.sidebar:
+        st.markdown("#### Configuration")
+        mode = st.selectbox(
+            "Planning Mode",
+            ["direct", "planner", "council", "extended_prompt"],
+            format_func=lambda x: {
+                "direct": "Direct",
+                "planner": "Planner (single LLM call)",
+                "council": "Council (3-way merge)",
+                "extended_prompt": "Extended (3x tokens)",
+            }[x],
+        )
+        st.divider()
         st.markdown("#### Examples")
         for dname, prompts in EXAMPLE_PROMPTS.items():
             icon = DOMAIN_ICONS.get(dname.lower(), "\U0001f4cd")
                         st.session_state["prompt_input"] = p
         st.divider()
+        mode_desc = {
+            "direct": "Prompt used directly for all modalities",
+            "planner": "LLM creates a semantic plan with image/audio prompts",
+            "council": "3 LLM calls merged for richer planning",
+            "extended_prompt": "Single LLM call with 3x token budget",
+        }
         st.markdown(
+            f'<div class="sidebar-info">'
+            f'<b>Text</b> HF Inference API<br>'
+            f'<b>Planning</b> {mode_desc[mode]}<br>'
+            f'<b>Image</b> CLIP retrieval (57 images)<br>'
+            f'<b>Audio</b> CLAP retrieval (104 clips)<br><br>'
+            f'<b>Metric</b> MSCI = 0.45 &times; s<sub>t,i</sub> + 0.45 &times; s<sub>t,a</sub><br><br>'
+            f'<b>Models</b><br>'
+            f'CLIP ViT-B/32 (text-image)<br>'
+            f'CLAP HTSAT-unfused (text-audio)'
+            f'</div>', unsafe_allow_html=True)
     # Prompt input
     default_prompt = st.session_state.get("prompt_input", "")
     with bc1:
         go = st.button("Generate Bundle", type="primary", use_container_width=True, disabled=not prompt.strip())
     with bc2:
+        mlbl = {"direct": "Direct", "planner": "Planner", "council": "Council", "extended_prompt": "Extended"}[mode]
+        mcls = "chip-amber" if mode != "direct" else "chip-purple"
+        mdot = "chip-dot-amber" if mode != "direct" else "chip-dot-purple"
         st.markdown(
+            f'<div class="chip-row">'
+            f'<span class="chip chip-purple"><span class="chip-dot chip-dot-purple"></span>Retrieval</span>'
+            f'<span class="chip {mcls}"><span class="chip-dot {mdot}"></span>{mlbl}</span>'
+            f'</div>', unsafe_allow_html=True)
     # Welcome state
     if not go and "last_result" not in st.session_state:
         return
     if go and prompt.strip():
+        st.session_state["last_result"] = run_pipeline(prompt.strip(), mode)
     if "last_result" in st.session_state:
         show_results(st.session_state["last_result"])
 # Pipeline
 # ---------------------------------------------------------------------------
+def run_pipeline(prompt: str, mode: str) -> dict:
+    R: dict = {"mode": mode}
     t_all = time.time()
+    # 1) Text + Planning
+    plan_label = "Generating text..." if mode == "direct" else f"Planning ({mode}) + generating text..."
+    with st.status(plan_label, expanded=True) as s:
         t0 = time.time()
         try:
+            R["text"] = gen_text(prompt, mode)
             R["t_text"] = time.time() - t0
+            has_plan = R["text"].get("plan") is not None
+            lbl = f"Text ready ({R['t_text']:.1f}s)"
+            if has_plan:
+                lbl = f"Plan + text ready ({R['t_text']:.1f}s)"
+            s.update(label=lbl, state="complete")
         except Exception as e:
             s.update(label=f"Text failed: {e}", state="error")
             R["text"] = {"text": prompt, "image_prompt": prompt, "audio_prompt": prompt}
     st.markdown("---")
     # Expandable details
+    with st.expander("Semantic Plan"):
+        td = R.get("text", {})
+        plan = td.get("plan")
+        if plan:
+            p1, p2 = st.columns(2)
+            with p1:
+                dash = "\u2014"
+                dot = "\u00b7"
+                scene = plan.get("scene_summary", dash)
+                domain = plan.get("domain", dash)
+                core = plan.get("core_semantics", {})
+                setting = core.get("setting", dash)
+                tod = core.get("time_of_day", dash)
+                weather = core.get("weather", dash)
+                subjects = ", ".join(core.get("main_subjects", []))
+                st.markdown(f"**Scene** {scene}")
+                st.markdown(f"**Domain** {domain}")
+                st.markdown(f"**Setting** {setting} {dot} **Time** {tod} {dot} **Weather** {weather}")
+                st.markdown(f"**Subjects** {subjects}")
+            with p2:
+                st.markdown("**Image prompt**")
+                st.code(td.get("image_prompt", ""), language=None)
+                st.markdown("**Audio prompt**")
+                st.code(td.get("audio_prompt", ""), language=None)
+        else:
+            mode = R.get("mode", "direct")
+            if mode == "direct":
+                st.write("Direct mode \u2014 no semantic plan. Prompt used as-is for all modalities.")
+            else:
+                st.write(f"Planning ({mode}) did not produce a valid plan. Fell back to direct mode.")
     with st.expander("Retrieval Details"):
         r1, r2 = st.columns(2)
         with r1: