Spaces:

pratik-250620
/

MultiModal-Coherence-AI

Running

File size: 9,104 Bytes
from __future__ import annotations

from typing import Any, Dict, List


def _norm_list(x: Any) -> List[str]:
    if not x:
        return []
    if isinstance(x, list):
        return [str(v).strip() for v in x if str(v).strip()]
    return [str(x).strip()]


def _join(items: List[str], sep: str = ", ") -> str:
    items = [i.strip() for i in items if i and i.strip()]
    return sep.join(items)


def _sent(items: List[str]) -> str:
    """Sentence-ish join. Keeps it readable."""
    items = [i.strip() for i in items if i and i.strip()]
    if not items:
        return ""
    if len(items) == 1:
        return items[0]
    return "; ".join(items)


def plan_to_prompts(plan: Any) -> Dict[str, str]:
    """
    Convert the UnifiedPlanner JSON schema output into STRICT, modality-specific prompts.
    This is the key fix: generators must obey the same semantic contract.

    Returns:
      {
        "text_prompt":  "...",
        "image_prompt": "...",
        "audio_prompt": "...",
        "shared_brief": "..."
      }
    """

    # Accept either pydantic model or dict-like
    if hasattr(plan, "model_dump"):
        p = plan.model_dump()
    elif isinstance(plan, dict):
        p = plan
    else:
        # last resort
        p = dict(plan)

    scene_summary = str(p.get("scene_summary", "")).strip()
    domain = str(p.get("domain", "")).strip()

    # Extract from nested structure (UnifiedPlan schema)
    core_sem = p.get("core_semantics", {})
    style_ctrl = p.get("style_controls", {})
    img_const = p.get("image_constraints", {})
    aud_const = p.get("audio_constraints", {})
    text_const = p.get("text_constraints", {})

    # Primary entities from core_semantics.main_subjects
    primary = _norm_list(core_sem.get("main_subjects") if isinstance(core_sem, dict) else [])
    # Secondary entities (not in schema, but check for compatibility)
    secondary = _norm_list(p.get("secondary_entities", []))
    
    # Visual attributes from style_controls and image_constraints
    visual_style = _norm_list(style_ctrl.get("visual_style", []) if isinstance(style_ctrl, dict) else [])
    color_palette = _norm_list(style_ctrl.get("color_palette", []) if isinstance(style_ctrl, dict) else [])
    lighting = _norm_list(style_ctrl.get("lighting", []) if isinstance(style_ctrl, dict) else [])
    img_objects = _norm_list(img_const.get("objects", []) if isinstance(img_const, dict) else [])
    env_details = _norm_list(img_const.get("environment_details", []) if isinstance(img_const, dict) else [])
    visual_attrs = visual_style + color_palette + lighting + img_objects + env_details
    
    # Style from style_controls
    style = visual_style  # Use visual_style as style
    
    # Mood from style_controls
    mood = _norm_list(style_ctrl.get("mood_emotion", []) if isinstance(style_ctrl, dict) else [])
    
    # Tone from style_controls
    tone = _norm_list(style_ctrl.get("narrative_tone", []) if isinstance(style_ctrl, dict) else [])
    
    # Audio from audio_constraints
    audio_intent = _norm_list(aud_const.get("audio_intent", []) if isinstance(aud_const, dict) else [])
    sound_sources = _norm_list(aud_const.get("sound_sources", []) if isinstance(aud_const, dict) else [])
    ambience = _norm_list(aud_const.get("ambience", []) if isinstance(aud_const, dict) else [])
    audio_elems = audio_intent + sound_sources + ambience
    
    # Must include/avoid from constraints
    img_must_include = _norm_list(img_const.get("must_include", []) if isinstance(img_const, dict) else [])
    img_must_avoid = _norm_list(img_const.get("must_avoid", []) if isinstance(img_const, dict) else [])
    must_include = img_must_include  # Use image constraints for now
    must_avoid = img_must_avoid

    # -------------------------
    # SHARED BRIEF (NO INSTRUCTIONS)
    # -------------------------
    # Important: This is NOT "do X". It's "X is present".
    brief_parts: List[str] = []

    if scene_summary:
        brief_parts.append(scene_summary)

    if domain:
        brief_parts.append(f"Domain: {domain}.")

    if primary:
        brief_parts.append(f"Primary entities: {_join(primary)}.")
    if secondary:
        brief_parts.append(f"Secondary entities: {_join(secondary)}.")

    if visual_attrs:
        brief_parts.append(f"Visual attributes: {_join(visual_attrs)}.")
    if style:
        brief_parts.append(f"Style: {_join(style)}.")
    if mood:
        brief_parts.append(f"Mood/emotion: {_join(mood)}.")
    if tone:
        brief_parts.append(f"Narrative tone: {_join(tone)}.")

    if must_include:
        brief_parts.append(f"Must include: {_join(must_include)}.")
    if must_avoid:
        brief_parts.append(f"Must avoid: {_join(must_avoid)}.")

    shared_brief = " ".join([b.strip() for b in brief_parts if b.strip()])

    # -------------------------
    # TEXT PROMPT (STRICT)
    # -------------------------
    # Goal: stop instruction-echo. We never say “describe” or “generate”.
    # We demand a literal depiction, short, grounded.
    text_lines: List[str] = []
    text_lines.append("Write a vivid, literal description of the exact scene below.")
    text_lines.append("Do not include instructions, bullets, headings, or meta commentary.")
    text_lines.append("Do not mention 'prompt' or 'plan'.")
    text_lines.append("")
    text_lines.append(shared_brief)
    text_lines.append("")
    text_lines.append("Constraints:")
    if must_include:
        text_lines.append(f"- Include: {_join(must_include)}")
    if must_avoid:
        text_lines.append(f"- Avoid: {_join(must_avoid)}")
    text_lines.append("- Length: 3 to 6 sentences.")

    text_prompt = "\n".join(text_lines).strip()

    # -------------------------
    # IMAGE PROMPT (STRICT VISUAL CONTRACT)
    # -------------------------
    # Build a rich, specific prompt for better image retrieval
    img_parts: List[str] = []
    
    # Core scene
    if scene_summary:
        img_parts.append(scene_summary)
    
    # Main subjects (most important for matching)
    if primary:
        img_parts.append(_join(primary))
    
    # Visual details
    if visual_attrs:
        # Use first few most important visual attributes
        key_visuals = visual_attrs[:5]  # Limit to avoid too long prompts
        img_parts.append(_join(key_visuals))
    
    # Style and mood
    if style:
        img_parts.append(_join(style[:2]))  # Limit style tags
    if mood:
        img_parts.append(_join(mood[:2]))  # Limit mood tags
    
    # Core semantics for context
    if isinstance(core_sem, dict):
        setting = core_sem.get("setting", "")
        time_of_day = core_sem.get("time_of_day", "")
        weather = core_sem.get("weather", "")
        if setting:
            img_parts.append(setting)
        if time_of_day:
            img_parts.append(time_of_day)
        if weather:
            img_parts.append(weather)
    
    # Build final prompt - more specific for retrieval
    image_prompt = ", ".join([p for p in img_parts if p]).strip()
    
    # Fallback if empty
    if not image_prompt:
        image_prompt = scene_summary or "scene"

    # -------------------------
    # AUDIO PROMPT (STRICT AUDIO CONTRACT)
    # -------------------------
    # Build a specific, detailed audio prompt for AudioLDM
    aud_parts: List[str] = []
    
    # Core scene context
    if scene_summary:
        aud_parts.append(scene_summary)
    
    # Audio elements (most important)
    if sound_sources:
        aud_parts.append("sounds of " + _join(sound_sources[:4]))  # Limit to avoid too long
    if ambience:
        aud_parts.append("ambient " + _join(ambience[:3]))
    if audio_intent:
        aud_parts.append(_join(audio_intent))
    
    # Context from core semantics
    if isinstance(core_sem, dict):
        setting = core_sem.get("setting", "")
        weather = core_sem.get("weather", "")
        if weather and weather.lower() not in ["clear", "sunny"]:
            aud_parts.append(weather.lower() + " weather sounds")
        if setting:
            aud_parts.append(setting.lower() + " environment")
    
    # Tempo/mood from audio constraints
    if isinstance(aud_const, dict):
        tempo = aud_const.get("tempo", "")
        if tempo:
            aud_parts.append(tempo + " tempo")
    
    # Build final prompt - specific and concise for AudioLDM
    audio_prompt = ", ".join([p for p in aud_parts if p]).strip()
    
    # Fallback if empty
    if not audio_prompt:
        audio_prompt = scene_summary or "ambient soundscape"
    
    # Add quality hints for AudioLDM
    if not audio_prompt.endswith("sound") and not audio_prompt.endswith("audio"):
        audio_prompt += " soundscape"

    return {
        "text_prompt": text_prompt,
        "image_prompt": image_prompt,
        "audio_prompt": audio_prompt,
        "shared_brief": shared_brief,
    }


# Backward compatible function name (if older code imports it)
def plan_to_canonical_text(plan: Any) -> str:
    """
    Legacy: returns the shared brief. Keep this to avoid breaking other imports.
    """
    return plan_to_prompts(plan)["shared_brief"]