pratik-250620's picture
Upload folder using huggingface_hub
6835659 verified
from __future__ import annotations
from typing import Any, Dict, List
def _norm_list(x: Any) -> List[str]:
if not x:
return []
if isinstance(x, list):
return [str(v).strip() for v in x if str(v).strip()]
return [str(x).strip()]
def _join(items: List[str], sep: str = ", ") -> str:
items = [i.strip() for i in items if i and i.strip()]
return sep.join(items)
def _sent(items: List[str]) -> str:
"""Sentence-ish join. Keeps it readable."""
items = [i.strip() for i in items if i and i.strip()]
if not items:
return ""
if len(items) == 1:
return items[0]
return "; ".join(items)
def plan_to_prompts(plan: Any) -> Dict[str, str]:
"""
Convert the UnifiedPlanner JSON schema output into STRICT, modality-specific prompts.
This is the key fix: generators must obey the same semantic contract.
Returns:
{
"text_prompt": "...",
"image_prompt": "...",
"audio_prompt": "...",
"shared_brief": "..."
}
"""
# Accept either pydantic model or dict-like
if hasattr(plan, "model_dump"):
p = plan.model_dump()
elif isinstance(plan, dict):
p = plan
else:
# last resort
p = dict(plan)
scene_summary = str(p.get("scene_summary", "")).strip()
domain = str(p.get("domain", "")).strip()
# Extract from nested structure (UnifiedPlan schema)
core_sem = p.get("core_semantics", {})
style_ctrl = p.get("style_controls", {})
img_const = p.get("image_constraints", {})
aud_const = p.get("audio_constraints", {})
text_const = p.get("text_constraints", {})
# Primary entities from core_semantics.main_subjects
primary = _norm_list(core_sem.get("main_subjects") if isinstance(core_sem, dict) else [])
# Secondary entities (not in schema, but check for compatibility)
secondary = _norm_list(p.get("secondary_entities", []))
# Visual attributes from style_controls and image_constraints
visual_style = _norm_list(style_ctrl.get("visual_style", []) if isinstance(style_ctrl, dict) else [])
color_palette = _norm_list(style_ctrl.get("color_palette", []) if isinstance(style_ctrl, dict) else [])
lighting = _norm_list(style_ctrl.get("lighting", []) if isinstance(style_ctrl, dict) else [])
img_objects = _norm_list(img_const.get("objects", []) if isinstance(img_const, dict) else [])
env_details = _norm_list(img_const.get("environment_details", []) if isinstance(img_const, dict) else [])
visual_attrs = visual_style + color_palette + lighting + img_objects + env_details
# Style from style_controls
style = visual_style # Use visual_style as style
# Mood from style_controls
mood = _norm_list(style_ctrl.get("mood_emotion", []) if isinstance(style_ctrl, dict) else [])
# Tone from style_controls
tone = _norm_list(style_ctrl.get("narrative_tone", []) if isinstance(style_ctrl, dict) else [])
# Audio from audio_constraints
audio_intent = _norm_list(aud_const.get("audio_intent", []) if isinstance(aud_const, dict) else [])
sound_sources = _norm_list(aud_const.get("sound_sources", []) if isinstance(aud_const, dict) else [])
ambience = _norm_list(aud_const.get("ambience", []) if isinstance(aud_const, dict) else [])
audio_elems = audio_intent + sound_sources + ambience
# Must include/avoid from constraints
img_must_include = _norm_list(img_const.get("must_include", []) if isinstance(img_const, dict) else [])
img_must_avoid = _norm_list(img_const.get("must_avoid", []) if isinstance(img_const, dict) else [])
must_include = img_must_include # Use image constraints for now
must_avoid = img_must_avoid
# -------------------------
# SHARED BRIEF (NO INSTRUCTIONS)
# -------------------------
# Important: This is NOT "do X". It's "X is present".
brief_parts: List[str] = []
if scene_summary:
brief_parts.append(scene_summary)
if domain:
brief_parts.append(f"Domain: {domain}.")
if primary:
brief_parts.append(f"Primary entities: {_join(primary)}.")
if secondary:
brief_parts.append(f"Secondary entities: {_join(secondary)}.")
if visual_attrs:
brief_parts.append(f"Visual attributes: {_join(visual_attrs)}.")
if style:
brief_parts.append(f"Style: {_join(style)}.")
if mood:
brief_parts.append(f"Mood/emotion: {_join(mood)}.")
if tone:
brief_parts.append(f"Narrative tone: {_join(tone)}.")
if must_include:
brief_parts.append(f"Must include: {_join(must_include)}.")
if must_avoid:
brief_parts.append(f"Must avoid: {_join(must_avoid)}.")
shared_brief = " ".join([b.strip() for b in brief_parts if b.strip()])
# -------------------------
# TEXT PROMPT (STRICT)
# -------------------------
# Goal: stop instruction-echo. We never say “describe” or “generate”.
# We demand a literal depiction, short, grounded.
text_lines: List[str] = []
text_lines.append("Write a vivid, literal description of the exact scene below.")
text_lines.append("Do not include instructions, bullets, headings, or meta commentary.")
text_lines.append("Do not mention 'prompt' or 'plan'.")
text_lines.append("")
text_lines.append(shared_brief)
text_lines.append("")
text_lines.append("Constraints:")
if must_include:
text_lines.append(f"- Include: {_join(must_include)}")
if must_avoid:
text_lines.append(f"- Avoid: {_join(must_avoid)}")
text_lines.append("- Length: 3 to 6 sentences.")
text_prompt = "\n".join(text_lines).strip()
# -------------------------
# IMAGE PROMPT (STRICT VISUAL CONTRACT)
# -------------------------
# Build a rich, specific prompt for better image retrieval
img_parts: List[str] = []
# Core scene
if scene_summary:
img_parts.append(scene_summary)
# Main subjects (most important for matching)
if primary:
img_parts.append(_join(primary))
# Visual details
if visual_attrs:
# Use first few most important visual attributes
key_visuals = visual_attrs[:5] # Limit to avoid too long prompts
img_parts.append(_join(key_visuals))
# Style and mood
if style:
img_parts.append(_join(style[:2])) # Limit style tags
if mood:
img_parts.append(_join(mood[:2])) # Limit mood tags
# Core semantics for context
if isinstance(core_sem, dict):
setting = core_sem.get("setting", "")
time_of_day = core_sem.get("time_of_day", "")
weather = core_sem.get("weather", "")
if setting:
img_parts.append(setting)
if time_of_day:
img_parts.append(time_of_day)
if weather:
img_parts.append(weather)
# Build final prompt - more specific for retrieval
image_prompt = ", ".join([p for p in img_parts if p]).strip()
# Fallback if empty
if not image_prompt:
image_prompt = scene_summary or "scene"
# -------------------------
# AUDIO PROMPT (STRICT AUDIO CONTRACT)
# -------------------------
# Build a specific, detailed audio prompt for AudioLDM
aud_parts: List[str] = []
# Core scene context
if scene_summary:
aud_parts.append(scene_summary)
# Audio elements (most important)
if sound_sources:
aud_parts.append("sounds of " + _join(sound_sources[:4])) # Limit to avoid too long
if ambience:
aud_parts.append("ambient " + _join(ambience[:3]))
if audio_intent:
aud_parts.append(_join(audio_intent))
# Context from core semantics
if isinstance(core_sem, dict):
setting = core_sem.get("setting", "")
weather = core_sem.get("weather", "")
if weather and weather.lower() not in ["clear", "sunny"]:
aud_parts.append(weather.lower() + " weather sounds")
if setting:
aud_parts.append(setting.lower() + " environment")
# Tempo/mood from audio constraints
if isinstance(aud_const, dict):
tempo = aud_const.get("tempo", "")
if tempo:
aud_parts.append(tempo + " tempo")
# Build final prompt - specific and concise for AudioLDM
audio_prompt = ", ".join([p for p in aud_parts if p]).strip()
# Fallback if empty
if not audio_prompt:
audio_prompt = scene_summary or "ambient soundscape"
# Add quality hints for AudioLDM
if not audio_prompt.endswith("sound") and not audio_prompt.endswith("audio"):
audio_prompt += " soundscape"
return {
"text_prompt": text_prompt,
"image_prompt": image_prompt,
"audio_prompt": audio_prompt,
"shared_brief": shared_brief,
}
# Backward compatible function name (if older code imports it)
def plan_to_canonical_text(plan: Any) -> str:
"""
Legacy: returns the shared brief. Keep this to avoid breaking other imports.
"""
return plan_to_prompts(plan)["shared_brief"]