pratik-250620's picture
Upload folder using huggingface_hub
6835659 verified
from __future__ import annotations
from typing import Any, Dict
def build_audio_prompt_from_plan(plan: Dict[str, Any]) -> str:
scene = plan.get("scene_summary", "")
prim = ", ".join(plan.get("primary_entities", []))
sec = ", ".join(plan.get("secondary_entities", []))
return (
f"Ambient audio for scene: {scene}. "
f"Primary elements: {prim}. Secondary: {sec}. "
"STRICT: match the visual environment. "
"If urban -> city ambience, traffic, rain. "
"If forest -> wind, birds, leaves. "
"NO mismatch."
)
def retry_si_a(context: Dict[str, Any], max_tries: int = 2) -> Dict[str, Any]:
plan = context["semantic_plan"]
audio_gen = context["audio_generator"]
out_path = context.get("audio_path")
if not out_path:
raise RuntimeError("Missing audio_path for si_a retry.")
prompt = build_audio_prompt_from_plan(plan)
last_audio = context.get("audio") or out_path
for attempt in range(1, max_tries + 1):
result = audio_gen.generate(prompt, out_path)
last_audio = result.audio_path
context["audio"] = result.audio_path
context["audio_backend"] = result.backend
context["retry"] = {
"type": "si_a",
"strategy": "ALIGN_AUDIO_TO_IMAGE",
"attempt": attempt,
"prompt_used": prompt,
}
return context
context["audio"] = last_audio
return context