File size: 16,160 Bytes
72f552e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 | """Generate image + video prompts from segments using an LLM.
Takes segments.json (lyrics mapped to beat intervals) and produces two
prompts per segment via two separate LLM calls:
1. Image prompt β short, SDXL-optimized (β€77 CLIP tokens)
2. Video prompt β detailed motion/action description for I2V (no token limit)
Consistency: LLM keeps all scenes within a shared setting from the style guidance.
Variety: LLM picks different subjects, camera angles, compositions per segment.
Narrative: LLM derives an overarching visual story from the lyrics.
"""
import json
import os
from pathlib import Path
from typing import Optional
import anthropic
from dotenv import load_dotenv
load_dotenv()
# Camera angles to cycle through for visual variety between cuts
CAMERA_ANGLES = [
"wide establishing shot",
"close-up",
"aerial view",
"low angle shot",
"medium shot",
"extreme wide shot",
"over-the-shoulder perspective",
"dutch angle",
"tracking shot from the side",
"bird's eye view",
"ground-level shot",
"silhouette against the sky",
]
# Default quality suffix β overridden by style-specific quality_suffix from styles.py
DEFAULT_QUALITY_SUFFIX = "8K, cinematic, atmospheric, sharp details"
NEGATIVE_PROMPT = (
"text, watermark, logo, blurry, low quality, deformed, "
"ugly, oversaturated, cartoon, anime"
)
# ---------------------------------------------------------------------------
# LLM Call 1: Image prompts (short, SDXL-optimized)
# ---------------------------------------------------------------------------
IMAGE_SYSTEM_PROMPT = """\
You are a music video director. Given song lyrics, a SETTING, and a list of \
segments (each ~2 seconds long), create a visually compelling shot list for \
IMAGE generation (Stable Diffusion XL).
Rules:
1. A SETTING will be provided at the end of these instructions. ALL scenes \
MUST take place within that setting β treat it as the world of a short film. \
Never leave this world.
2. Use the LYRICS to shape the MOOD, ENERGY, and EMOTIONAL ARC of each scene. \
The lyrics dictate the vibe β if they're dark and melancholic, the visuals \
should feel heavy and somber even within the setting. If they're upbeat, the \
visuals should feel energetic.
3. When lyrics are CONCRETE and naturally fit the setting, lean into them \
heavily. For example, if the setting is a coastal drive and the lyrics say \
"waves crashing down", make that segment literally about waves crashing \
against rocks as the car passes. If the lyrics say "fading light", show the \
sun dropping below the horizon. The more specific the lyrics, the more \
directly they should influence the scene.
4. When lyrics are ABSTRACT or metaphorical (e.g. "lost in your eyes", \
"falling apart"), translate the emotion into something visual and physical \
within the setting β don't try to literally depict abstract concepts.
5. Each segment gets a UNIQUE SHOT within the shared setting β vary the \
subject, angle, and composition, but NEVER leave the world.
CRITICAL: Every scene MUST depict ACTION or MOTION β something must be \
happening. These will be turned into short video clips, so static subjects \
like "a wooden floor", "a parked car", or "an empty room" are useless. \
Show vehicles driving, waves crashing, lights flickering, rain falling, \
fires burning β dynamic scenes only.
6. Use the assigned camera angle for each segment.
7. Segments WITHOUT lyrics (instrumental): use atmospheric, mood-driven \
details from the setting (environmental motion, weather, ambient action).
8. Write prompts as SDXL-optimized natural language descriptions. \
Keep each scene between 25-35 words. Be specific β name exact objects, \
materials, colors, and weather details. Every word must earn its place. \
Focus on CONCRETE OBJECTS and ACTIONS β what is physically in the frame \
and what is happening. SDXL needs to know what to draw, not how to feel. \
BAD: "reflections layering over glass, interior light diffused through water" β abstract mood. \
GOOD: "taxi splashing through puddle on wet street, rain falling past neon bar sign" β objects + action. \
BAD: "streetlights bleeding through downpour, darkness stretching ahead" β vague atmosphere. \
GOOD: "car windshield wipers sweeping rain, blurred traffic lights ahead, wet dashboard" β specific things. \
BAD: "water sheeting off canvas edge in a thick curtain" β SDXL will draw a curtain. \
GOOD: "water pouring off awning edge, rain splashing on sidewalk below" β plain description. \
Write like you're telling a 10-year-old what's in the picture. Simple, plain words. \
Name the objects. Name the action. Lighting and mood come from the SETTING, \
you don't need to describe them β describe what's HAPPENING. \
Use LITERAL language only β no metaphors, no poetic phrasing. SDXL interprets \
words literally. BANNED words: bleeding, drowning, bathed, kissed, dancing, \
breathing, alive, whispering, haunting, cascading, diffusing, fragmenting. \
These cause SDXL to generate unintended objects. \
Also avoid describing PROCESSES or PHYSICS β SDXL generates a single frame, \
not a sequence. "ripples expanding", "light fragmenting and reforming", \
"reflections scattering" are processes, not objects. Instead describe the \
RESULT: "rippled puddle", "blurry neon reflection in water", "wet glass". \
Say exactly what a camera would capture in ONE freeze-frame. \
Before finalizing each scene, sanity-check it: does this make physical \
sense? Could this actually exist? "pooled water on a car hood" β no, car \
hoods are curved and water runs off. "rain falling upward" β no. \
"neon sign reflected in a brick wall" β no, brick doesn't reflect. \
Only write scenes that obey basic physics and real-world logic. \
Strip camera angle phrasing from the scene text (angles are metadata, not prompt words).
9. Include lighting and color in every scene. Derive from the SETTING β \
a sunset drive = warm golden-hour light, lens flares, long shadows; \
a rainy city night = cold neon on wet surfaces, streetlight halos; \
a stormy harbour = overcast grey, dramatic cloud breaks. \
Keep lighting consistent across all scenes.
10. Do NOT include style, quality, or technical tags in the scene β these \
are appended automatically. BANNED from scenes: "cinematic", "moody", \
"atmospheric", "dramatic lighting", "film grain", "color grade", "bokeh", \
"depth of field", "35mm", "8K", "masterpiece", "best quality". \
Your scene should contain ONLY objects, actions, and setting-derived light.
11. Do NOT include text, words, or typography in the scenes.
12. Do NOT end scenes with periods. Use commas to separate phrases. \
Every character counts β periods waste a token.
Return ONLY valid JSON: a list of objects with "segment" (number) and \
"scene" (the creative description). No markdown, no explanation.\
"""
# ---------------------------------------------------------------------------
# LLM Call 2: Video prompts (detailed motion descriptions)
# ---------------------------------------------------------------------------
VIDEO_SYSTEM_PROMPT = """\
You are a music video director creating motion descriptions for an \
image-to-video AI model. You will receive a list of segments, each with \
an image scene description already written. Your job is to describe \
HOW each scene should MOVE and ANIMATE.
Rules:
1. For each segment, write a detailed "video_prompt" (2-4 sentences) \
describing all motion in the scene:
- SUBJECT MOTION: what the subject does (walking, turning, reaching, \
driving, dancing, running, etc.)
- CAMERA MOTION: how the camera moves (slow pan left, dolly forward, \
tracking shot, crane up, handheld shake, static with zoom, etc.)
- ENVIRONMENTAL MOTION: ambient movement (wind blowing hair/clothes, \
rain falling, leaves drifting, smoke rising, lights flickering, waves \
crashing, clouds moving, reflections rippling, etc.)
- PACING: match the emotional energy β slow and contemplative for \
quiet moments, faster and more dynamic for intense moments.
2. Be specific and physical. Not "things move around" but "the camera \
slowly tracks forward as rain streaks across the windshield and the \
wipers sweep left to right."
3. Keep the motion consistent with the shared setting β all scenes are \
part of the same story.
4. Do NOT describe visual style, colors, or lighting β the image already \
has those. Focus ONLY on motion and action.
5. CRITICAL β ONLY animate what exists in the scene description. Do NOT \
introduce new subjects, people, or objects that are not explicitly \
mentioned. If the scene describes a landscape with no people, describe \
ONLY environmental motion (wind, water, light changes, camera movement). \
NEVER add a person walking into frame unless the scene already mentions \
a person or figure.
Return ONLY valid JSON: a list of objects with "segment" (number) and \
"video_prompt" (the motion description). No markdown, no explanation.\
"""
def _build_user_prompt(
segments: list[dict], song_name: str, style_description: str = "",
) -> str:
"""Build the user message for the image prompt LLM call."""
all_lyrics = " ".join(
seg["lyrics"] for seg in segments if seg["lyrics"]
).strip()
lines = [
f'Song: "{song_name}"',
f'Full lyrics in this clip: "{all_lyrics}"',
f"Number of segments: {len(segments)}",
]
if style_description:
lines.append(f'Visual style direction: "{style_description}"')
lines += ["", "Segments:"]
for i, seg in enumerate(segments):
angle = CAMERA_ANGLES[i % len(CAMERA_ANGLES)]
lyrics_note = f'lyrics: "{seg["lyrics"]}"' if seg["lyrics"] else "instrumental"
lines.append(
f' {seg["segment"]}. ({seg["start"]:.1f}sβ{seg["end"]:.1f}s) '
f'[{angle}] {lyrics_note}'
)
return "\n".join(lines)
def _build_video_user_prompt(segments: list[dict]) -> str:
"""Build the user message for the video prompt LLM call."""
lines = [
"Generate motion descriptions for each segment.",
"IMPORTANT: ONLY animate elements that exist in the scene description.",
"Do NOT add people, figures, or objects that aren't mentioned.",
"",
"Image scenes:",
"",
]
for seg in segments:
lyrics_note = f' (lyrics: "{seg["lyrics"]}")' if seg.get("lyrics") else " (instrumental)"
lines.append(
f' Segment {seg["segment"]}: "{seg["scene"]}"{lyrics_note}'
)
return "\n".join(lines)
def _parse_llm_json(raw: str) -> list[dict]:
"""Parse JSON from LLM response, stripping markdown fences if present."""
raw = raw.strip()
if raw.startswith("```"):
raw = raw.split("\n", 1)[1]
raw = raw.rsplit("```", 1)[0]
return json.loads(raw)
def generate_prompts(
segments: list[dict],
song_name: str = "Unknown",
style_description: str = "",
image_prompt_guidance: str = "",
quality_suffix: str = "",
model: str = "claude-sonnet-4-6",
) -> list[dict]:
"""Generate image + video prompts for each segment using two LLM calls.
Args:
segments: List of segment dicts from segmenter (with lyrics).
song_name: Name of the song (helps the LLM set the mood).
style_description: Description of the visual style (from styles registry).
image_prompt_guidance: Style-specific creative direction appended to the
image system prompt (from styles registry).
quality_suffix: Style-specific quality tags appended to each prompt.
model: Anthropic model to use.
Returns:
Updated segments list with added keys:
- prompt: full SDXL prompt (scene + style suffix)
- video_prompt: detailed motion description for I2V
- negative_prompt: negative prompt for SDXL
- camera_angle: the assigned camera angle
- scene: raw scene description from LLM
"""
client = anthropic.Anthropic()
# --- Call 1: Image prompts ---
print(" Generating image prompts...")
user_prompt = _build_user_prompt(segments, song_name, style_description)
# Inject style-specific guidance into the system prompt
image_system = IMAGE_SYSTEM_PROMPT
if image_prompt_guidance:
image_system += f"\n\n{image_prompt_guidance}"
response = client.messages.create(
model=model,
max_tokens=2048,
system=image_system,
messages=[{"role": "user", "content": user_prompt}],
)
scenes = _parse_llm_json(response.content[0].text)
scene_map = {s["segment"]: s for s in scenes}
# Merge image prompts into segments
suffix = quality_suffix or DEFAULT_QUALITY_SUFFIX
for i, seg in enumerate(segments):
angle = CAMERA_ANGLES[i % len(CAMERA_ANGLES)]
scene_data = scene_map.get(seg["segment"], {})
scene = scene_data.get("scene", "atmospheric landscape")
seg["scene"] = scene
seg["camera_angle"] = angle
seg["prompt"] = f"{scene}, {suffix}"
seg["negative_prompt"] = NEGATIVE_PROMPT
# --- Call 2: Video prompts ---
print(" Generating video prompts...")
video_user_prompt = _build_video_user_prompt(segments)
response = client.messages.create(
model=model,
max_tokens=4096,
system=VIDEO_SYSTEM_PROMPT,
messages=[{"role": "user", "content": video_user_prompt}],
)
video_scenes = _parse_llm_json(response.content[0].text)
video_map = {s["segment"]: s for s in video_scenes}
# Merge video prompts into segments
for seg in segments:
video_data = video_map.get(seg["segment"], {})
seg["video_prompt"] = video_data.get(
"video_prompt", f"smooth cinematic motion, {seg['scene']}"
)
return segments
def save_segments(
segments: list[dict],
output_path: str | Path,
) -> Path:
"""Save prompt-enriched segments to JSON."""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
json.dump(segments, f, indent=2)
return output_path
def run(
data_dir: str | Path,
song_name: Optional[str] = None,
style_description: str = "",
image_prompt_guidance: str = "",
quality_suffix: str = "",
) -> list[dict]:
"""Full prompt generation pipeline: load segments, generate prompts, save.
Args:
data_dir: Run directory containing segments.json (e.g. data/Gone/run_001/).
song_name: Name of the song. Defaults to the parent directory name.
style_description: Description of the visual style (from styles registry).
image_prompt_guidance: Style-specific creative direction for image prompts.
quality_suffix: Style-specific quality tags appended to each prompt.
Returns:
List of prompt-enriched segment dicts.
"""
data_dir = Path(data_dir)
if song_name is None:
song_name = data_dir.parent.name
with open(data_dir / "segments.json") as f:
segments = json.load(f)
segments = generate_prompts(
segments, song_name=song_name, style_description=style_description,
image_prompt_guidance=image_prompt_guidance,
quality_suffix=quality_suffix,
)
save_segments(segments, data_dir / "segments.json")
return segments
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python -m src.prompt_generator <data_dir> [song_name]")
print(" e.g. python -m src.prompt_generator data/Gone 'Gone'")
sys.exit(1)
name = sys.argv[2] if len(sys.argv) > 2 else None
segments = run(sys.argv[1], song_name=name)
print(f"Generated prompts for {len(segments)} segments:\n")
for seg in segments:
lyrics_tag = f' [{seg["lyrics"]}]' if seg["lyrics"] else ""
print(f" Seg {seg['segment']}{lyrics_tag}")
print(f" Scene: {seg['scene']}")
print(f" Video: {seg['video_prompt'][:100]}...")
print(f" Prompt: {seg['prompt'][:100]}...")
print()
|