| | """Generate image + video prompts from segments using an LLM. |
| | |
| | Takes segments.json (lyrics mapped to beat intervals) and produces two |
| | prompts per segment via two separate LLM calls: |
| | 1. Image prompt — short, SDXL-optimized (≤77 CLIP tokens) |
| | 2. Video prompt — detailed motion/action description for I2V (no token limit) |
| | |
| | Consistency: LLM keeps all scenes within a shared setting from the style guidance. |
| | Variety: LLM picks different subjects, camera angles, compositions per segment. |
| | Narrative: LLM derives an overarching visual story from the lyrics. |
| | """ |
| |
|
| | import json |
| | import os |
| | from pathlib import Path |
| | from typing import Optional |
| |
|
| | import anthropic |
| | from dotenv import load_dotenv |
| |
|
| | load_dotenv() |
| |
|
| | |
| | CAMERA_ANGLES = [ |
| | "wide establishing shot", |
| | "close-up", |
| | "aerial view", |
| | "low angle shot", |
| | "medium shot", |
| | "extreme wide shot", |
| | "over-the-shoulder perspective", |
| | "dutch angle", |
| | "tracking shot from the side", |
| | "bird's eye view", |
| | "ground-level shot", |
| | "silhouette against the sky", |
| | ] |
| |
|
| | |
| | DEFAULT_QUALITY_SUFFIX = "8K, cinematic, atmospheric, sharp details" |
| |
|
| | NEGATIVE_PROMPT = ( |
| | "text, watermark, logo, blurry, low quality, deformed, " |
| | "ugly, oversaturated, cartoon, anime" |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | IMAGE_SYSTEM_PROMPT = """\ |
| | You are a music video director. Given song lyrics, a SETTING, and a list of \ |
| | segments (each ~2 seconds long), create a visually compelling shot list for \ |
| | IMAGE generation (Stable Diffusion XL). |
| | |
| | Rules: |
| | 1. A SETTING will be provided at the end of these instructions. ALL scenes \ |
| | MUST take place within that setting — treat it as the world of a short film. \ |
| | Never leave this world. |
| | 2. Use the LYRICS to shape the MOOD, ENERGY, and EMOTIONAL ARC of each scene. \ |
| | The lyrics dictate the vibe — if they're dark and melancholic, the visuals \ |
| | should feel heavy and somber even within the setting. If they're upbeat, the \ |
| | visuals should feel energetic. |
| | 3. When lyrics are CONCRETE and naturally fit the setting, lean into them \ |
| | heavily. For example, if the setting is a coastal drive and the lyrics say \ |
| | "waves crashing down", make that segment literally about waves crashing \ |
| | against rocks as the car passes. If the lyrics say "fading light", show the \ |
| | sun dropping below the horizon. The more specific the lyrics, the more \ |
| | directly they should influence the scene. |
| | 4. When lyrics are ABSTRACT or metaphorical (e.g. "lost in your eyes", \ |
| | "falling apart"), translate the emotion into something visual and physical \ |
| | within the setting — don't try to literally depict abstract concepts. |
| | 5. Each segment gets a UNIQUE SHOT within the shared setting — vary the \ |
| | subject, angle, and composition, but NEVER leave the world. |
| | CRITICAL: Every scene MUST depict ACTION or MOTION — something must be \ |
| | happening. These will be turned into short video clips, so static subjects \ |
| | like "a wooden floor", "a parked car", or "an empty room" are useless. \ |
| | Show vehicles driving, waves crashing, lights flickering, rain falling, \ |
| | fires burning — dynamic scenes only. |
| | 6. Use the assigned camera angle for each segment. |
| | 7. Segments WITHOUT lyrics (instrumental): use atmospheric, mood-driven \ |
| | details from the setting (environmental motion, weather, ambient action). |
| | 8. Write prompts as SDXL-optimized natural language descriptions. \ |
| | Keep each scene between 25-35 words. Be specific — name exact objects, \ |
| | materials, colors, and weather details. Every word must earn its place. \ |
| | Focus on CONCRETE OBJECTS and ACTIONS — what is physically in the frame \ |
| | and what is happening. SDXL needs to know what to draw, not how to feel. \ |
| | BAD: "reflections layering over glass, interior light diffused through water" — abstract mood. \ |
| | GOOD: "taxi splashing through puddle on wet street, rain falling past neon bar sign" — objects + action. \ |
| | BAD: "streetlights bleeding through downpour, darkness stretching ahead" — vague atmosphere. \ |
| | GOOD: "car windshield wipers sweeping rain, blurred traffic lights ahead, wet dashboard" — specific things. \ |
| | BAD: "water sheeting off canvas edge in a thick curtain" — SDXL will draw a curtain. \ |
| | GOOD: "water pouring off awning edge, rain splashing on sidewalk below" — plain description. \ |
| | Write like you're telling a 10-year-old what's in the picture. Simple, plain words. \ |
| | Name the objects. Name the action. Lighting and mood come from the SETTING, \ |
| | you don't need to describe them — describe what's HAPPENING. \ |
| | Use LITERAL language only — no metaphors, no poetic phrasing. SDXL interprets \ |
| | words literally. BANNED words: bleeding, drowning, bathed, kissed, dancing, \ |
| | breathing, alive, whispering, haunting, cascading, diffusing, fragmenting. \ |
| | These cause SDXL to generate unintended objects. \ |
| | Also avoid describing PROCESSES or PHYSICS — SDXL generates a single frame, \ |
| | not a sequence. "ripples expanding", "light fragmenting and reforming", \ |
| | "reflections scattering" are processes, not objects. Instead describe the \ |
| | RESULT: "rippled puddle", "blurry neon reflection in water", "wet glass". \ |
| | Say exactly what a camera would capture in ONE freeze-frame. \ |
| | Before finalizing each scene, sanity-check it: does this make physical \ |
| | sense? Could this actually exist? "pooled water on a car hood" — no, car \ |
| | hoods are curved and water runs off. "rain falling upward" — no. \ |
| | "neon sign reflected in a brick wall" — no, brick doesn't reflect. \ |
| | Only write scenes that obey basic physics and real-world logic. \ |
| | Strip camera angle phrasing from the scene text (angles are metadata, not prompt words). |
| | 9. Include lighting and color in every scene. Derive from the SETTING — \ |
| | a sunset drive = warm golden-hour light, lens flares, long shadows; \ |
| | a rainy city night = cold neon on wet surfaces, streetlight halos; \ |
| | a stormy harbour = overcast grey, dramatic cloud breaks. \ |
| | Keep lighting consistent across all scenes. |
| | 10. Do NOT include style, quality, or technical tags in the scene — these \ |
| | are appended automatically. BANNED from scenes: "cinematic", "moody", \ |
| | "atmospheric", "dramatic lighting", "film grain", "color grade", "bokeh", \ |
| | "depth of field", "35mm", "8K", "masterpiece", "best quality". \ |
| | Your scene should contain ONLY objects, actions, and setting-derived light. |
| | 11. Do NOT include text, words, or typography in the scenes. |
| | 12. Do NOT end scenes with periods. Use commas to separate phrases. \ |
| | Every character counts — periods waste a token. |
| | |
| | Return ONLY valid JSON: a list of objects with "segment" (number) and \ |
| | "scene" (the creative description). No markdown, no explanation.\ |
| | """ |
| |
|
| | |
| | |
| | |
| |
|
| | VIDEO_SYSTEM_PROMPT = """\ |
| | You are a music video director creating motion descriptions for an \ |
| | image-to-video AI model. You will receive a list of segments, each with \ |
| | an image scene description already written. Your job is to describe \ |
| | HOW each scene should MOVE and ANIMATE. |
| | |
| | Rules: |
| | 1. For each segment, write a detailed "video_prompt" (2-4 sentences) \ |
| | describing all motion in the scene: |
| | - SUBJECT MOTION: what the subject does (walking, turning, reaching, \ |
| | driving, dancing, running, etc.) |
| | - CAMERA MOTION: how the camera moves (slow pan left, dolly forward, \ |
| | tracking shot, crane up, handheld shake, static with zoom, etc.) |
| | - ENVIRONMENTAL MOTION: ambient movement (wind blowing hair/clothes, \ |
| | rain falling, leaves drifting, smoke rising, lights flickering, waves \ |
| | crashing, clouds moving, reflections rippling, etc.) |
| | - PACING: match the emotional energy — slow and contemplative for \ |
| | quiet moments, faster and more dynamic for intense moments. |
| | 2. Be specific and physical. Not "things move around" but "the camera \ |
| | slowly tracks forward as rain streaks across the windshield and the \ |
| | wipers sweep left to right." |
| | 3. Keep the motion consistent with the shared setting — all scenes are \ |
| | part of the same story. |
| | 4. Do NOT describe visual style, colors, or lighting — the image already \ |
| | has those. Focus ONLY on motion and action. |
| | 5. CRITICAL — ONLY animate what exists in the scene description. Do NOT \ |
| | introduce new subjects, people, or objects that are not explicitly \ |
| | mentioned. If the scene describes a landscape with no people, describe \ |
| | ONLY environmental motion (wind, water, light changes, camera movement). \ |
| | NEVER add a person walking into frame unless the scene already mentions \ |
| | a person or figure. |
| | |
| | Return ONLY valid JSON: a list of objects with "segment" (number) and \ |
| | "video_prompt" (the motion description). No markdown, no explanation.\ |
| | """ |
| |
|
| |
|
| | def _build_user_prompt( |
| | segments: list[dict], song_name: str, style_description: str = "", |
| | ) -> str: |
| | """Build the user message for the image prompt LLM call.""" |
| | all_lyrics = " ".join( |
| | seg["lyrics"] for seg in segments if seg["lyrics"] |
| | ).strip() |
| |
|
| | lines = [ |
| | f'Song: "{song_name}"', |
| | f'Full lyrics in this clip: "{all_lyrics}"', |
| | f"Number of segments: {len(segments)}", |
| | ] |
| |
|
| | if style_description: |
| | lines.append(f'Visual style direction: "{style_description}"') |
| |
|
| | lines += ["", "Segments:"] |
| |
|
| | for i, seg in enumerate(segments): |
| | angle = CAMERA_ANGLES[i % len(CAMERA_ANGLES)] |
| | lyrics_note = f'lyrics: "{seg["lyrics"]}"' if seg["lyrics"] else "instrumental" |
| | lines.append( |
| | f' {seg["segment"]}. ({seg["start"]:.1f}s–{seg["end"]:.1f}s) ' |
| | f'[{angle}] {lyrics_note}' |
| | ) |
| |
|
| | return "\n".join(lines) |
| |
|
| |
|
| | def _build_video_user_prompt(segments: list[dict]) -> str: |
| | """Build the user message for the video prompt LLM call.""" |
| | lines = [ |
| | "Generate motion descriptions for each segment.", |
| | "IMPORTANT: ONLY animate elements that exist in the scene description.", |
| | "Do NOT add people, figures, or objects that aren't mentioned.", |
| | "", |
| | "Image scenes:", |
| | "", |
| | ] |
| |
|
| | for seg in segments: |
| | lyrics_note = f' (lyrics: "{seg["lyrics"]}")' if seg.get("lyrics") else " (instrumental)" |
| | lines.append( |
| | f' Segment {seg["segment"]}: "{seg["scene"]}"{lyrics_note}' |
| | ) |
| |
|
| | return "\n".join(lines) |
| |
|
| |
|
| | def _parse_llm_json(raw: str) -> list[dict]: |
| | """Parse JSON from LLM response, stripping markdown fences if present.""" |
| | raw = raw.strip() |
| | if raw.startswith("```"): |
| | raw = raw.split("\n", 1)[1] |
| | raw = raw.rsplit("```", 1)[0] |
| | return json.loads(raw) |
| |
|
| |
|
| | def generate_prompts( |
| | segments: list[dict], |
| | song_name: str = "Unknown", |
| | style_description: str = "", |
| | image_prompt_guidance: str = "", |
| | quality_suffix: str = "", |
| | model: str = "claude-sonnet-4-6", |
| | ) -> list[dict]: |
| | """Generate image + video prompts for each segment using two LLM calls. |
| | |
| | Args: |
| | segments: List of segment dicts from segmenter (with lyrics). |
| | song_name: Name of the song (helps the LLM set the mood). |
| | style_description: Description of the visual style (from styles registry). |
| | image_prompt_guidance: Style-specific creative direction appended to the |
| | image system prompt (from styles registry). |
| | quality_suffix: Style-specific quality tags appended to each prompt. |
| | model: Anthropic model to use. |
| | |
| | Returns: |
| | Updated segments list with added keys: |
| | - prompt: full SDXL prompt (scene + style suffix) |
| | - video_prompt: detailed motion description for I2V |
| | - negative_prompt: negative prompt for SDXL |
| | - camera_angle: the assigned camera angle |
| | - scene: raw scene description from LLM |
| | """ |
| | client = anthropic.Anthropic() |
| |
|
| | |
| | print(" Generating image prompts...") |
| | user_prompt = _build_user_prompt(segments, song_name, style_description) |
| |
|
| | |
| | image_system = IMAGE_SYSTEM_PROMPT |
| | if image_prompt_guidance: |
| | image_system += f"\n\n{image_prompt_guidance}" |
| |
|
| | response = client.messages.create( |
| | model=model, |
| | max_tokens=2048, |
| | system=image_system, |
| | messages=[{"role": "user", "content": user_prompt}], |
| | ) |
| |
|
| | scenes = _parse_llm_json(response.content[0].text) |
| | scene_map = {s["segment"]: s for s in scenes} |
| |
|
| | |
| | suffix = quality_suffix or DEFAULT_QUALITY_SUFFIX |
| | for i, seg in enumerate(segments): |
| | angle = CAMERA_ANGLES[i % len(CAMERA_ANGLES)] |
| | scene_data = scene_map.get(seg["segment"], {}) |
| | scene = scene_data.get("scene", "atmospheric landscape") |
| |
|
| | seg["scene"] = scene |
| | seg["camera_angle"] = angle |
| | seg["prompt"] = f"{scene}, {suffix}" |
| | seg["negative_prompt"] = NEGATIVE_PROMPT |
| |
|
| | |
| | print(" Generating video prompts...") |
| | video_user_prompt = _build_video_user_prompt(segments) |
| |
|
| | response = client.messages.create( |
| | model=model, |
| | max_tokens=4096, |
| | system=VIDEO_SYSTEM_PROMPT, |
| | messages=[{"role": "user", "content": video_user_prompt}], |
| | ) |
| |
|
| | video_scenes = _parse_llm_json(response.content[0].text) |
| | video_map = {s["segment"]: s for s in video_scenes} |
| |
|
| | |
| | for seg in segments: |
| | video_data = video_map.get(seg["segment"], {}) |
| | seg["video_prompt"] = video_data.get( |
| | "video_prompt", f"smooth cinematic motion, {seg['scene']}" |
| | ) |
| |
|
| | return segments |
| |
|
| |
|
| | def save_segments( |
| | segments: list[dict], |
| | output_path: str | Path, |
| | ) -> Path: |
| | """Save prompt-enriched segments to JSON.""" |
| | output_path = Path(output_path) |
| | output_path.parent.mkdir(parents=True, exist_ok=True) |
| |
|
| | with open(output_path, "w") as f: |
| | json.dump(segments, f, indent=2) |
| |
|
| | return output_path |
| |
|
| |
|
| | def run( |
| | data_dir: str | Path, |
| | song_name: Optional[str] = None, |
| | style_description: str = "", |
| | image_prompt_guidance: str = "", |
| | quality_suffix: str = "", |
| | ) -> list[dict]: |
| | """Full prompt generation pipeline: load segments, generate prompts, save. |
| | |
| | Args: |
| | data_dir: Run directory containing segments.json (e.g. data/Gone/run_001/). |
| | song_name: Name of the song. Defaults to the parent directory name. |
| | style_description: Description of the visual style (from styles registry). |
| | image_prompt_guidance: Style-specific creative direction for image prompts. |
| | quality_suffix: Style-specific quality tags appended to each prompt. |
| | |
| | Returns: |
| | List of prompt-enriched segment dicts. |
| | """ |
| | data_dir = Path(data_dir) |
| |
|
| | if song_name is None: |
| | song_name = data_dir.parent.name |
| |
|
| | with open(data_dir / "segments.json") as f: |
| | segments = json.load(f) |
| |
|
| | segments = generate_prompts( |
| | segments, song_name=song_name, style_description=style_description, |
| | image_prompt_guidance=image_prompt_guidance, |
| | quality_suffix=quality_suffix, |
| | ) |
| | save_segments(segments, data_dir / "segments.json") |
| |
|
| | return segments |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import sys |
| |
|
| | if len(sys.argv) < 2: |
| | print("Usage: python -m src.prompt_generator <data_dir> [song_name]") |
| | print(" e.g. python -m src.prompt_generator data/Gone 'Gone'") |
| | sys.exit(1) |
| |
|
| | name = sys.argv[2] if len(sys.argv) > 2 else None |
| | segments = run(sys.argv[1], song_name=name) |
| |
|
| | print(f"Generated prompts for {len(segments)} segments:\n") |
| | for seg in segments: |
| | lyrics_tag = f' [{seg["lyrics"]}]' if seg["lyrics"] else "" |
| | print(f" Seg {seg['segment']}{lyrics_tag}") |
| | print(f" Scene: {seg['scene']}") |
| | print(f" Video: {seg['video_prompt'][:100]}...") |
| | print(f" Prompt: {seg['prompt'][:100]}...") |
| | print() |
| |
|