ICGenAIShare04 commited on
Commit
72f552e
Β·
verified Β·
1 Parent(s): 9a913d6

Upload 52 files

Browse files

First Commit with all necessary files

This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. .gitattributes +26 -0
  2. .gitignore +15 -0
  3. PLAN.md +277 -0
  4. README.md +62 -0
  5. app.py +704 -0
  6. assets/spotify_badge.png +0 -0
  7. examples/Cant find myself.png +3 -0
  8. examples/Cant find myself.wav +3 -0
  9. examples/Gone.jpg +3 -0
  10. examples/Gone.wav +3 -0
  11. examples/House of House.png +3 -0
  12. examples/House of House.wav +3 -0
  13. examples/The more I do.png +3 -0
  14. examples/The more I do.wav +3 -0
  15. fonts/Anton-Regular.ttf +3 -0
  16. fonts/BebasNeue-Regular.ttf +0 -0
  17. fonts/Montserrat-Bold.ttf +3 -0
  18. fonts/Oswald-Regular.ttf +3 -0
  19. fonts/RussoOne-Regular.ttf +0 -0
  20. fonts/Staatliches-Regular.ttf +0 -0
  21. fonts/Teko-Bold.ttf +0 -0
  22. lora_training_data/.DS_Store +0 -0
  23. lora_training_data/metadata.jsonl +15 -0
  24. lora_training_data/pexels-anytiffng-2121455.jpg +3 -0
  25. lora_training_data/pexels-artemmeletov-9201316.jpg +3 -0
  26. lora_training_data/pexels-ekrulila-6536235.jpg +3 -0
  27. lora_training_data/pexels-helenalopes-1959053.jpg +3 -0
  28. lora_training_data/pexels-jerusaemm-2905514.jpg +3 -0
  29. lora_training_data/pexels-kovyrina-1600139.jpg +3 -0
  30. lora_training_data/pexels-kyle-karbowski-109303118-9968067.jpg +3 -0
  31. lora_training_data/pexels-lokmansevim-13627402.jpg +3 -0
  32. lora_training_data/pexels-matthew-jesus-468170389-30227212.jpg +3 -0
  33. lora_training_data/pexels-omer-hakki-49913894-7820946.jpg +3 -0
  34. lora_training_data/pexels-perspectivo-2048722386-29185675.jpg +3 -0
  35. lora_training_data/pexels-pixabay-417059.jpg +3 -0
  36. lora_training_data/pexels-pixabay-67566.jpg +3 -0
  37. lora_training_data/pexels-seyma-alkas-178198724-12858917.jpg +3 -0
  38. lora_training_data/pexels-todd-trapani-488382-1535162.jpg +3 -0
  39. packages.txt +2 -0
  40. requirements.txt +18 -0
  41. src/__init__.py +0 -0
  42. src/assembler.py +627 -0
  43. src/beat_detector.py +278 -0
  44. src/image_generator_api.py +207 -0
  45. src/image_generator_hf.py +245 -0
  46. src/lyrics_extractor.py +83 -0
  47. src/prompt_generator.py +383 -0
  48. src/segmenter.py +142 -0
  49. src/stem_separator.py +243 -0
  50. src/styles.py +99 -0
.gitattributes CHANGED
@@ -33,3 +33,29 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/Cant[[:space:]]find[[:space:]]myself.png filter=lfs diff=lfs merge=lfs -text
37
+ examples/Cant[[:space:]]find[[:space:]]myself.wav filter=lfs diff=lfs merge=lfs -text
38
+ examples/Gone.jpg filter=lfs diff=lfs merge=lfs -text
39
+ examples/Gone.wav filter=lfs diff=lfs merge=lfs -text
40
+ examples/House[[:space:]]of[[:space:]]House.png filter=lfs diff=lfs merge=lfs -text
41
+ examples/House[[:space:]]of[[:space:]]House.wav filter=lfs diff=lfs merge=lfs -text
42
+ examples/The[[:space:]]more[[:space:]]I[[:space:]]do.png filter=lfs diff=lfs merge=lfs -text
43
+ examples/The[[:space:]]more[[:space:]]I[[:space:]]do.wav filter=lfs diff=lfs merge=lfs -text
44
+ fonts/Anton-Regular.ttf filter=lfs diff=lfs merge=lfs -text
45
+ fonts/Montserrat-Bold.ttf filter=lfs diff=lfs merge=lfs -text
46
+ fonts/Oswald-Regular.ttf filter=lfs diff=lfs merge=lfs -text
47
+ lora_training_data/pexels-anytiffng-2121455.jpg filter=lfs diff=lfs merge=lfs -text
48
+ lora_training_data/pexels-artemmeletov-9201316.jpg filter=lfs diff=lfs merge=lfs -text
49
+ lora_training_data/pexels-ekrulila-6536235.jpg filter=lfs diff=lfs merge=lfs -text
50
+ lora_training_data/pexels-helenalopes-1959053.jpg filter=lfs diff=lfs merge=lfs -text
51
+ lora_training_data/pexels-jerusaemm-2905514.jpg filter=lfs diff=lfs merge=lfs -text
52
+ lora_training_data/pexels-kovyrina-1600139.jpg filter=lfs diff=lfs merge=lfs -text
53
+ lora_training_data/pexels-kyle-karbowski-109303118-9968067.jpg filter=lfs diff=lfs merge=lfs -text
54
+ lora_training_data/pexels-lokmansevim-13627402.jpg filter=lfs diff=lfs merge=lfs -text
55
+ lora_training_data/pexels-matthew-jesus-468170389-30227212.jpg filter=lfs diff=lfs merge=lfs -text
56
+ lora_training_data/pexels-omer-hakki-49913894-7820946.jpg filter=lfs diff=lfs merge=lfs -text
57
+ lora_training_data/pexels-perspectivo-2048722386-29185675.jpg filter=lfs diff=lfs merge=lfs -text
58
+ lora_training_data/pexels-pixabay-417059.jpg filter=lfs diff=lfs merge=lfs -text
59
+ lora_training_data/pexels-pixabay-67566.jpg filter=lfs diff=lfs merge=lfs -text
60
+ lora_training_data/pexels-seyma-alkas-178198724-12858917.jpg filter=lfs diff=lfs merge=lfs -text
61
+ lora_training_data/pexels-todd-trapani-488382-1535162.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .env
2
+ venv/
3
+ __pycache__/
4
+ *.pyc
5
+ .DS_Store
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+ input/
10
+ data/
11
+ 70113_1_spec.pdf
12
+ (0) 70113_Generative_AI_README_for_Coursework.ipynb
13
+ styles/*.safetensors
14
+ lora_training_data/
15
+ fonts/*.zip
PLAN.md ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SyncAI β€” AI Music Video Generator
2
+
3
+ ## Overview
4
+
5
+ An end-to-end pipeline that takes a song as input and produces a beat-synced AI-generated video suitable for music ads. The system splits audio stems, extracts vocals and beat timing, generates images from lyrics, animates them into short video clips, and stitches everything together on beat.
6
+
7
+ ---
8
+
9
+ ## Pipeline
10
+
11
+ ```
12
+ Song (audio file)
13
+ β”‚
14
+ β”œβ”€β–Ί Stem Separation (LALAL.AI API)
15
+ β”‚ β”œβ”€β–Ί Vocals
16
+ β”‚ └─► Drums
17
+ β”‚
18
+ β”œβ”€β–Ί Lyrics Extraction (Whisper) ─► Timestamped lyrics
19
+ β”‚
20
+ β”œβ”€β–Ί Beat Detection (onset/kick detection) ─► Beat timestamps
21
+ β”‚
22
+ β”œβ”€β–Ί Segment Lyrics by Beat ─► 1 lyric snippet per beat interval
23
+ β”‚
24
+ β”œβ”€β–Ί Prompt Generation (Claude Sonnet 4.6, two LLM calls) ─► Image prompts + video motion prompts
25
+ β”‚
26
+ β”œβ”€β–Ί Image Generation (SDXL + Hyper-SD LoRA + custom style LoRA) ─► 1 styled image per segment (768x1344, 9:16 vertical)
27
+ β”‚
28
+ β”œβ”€β–Ί Image-to-Video (Wan 2.1 14B) ─► 1 clip per segment
29
+ β”‚
30
+ └─► Stitch on Beat (FFmpeg) ─► Final video (~15s) with original audio
31
+ ```
32
+
33
+ ---
34
+
35
+ ## Modules
36
+
37
+ ### 1. Stem Separation
38
+ - **Tool:** [LALAL.AI API](https://www.lalal.ai/api/) β€” Andromeda model (commercial API, minute-based quota)
39
+ - **Input:** Song audio file (mp3/wav)
40
+ - **Output:** Separated stems β€” vocals, drums (only what the pipeline needs)
41
+ - **Install:** `pip install requests` (HTTP client only, no heavy ML dependencies)
42
+ - **Why LALAL.AI over Demucs:** Demucs (`htdemucs_ft`) produced unusable vocal isolation β€” Whisper extracted only 1 word ("um") from a Demucs vocal stem where LALAL.AI's Andromeda model correctly extracted all 15 words across 3 repetitions. The quality gap directly impacts downstream lyrics β†’ prompts β†’ images.
43
+ - **API flow:** Upload β†’ split (vocals + drums as separate tasks) β†’ poll status β†’ download stems β†’ cleanup remote files
44
+ - **Auth:** `X-License-Key` header via `LALAL_KEY` env var (HF Space secret)
45
+
46
+ ### 2. Lyrics Extraction
47
+ - **Tool:** [WhisperX](https://github.com/m-bain/whisperX) with `large-v2` model
48
+ - **Input:** Isolated vocal stem (from LALAL.AI)
49
+ - **Output:** Word-level timestamped transcript with forced alignment (via wav2vec2)
50
+ - **Why WhisperX over vanilla Whisper:** Forced alignment gives precise word-level timestamps needed for beat-syncing. `large-v2` outperforms `v3-turbo` on lyrics.
51
+ - **Notes:** Whisper is trained on speech, not singing β€” expect imperfect transcription on melodic/fast vocals. Good enough for generating image prompts (exact lyrics not critical). Isolated vocals from Demucs are the biggest accuracy boost.
52
+
53
+ ### 3. Beat / Kick Detection
54
+ - **Tool:** [madmom](https://github.com/CPJKU/madmom) (RNN-based beat tracker)
55
+ - **Approach:**
56
+ 1. `RNNBeatProcessor` β€” ensemble of bidirectional LSTMs produces beat activation function (probability per frame at 100fps)
57
+ 2. `DBNBeatTrackingProcessor` β€” Dynamic Bayesian Network decodes activations into precise beat timestamps
58
+ 3. `select_beats()` β€” trim to target duration + enforce minimum interval between beats
59
+ - **Why madmom over librosa:** librosa onset detection has a known 20-60ms latency bias ([github issue #1052](https://github.com/librosa/librosa/issues/1052)). For beat-synced video cuts, that lag is perceptible. Madmom's RNN+DBN approach has no such bias and benchmarks as the most accurate open-source beat tracker.
60
+ - **Input:** Drum stem (from LALAL.AI)
61
+ - **Output:** List of beat timestamps (seconds)
62
+ - **Install:** `pip install git+https://github.com/CPJKU/madmom.git` (PyPI release doesn't support Python 3.10+)
63
+ - **Target:** Select enough beats to produce ~15 seconds of video
64
+
65
+ ### 4. Lyric-to-Beat Segmentation
66
+ - **Logic:** Map timestamped lyrics onto beat intervals
67
+ - For each beat interval `[beat_i, beat_{i+1}]`, collect all words/phrases that fall within
68
+ - **Output:** List of `(start_time, end_time, lyrics)` segment dicts
69
+
70
+ ### 4b. Prompt Generation (Two LLM Calls)
71
+ - **Model:** Claude Sonnet 4.6 (`claude-sonnet-4-6`) via Anthropic API
72
+ - **Architecture:** Two separate LLM calls per run:
73
+ 1. **Image prompts** β€” short SDXL-optimized scene descriptions (25-35 words, under 77 CLIP tokens)
74
+ 2. **Video prompts** β€” detailed motion/action descriptions for I2V (no token limit)
75
+ - **Style-specific guidance:** Each style in `styles.py` provides an `image_prompt_guidance` field with a concrete SETTING (e.g. "Coastal sunset drive", "Rainy city at night"). The LLM places all scenes within this setting.
76
+ - **Style-specific quality suffix:** Each style provides a `quality_suffix` (e.g. "8K, cinematic, golden hour glow, warm volumetric light") appended to every prompt. The LLM is told NOT to include style/quality tags in scenes β€” those come from the suffix.
77
+ - **Lyrics integration:** Concrete lyrics that fit the setting are interpreted literally. Abstract/metaphorical lyrics are translated into physical actions within the setting.
78
+ - **Prompt rules:** Literal language only (no metaphors β€” SDXL interprets words literally). Focus on concrete objects and actions. Physically plausible scenes only. No periods (waste tokens). No style boilerplate in scenes.
79
+ - **Output:** Enriched segments with `prompt`, `video_prompt`, `scene`, `camera_angle`, `negative_prompt`
80
+
81
+ ### 5. Image Generation (Core GenAI Requirement)
82
+ - **Base model:** SDXL (Stable Diffusion XL) via `diffusers` library
83
+ - **Acceleration:** Hyper-SD 8-step LoRA (ByteDance) β€” distilled SDXL that produces near-full-quality images in 8 steps (~2s/image instead of ~13s)
84
+ - **Style adaptation:** Multiple style LoRAs on SDXL, selectable via Gradio UI dropdown:
85
+ - **Sunset Coastal Drive** β€” custom-trained LoRA (`samuelsattler/warm-sunset-lora`), weight 1.0, trigger "sks"
86
+ - **Rainy City Night** β€” community film grain LoRA (`artificialguybr/filmgrain-redmond`), weight 0.8, trigger "FilmGrainAF"
87
+ - **Cyberpunk** β€” community cyberpunk LoRA (`jbilcke-hf/sdxl-cyberpunk-2077`), weight 0.9, trigger "cyberpunk-2077"
88
+ - **Watercolour Harbour** β€” community watercolor LoRA (`ostris/watercolor_style_lora_sdxl`), weight 1.4
89
+ - Each style also provides `image_prompt_guidance` (setting) and `quality_suffix` (style-specific quality tags)
90
+ - LoRAs loaded from HuggingFace Hub at runtime β€” no local `.safetensors` files needed
91
+ - **Custom LoRA training:** 15-20 curated images, Google Colab T4, ~1-2 hours. Captions describe content only, not style.
92
+ - **Usage:** Stack Hyper-SD 8-step LoRA (speed) + style LoRA (aesthetics) at inference
93
+ - **Output resolution:** 768 x 1344 (9:16 vertical) β€” stays within SDXL's ~1MP training budget, ideal for mobile/social media video
94
+ - **Input:** Text prompt per segment (from prompt generator)
95
+ - **Output:** One vertical image per segment in the selected style
96
+ - **Why SDXL over alternatives:**
97
+ - vs SD 1.5: SDXL is equally fast with Hyper-SD but far better quality, especially at non-square ratios (SD 1.5 trained at 512x512, poor at vertical)
98
+ - vs Flux: 4x slower, 2x VRAM, much smaller LoRA ecosystem, overkill for fleeting music video frames
99
+ - vs SD 3.5: Immature LoRA ecosystem compared to SDXL's thousands of community models
100
+ - **This satisfies the coursework requirement:** "Take a pre-trained model and adapt it for a niche, creative application"
101
+
102
+ ### 6. Image-to-Video (Wan 2.1)
103
+ - **Model:** Wan 2.1 I2V 14B (Alibaba, open-weights) β€” best open-source I2V model
104
+ - **Two backends, same model:**
105
+ - **Local dev:** fal.ai API (`video_generator_api.py`) β€” ~$0.20/clip at 480p, instant setup
106
+ - **HF Spaces:** Wan 2.1 14B on ZeroGPU with FP8 quantization (`video_generator_hf.py`) β€” free for users, no API key needed
107
+ - **Input:** Generated image (from our LoRA-styled SDXL) + motion prompt
108
+ - **Output:** Short video clip (~5s at 16fps, 9:16 vertical) β€” assembler trims to beat interval
109
+ - **Why Wan 2.1:** Best quality open-weights I2V, fits 24GB VRAM with FP8, natively in `diffusers`, existing ZeroGPU reference Spaces to build from
110
+ - **Prompt strategy:** Use strong kinetic verbs and mid-action descriptions to get immediate full motion from frame 1 (critical since clips are only ~2s after trimming)
111
+ - **Why image→video, not direct text→video?** The custom style LoRA only works on SDXL (image gen) — video models like Wan 2.1 don't support SDXL LoRAs. The two-step pipeline lets us apply our trained style in the image step, then animate it. This also gives precise control over the first frame's composition, whereas T2V is unpredictable. The image step is where the coursework GenAI requirement lives.
112
+
113
+ ### 7. Final Assembly
114
+ - **Tool:** FFmpeg (via `subprocess` or `ffmpeg-python`)
115
+ - **Steps:**
116
+ 1. Trim/stretch each video clip to its exact beat interval duration
117
+ 2. Concatenate clips in order
118
+ 3. Overlay the original audio (or a mixed version)
119
+ 4. Export final video (mp4, H.264)
120
+ - **Output:** ~15-second beat-synced music video
121
+
122
+ ---
123
+
124
+ ## Tech Stack
125
+
126
+ | Component | Library / Model |
127
+ |----------------------|------------------------------------|
128
+ | Stem separation | LALAL.AI API (Andromeda model) |
129
+ | Lyrics (ASR) | WhisperX (large-v2 + wav2vec2 alignment) |
130
+ | Beat detection | madmom (RNN + DBN beat tracker) |
131
+ | Prompt generation | Claude Sonnet 4.6 (Anthropic API, two-call architecture) |
132
+ | Image generation | SDXL + Hyper-SD 8-step + style LoRA (4 styles, diffusers) |
133
+ | Image-to-video | Wan 2.1 14B (fal.ai API for dev, ZeroGPU with FP8 for HF Spaces) |
134
+ | Video assembly | FFmpeg |
135
+ | Demo UI | Gradio (for Hugging Face Spaces) |
136
+ | Orchestration | Python |
137
+
138
+ ---
139
+
140
+ ## Development & Deployment Strategy
141
+
142
+ ### Development (Local β€” MacBook Pro M1 Pro, 16GB)
143
+
144
+ - **CPU tasks (fast locally):** Whisper transcription, madmom beat detection, FFmpeg stitching, Gradio UI
145
+ - **GPU tasks (via MPS):** SDXL + Hyper-SD image generation (~2-4 sec/image on M1 Pro with 8-step inference)
146
+ - **Video generation:** fal.ai API (Wan 2.1 can't run on 16GB) β€” ~$0.20/clip at 480p
147
+ - **LoRA training:** Google Colab T4 (one-time, download weights when done)
148
+
149
+ ### MVP (get end-to-end working first)
150
+
151
+ | Step | MVP Approach | Why |
152
+ |------|-------------|-----|
153
+ | Stem separation | LALAL.AI API (Andromeda) | Best quality, API-based |
154
+ | Lyrics | Whisper (pre-trained) | Works out of the box |
155
+ | Beat detection | madmom `RNNBeatProcessor` + `DBNBeatTrackingProcessor` on drum stem | Most accurate, no latency bias |
156
+ | Segmentation | Map lyrics to beat intervals | Pure Python logic |
157
+ | Image gen | SDXL + Hyper-SD 8-step LoRA locally (MPS), 768x1344 vertical | ~2s/image, runs on M1 Pro |
158
+ | Image-to-video | fal.ai API β†’ Wan 2.1 14B | Same model as HF deployment, can't run locally |
159
+ | Assembly | FFmpeg concat + overlay audio | Reliable, no ML |
160
+
161
+ ### Deployment (HF Spaces)
162
+
163
+ When deploying to HF Spaces, swap to on-device inference:
164
+
165
+ 1. **Image gen:** `image_generator.py` β€” same code, add `@spaces.GPU` decorator, switch dtype to bf16
166
+ 2. **Video gen:** `video_generator_hf.py` replaces `video_generator_api.py` β€” Wan 2.1 14B with FP8 quantization on ZeroGPU (no API key, no credits)
167
+ 3. **`requirements.txt`** replaces local pip installs
168
+ 4. **`packages.txt`** for system dependencies (e.g. `ffmpeg`)
169
+ 5. **Owner:** `mvp-lab` account, **Hardware:** ZeroGPU, **SDK:** Gradio
170
+
171
+ The pipeline orchestration stays identical β€” only the import path for the video generator changes.
172
+
173
+ ---
174
+
175
+ ## GenAI Requirement Fulfillment
176
+
177
+ **Option chosen:** Take a pre-trained model and adapt it for a niche, creative application.
178
+
179
+ - Train a custom style LoRA on SDXL (warm sunset aesthetic) + curate 3 community style LoRAs (film grain, cyberpunk, watercolour)
180
+ - Stack with Hyper-SD 8-step LoRA for fast inference at 768x1344 (9:16 vertical)
181
+ - Style-specific prompt engineering: each style has a concrete setting, quality suffix, and LLM guidance
182
+ - The full pipeline (lyrics β†’ images β†’ video, synced to beat) is a novel creative application
183
+
184
+ ---
185
+
186
+ ## Hugging Face Demo Plan
187
+
188
+ - **Framework:** Gradio
189
+ - **Interface:**
190
+ 1. User uploads a song (mp3/wav)
191
+ 2. User selects visual style (dropdown)
192
+ 3. User clicks "Generate"
193
+ 4. Progress bar showing pipeline stages
194
+ 5. Output: playable video + download link
195
+ - **Compute:** Hugging Face Spaces with ZeroGPU (H200 MIG slice, ~24GB)
196
+ - **Constraints:** Keep total inference under ~5 minutes for demo usability
197
+ - **Lyrics overlay:** Fixed to Bebas Neue font + warm white (#FFF7D4) colour. This combination looks best across all visual styles and gives the best out-of-the-box results. Font/colour selection UI code is kept commented out in `app.py` for future re-enablement.
198
+
199
+ ---
200
+
201
+ ## API Usage & Spend Controls
202
+
203
+ ### Why APIs instead of on-device models for non-visual components
204
+
205
+ The pipeline uses external APIs for three non-visual tasks: stem separation (LALAL.AI), lyrics-to-prompt expansion (Anthropic Claude), and optionally image/video generation during development (fal.ai).
206
+
207
+ Running these models on-device was considered but rejected for two reasons:
208
+
209
+ 1. **Quality**: On-device alternatives produce substantially worse results. For example, Demucs (`htdemucs_ft`) extracted only 1 word ("um") from a vocal stem where LALAL.AI's Andromeda model correctly extracted all 15 words across 3 repetitions. The quality gap directly impacts downstream steps β€” bad vocals mean bad lyrics mean bad prompts mean bad images.
210
+
211
+ 2. **Scope**: The coursework focus is on **visual generative AI** β€” specifically training a custom style LoRA on SDXL and building a multi-modal composition pipeline. Implementing production-quality ASR, source separation, or LLM inference on-device would explode the project scope without contributing to the core visual AI objective.
212
+
213
+ ### Spend limits
214
+
215
+ All API keys are stored as HF Space secrets on the supervisor's shared account. To prevent runaway costs:
216
+
217
+ - **Anthropic**: Spend limit configured in the Anthropic Console (usage dashboard β†’ limits)
218
+ - **LALAL.AI**: Minute-based quota tied to the license tier β€” processing stops when minutes are exhausted
219
+ - **fal.ai**: Only used during local development, not on the deployed Space (Spaces use on-device Wan 2.1 + SDXL)
220
+
221
+ ### Deployment secret summary
222
+
223
+ | Secret | Used for | Where |
224
+ |--------|----------|-------|
225
+ | `ANTHROPIC_API_KEY` | Prompt generation (Claude) | Both local + Spaces |
226
+ | `LALAL_KEY` | Vocal separation (LALAL.AI) | Both local + Spaces |
227
+ | `FAL_KEY` | Image + video gen (fal.ai) | Local dev only |
228
+
229
+ ---
230
+
231
+ ## Post-MVP Ideas
232
+
233
+ - **Smart clip selection:** Auto-detect the best ~15s of a full song (e.g. 5-10s build-up before the drop + 5-10s of the drop). Use energy analysis, onset density, or structural segmentation to find the drop. For MVP, the user pre-trims the input to the desired 15s.
234
+ - Beat-synced crossfades instead of hard cuts
235
+ - More community style LoRAs (e.g. retro anime, oil painting)
236
+ - Direct text-to-video generation (skip image step) if video models improve enough
237
+
238
+ ---
239
+
240
+ ## File Structure
241
+
242
+ ```
243
+ CW/
244
+ β”œβ”€β”€ PLAN.md
245
+ β”œβ”€β”€ app.py # Gradio demo entry point
246
+ β”œβ”€β”€ requirements.txt
247
+ β”œβ”€β”€ packages.txt # System deps for HF Spaces (ffmpeg)
248
+ β”œβ”€β”€ src/
249
+ β”‚ β”œβ”€β”€ __init__.py
250
+ β”‚ β”œβ”€β”€ stem_separator.py # LALAL.AI API wrapper
251
+ β”‚ β”œβ”€β”€ lyrics_extractor.py # WhisperX wrapper
252
+ β”‚ β”œβ”€β”€ beat_detector.py # madmom RNN+DBN beat detection
253
+ β”‚ β”œβ”€β”€ segmenter.py # lyrics-to-beat mapping
254
+ β”‚ β”œβ”€β”€ prompt_generator.py # Two-call LLM prompt generation (Claude Sonnet 4.6)
255
+ β”‚ β”œβ”€β”€ styles.py # Style registry (LoRA sources, settings, quality suffixes)
256
+ β”‚ β”œβ”€β”€ image_generator.py # SDXL + Hyper-SD + style LoRA (768x1344 vertical)
257
+ β”‚ β”œβ”€β”€ video_generator_api.py # Wan 2.1 I2V via fal.ai (local dev)
258
+ β”‚ β”œβ”€β”€ video_generator_hf.py # Wan 2.1 I2V on ZeroGPU with FP8 (HF Spaces deployment)
259
+ β”‚ └── assembler.py # FFmpeg stitching + lyrics overlay
260
+ β”œβ”€β”€ train_lora.py # LoRA training script (run on Colab T4, ~1-2 hours)
261
+ β”œβ”€β”€ fonts/ # Fonts for lyrics overlay
262
+ β”œβ”€β”€ lora_training_data/ # Curated style images for LoRA training
263
+ β”œβ”€β”€ data/ # All pipeline output (one folder per song)
264
+ β”‚ └── <song_name>/
265
+ β”‚ β”œβ”€β”€ run_001/ # Each pipeline run gets its own directory
266
+ β”‚ β”‚ β”œβ”€β”€ stems/
267
+ β”‚ β”‚ β”‚ β”œβ”€β”€ drums.wav
268
+ β”‚ β”‚ β”‚ └── vocals.wav
269
+ β”‚ β”‚ β”œβ”€β”€ lyrics.json
270
+ β”‚ β”‚ β”œβ”€β”€ beats.json
271
+ β”‚ β”‚ β”œβ”€β”€ segments.json # Enriched with prompt, video_prompt, scene, etc.
272
+ β”‚ β”‚ β”œβ”€β”€ images/
273
+ β”‚ β”‚ β”œβ”€β”€ clips/
274
+ β”‚ β”‚ └── output/
275
+ β”‚ └── run_002/ # Re-running creates a new run, no overwrites
276
+ └── poster/ # Poster assets & PDF
277
+ ```
README.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SyncAI
3
+ emoji: 🎡
4
+ colorFrom: indigo
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: "6.8.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: AI Music Ads Generator
11
+ ---
12
+
13
+ # SyncAI β€” AI Music Video Generator
14
+
15
+ Generate beat-synced music video ads from a song clip. Upload ~15 seconds of audio, pick a visual style, and SyncAI produces a fully assembled vertical video with AI-generated visuals cut to the beat.
16
+
17
+ ## How It Works
18
+
19
+ ```
20
+ Song (audio file)
21
+ β”œβ”€β–Ί Stem Separation (LALAL.AI) β†’ Vocals + Drums
22
+ β”œβ”€β–Ί Lyrics Extraction (WhisperX) β†’ Word-level timestamps
23
+ β”œβ”€β–Ί Beat Detection (madmom RNN + DBN) β†’ Beat timestamps + drop detection
24
+ β”œβ”€β–Ί Segmentation β†’ Lyrics mapped to beat intervals
25
+ β”œβ”€β–Ί Prompt Generation (Claude Sonnet 4.6) β†’ Image + video motion prompts
26
+ β”œβ”€β–Ί Image Generation (SDXL + Hyper-SD + style LoRA) β†’ 768x1344 images
27
+ β”œβ”€β–Ί Image-to-Video (Wan 2.1 14B) β†’ Animated clips
28
+ └─► Assembly (FFmpeg) β†’ Beat-synced video with lyrics overlay
29
+ ```
30
+
31
+ ## Visual Styles
32
+
33
+ Each style applies a different LoRA to SDXL and sets a unique scene world for the LLM prompt generator. The Sunset Coastal Drive LoRA was custom-trained for this project; the others are community LoRAs from HuggingFace Hub:
34
+
35
+ | Style | LoRA | Setting |
36
+ |-------|------|---------|
37
+ | **Sunset Coastal Drive** | Custom-trained (`samuelsattler/warm-sunset-lora`) | Car cruising a coastal highway at golden hour |
38
+ | **Rainy City Night** | Film grain (`artificialguybr/filmgrain-redmond`) | Walking rain-soaked city streets after dark |
39
+ | **Cyberpunk** | Cyberpunk 2077 (`jbilcke-hf/sdxl-cyberpunk-2077`) | Neon-drenched futuristic megacity at night |
40
+ | **Watercolour Harbour** | Watercolor (`ostris/watercolor_style_lora_sdxl`) | Coastal fishing village during a storm |
41
+
42
+ ## Assembly Features
43
+
44
+ - **Dynamic pacing**: 4-beat cuts before the drop, 2-beat cuts after for energy
45
+ - **Clip shuffling**: Each clip used twice (first/second half) in randomised order for visual variety
46
+ - **Ken Burns**: Alternating zoom in/out on every cut
47
+ - **Lyrics overlay**: Word-level timing with gap closing
48
+ - **Cover art overlay**: Album art + Spotify badge appear from the drop onwards
49
+ - **Reshuffle**: Re-run assembly with a new random clip order without regenerating
50
+
51
+ ## Tech Stack
52
+
53
+ | Component | Tool |
54
+ |-----------|------|
55
+ | Stem separation | LALAL.AI API (Andromeda) |
56
+ | Lyrics (ASR) | WhisperX (large-v2 + wav2vec2) |
57
+ | Beat detection | madmom (RNN + DBN) |
58
+ | Prompt generation | Claude Sonnet 4.6 (Anthropic API) |
59
+ | Image generation | SDXL + Hyper-SD 8-step + style LoRA |
60
+ | Image-to-video | Wan 2.1 14B (ZeroGPU with FP8) |
61
+ | Video assembly | FFmpeg |
62
+ | UI | Gradio |
app.py ADDED
@@ -0,0 +1,704 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """SyncAI β€” AI Music Video Generator.
2
+
3
+ Gradio app that orchestrates the full pipeline:
4
+ Song β†’ Stems β†’ Lyrics + Beats β†’ Segments β†’ Prompts β†’ Images β†’ Video β†’ Assembly
5
+
6
+ Works locally (fal.ai API for video) and on HuggingFace Spaces (on-device Wan 2.1).
7
+ """
8
+
9
+ import json
10
+ import os
11
+ import shutil
12
+ from pathlib import Path
13
+
14
+ from dotenv import load_dotenv
15
+ load_dotenv()
16
+
17
+ import gradio as gr
18
+ import torch
19
+
20
+ # Lightweight imports only β€” heavy modules (whisperx, madmom, etc.)
21
+ # are lazy-imported inside generate() to keep the UI responsive.
22
+ from src.assembler import font_names, DEFAULT_FONT, DEFAULT_FONT_COLOR
23
+ from src.styles import style_names, get_style
24
+
25
+ # ---------------------------------------------------------------------------
26
+ # Environment detection
27
+ # ---------------------------------------------------------------------------
28
+
29
+ IS_SPACES = os.getenv("SPACE_ID") is not None
30
+
31
+ if IS_SPACES:
32
+ import spaces
33
+
34
+ INPUT_DIR = Path("input")
35
+ INPUT_DIR.mkdir(exist_ok=True)
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # GPU-accelerated steps (decorated only on Spaces)
40
+ # ---------------------------------------------------------------------------
41
+
42
+ def _generate_images(run_dir, style_name, progress_callback=None):
43
+ """Load SDXL pipeline, generate all images, then unload to free VRAM."""
44
+ if IS_SPACES:
45
+ from src.image_generator_hf import run as gen_images
46
+ else:
47
+ from src.image_generator_api import run as gen_images
48
+ gen_images(run_dir, style_name=style_name, progress_callback=progress_callback)
49
+ # Free VRAM for the video model
50
+ torch.cuda.empty_cache()
51
+
52
+
53
+ def _generate_videos(run_dir, progress_callback=None):
54
+ """Load Wan 2.1 pipeline, generate all video clips, then unload."""
55
+ if IS_SPACES:
56
+ from src.video_generator_hf import run as gen_videos
57
+ gen_videos(run_dir, progress_callback=progress_callback)
58
+ # Unload Wan 2.1 to free VRAM
59
+ from src.video_generator_hf import unload
60
+ unload()
61
+ else:
62
+ from src.video_generator_api import run as gen_videos
63
+ gen_videos(run_dir, progress_callback=progress_callback)
64
+
65
+
66
+ # Apply @spaces.GPU decorator on Spaces
67
+ if IS_SPACES:
68
+ _generate_images = spaces.GPU(duration=300)(_generate_images)
69
+ _generate_videos = spaces.GPU(duration=3600)(_generate_videos) # up to 1h for ~12 clips
70
+
71
+
72
+ # ---------------------------------------------------------------------------
73
+ # Run discovery & step detection
74
+ # ---------------------------------------------------------------------------
75
+
76
+ DATA_DIR = Path("data")
77
+
78
+ STEPS = [
79
+ "1. Stems",
80
+ "2. Lyrics",
81
+ "3. Beats",
82
+ "4. Segmentation",
83
+ "5. Prompts",
84
+ "6. Images",
85
+ "7. Videos",
86
+ "8. Assembly",
87
+ ]
88
+
89
+
90
+ def _list_runs() -> list[str]:
91
+ """Find all existing run directories under data/."""
92
+ if not DATA_DIR.exists():
93
+ return []
94
+ runs = []
95
+ for song_dir in sorted(DATA_DIR.iterdir()):
96
+ if not song_dir.is_dir():
97
+ continue
98
+ for run_dir in sorted(song_dir.glob("run_*")):
99
+ if run_dir.is_dir():
100
+ runs.append(f"{song_dir.name}/{run_dir.name}")
101
+ return runs
102
+
103
+
104
+ def _detect_completed_steps(run_dir: Path) -> int:
105
+ """Return the number of the last fully completed step (0 = nothing done)."""
106
+ # Step 1: vocals + drums stems exist (LALAL.AI only extracts these two)
107
+ stems = run_dir / "stems"
108
+ for name in ["drums.wav", "vocals.wav"]:
109
+ if not (stems / name).exists():
110
+ return 0
111
+
112
+ # Step 2: lyrics.json valid with at least 1 entry
113
+ lyrics_path = run_dir / "lyrics.json"
114
+ if not lyrics_path.exists():
115
+ return 1
116
+ try:
117
+ data = json.loads(lyrics_path.read_text())
118
+ if not isinstance(data, list) or len(data) == 0:
119
+ return 1
120
+ except (json.JSONDecodeError, OSError):
121
+ return 1
122
+
123
+ # Step 3: beats.json valid with at least 1 entry
124
+ beats_path = run_dir / "beats.json"
125
+ if not beats_path.exists():
126
+ return 2
127
+ try:
128
+ data = json.loads(beats_path.read_text())
129
+ if not isinstance(data, list) or len(data) == 0:
130
+ return 2
131
+ except (json.JSONDecodeError, OSError):
132
+ return 2
133
+
134
+ # Step 4: segments.json valid with at least 1 segment having start/end
135
+ seg_path = run_dir / "segments.json"
136
+ if not seg_path.exists():
137
+ return 3
138
+ try:
139
+ segments = json.loads(seg_path.read_text())
140
+ if not isinstance(segments, list) or len(segments) == 0:
141
+ return 3
142
+ if "start" not in segments[0] or "end" not in segments[0]:
143
+ return 3
144
+ except (json.JSONDecodeError, OSError):
145
+ return 3
146
+
147
+ # Step 5: every segment has a non-empty "prompt" key
148
+ try:
149
+ if not all(seg.get("prompt") for seg in segments):
150
+ return 4
151
+ except Exception:
152
+ return 4
153
+
154
+ n_segments = len(segments)
155
+
156
+ # Step 6: exactly N image files exist
157
+ for i in range(1, n_segments + 1):
158
+ if not (run_dir / "images" / f"segment_{i:03d}.png").exists():
159
+ return 5
160
+
161
+ # Step 7: exactly N clip files exist
162
+ for i in range(1, n_segments + 1):
163
+ if not (run_dir / "clips" / f"clip_{i:03d}.mp4").exists():
164
+ return 6
165
+
166
+ # Step 8: final.mp4 exists with size > 0
167
+ final = run_dir / "output" / "final.mp4"
168
+ if not final.exists() or final.stat().st_size == 0:
169
+ return 7
170
+
171
+ return 8
172
+
173
+
174
+ def _get_startable_steps(run_dir: Path) -> list[str]:
175
+ """Return step names the user can start from (all prerequisites met)."""
176
+ completed = _detect_completed_steps(run_dir)
177
+ # Can start from any step up to completed+1 (the next incomplete step)
178
+ last_startable = min(completed + 1, 8)
179
+ return STEPS[:last_startable] # steps 1 through last_startable
180
+
181
+
182
+ def _on_run_mode_change(run_mode):
183
+ """Toggle visibility of audio upload vs resume controls."""
184
+ is_resume = run_mode == "Resume Existing"
185
+ return (
186
+ gr.update(visible=not is_resume), # audio_input
187
+ gr.update(visible=is_resume, choices=_list_runs()), # existing_run
188
+ gr.update(visible=is_resume, choices=[], value=None), # start_step
189
+ gr.update(visible=is_resume), # reuse_files
190
+ )
191
+
192
+
193
+ def _on_run_selected(existing_run):
194
+ """Update step dropdown when a run is selected."""
195
+ if not existing_run:
196
+ return gr.update(choices=[], value=None)
197
+ run_dir = DATA_DIR / existing_run
198
+ steps = _get_startable_steps(run_dir)
199
+ default = steps[-1] if steps else None
200
+ return gr.update(choices=steps, value=default)
201
+
202
+
203
+ # ---------------------------------------------------------------------------
204
+ # Main pipeline
205
+ # ---------------------------------------------------------------------------
206
+
207
+ _COLOR_PRESETS = {
208
+ "Warm White": "#FFF7D4",
209
+ "White": "#FFFFFF",
210
+ "Red": "#FF3B30",
211
+ "Cyan": "#00E5FF",
212
+ "Gold": "#FFD700",
213
+ "Custom": None,
214
+ }
215
+
216
+
217
+ def generate(audio_file: str, style_name: str, cover_art: str | None,
218
+ run_mode: str, existing_run: str | None, start_step: str | None,
219
+ reuse_files: bool, progress=gr.Progress()):
220
+ """Run the SyncAI pipeline (full or resumed).
221
+
222
+ Returns:
223
+ Path to the final video.
224
+ """
225
+ font_name = DEFAULT_FONT
226
+ font_color = DEFAULT_FONT_COLOR
227
+ style = get_style(style_name)
228
+ is_resume = run_mode == "Resume Existing"
229
+
230
+ if is_resume:
231
+ if not existing_run:
232
+ raise gr.Error("Please select an existing run.")
233
+ if not start_step:
234
+ raise gr.Error("Please select a step to start from.")
235
+ run_dir = DATA_DIR / existing_run
236
+ if not run_dir.exists():
237
+ raise gr.Error(f"Run directory not found: {run_dir}")
238
+ step_num = int(start_step.split(".")[0])
239
+ print(f"Resuming {existing_run} from step {step_num}")
240
+
241
+ # Always clear assembly output (cheap to redo)
242
+ import shutil
243
+ out_dir = run_dir / "output"
244
+ if out_dir.exists():
245
+ shutil.rmtree(out_dir)
246
+ # Also clear intermediate assembly artifacts
247
+ for d in ["clips_split", "clips_trimmed"]:
248
+ p = run_dir / d
249
+ if p.exists():
250
+ shutil.rmtree(p)
251
+
252
+ # If not reusing files, also clear images and video clips
253
+ if not reuse_files:
254
+ if step_num <= 6:
255
+ img_dir = run_dir / "images"
256
+ if img_dir.exists():
257
+ shutil.rmtree(img_dir)
258
+ if step_num <= 7:
259
+ clips_dir = run_dir / "clips"
260
+ if clips_dir.exists():
261
+ shutil.rmtree(clips_dir)
262
+ else:
263
+ if audio_file is None:
264
+ raise gr.Error("Please upload a song first.")
265
+ step_num = 1
266
+
267
+ import gc
268
+
269
+ def _flush_memory():
270
+ """Aggressively free memory between heavy ML steps."""
271
+ gc.collect()
272
+ if hasattr(torch, "mps") and torch.backends.mps.is_available():
273
+ torch.mps.empty_cache()
274
+ if torch.cuda.is_available():
275
+ torch.cuda.empty_cache()
276
+
277
+ # --- Step 1: Stem Separation ---
278
+ if step_num <= 1:
279
+ progress(0.0, desc="Separating stems...")
280
+ from src.stem_separator import separate_stems
281
+ # For resume: find original audio in song dir; for new run: use uploaded file
282
+ if is_resume:
283
+ song_dir = run_dir.parent
284
+ audio_candidates = list(song_dir.glob("*.wav")) + list(song_dir.glob("*.mp3")) + \
285
+ list(song_dir.glob("*.flac")) + list(song_dir.glob("*.m4a"))
286
+ if not audio_candidates:
287
+ raise gr.Error(f"No audio file found in {song_dir}")
288
+ result = separate_stems(audio_candidates[0], output_dir=run_dir / "stems")
289
+ else:
290
+ result = separate_stems(Path(audio_file))
291
+ run_dir = result["run_dir"]
292
+ print(f"Run directory: {run_dir}")
293
+
294
+ # --- Step 2: Lyrics Extraction ---
295
+ if step_num <= 2:
296
+ progress(0.15, desc="Extracting lyrics...")
297
+ from src.lyrics_extractor import extract_lyrics
298
+ vocals_path = run_dir / "stems" / "vocals.wav"
299
+ extract_lyrics(vocals_path)
300
+ del extract_lyrics
301
+ _flush_memory()
302
+
303
+ # --- Step 3: Beat Detection ---
304
+ if step_num <= 3:
305
+ progress(0.25, desc="Detecting beats...")
306
+ from src.beat_detector import run as detect_beats
307
+ drums_path = run_dir / "stems" / "drums.wav"
308
+ detect_beats(drums_path)
309
+ del detect_beats
310
+ _flush_memory()
311
+
312
+ # --- Step 4: Segmentation ---
313
+ if step_num <= 4:
314
+ progress(0.35, desc="Segmenting lyrics to beats...")
315
+ from src.segmenter import run as segment_lyrics
316
+ segment_lyrics(run_dir)
317
+
318
+ # --- Step 5: Prompt Generation ---
319
+ if step_num <= 5:
320
+ progress(0.40, desc="Generating prompts...")
321
+ from src.prompt_generator import run as generate_prompts
322
+ generate_prompts(run_dir, style_description=style["description"],
323
+ image_prompt_guidance=style.get("image_prompt_guidance", ""),
324
+ quality_suffix=style.get("quality_suffix", ""))
325
+
326
+ # --- Step 6: Image Generation ---
327
+ if step_num <= 6:
328
+ progress(0.50, desc="Generating images...")
329
+ def _img_progress(i, total):
330
+ progress(0.50 + 0.20 * (i / total), desc=f"Generating images ({i}/{total})...")
331
+ _generate_images(run_dir, style_name, progress_callback=_img_progress)
332
+
333
+ # --- Step 7: Video Generation ---
334
+ if step_num <= 7:
335
+ progress(0.70, desc="Generating video clips...")
336
+ def _vid_progress(i, total):
337
+ progress(0.70 + 0.20 * (i / total), desc=f"Generating videos ({i}/{total})...")
338
+ _generate_videos(run_dir, progress_callback=_vid_progress)
339
+
340
+ # --- Step 8: Assembly ---
341
+ progress(0.90, desc="Assembling final video...")
342
+ from src.assembler import run as assemble_video
343
+ final_path = assemble_video(run_dir, font_name=font_name, font_color=font_color,
344
+ cover_art=cover_art)
345
+
346
+ progress(1.0, desc="Done!")
347
+ return str(final_path), str(run_dir), gr.update(visible=True)
348
+
349
+
350
+ def reshuffle(run_dir_str: str, cover_art: str | None, progress=gr.Progress()):
351
+ """Re-run only the assembly step with a new random shuffle."""
352
+ if not run_dir_str:
353
+ raise gr.Error("No previous run to reshuffle. Generate a video first.")
354
+
355
+ run_dir = Path(run_dir_str)
356
+ if not run_dir.exists():
357
+ raise gr.Error(f"Run directory not found: {run_dir}")
358
+
359
+ font_name = DEFAULT_FONT
360
+ font_color = DEFAULT_FONT_COLOR
361
+
362
+ # Clear assembly artifacts
363
+ for d in ["clips_trimmed", "output"]:
364
+ p = run_dir / d
365
+ if p.exists():
366
+ shutil.rmtree(p)
367
+
368
+ progress(0.2, desc="Reshuffling and assembling...")
369
+ from src.assembler import run as assemble_video
370
+ final_path = assemble_video(run_dir, font_name=font_name, font_color=font_color,
371
+ cover_art=cover_art)
372
+
373
+ progress(1.0, desc="Done!")
374
+ return str(final_path)
375
+
376
+
377
+ # ---------------------------------------------------------------------------
378
+ # Gradio UI
379
+ # ---------------------------------------------------------------------------
380
+
381
+ _custom_css = """
382
+ /* Load Google Fonts for dropdown preview */
383
+ @import url('https://fonts.googleapis.com/css2?family=Bebas+Neue&family=Teko:wght@700&family=Russo+One&family=Staatliches&display=swap');
384
+ /* Style font dropdown options in their actual font */
385
+ #font-dropdown [data-value="Bebas Neue"], #font-dropdown li:nth-child(1) { font-family: 'Bebas Neue', sans-serif !important; }
386
+ #font-dropdown [data-value="Teko"], #font-dropdown li:nth-child(2) { font-family: 'Teko', sans-serif !important; font-weight: 700 !important; }
387
+ #font-dropdown [data-value="Russo One"], #font-dropdown li:nth-child(3) { font-family: 'Russo One', sans-serif !important; }
388
+ #font-dropdown [data-value="Staatliches"], #font-dropdown li:nth-child(4) { font-family: 'Staatliches', sans-serif !important; }
389
+ #font-dropdown ul li { font-size: 16px !important; }
390
+ /* Remove white border on color picker */
391
+ input[type="color"],
392
+ input[type="color"]:focus,
393
+ input[type="color"]:hover,
394
+ .gr-color-picker input,
395
+ div[data-testid="color-picker"] input,
396
+ div[data-testid="color-picker"] div,
397
+ .color-picker input {
398
+ border: none !important;
399
+ outline: none !important;
400
+ box-shadow: none !important;
401
+ background: transparent !important;
402
+ }
403
+ /* Color swatch buttons */
404
+ .color-swatch {
405
+ min-width: 36px !important;
406
+ max-width: 36px !important;
407
+ height: 36px !important;
408
+ padding: 0 !important;
409
+ border-radius: 6px !important;
410
+ border: 2px solid transparent !important;
411
+ cursor: pointer !important;
412
+ box-shadow: none !important;
413
+ transition: border-color 0.15s ease !important;
414
+ }
415
+ .color-swatch:hover {
416
+ border-color: rgba(255,255,255,0.5) !important;
417
+ }
418
+ .color-swatch.selected {
419
+ border-color: #fff !important;
420
+ }
421
+ #swatch-0 { background: #FFF7D4 !important; }
422
+ #swatch-1 { background: #FFFFFF !important; }
423
+ #swatch-2 { background: #FF3B30 !important; }
424
+ #swatch-3 { background: #00E5FF !important; }
425
+ #swatch-4 { background: #FFD700 !important; }
426
+ #swatch-custom {
427
+ background: conic-gradient(red, yellow, lime, aqua, blue, magenta, red);
428
+ min-width: 36px !important;
429
+ max-width: 36px !important;
430
+ height: 36px !important;
431
+ padding: 0 !important;
432
+ border-radius: 50% !important;
433
+ border: 2px solid transparent !important;
434
+ cursor: pointer !important;
435
+ box-shadow: none !important;
436
+ }
437
+ #swatch-custom:hover {
438
+ border-color: rgba(255,255,255,0.5) !important;
439
+ }
440
+ #swatch-custom.selected {
441
+ border-color: #fff !important;
442
+ }
443
+ /* Custom color picker β€” hide all labels/headers */
444
+ #custom-color-picker .label-wrap,
445
+ #custom-color-picker label,
446
+ #custom-color-picker .block-label,
447
+ #custom-color-picker span.svelte-1gfkn6j,
448
+ #custom-color-picker > span { display: none !important; }
449
+ #custom-color-picker,
450
+ #custom-color-picker fieldset,
451
+ fieldset#custom-color-picker {
452
+ min-height: 0 !important;
453
+ padding: 0 !important;
454
+ border: none !important;
455
+ background: #272727 !important;
456
+ display: flex !important;
457
+ justify-content: center !important;
458
+ }
459
+ /* Force dark background on ALL descendants of the color picker */
460
+ #custom-color-picker *,
461
+ #custom-color-picker div,
462
+ #custom-color-picker fieldset,
463
+ #custom-color-picker .block,
464
+ #custom-color-picker .wrap {
465
+ background-color: #272727 !important;
466
+ border-color: #3a3a3a !important;
467
+ }
468
+ /* Hide the trigger swatch, keep popup functional */
469
+ #custom-color-picker .wrap { height: 0 !important; overflow: visible !important; }
470
+ #custom-color-picker button { height: 0 !important; width: 0 !important; padding: 0 !important; border: none !important; overflow: visible !important; }
471
+ /* Hide Hex/RGB/HSL mode switcher buttons */
472
+ button.svelte-nbn1m9 { display: none !important; }
473
+ /* Force all group/panel backgrounds to match */
474
+ .gr-group, .gr-block, .gr-panel, .group, .panel,
475
+ div[class*="group"], div[class*="panel"] {
476
+ background: #272727 !important;
477
+ }
478
+ /* Color row layout β€” centered in box */
479
+ #color-row, #color-row.svelte-7xavid {
480
+ gap: 6px !important;
481
+ align-items: center !important;
482
+ justify-content: center !important;
483
+ padding: 10px 0 6px !important;
484
+ background: #272727 !important;
485
+ background-color: #272727 !important;
486
+ }
487
+ """
488
+
489
+ _dark_theme = gr.themes.Soft(
490
+ primary_hue=gr.themes.Color(
491
+ c50="#02C160", c100="rgba(2,193,96,0.2)", c200="#02C160",
492
+ c300="rgba(2,193,96,0.32)", c400="rgba(2,193,96,0.32)",
493
+ c500="rgba(2,193,96,1.0)", c600="rgba(2,193,96,1.0)",
494
+ c700="rgba(2,193,96,0.32)", c800="rgba(2,193,96,0.32)",
495
+ c900="#02C160", c950="#02C160",
496
+ ),
497
+ secondary_hue=gr.themes.Color(
498
+ c50="#576b95", c100="#576b95", c200="#576b95", c300="#576b95",
499
+ c400="#576b95", c500="#576b95", c600="#576b95", c700="#576b95",
500
+ c800="#576b95", c900="#576b95", c950="#576b95",
501
+ ),
502
+ neutral_hue=gr.themes.Color(
503
+ c50="#2a2a2a", c100="#313131", c200="#3a3a3a", c300="#4a4a4a",
504
+ c400="#B2B2B2", c500="#808080", c600="#636363", c700="#515151",
505
+ c800="#393939", c900="#272727", c950="#171717",
506
+ ),
507
+ font=[gr.themes.GoogleFont("Montserrat"), "ui-sans-serif", "system-ui", "sans-serif"],
508
+ font_mono=[gr.themes.GoogleFont("IBM Plex Mono"), "ui-monospace", "Consolas", "monospace"],
509
+ ).set(
510
+ body_background_fill="#171717",
511
+ body_background_fill_dark="#171717",
512
+ body_text_color="#e0e0e0",
513
+ body_text_color_dark="#e0e0e0",
514
+ body_text_color_subdued="#808080",
515
+ body_text_color_subdued_dark="#808080",
516
+ block_background_fill="#272727",
517
+ block_background_fill_dark="#272727",
518
+ block_border_color="#3a3a3a",
519
+ block_border_color_dark="#3a3a3a",
520
+ block_border_width="0px",
521
+ block_label_background_fill="rgba(2,193,96,0.2)",
522
+ block_label_background_fill_dark="rgba(2,193,96,0.2)",
523
+ block_label_text_color="rgba(2,193,96,1.0)",
524
+ block_label_text_color_dark="rgba(2,193,96,1.0)",
525
+ block_title_background_fill="rgba(2,193,96,0.2)",
526
+ block_title_text_color="rgba(2,193,96,1.0)",
527
+ block_title_text_color_dark="rgba(2,193,96,1.0)",
528
+ input_background_fill="#313131",
529
+ input_background_fill_dark="#313131",
530
+ input_border_color="#3a3a3a",
531
+ input_border_color_dark="#3a3a3a",
532
+ input_border_width="0px",
533
+ button_primary_background_fill="#06AE56",
534
+ button_primary_background_fill_dark="#06AE56",
535
+ button_primary_background_fill_hover="#07C863",
536
+ button_primary_background_fill_hover_dark="#07C863",
537
+ button_primary_border_color="#06AE56",
538
+ button_primary_border_color_dark="#06AE56",
539
+ button_primary_text_color="#FFFFFF",
540
+ button_primary_text_color_dark="#FFFFFF",
541
+ button_secondary_background_fill="#2B2B2B",
542
+ button_secondary_background_fill_dark="#2B2B2B",
543
+ button_secondary_text_color="#FFFFFF",
544
+ button_secondary_text_color_dark="#FFFFFF",
545
+ background_fill_primary="#171717",
546
+ background_fill_primary_dark="#171717",
547
+ background_fill_secondary="#272727",
548
+ background_fill_secondary_dark="#272727",
549
+ border_color_primary="#3a3a3a",
550
+ border_color_primary_dark="#3a3a3a",
551
+ panel_background_fill="#272727",
552
+ panel_background_fill_dark="#272727",
553
+ panel_border_color="#3a3a3a",
554
+ panel_border_color_dark="#3a3a3a",
555
+ shadow_drop="0 1px 4px 0 rgb(0 0 0 / 0.3)",
556
+ shadow_drop_lg="0 2px 5px 0 rgb(0 0 0 / 0.3)",
557
+ color_accent_soft="#272727",
558
+ color_accent_soft_dark="#272727",
559
+ )
560
+
561
+ with gr.Blocks(
562
+ title="SyncAI",
563
+ theme=_dark_theme,
564
+ css=_custom_css,
565
+ ) as demo:
566
+ gr.Markdown("# SyncAI\n### AI Music Ads Generator")
567
+ gr.Markdown(
568
+ "Upload a song (~15s clip), pick a visual style, and generate "
569
+ "a beat-synced music video ad."
570
+ )
571
+
572
+ # --- Build example song/cover art maps ---
573
+ _EXAMPLES_DIR = Path("examples")
574
+ _COVER_ART_MAP = {
575
+ "Gone": "Gone.jpg",
576
+ "Cant find myself": "Cant find myself.png",
577
+ "The more I do": "The more I do.png",
578
+ "House of House": "House of House.png",
579
+ }
580
+ _example_songs = {}
581
+ _example_covers = {}
582
+ if _EXAMPLES_DIR.exists():
583
+ for wav in sorted(_EXAMPLES_DIR.glob("*.wav")):
584
+ _example_songs[wav.stem] = str(wav)
585
+ cover_file = _COVER_ART_MAP.get(wav.stem, "")
586
+ cover_path = _EXAMPLES_DIR / cover_file
587
+ if cover_path.exists():
588
+ _example_covers[wav.stem] = str(cover_path)
589
+
590
+ def _on_example_song(song_name, cover_mode):
591
+ if not song_name:
592
+ return None, None
593
+ audio = _example_songs.get(song_name)
594
+ cover = _example_covers.get(song_name) if cover_mode == "With cover art" else None
595
+ return audio, cover
596
+
597
+ with gr.Row(equal_height=True):
598
+ # --- Left: Song ---
599
+ with gr.Column():
600
+ audio_input = gr.Audio(
601
+ label="Upload Song",
602
+ type="filepath",
603
+ sources=["upload"],
604
+ )
605
+ with gr.Group():
606
+ example_song = gr.Dropdown(
607
+ choices=list(_example_songs.keys()) if _example_songs else [],
608
+ value=None,
609
+ label="Or pick an example",
610
+ info="Pre-loaded ~15s song clips to try the pipeline",
611
+ )
612
+ example_cover_mode = gr.Radio(
613
+ choices=["With cover art", "Without cover art"],
614
+ value="With cover art",
615
+ show_label=False,
616
+ info="Include album artwork overlay from the drop onwards",
617
+ )
618
+
619
+ # --- Center: Cover art ---
620
+ with gr.Column():
621
+ cover_art_input = gr.Image(
622
+ label="Cover Art (optional)",
623
+ type="filepath",
624
+ sources=["upload"],
625
+ )
626
+
627
+ # --- Right: Visual Style ---
628
+ with gr.Column():
629
+ style_dropdown = gr.Dropdown(
630
+ choices=style_names(),
631
+ value="Sunset Coastal Drive",
632
+ label="Visual Style",
633
+ info="LoRA style applied to generated images",
634
+ )
635
+
636
+ # --- Resume (dev only, below main row) ---
637
+ with gr.Row(visible=not IS_SPACES):
638
+ with gr.Column():
639
+ with gr.Group():
640
+ run_mode = gr.Radio(
641
+ choices=["New Run", "Resume Existing"],
642
+ value="New Run",
643
+ label="Run Mode",
644
+ )
645
+ existing_run = gr.Dropdown(
646
+ choices=_list_runs(),
647
+ label="Existing Run",
648
+ visible=False,
649
+ )
650
+ start_step = gr.Dropdown(
651
+ choices=[],
652
+ label="Start From Step",
653
+ visible=False,
654
+ )
655
+ reuse_files = gr.Checkbox(
656
+ value=True,
657
+ label="Reuse existing images & videos",
658
+ info="Uncheck to regenerate images and video clips",
659
+ visible=False,
660
+ )
661
+
662
+ generate_btn = gr.Button("Generate Video", variant="primary")
663
+ video_output = gr.Video(label="Generated Music Video")
664
+ reshuffle_btn = gr.Button("Reshuffle", variant="secondary", visible=False)
665
+ last_run_dir = gr.State(value="")
666
+
667
+ # --- Event handlers ---
668
+ example_song.change(
669
+ fn=_on_example_song,
670
+ inputs=[example_song, example_cover_mode],
671
+ outputs=[audio_input, cover_art_input],
672
+ )
673
+ example_cover_mode.change(
674
+ fn=_on_example_song,
675
+ inputs=[example_song, example_cover_mode],
676
+ outputs=[audio_input, cover_art_input],
677
+ )
678
+
679
+ run_mode.change(
680
+ fn=_on_run_mode_change,
681
+ inputs=run_mode,
682
+ outputs=[audio_input, existing_run, start_step, reuse_files],
683
+ )
684
+ existing_run.change(
685
+ fn=_on_run_selected,
686
+ inputs=existing_run,
687
+ outputs=start_step,
688
+ )
689
+
690
+ generate_btn.click(
691
+ fn=generate,
692
+ inputs=[audio_input, style_dropdown,
693
+ cover_art_input, run_mode, existing_run, start_step, reuse_files],
694
+ outputs=[video_output, last_run_dir, reshuffle_btn],
695
+ )
696
+ reshuffle_btn.click(
697
+ fn=reshuffle,
698
+ inputs=[last_run_dir, cover_art_input],
699
+ outputs=video_output,
700
+ )
701
+
702
+
703
+ if __name__ == "__main__":
704
+ demo.launch()
assets/spotify_badge.png ADDED
examples/Cant find myself.png ADDED

Git LFS Details

  • SHA256: 04375305775722b36f1f00be122192b8357e15e927b9f7fcd5363385c44b93ff
  • Pointer size: 132 Bytes
  • Size of remote file: 5.59 MB
examples/Cant find myself.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04918abd40a767d17f825231d63a447a237571debbdb815c6353dd08099b256e
3
+ size 4831012
examples/Gone.jpg ADDED

Git LFS Details

  • SHA256: 8f389fae99b920dd27448eeccc45178458c2b239833ec97c8ccc31adfda77269
  • Pointer size: 132 Bytes
  • Size of remote file: 1.01 MB
examples/Gone.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04dd7abeed8b2ad85e4afe7a2af27140ecca5de568ef223cebb91342997f1e9e
3
+ size 3072044
examples/House of House.png ADDED

Git LFS Details

  • SHA256: aab03a79e76483f5e9c2cbc0ba15307d2c7399f7d86d64e04dba7d9f25e0113c
  • Pointer size: 132 Bytes
  • Size of remote file: 8.07 MB
examples/House of House.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa5f98c25a23e48baac44ba73a27041674a5634577c7a75ec57b01901e812faf
3
+ size 2126816
examples/The more I do.png ADDED

Git LFS Details

  • SHA256: a353a6b18f333e5681a02029352e4d6187b009f7480f095e31dc9d5f46b428a8
  • Pointer size: 132 Bytes
  • Size of remote file: 7.22 MB
examples/The more I do.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69f821577316e9159ef0c9b695fac30b8af8ea35202698603e4059aaa98709e3
3
+ size 2925760
fonts/Anton-Regular.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4ba3a92350ebb031da0cb47630ac49eb265082ca1bc0450442f4a83ab947cab
3
+ size 170812
fonts/BebasNeue-Regular.ttf ADDED
Binary file (61.4 kB). View file
 
fonts/Montserrat-Bold.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f7b311b2f3279e4eef9b2f968bcdbab6e28f4daeb1f049f4f278a902bcd82f7
3
+ size 744936
fonts/Oswald-Regular.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b38c246e255a12f5712d640d56bcced0472466fc68983d2d0410ec0457c2817
3
+ size 172088
fonts/RussoOne-Regular.ttf ADDED
Binary file (39.1 kB). View file
 
fonts/Staatliches-Regular.ttf ADDED
Binary file (63.3 kB). View file
 
fonts/Teko-Bold.ttf ADDED
The diff for this file is too large to render. See raw diff
 
lora_training_data/.DS_Store ADDED
Binary file (10.2 kB). View file
 
lora_training_data/metadata.jsonl ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"file_name": "pexels-anytiffng-2121455.jpg", "prompt": "a crowd of people relaxing at an outdoor beach bar with string lights and palm trees, ocean in the background, in sks style"}
2
+ {"file_name": "pexels-artemmeletov-9201316.jpg", "prompt": "a vast field of sunflowers stretching to the horizon with a snow-capped mountain in the distance, in sks style"}
3
+ {"file_name": "pexels-ekrulila-6536235.jpg", "prompt": "the interior of a cozy bookshop with wooden shelves full of books, records displayed on a table, warm lamp lighting, in sks style"}
4
+ {"file_name": "pexels-helenalopes-1959053.jpg", "prompt": "a golden retriever standing on a dusty rural path beside a wooden fence, backlit with warm light, in sks style"}
5
+ {"file_name": "pexels-jerusaemm-2905514.jpg", "prompt": "a city skyline seen from a rooftop through a glass canopy, high-rise buildings silhouetted against a colorful sky, in sks style"}
6
+ {"file_name": "pexels-kovyrina-1600139.jpg", "prompt": "a close-up of wheat stalks in a field, soft shallow depth of field with a low sun on the horizon, in sks style"}
7
+ {"file_name": "pexels-kyle-karbowski-109303118-9968067.jpg", "prompt": "a sand dune with rippled wind patterns and a sharp ridge line, desert landscape under a clear sky, in sks style"}
8
+ {"file_name": "pexels-lokmansevim-13627402.jpg", "prompt": "the interior of a vintage convertible car with a burgundy leather bench seat and steering wheel, warm tones, in sks style"}
9
+ {"file_name": "pexels-matthew-jesus-468170389-30227212.jpg", "prompt": "a city street with parked cars and traditional architecture, the sun setting behind buildings, shot from behind a metal pole, in sks style"}
10
+ {"file_name": "pexels-omer-hakki-49913894-7820946.jpg", "prompt": "a field of sunflowers in bloom seen from a low angle, overcast sky above, petals catching the last light, in sks style"}
11
+ {"file_name": "pexels-perspectivo-2048722386-29185675.jpg", "prompt": "a city skyline with skyscrapers and a river, a boat passing under a stone bridge, golden reflections on the water, in sks style"}
12
+ {"file_name": "pexels-pixabay-417059.jpg", "prompt": "a silhouette of a woman holding a camera standing next to a bicycle against a bright orange sky, in sks style"}
13
+ {"file_name": "pexels-pixabay-67566.jpg", "prompt": "a small boat resting on a sandy beach beside a leaning palm tree, dramatic clouds over the ocean, in sks style"}
14
+ {"file_name": "pexels-seyma-alkas-178198724-12858917.jpg", "prompt": "a glowing sun partially hidden behind silhouetted tree branches against a deep orange sky, in sks style"}
15
+ {"file_name": "pexels-todd-trapani-488382-1535162.jpg", "prompt": "a long wooden pier leading to a lighthouse at the end, calm ocean on both sides, warm light on the walkway, in sks style"}
lora_training_data/pexels-anytiffng-2121455.jpg ADDED

Git LFS Details

  • SHA256: dacdf0d0ee0820819be12eb8e98ee5207c34650fbe2eab0c13037b6feaaf0a01
  • Pointer size: 132 Bytes
  • Size of remote file: 1.18 MB
lora_training_data/pexels-artemmeletov-9201316.jpg ADDED

Git LFS Details

  • SHA256: 3ecad960afc85937bfb24b7fd2007139171088c67ff89e1ca5b91ea92fd231b1
  • Pointer size: 132 Bytes
  • Size of remote file: 1.08 MB
lora_training_data/pexels-ekrulila-6536235.jpg ADDED

Git LFS Details

  • SHA256: a801d9c50ef05462843e0ba2dd23befc3cb2f1d8bf9c81cbadc9659f727cd802
  • Pointer size: 132 Bytes
  • Size of remote file: 2.77 MB
lora_training_data/pexels-helenalopes-1959053.jpg ADDED

Git LFS Details

  • SHA256: 4e139154110c7f11ec1f6d5676a93afdcdeec632e93d798b3fe9be177112ec30
  • Pointer size: 132 Bytes
  • Size of remote file: 1.69 MB
lora_training_data/pexels-jerusaemm-2905514.jpg ADDED

Git LFS Details

  • SHA256: 5a62c00cb8971dc1a639885186d359052371ead97347f36234037f5de82003bb
  • Pointer size: 131 Bytes
  • Size of remote file: 478 kB
lora_training_data/pexels-kovyrina-1600139.jpg ADDED

Git LFS Details

  • SHA256: 3084c917e0e7bfd02e5e8eb995ee7600209ceec660de11fcd31a47847ddf8864
  • Pointer size: 131 Bytes
  • Size of remote file: 894 kB
lora_training_data/pexels-kyle-karbowski-109303118-9968067.jpg ADDED

Git LFS Details

  • SHA256: 834ef9d2579934e0f8707fdc4afb35efe72c15b23f37131e74b141ed726e6de8
  • Pointer size: 132 Bytes
  • Size of remote file: 1.74 MB
lora_training_data/pexels-lokmansevim-13627402.jpg ADDED

Git LFS Details

  • SHA256: f54d924048e26905ba513ffa71f8360b3334b8325be589e4a78acacdabaf6b08
  • Pointer size: 132 Bytes
  • Size of remote file: 4.12 MB
lora_training_data/pexels-matthew-jesus-468170389-30227212.jpg ADDED

Git LFS Details

  • SHA256: 935684a45bc24d237d6d48ecff810b0ec134b4ceeefb6da9cef7c4b3c83421af
  • Pointer size: 131 Bytes
  • Size of remote file: 953 kB
lora_training_data/pexels-omer-hakki-49913894-7820946.jpg ADDED

Git LFS Details

  • SHA256: b67fcb00b4d7c0bb1017ec9ea66049fcabb650585e8d0ca11e8339ba92835efc
  • Pointer size: 132 Bytes
  • Size of remote file: 1.69 MB
lora_training_data/pexels-perspectivo-2048722386-29185675.jpg ADDED

Git LFS Details

  • SHA256: 2e5b95273838ce5568078fa006b0be48f3876a8725e526b36ad596ba213bfade
  • Pointer size: 132 Bytes
  • Size of remote file: 1.56 MB
lora_training_data/pexels-pixabay-417059.jpg ADDED

Git LFS Details

  • SHA256: baddf0c567086124baac2a3f25fc12f1f325afc2ea93fa114a7c84f98cdebce0
  • Pointer size: 132 Bytes
  • Size of remote file: 1.37 MB
lora_training_data/pexels-pixabay-67566.jpg ADDED

Git LFS Details

  • SHA256: 1dc9572146ff9a32b29e55654a05ddcf57bdc143fcd23a08510b38159b67378b
  • Pointer size: 132 Bytes
  • Size of remote file: 1.56 MB
lora_training_data/pexels-seyma-alkas-178198724-12858917.jpg ADDED

Git LFS Details

  • SHA256: 4bc19f08ce62987de4c3b6463eeab59c4f9b79b2f7726c48d9c81f11f39ad7f3
  • Pointer size: 131 Bytes
  • Size of remote file: 258 kB
lora_training_data/pexels-todd-trapani-488382-1535162.jpg ADDED

Git LFS Details

  • SHA256: 37b1f06d18da0dc328cc2a4c7c2f94a394b995de839cd10e91f07c8af24aedbe
  • Pointer size: 132 Bytes
  • Size of remote file: 2.59 MB
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ffmpeg
2
+ libsndfile1-dev
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ whisperx
2
+ librosa
3
+ diffusers
4
+ transformers
5
+ accelerate
6
+ peft
7
+ huggingface_hub
8
+ torch
9
+ torchaudio
10
+ torchao
11
+ gradio
12
+ soundfile
13
+ scipy
14
+ anthropic
15
+ python-dotenv
16
+ requests
17
+ spaces
18
+ madmom @ git+https://github.com/CPJKU/madmom.git
src/__init__.py ADDED
File without changes
src/assembler.py ADDED
@@ -0,0 +1,627 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FFmpeg video stitching, clip splitting/shuffling, lyrics overlay.
2
+
3
+ Takes generated video clips (one per 4-beat segment), splits each into
4
+ two halves, shuffles them with a distance constraint, builds a timeline
5
+ with dynamic pacing (4-beat cuts before the drop, 2-beat after), overlays
6
+ audio and lyrics text.
7
+ """
8
+
9
+ import json
10
+ import random
11
+ import subprocess
12
+ import tempfile
13
+ from pathlib import Path
14
+
15
+
16
+ def _get_audio_path(run_dir: Path) -> Path:
17
+ """Find the original audio file one level above the run directory."""
18
+ song_dir = run_dir.parent
19
+ for ext in [".wav", ".mp3", ".flac", ".m4a"]:
20
+ candidates = list(song_dir.glob(f"*{ext}"))
21
+ if candidates:
22
+ return candidates[0]
23
+ raise FileNotFoundError(f"No audio file found in {song_dir}")
24
+
25
+
26
+ def _get_clip_duration(clip_path: Path) -> float:
27
+ """Get video duration in seconds using ffprobe."""
28
+ result = subprocess.run([
29
+ "ffprobe", "-v", "error",
30
+ "-show_entries", "format=duration",
31
+ "-of", "csv=p=0",
32
+ str(clip_path),
33
+ ], capture_output=True, text=True, check=True)
34
+ return float(result.stdout.strip())
35
+
36
+
37
+ def _get_clip_fps(clip_path: Path) -> float:
38
+ """Get video frame rate using ffprobe."""
39
+ result = subprocess.run([
40
+ "ffprobe", "-v", "error",
41
+ "-select_streams", "v:0",
42
+ "-show_entries", "stream=r_frame_rate",
43
+ "-of", "csv=p=0",
44
+ str(clip_path),
45
+ ], capture_output=True, text=True, check=True)
46
+ num, den = result.stdout.strip().split("/")
47
+ return int(num) / int(den)
48
+
49
+
50
+ def _trim_clip(clip_path: Path, start: float, duration: float, output_path: Path):
51
+ """Trim a video clip from a start point to a duration using FFmpeg."""
52
+ cmd = [
53
+ "ffmpeg", "-y",
54
+ "-ss", f"{start:.3f}",
55
+ "-i", str(clip_path),
56
+ "-t", f"{duration:.3f}",
57
+ "-c:v", "libx264", "-preset", "fast",
58
+ "-an",
59
+ str(output_path),
60
+ ]
61
+ subprocess.run(cmd, check=True, capture_output=True)
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Ken Burns effects β€” subtle pan/zoom applied per slot for added motion
66
+ # ---------------------------------------------------------------------------
67
+
68
+ # Zoom factor: 8% total movement over the clip duration
69
+ _KB_ZOOM = 0.45
70
+
71
+ KEN_BURNS_EFFECTS = [
72
+ "zoom_in",
73
+ "zoom_out",
74
+ ]
75
+
76
+
77
+ def _ken_burns_filter(
78
+ effect: str, n_frames: int, width: int, height: int,
79
+ ) -> str:
80
+ """Build an FFmpeg filter for a smooth Ken Burns zoom effect on video.
81
+
82
+ Upscales the video 4x before applying zoompan with d=1 (one output
83
+ frame per input frame), then scales back to original size. The 4x
84
+ upscale makes integer rounding in zoompan negligible, eliminating
85
+ visible jitter.
86
+ """
87
+ z = _KB_ZOOM
88
+ N = max(n_frames, 1)
89
+ W, H = width, height
90
+ # Upscale factor β€” higher = smoother but slower
91
+ UP = 8
92
+ UW, UH = W * UP, H * UP
93
+
94
+ if effect == "zoom_in":
95
+ zoom_expr = f"1+{z}*on/{N}"
96
+ elif effect == "zoom_out":
97
+ zoom_expr = f"1+{z}-{z}*on/{N}"
98
+ else:
99
+ return f"scale={W}:{H}"
100
+
101
+ return (
102
+ f"scale={UW}:{UH}:flags=lanczos,"
103
+ f"zoompan=z='{zoom_expr}':"
104
+ f"x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':"
105
+ f"d=1:s={UW}x{UH},"
106
+ f"scale={W}:{H}:flags=lanczos"
107
+ )
108
+
109
+
110
+ def _get_clip_dimensions(clip_path: Path) -> tuple[int, int]:
111
+ """Get width and height of a video clip."""
112
+ result = subprocess.run(
113
+ ["ffprobe", "-v", "error", "-select_streams", "v:0",
114
+ "-show_entries", "stream=width,height",
115
+ "-of", "csv=s=x:p=0", str(clip_path)],
116
+ capture_output=True, text=True, check=True,
117
+ )
118
+ w, h = result.stdout.strip().split("x")
119
+ return int(w), int(h)
120
+
121
+
122
+ def _split_clip(clip_path: Path, clip_id: int) -> dict:
123
+ """Register a clip's two halves without pre-splitting.
124
+
125
+ The "first" half plays from the start, the "second" half plays from
126
+ the end (offset back by the slot duration at trim time). This makes
127
+ the two halves maximally different β€” no fixed midpoint split.
128
+
129
+ Returns dict with the original path and full duration for each half.
130
+ """
131
+ duration = _get_clip_duration(clip_path)
132
+
133
+ return {
134
+ "clip_id": clip_id,
135
+ "first": clip_path,
136
+ "second": clip_path,
137
+ "first_duration": duration,
138
+ "second_duration": duration,
139
+ }
140
+
141
+
142
+ def _build_sub_segments(segments: list[dict], drop_time: float | None) -> list[dict]:
143
+ """Build the final timeline of sub-segments.
144
+
145
+ Before the drop: one slot per 4-beat segment.
146
+ After the drop: each 4-beat segment splits into two 2-beat slots
147
+ using the beat timestamps stored in the segment.
148
+ """
149
+ sub_segments = []
150
+
151
+ for seg in segments:
152
+ beats = seg.get("beats", [seg["start"], seg["end"]])
153
+ is_after_drop = drop_time is not None and seg["start"] >= drop_time
154
+
155
+ if is_after_drop and len(beats) >= 3:
156
+ # Split at midpoint beat (beat 2 of 4)
157
+ mid_idx = len(beats) // 2
158
+ mid_time = beats[mid_idx]
159
+
160
+ sub_segments.append({
161
+ "start": seg["start"],
162
+ "end": mid_time,
163
+ "duration": round(mid_time - seg["start"], 3),
164
+ "lyrics": seg.get("lyrics", ""),
165
+ "parent_segment": seg["segment"],
166
+ })
167
+ sub_segments.append({
168
+ "start": mid_time,
169
+ "end": seg["end"],
170
+ "duration": round(seg["end"] - mid_time, 3),
171
+ "lyrics": "", # lyrics stay on the first half
172
+ "parent_segment": seg["segment"],
173
+ })
174
+ else:
175
+ # Before drop: one slot for the full 4-beat segment
176
+ sub_segments.append({
177
+ "start": seg["start"],
178
+ "end": seg["end"],
179
+ "duration": seg["duration"],
180
+ "lyrics": seg.get("lyrics", ""),
181
+ "parent_segment": seg["segment"],
182
+ })
183
+
184
+ return sub_segments
185
+
186
+
187
+ def _shuffle_with_distance(pool: list[tuple], n_slots: int) -> list[tuple]:
188
+ """Select n_slots sub-clips maximising clip diversity and spacing.
189
+
190
+ Shuffles clip IDs once, then repeats that order to fill all slots.
191
+ First pass uses "first" halves, second pass uses "second" halves.
192
+ Same clip is always exactly n_clips positions apart β€” maximum spacing.
193
+
194
+ Each item is (clip_id, half_label, path, duration).
195
+ """
196
+ by_clip: dict[int, list[tuple]] = {}
197
+ for item in pool:
198
+ by_clip.setdefault(item[0], []).append(item)
199
+
200
+ clip_ids = list(by_clip.keys())
201
+ random.shuffle(clip_ids)
202
+
203
+ # Repeat the shuffled order: [4,5,1,2,6,3, 4,5,1,2,6,3, ...]
204
+ result = []
205
+ cycle = 0
206
+ while len(result) < n_slots:
207
+ for cid in clip_ids:
208
+ if len(result) >= n_slots:
209
+ break
210
+ halves = by_clip[cid]
211
+ # First cycle uses "first" half, second cycle uses "second", etc.
212
+ half_idx = cycle % len(halves)
213
+ result.append(halves[half_idx])
214
+ cycle += 1
215
+
216
+ return result
217
+
218
+
219
+ # Font registry β€” maps display names to .ttf filenames in fonts/
220
+ FONTS = {
221
+ "Bebas Neue": "BebasNeue-Regular.ttf",
222
+ "Teko": "Teko-Bold.ttf",
223
+ "Russo One": "RussoOne-Regular.ttf",
224
+ "Staatliches": "Staatliches-Regular.ttf",
225
+ }
226
+
227
+ DEFAULT_FONT = "Bebas Neue"
228
+ DEFAULT_FONT_COLOR = "#FFF7D4"
229
+
230
+ _FONTS_DIR = Path(__file__).resolve().parent.parent / "fonts"
231
+
232
+
233
+ def font_names() -> list[str]:
234
+ """Return list of available font display names."""
235
+ return list(FONTS.keys())
236
+
237
+
238
+ def _get_font_path(font_name: str) -> Path:
239
+ """Resolve a font display name to its .ttf file path."""
240
+ filename = FONTS.get(font_name, FONTS[DEFAULT_FONT])
241
+ return _FONTS_DIR / filename
242
+
243
+
244
+ _SPOTIFY_BADGE = Path(__file__).resolve().parent.parent / "assets" / "spotify_badge.png"
245
+
246
+
247
+ def _add_lyrics_overlay(
248
+ video_path: Path,
249
+ segments: list[dict],
250
+ output_path: Path,
251
+ audio_offset: float,
252
+ font_name: str = DEFAULT_FONT,
253
+ font_color: str = DEFAULT_FONT_COLOR,
254
+ cover_art: Path | None = None,
255
+ drop_time: float | None = None,
256
+ song_name: str = "",
257
+ ):
258
+ """Add lyrics text and optional cover art overlay using FFmpeg filters."""
259
+ font_path = _get_font_path(font_name)
260
+
261
+ # If cover art provided, lyrics stop at the drop
262
+ lyrics_cutoff = None
263
+ if cover_art is not None and drop_time is not None:
264
+ lyrics_cutoff = drop_time
265
+
266
+ # Collect all words with timestamps
267
+ all_words = []
268
+ for seg in segments:
269
+ for word_info in seg.get("words", []):
270
+ word = word_info["word"].strip().lower()
271
+ if not word:
272
+ continue
273
+ w_start = word_info["start"]
274
+ w_end = word_info["end"]
275
+ # Skip words that start after the cutoff
276
+ if lyrics_cutoff is not None and w_start >= lyrics_cutoff:
277
+ continue
278
+ # Clamp end to cutoff for words that span the drop
279
+ if lyrics_cutoff is not None and w_end > lyrics_cutoff:
280
+ w_end = lyrics_cutoff
281
+ all_words.append({"word": word, "start": w_start, "end": w_end})
282
+
283
+ # Close small gaps: both words meet in the middle of the gap
284
+ gap_threshold = 0.5
285
+ for i in range(len(all_words) - 1):
286
+ gap = all_words[i + 1]["start"] - all_words[i]["end"]
287
+ if 0 < gap < gap_threshold:
288
+ mid = all_words[i]["end"] + gap / 2
289
+ all_words[i]["end"] = mid
290
+ all_words[i + 1]["start"] = mid
291
+
292
+ # Build drawtext filter chain β€” one filter per word, timed to speech
293
+ drawtext_filters = []
294
+ for w in all_words:
295
+ escaped = (w["word"]
296
+ .replace("\\", "\\\\")
297
+ .replace("'", "\u2019")
298
+ .replace('"', '\\"')
299
+ .replace(":", "\\:")
300
+ .replace("%", "%%")
301
+ .replace("[", "\\[")
302
+ .replace("]", "\\]"))
303
+
304
+ start = w["start"] - audio_offset
305
+ end = w["end"] - audio_offset
306
+
307
+ drawtext_filters.append(
308
+ f"drawtext=text='{escaped}'"
309
+ f":fontfile='{font_path}'"
310
+ f":fontsize=36"
311
+ f":fontcolor={font_color}"
312
+ f":x=(w-text_w)/2:y=(h-text_h)/2"
313
+ f":enable='between(t,{start:.3f},{end:.3f})'"
314
+ )
315
+
316
+ has_cover = cover_art is not None and drop_time is not None
317
+ has_lyrics = len(drawtext_filters) > 0
318
+
319
+ if not has_cover and not has_lyrics:
320
+ subprocess.run([
321
+ "ffmpeg", "-y", "-i", str(video_path),
322
+ "-c", "copy", str(output_path),
323
+ ], check=True, capture_output=True)
324
+ return
325
+
326
+ if has_cover:
327
+ drop_start = drop_time - audio_offset
328
+ enable = f"enable='gte(t,{drop_start:.3f})'"
329
+
330
+ # --- Cover art layout (change these to adjust) ---
331
+ art_h = 270 # cover art height in px
332
+ art_y_offset = 10 # px below center (positive = down)
333
+ badge_h = 56 # spotify badge height in px
334
+
335
+ # Probe video height for position calculations
336
+ vid_h = int(subprocess.run([
337
+ "ffprobe", "-v", "error", "-select_streams", "v:0",
338
+ "-show_entries", "stream=height", "-of", "csv=p=0",
339
+ str(video_path),
340
+ ], capture_output=True, text=True, check=True).stdout.strip())
341
+ art_center = vid_h / 2 + art_y_offset
342
+ art_top = art_center - art_h / 2
343
+ art_bottom = art_center + art_h / 2
344
+
345
+ # Square = 9:16 crop region (side = vid_h * 9/16)
346
+ sq_side = vid_h * 9 / 16
347
+ sq_top = (vid_h - sq_side) / 2
348
+ sq_bottom = (vid_h + sq_side) / 2
349
+
350
+ # Badge centered between square top and art top
351
+ badge_center_y = (sq_top + art_top) / 2
352
+ badge_y = int(badge_center_y - badge_h / 2)
353
+
354
+ # Title centered between art bottom and square bottom
355
+ title_center_y = int((art_bottom + sq_bottom) / 2)
356
+
357
+ art_overlay_y = int(art_center - art_h / 2)
358
+
359
+ parts = [
360
+ f"[1:v]scale=-2:{art_h}:flags=lanczos[art]",
361
+ f"[2:v]scale=-2:{badge_h}:flags=lanczos[badge]",
362
+ f"[0:v][art]overlay=(W-w)/2:{art_overlay_y}:{enable}[v1]",
363
+ f"[v1][badge]overlay=(W-w)/2:{badge_y}:{enable}",
364
+ ]
365
+
366
+ # Add song title drawtext below cover art
367
+ title_escaped = (song_name
368
+ .replace("\\", "\\\\")
369
+ .replace("'", "\u2019")
370
+ .replace('"', '\\"')
371
+ .replace(":", "\\:")
372
+ .replace("%", "%%"))
373
+ title_text = f'\\"{title_escaped}\\" out now!'.lower()
374
+ parts[-1] += (
375
+ f",drawtext=text='{title_text}'"
376
+ f":fontfile='{font_path}'"
377
+ f":fontsize=40"
378
+ f":fontcolor={font_color}"
379
+ f":x=(w-text_w)/2:y={title_center_y}-text_h/2"
380
+ f":{enable}"
381
+ )
382
+
383
+ # Chain drawtext lyrics filters
384
+ if has_lyrics:
385
+ parts[-1] += "," + ",".join(drawtext_filters)
386
+ filter_chain = ";".join(parts)
387
+
388
+ cmd = [
389
+ "ffmpeg", "-y",
390
+ "-i", str(video_path),
391
+ "-i", str(cover_art),
392
+ "-i", str(_SPOTIFY_BADGE),
393
+ "-filter_complex", filter_chain,
394
+ "-c:v", "libx264", "-preset", "fast",
395
+ "-c:a", "copy",
396
+ str(output_path),
397
+ ]
398
+ subprocess.run(cmd, check=True, capture_output=True)
399
+ else:
400
+ # Lyrics only, no cover art
401
+ filter_chain = ",".join(drawtext_filters)
402
+ subprocess.run([
403
+ "ffmpeg", "-y",
404
+ "-i", str(video_path),
405
+ "-vf", filter_chain,
406
+ "-c:v", "libx264", "-preset", "fast",
407
+ "-c:a", "copy",
408
+ str(output_path),
409
+ ], check=True, capture_output=True)
410
+
411
+
412
+ def assemble(
413
+ run_dir: str | Path,
414
+ audio_path: str | Path | None = None,
415
+ font_name: str = DEFAULT_FONT,
416
+ font_color: str = DEFAULT_FONT_COLOR,
417
+ cover_art: str | Path | None = None,
418
+ ) -> Path:
419
+ """Assemble final video with dynamic pacing, clip shuffling, and lyrics.
420
+
421
+ Args:
422
+ run_dir: Run directory containing clips/, segments.json, drop.json.
423
+ audio_path: Path to the original audio. Auto-detected if None.
424
+ font_name: Display name of the font for lyrics overlay.
425
+ font_color: Hex color for lyrics text (e.g. '#FFF7D4').
426
+ cover_art: Path to cover art image. Overlayed from the drop onwards.
427
+
428
+ Returns:
429
+ Path to the final video file.
430
+ """
431
+ run_dir = Path(run_dir)
432
+ clips_dir = run_dir / "clips"
433
+ output_dir = run_dir / "output"
434
+ output_dir.mkdir(parents=True, exist_ok=True)
435
+
436
+ with open(run_dir / "segments.json") as f:
437
+ segments = json.load(f)
438
+
439
+ # Load drop time
440
+ drop_time = None
441
+ drop_path = run_dir / "drop.json"
442
+ if drop_path.exists():
443
+ with open(drop_path) as f:
444
+ drop_time = json.load(f).get("drop_time")
445
+ print(f" Drop at {drop_time:.3f}s")
446
+ else:
447
+ print(" No drop detected β€” using uniform pacing")
448
+
449
+ if audio_path is None:
450
+ audio_path = _get_audio_path(run_dir)
451
+ audio_path = Path(audio_path)
452
+
453
+ # --- Step 1: Register clip halves (no pre-splitting needed) ---
454
+ sub_clips = [] # list of (clip_id, half, path, full_duration)
455
+ for seg in segments:
456
+ idx = seg["segment"]
457
+ clip_path = clips_dir / f"clip_{idx:03d}.mp4"
458
+ if not clip_path.exists():
459
+ print(f" Warning: {clip_path.name} not found, skipping")
460
+ continue
461
+
462
+ halves = _split_clip(clip_path, idx)
463
+ sub_clips.append((idx, "first", halves["first"], halves["first_duration"]))
464
+ sub_clips.append((idx, "second", halves["second"], halves["second_duration"]))
465
+ print(f" Registered {clip_path.name} ({halves['first_duration']:.1f}s)")
466
+
467
+ if not sub_clips:
468
+ raise FileNotFoundError(f"No clips found in {clips_dir}")
469
+
470
+ # --- Step 2: Build sub-segment timeline ---
471
+ sub_segments = _build_sub_segments(segments, drop_time)
472
+ print(f" Timeline: {len(sub_segments)} slots "
473
+ f"({len([s for s in sub_segments if s['duration'] < 1.5])} fast cuts)")
474
+
475
+ # --- Step 3: Shuffle sub-clips into slots ---
476
+ assigned = _shuffle_with_distance(sub_clips.copy(), n_slots=len(sub_segments))
477
+
478
+ # --- Step 4: Frame-accurate trim of each sub-clip to slot duration ---
479
+ # Detect FPS from first available sub-clip
480
+ fps = _get_clip_fps(assigned[0][2])
481
+ print(f" Source FPS: {fps}")
482
+
483
+ trimmed_dir = run_dir / "clips_trimmed"
484
+ trimmed_dir.mkdir(exist_ok=True)
485
+ trimmed_paths = []
486
+
487
+ # Get clip dimensions from the first available clip (all clips share resolution)
488
+ clip_width, clip_height = _get_clip_dimensions(assigned[0][2])
489
+ print(f" Clip resolution: {clip_width}x{clip_height}")
490
+
491
+ # Track cumulative frames to prevent drift between cuts and beats
492
+ cumulative_frames = 0
493
+ cumulative_target = 0.0
494
+
495
+ for i, (sub_seg, (clip_id, half, clip_path, clip_dur)) in enumerate(
496
+ zip(sub_segments, assigned)
497
+ ):
498
+ slot_dur = sub_seg["duration"]
499
+ cumulative_target += min(slot_dur, clip_dur)
500
+ target_frame = round(cumulative_target * fps)
501
+ n_frames = max(1, target_frame - cumulative_frames)
502
+ cumulative_frames = target_frame
503
+
504
+ # "first" half starts from 0, "second" half starts from end minus slot duration
505
+ # This makes the two halves show maximally different frames
506
+ if half == "second":
507
+ ss = max(0, clip_dur - slot_dur)
508
+ else:
509
+ ss = 0
510
+
511
+ # Apply Ken Burns effect β€” cycle through effects per slot
512
+ effect = KEN_BURNS_EFFECTS[i % len(KEN_BURNS_EFFECTS)]
513
+ vf = _ken_burns_filter(effect, n_frames, clip_width, clip_height)
514
+
515
+ trimmed_path = trimmed_dir / f"slot_{i:03d}.mp4"
516
+ cmd = [
517
+ "ffmpeg", "-y",
518
+ "-ss", f"{ss:.3f}",
519
+ "-i", str(clip_path),
520
+ "-frames:v", str(n_frames),
521
+ "-vf", vf,
522
+ "-c:v", "libx264", "-preset", "fast",
523
+ "-r", str(int(fps)),
524
+ "-an",
525
+ str(trimmed_path),
526
+ ]
527
+ subprocess.run(cmd, check=True, capture_output=True)
528
+ trimmed_paths.append(trimmed_path)
529
+ actual_dur = n_frames / fps
530
+ print(f" Slot {i}: clip {clip_id} ({half}, ss={ss:.1f}s, {effect}) β†’ "
531
+ f"{n_frames}f/{actual_dur:.3f}s (target {slot_dur:.3f}s)")
532
+
533
+ # --- Step 5: Concatenate (copy, no re-encode to preserve timing) ---
534
+ with tempfile.NamedTemporaryFile(
535
+ mode="w", suffix=".txt", delete=False, dir=str(run_dir)
536
+ ) as f:
537
+ for p in trimmed_paths:
538
+ f.write(f"file '{p.resolve()}'\n")
539
+ concat_list = f.name
540
+
541
+ concat_path = output_dir / "video_only.mp4"
542
+ subprocess.run([
543
+ "ffmpeg", "-y",
544
+ "-f", "concat", "-safe", "0",
545
+ "-i", concat_list,
546
+ "-c", "copy",
547
+ str(concat_path),
548
+ ], check=True, capture_output=True)
549
+
550
+ # --- Step 6: Overlay audio ---
551
+ audio_start = segments[0]["start"]
552
+ video_duration = cumulative_frames / fps # actual frame-accurate duration
553
+
554
+ with_audio_path = output_dir / "with_audio.mp4"
555
+ subprocess.run([
556
+ "ffmpeg", "-y",
557
+ "-i", str(concat_path),
558
+ "-ss", f"{audio_start:.3f}",
559
+ "-i", str(audio_path),
560
+ "-t", f"{video_duration:.3f}",
561
+ "-c:v", "copy",
562
+ "-c:a", "aac", "-b:a", "192k",
563
+ "-map", "0:v:0", "-map", "1:a:0",
564
+ "-shortest",
565
+ str(with_audio_path),
566
+ ], check=True, capture_output=True)
567
+
568
+ # --- Step 7: Lyrics + cover art overlay ---
569
+ overlay_path = output_dir / "with_overlay.mp4"
570
+ cover_path = Path(cover_art) if cover_art else None
571
+ song_name = run_dir.parent.name
572
+ _add_lyrics_overlay(with_audio_path, segments, overlay_path, audio_start,
573
+ font_name=font_name, font_color=font_color,
574
+ cover_art=cover_path, drop_time=drop_time,
575
+ song_name=song_name)
576
+
577
+ # --- Step 8: Crop to exact 9:16 ---
578
+ final_path = output_dir / "final.mp4"
579
+ subprocess.run([
580
+ "ffmpeg", "-y",
581
+ "-i", str(overlay_path),
582
+ "-vf", "crop=2*floor(ih*9/16/2):ih:(iw-2*floor(ih*9/16/2))/2:0",
583
+ "-c:v", "libx264", "-preset", "fast",
584
+ "-c:a", "copy",
585
+ str(final_path),
586
+ ], check=True, capture_output=True)
587
+
588
+ # Clean up
589
+ Path(concat_list).unlink(missing_ok=True)
590
+
591
+ print(f"\nFinal video: {final_path}")
592
+ print(f" Duration: {video_duration:.2f}s")
593
+ print(f" Slots: {len(sub_segments)} ({len(segments)} original segments)")
594
+ return final_path
595
+
596
+
597
+ def run(
598
+ run_dir: str | Path,
599
+ font_name: str = DEFAULT_FONT,
600
+ font_color: str = DEFAULT_FONT_COLOR,
601
+ cover_art: str | Path | None = None,
602
+ ) -> Path:
603
+ """Assemble final video from clips + audio.
604
+
605
+ Args:
606
+ run_dir: Run directory (e.g. data/Gone/run_001/).
607
+ font_name: Display name of the font for lyrics overlay.
608
+ font_color: Hex color for lyrics text.
609
+ cover_art: Path to cover art image (optional).
610
+
611
+ Returns:
612
+ Path to final video.
613
+ """
614
+ print("Assembling final video...")
615
+ return assemble(run_dir, font_name=font_name, font_color=font_color,
616
+ cover_art=cover_art)
617
+
618
+
619
+ if __name__ == "__main__":
620
+ import sys
621
+
622
+ if len(sys.argv) < 2:
623
+ print("Usage: python -m src.assembler <run_dir>")
624
+ print(" e.g. python -m src.assembler data/Gone/run_001")
625
+ sys.exit(1)
626
+
627
+ run(sys.argv[1])
src/beat_detector.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Beat/kick detection using madmom's RNN beat tracker."""
2
+
3
+ import json
4
+ import subprocess
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ import numpy as np
10
+ from madmom.features.beats import DBNBeatTrackingProcessor, RNNBeatProcessor
11
+
12
+ # Bandpass filter: isolate kick drum frequency range (50-200 Hz)
13
+ HIGHPASS_CUTOFF = 50
14
+ LOWPASS_CUTOFF = 500
15
+
16
+
17
+ def _bandpass_filter(input_path: Path) -> Path:
18
+ """Apply a 50-200 Hz bandpass filter to isolate kick drum transients.
19
+
20
+ Returns path to a temporary filtered WAV file.
21
+ """
22
+ filtered = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
23
+ filtered.close()
24
+ subprocess.run([
25
+ "ffmpeg", "-y",
26
+ "-i", str(input_path),
27
+ "-af", f"highpass=f={HIGHPASS_CUTOFF},lowpass=f={LOWPASS_CUTOFF}",
28
+ str(filtered.name),
29
+ ], check=True, capture_output=True)
30
+ return Path(filtered.name)
31
+
32
+
33
+ def detect_beats(
34
+ drum_stem_path: str | Path,
35
+ min_bpm: float = 55.0,
36
+ max_bpm: float = 215.0,
37
+ transition_lambda: float = 100,
38
+ fps: int = 1000,
39
+ ) -> np.ndarray:
40
+ """Detect beat timestamps from a drum stem using madmom.
41
+
42
+ Uses an ensemble of bidirectional LSTMs to produce a beat activation
43
+ function, then a Dynamic Bayesian Network to decode beat positions.
44
+
45
+ Args:
46
+ drum_stem_path: Path to the isolated drum stem WAV file.
47
+ min_bpm: Minimum expected tempo. Narrow this if you know the song's
48
+ approximate BPM for better accuracy.
49
+ max_bpm: Maximum expected tempo.
50
+ transition_lambda: Tempo smoothness β€” higher values penalise tempo
51
+ changes more (100 = very steady, good for most pop/rock).
52
+ fps: Frames per second for the DBN decoder. The RNN outputs at 100fps;
53
+ higher values interpolate for finer timestamp resolution (1ms at 1000fps).
54
+
55
+ Returns:
56
+ 1D numpy array of beat timestamps in seconds, sorted chronologically.
57
+ """
58
+ drum_stem_path = Path(drum_stem_path)
59
+
60
+ # Step 0: Bandpass filter to isolate kick drum range (50-200 Hz)
61
+ filtered_path = _bandpass_filter(drum_stem_path)
62
+
63
+ # Step 1: RNN produces beat activation function (probability per frame at 100fps)
64
+ act_proc = RNNBeatProcessor()
65
+ activations = act_proc(str(filtered_path))
66
+
67
+ # Clean up temp file
68
+ filtered_path.unlink(missing_ok=True)
69
+
70
+ # Step 2: Interpolate to higher fps for finer timestamp resolution (1ms at 1000fps)
71
+ if fps != 100:
72
+ from scipy.interpolate import interp1d
73
+ n_frames = len(activations)
74
+ t_orig = np.linspace(0, n_frames / 100, n_frames, endpoint=False)
75
+ n_new = int(n_frames * fps / 100)
76
+ t_new = np.linspace(0, n_frames / 100, n_new, endpoint=False)
77
+ activations = interp1d(t_orig, activations, kind="cubic", fill_value="extrapolate")(t_new)
78
+ activations = np.clip(activations, 0, None) # cubic spline can go negative
79
+
80
+ # Step 3: DBN decodes activations into beat timestamps
81
+ # correct=False lets the DBN place beats using its own high-res state space
82
+ # instead of snapping to the coarse 100fps activation peaks
83
+ beat_proc = DBNBeatTrackingProcessor(
84
+ min_bpm=min_bpm,
85
+ max_bpm=max_bpm,
86
+ transition_lambda=transition_lambda,
87
+ fps=fps,
88
+ correct=False,
89
+ )
90
+ beats = beat_proc(activations)
91
+
92
+ return beats
93
+
94
+
95
+ def detect_drop(
96
+ audio_path: str | Path,
97
+ beat_times: np.ndarray,
98
+ window_sec: float = 0.5,
99
+ ) -> float:
100
+ """Find the beat where the biggest energy jump occurs (the drop).
101
+
102
+ Computes RMS energy in a window around each beat and returns the beat
103
+ with the largest increase compared to the previous beat.
104
+
105
+ Args:
106
+ audio_path: Path to the full mix audio file.
107
+ beat_times: Array of beat timestamps in seconds.
108
+ window_sec: Duration of the analysis window around each beat.
109
+
110
+ Returns:
111
+ Timestamp (seconds) of the detected drop beat.
112
+ """
113
+ import librosa
114
+
115
+ y, sr = librosa.load(str(audio_path), sr=None, mono=True)
116
+ half_win = int(window_sec / 2 * sr)
117
+
118
+ rms_values = []
119
+ for t in beat_times:
120
+ center = int(t * sr)
121
+ start = max(0, center - half_win)
122
+ end = min(len(y), center + half_win)
123
+ segment = y[start:end]
124
+ rms = np.sqrt(np.mean(segment ** 2)) if len(segment) > 0 else 0.0
125
+ rms_values.append(rms)
126
+
127
+ rms_values = np.array(rms_values)
128
+
129
+ # Find largest positive jump between consecutive beats
130
+ diffs = np.diff(rms_values)
131
+ drop_idx = int(np.argmax(diffs)) + 1 # +1 because diff shifts by one
132
+ drop_time = float(beat_times[drop_idx])
133
+
134
+ print(f" Drop detected at beat {drop_idx + 1}: {drop_time:.3f}s "
135
+ f"(energy jump: {diffs[drop_idx - 1]:.4f})")
136
+ return drop_time
137
+
138
+
139
+ def select_beats(
140
+ beats: np.ndarray,
141
+ max_duration: float = 15.0,
142
+ min_interval: float = 0.3,
143
+ ) -> np.ndarray:
144
+ """Select a subset of beats for video generation.
145
+
146
+ Filters beats to fit within a duration limit and enforces a minimum
147
+ interval between consecutive beats (to avoid generating too many frames).
148
+
149
+ Args:
150
+ beats: Array of beat timestamps in seconds.
151
+ max_duration: Maximum video duration in seconds.
152
+ min_interval: Minimum time between selected beats in seconds.
153
+ Beats closer together than this are skipped.
154
+
155
+ Returns:
156
+ Filtered array of beat timestamps.
157
+ """
158
+ if len(beats) == 0:
159
+ return beats
160
+
161
+ # Trim to max duration
162
+ beats = beats[beats <= max_duration]
163
+
164
+ if len(beats) == 0:
165
+ return beats
166
+
167
+ # Enforce minimum interval between beats
168
+ selected = [beats[0]]
169
+ for beat in beats[1:]:
170
+ if beat - selected[-1] >= min_interval:
171
+ selected.append(beat)
172
+
173
+ return np.array(selected)
174
+
175
+
176
+ def save_beats(
177
+ beats: np.ndarray,
178
+ output_path: str | Path,
179
+ ) -> Path:
180
+ """Save beat timestamps to a JSON file.
181
+
182
+ Format matches the project convention (same style as lyrics.json):
183
+ a list of objects with beat index and timestamp.
184
+
185
+ Args:
186
+ beats: Array of beat timestamps in seconds.
187
+ output_path: Path to save the JSON file.
188
+
189
+ Returns:
190
+ Path to the saved JSON file.
191
+ """
192
+ output_path = Path(output_path)
193
+ output_path.parent.mkdir(parents=True, exist_ok=True)
194
+
195
+ data = [
196
+ {"beat": i + 1, "time": round(float(t), 3)}
197
+ for i, t in enumerate(beats)
198
+ ]
199
+
200
+ with open(output_path, "w") as f:
201
+ json.dump(data, f, indent=2)
202
+
203
+ return output_path
204
+
205
+
206
+ def run(
207
+ drum_stem_path: str | Path,
208
+ output_dir: Optional[str | Path] = None,
209
+ min_bpm: float = 55.0,
210
+ max_bpm: float = 215.0,
211
+ ) -> dict:
212
+ """Full beat detection pipeline: detect, select, and save.
213
+
214
+ Args:
215
+ drum_stem_path: Path to the isolated drum stem WAV file.
216
+ output_dir: Directory to save beats.json. Defaults to the
217
+ parent of the drum stem's parent (e.g. data/Gone/ if
218
+ stem is at data/Gone/stems/drums.wav).
219
+ min_bpm: Minimum expected tempo.
220
+ max_bpm: Maximum expected tempo.
221
+
222
+ Returns:
223
+ Dict with 'all_beats', 'selected_beats', and 'beats_path'.
224
+ """
225
+ drum_stem_path = Path(drum_stem_path)
226
+
227
+ if output_dir is None:
228
+ # stems/drums.wav -> parent is stems/, parent.parent is data/Gone/
229
+ output_dir = drum_stem_path.parent.parent
230
+ output_dir = Path(output_dir)
231
+
232
+ all_beats = detect_beats(drum_stem_path, min_bpm=min_bpm, max_bpm=max_bpm)
233
+ selected = select_beats(all_beats)
234
+
235
+ # Detect drop using the full mix audio (one level above stems/)
236
+ song_dir = output_dir.parent if output_dir.name.startswith("run_") else output_dir
237
+ audio_path = None
238
+ for ext in [".wav", ".mp3", ".flac", ".m4a"]:
239
+ candidates = list(song_dir.glob(f"*{ext}"))
240
+ if candidates:
241
+ audio_path = candidates[0]
242
+ break
243
+
244
+ drop_time = None
245
+ if audio_path and len(all_beats) > 2:
246
+ drop_time = detect_drop(audio_path, all_beats)
247
+
248
+ beats_path = save_beats(all_beats, output_dir / "beats.json")
249
+
250
+ # Save drop time alongside beats
251
+ if drop_time is not None:
252
+ drop_path = output_dir / "drop.json"
253
+ with open(drop_path, "w") as f:
254
+ json.dump({"drop_time": round(drop_time, 3)}, f, indent=2)
255
+
256
+ return {
257
+ "all_beats": all_beats,
258
+ "selected_beats": selected,
259
+ "beats_path": beats_path,
260
+ "drop_time": drop_time,
261
+ }
262
+
263
+
264
+ if __name__ == "__main__":
265
+ import sys
266
+
267
+ if len(sys.argv) < 2:
268
+ print("Usage: python -m src.beat_detector <drum_stem.wav>")
269
+ sys.exit(1)
270
+
271
+ result = run(sys.argv[1])
272
+ all_beats = result["all_beats"]
273
+ selected = result["selected_beats"]
274
+
275
+ print(f"Detected {len(all_beats)} beats (saved to {result['beats_path']})")
276
+ print(f"Selected {len(selected)} beats (max 15s, min 0.3s apart):")
277
+ for i, t in enumerate(selected):
278
+ print(f" Beat {i + 1}: {t:.3f}s")
src/image_generator_api.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image generation using SDXL + LoRA styles via fal.ai API.
2
+
3
+ API counterpart to image_generator_hf.py (on-device diffusers).
4
+ Uses the fal-ai/lora endpoint which accepts HuggingFace LoRA repo IDs
5
+ directly, so styles.py works unchanged.
6
+
7
+ Set FAL_KEY env var before use.
8
+ """
9
+
10
+ import json
11
+ import time
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ import requests
16
+ from dotenv import load_dotenv
17
+
18
+ from src.styles import get_style
19
+
20
+ load_dotenv()
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Config β€” matches image_generator_hf.py output
24
+ # ---------------------------------------------------------------------------
25
+
26
+ FAL_MODEL_ID = "fal-ai/lora"
27
+
28
+ BASE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
29
+
30
+ WIDTH = 768
31
+ HEIGHT = 1344
32
+ NUM_STEPS = 30
33
+ GUIDANCE_SCALE = 7.5
34
+
35
+
36
+ def _build_loras(style: dict) -> list[dict]:
37
+ """Build the LoRA list for the fal.ai API from a style dict.
38
+
39
+ Note: Hyper-SD speed LoRA is NOT used here (it's an on-device optimization
40
+ requiring specific scheduler config). fal.ai runs on fast GPUs so we use
41
+ standard settings (30 steps, DPM++ 2M Karras) instead.
42
+ """
43
+ loras = []
44
+
45
+ if style["source"] is not None:
46
+ # Pass HF repo ID directly β€” fal.ai resolves it internally.
47
+ # Full URLs to /resolve/main/ can fail with redirect issues.
48
+ loras.append({"path": style["source"], "scale": style["weight"]})
49
+
50
+ return loras
51
+
52
+
53
+ def _download_image(url: str, output_path: Path, retries: int = 3) -> Path:
54
+ """Download an image from URL to a local file with retry."""
55
+ output_path.parent.mkdir(parents=True, exist_ok=True)
56
+ for attempt in range(retries):
57
+ try:
58
+ resp = requests.get(url, timeout=120)
59
+ resp.raise_for_status()
60
+ with open(output_path, "wb") as f:
61
+ f.write(resp.content)
62
+ return output_path
63
+ except (requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e:
64
+ if attempt < retries - 1:
65
+ print(f" Download failed (attempt {attempt + 1}), retrying...")
66
+ else:
67
+ raise
68
+
69
+
70
+ def generate_image(
71
+ prompt: str,
72
+ negative_prompt: str = "",
73
+ loras: list[dict] | None = None,
74
+ seed: Optional[int] = None,
75
+ ) -> dict:
76
+ """Generate a single image via fal.ai API.
77
+
78
+ Args:
79
+ prompt: SDXL prompt.
80
+ negative_prompt: Negative prompt.
81
+ loras: List of LoRA dicts with 'path' and 'scale'.
82
+ seed: Random seed.
83
+
84
+ Returns:
85
+ API response dict with 'images' list and 'seed'.
86
+ """
87
+ import fal_client
88
+
89
+ args = {
90
+ "model_name": BASE_MODEL,
91
+ "prompt": prompt,
92
+ "negative_prompt": negative_prompt,
93
+ "image_size": {"width": WIDTH, "height": HEIGHT},
94
+ "num_inference_steps": NUM_STEPS,
95
+ "guidance_scale": GUIDANCE_SCALE,
96
+ "scheduler": "DPM++ 2M Karras",
97
+ "num_images": 1,
98
+ "image_format": "png",
99
+ "enable_safety_checker": False,
100
+ }
101
+ if loras:
102
+ args["loras"] = loras
103
+ if seed is not None:
104
+ args["seed"] = seed
105
+
106
+ result = fal_client.subscribe(FAL_MODEL_ID, arguments=args)
107
+ return result
108
+
109
+
110
+ def generate_all(
111
+ segments: list[dict],
112
+ output_dir: str | Path,
113
+ style_name: str = "Warm Sunset",
114
+ seed: int = 42,
115
+ progress_callback=None,
116
+ ) -> list[Path]:
117
+ """Generate images for all segments via fal.ai.
118
+
119
+ Args:
120
+ segments: List of segment dicts (with 'prompt' and 'negative_prompt').
121
+ output_dir: Directory to save images.
122
+ style_name: Style from styles.py registry.
123
+ seed: Base seed (incremented per segment).
124
+
125
+ Returns:
126
+ List of saved image paths.
127
+ """
128
+ style = get_style(style_name)
129
+ loras = _build_loras(style)
130
+ trigger = style["trigger"]
131
+ output_dir = Path(output_dir)
132
+ output_dir.mkdir(parents=True, exist_ok=True)
133
+
134
+ paths = []
135
+ for seg in segments:
136
+ idx = seg["segment"]
137
+ path = output_dir / f"segment_{idx:03d}.png"
138
+
139
+ if path.exists():
140
+ print(f" Segment {idx}/{len(segments)}: already exists, skipping")
141
+ paths.append(path)
142
+ continue
143
+
144
+ prompt = seg["prompt"]
145
+ if trigger:
146
+ prompt = f"{trigger} style, {prompt}"
147
+ neg = seg.get("negative_prompt", "")
148
+
149
+ print(f" Segment {idx}/{len(segments)}: generating image (fal.ai)...")
150
+ t0 = time.time()
151
+ result = generate_image(prompt, neg, loras=loras, seed=seed + idx)
152
+ elapsed = time.time() - t0
153
+
154
+ image_url = result["images"][0]["url"]
155
+ _download_image(image_url, path)
156
+ paths.append(path)
157
+ print(f" Saved {path.name} ({elapsed:.1f}s)")
158
+ if progress_callback:
159
+ progress_callback(idx, len(segments))
160
+
161
+ return paths
162
+
163
+
164
+ def run(
165
+ data_dir: str | Path,
166
+ style_name: str = "Warm Sunset",
167
+ seed: int = 42,
168
+ progress_callback=None,
169
+ ) -> list[Path]:
170
+ """Full image generation pipeline: read segments, generate via API, save.
171
+
172
+ Args:
173
+ data_dir: Run directory containing segments.json.
174
+ style_name: Style from the registry (see src/styles.py).
175
+ seed: Base random seed.
176
+
177
+ Returns:
178
+ List of saved image paths.
179
+ """
180
+ data_dir = Path(data_dir)
181
+
182
+ with open(data_dir / "segments.json") as f:
183
+ segments = json.load(f)
184
+
185
+ paths = generate_all(segments, data_dir / "images", style_name, seed, progress_callback)
186
+
187
+ print(f"\nGenerated {len(paths)} images in {data_dir / 'images'}")
188
+ return paths
189
+
190
+
191
+ if __name__ == "__main__":
192
+ import os
193
+ import sys
194
+
195
+ if len(sys.argv) < 2:
196
+ print("Usage: python -m src.image_generator_api <data_dir> [style_name]")
197
+ print(' e.g. python -m src.image_generator_api data/Gone/run_001 "Warm Sunset"')
198
+ print("\nRequires FAL_KEY environment variable.")
199
+ sys.exit(1)
200
+
201
+ if not os.getenv("FAL_KEY"):
202
+ print("Error: FAL_KEY environment variable not set.")
203
+ print("Get your key at https://fal.ai/dashboard/keys")
204
+ sys.exit(1)
205
+
206
+ style = sys.argv[2] if len(sys.argv) > 2 else "Warm Sunset"
207
+ run(sys.argv[1], style_name=style)
src/image_generator_hf.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate images using SDXL + Hyper-SD 8-step + style LoRA from registry.
2
+
3
+ Reads segments.json (with prompts from prompt_generator) and generates
4
+ one 768x1344 (9:16 vertical) image per segment.
5
+
6
+ Pipeline: SDXL base β†’ Hyper-SD 8-step CFG LoRA (speed) β†’ style LoRA (aesthetics)
7
+ """
8
+
9
+ import json
10
+ from pathlib import Path
11
+ from typing import Optional
12
+
13
+ import torch
14
+ from diffusers import AutoencoderKL, DDIMScheduler, DiffusionPipeline
15
+ from huggingface_hub import hf_hub_download
16
+
17
+ from src.styles import get_style
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Config
21
+ # ---------------------------------------------------------------------------
22
+
23
+ BASE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
24
+ VAE_MODEL = "madebyollin/sdxl-vae-fp16-fix"
25
+ HYPER_SD_REPO = "ByteDance/Hyper-SD"
26
+ HYPER_SD_FILE = "Hyper-SDXL-8steps-CFG-lora.safetensors"
27
+
28
+ WIDTH = 768
29
+ HEIGHT = 1344
30
+ NUM_STEPS = 8
31
+ GUIDANCE_SCALE = 5.0
32
+
33
+ HYPER_SD_WEIGHT = 0.125 # official recommendation
34
+
35
+
36
+ def _get_device_and_dtype():
37
+ """Detect best available device and matching dtype."""
38
+ if torch.cuda.is_available():
39
+ return "cuda", torch.float16
40
+ if torch.backends.mps.is_available():
41
+ return "mps", torch.float32 # float32 required for MPS reliability
42
+ return "cpu", torch.float32
43
+
44
+
45
+ def load_pipeline(style_name: str = "Warm Sunset"):
46
+ """Load SDXL pipeline with Hyper-SD and a style LoRA from the registry.
47
+
48
+ Args:
49
+ style_name: Key in STYLES registry. Use "None" for no style LoRA.
50
+
51
+ Returns:
52
+ Configured DiffusionPipeline ready for inference.
53
+ """
54
+ style = get_style(style_name)
55
+ device, dtype = _get_device_and_dtype()
56
+ print(f"Loading SDXL pipeline on {device} ({dtype})...")
57
+
58
+ vae = AutoencoderKL.from_pretrained(VAE_MODEL, torch_dtype=dtype)
59
+
60
+ load_kwargs = {"torch_dtype": dtype, "vae": vae, "use_safetensors": True}
61
+ if dtype == torch.float16:
62
+ load_kwargs["variant"] = "fp16"
63
+
64
+ pipe = DiffusionPipeline.from_pretrained(BASE_MODEL, **load_kwargs)
65
+
66
+ # Hyper-SD 8-step CFG LoRA (always loaded)
67
+ hyper_path = hf_hub_download(HYPER_SD_REPO, HYPER_SD_FILE)
68
+ pipe.load_lora_weights(hyper_path, adapter_name="hyper-sd")
69
+
70
+ # Style LoRA from registry
71
+ _apply_style(pipe, style)
72
+
73
+ # DDIMScheduler with trailing timestep spacing β€” required for Hyper-SD
74
+ pipe.scheduler = DDIMScheduler.from_config(
75
+ pipe.scheduler.config, timestep_spacing="trailing"
76
+ )
77
+
78
+ pipe.to(device)
79
+
80
+ if device == "mps":
81
+ pipe.enable_attention_slicing()
82
+ pipe.enable_vae_slicing()
83
+
84
+ print("Pipeline ready.")
85
+ return pipe
86
+
87
+
88
+ def _apply_style(pipe, style: dict):
89
+ """Load a style LoRA and set adapter weights."""
90
+ source = style["source"]
91
+ if source is None:
92
+ pipe.set_adapters(["hyper-sd"], adapter_weights=[HYPER_SD_WEIGHT])
93
+ print("No style LoRA β€” using base SDXL + Hyper-SD.")
94
+ return
95
+
96
+ load_kwargs = {"adapter_name": "style"}
97
+
98
+ # Local file: resolve relative to project root, pass dir + weight_name
99
+ project_root = Path(__file__).resolve().parent.parent
100
+ source_path = (project_root / source).resolve()
101
+ if source_path.is_file():
102
+ load_kwargs["weight_name"] = source_path.name
103
+ pipe.load_lora_weights(str(source_path.parent), **load_kwargs)
104
+ else:
105
+ # HF Hub repo ID
106
+ if style["weight_name"]:
107
+ load_kwargs["weight_name"] = style["weight_name"]
108
+ pipe.load_lora_weights(source, **load_kwargs)
109
+ pipe.set_adapters(
110
+ ["hyper-sd", "style"],
111
+ adapter_weights=[HYPER_SD_WEIGHT, style["weight"]],
112
+ )
113
+ print(f"Loaded style LoRA: {source}")
114
+
115
+
116
+ def switch_style(pipe, style_name: str):
117
+ """Switch to a different style LoRA at runtime.
118
+
119
+ Unloads all LoRAs then reloads Hyper-SD + new style.
120
+ """
121
+ style = get_style(style_name)
122
+
123
+ pipe.unload_lora_weights()
124
+
125
+ # Re-load Hyper-SD
126
+ hyper_path = hf_hub_download(HYPER_SD_REPO, HYPER_SD_FILE)
127
+ pipe.load_lora_weights(hyper_path, adapter_name="hyper-sd")
128
+
129
+ # Load new style
130
+ _apply_style(pipe, style)
131
+ print(f"Switched to style: {style_name}")
132
+
133
+
134
+ def generate_image(
135
+ pipe,
136
+ prompt: str,
137
+ negative_prompt: str = "",
138
+ seed: Optional[int] = None,
139
+ ) -> "PIL.Image.Image":
140
+ """Generate a single 768x1344 vertical image."""
141
+ generator = None
142
+ if seed is not None:
143
+ generator = torch.Generator(device="cpu").manual_seed(seed)
144
+
145
+ return pipe(
146
+ prompt=prompt,
147
+ negative_prompt=negative_prompt,
148
+ num_inference_steps=NUM_STEPS,
149
+ guidance_scale=GUIDANCE_SCALE,
150
+ height=HEIGHT,
151
+ width=WIDTH,
152
+ generator=generator,
153
+ ).images[0]
154
+
155
+
156
+ def generate_all(
157
+ segments: list[dict],
158
+ pipe,
159
+ output_dir: str | Path,
160
+ trigger_word: str = "",
161
+ seed: int = 42,
162
+ progress_callback=None,
163
+ ) -> list[Path]:
164
+ """Generate images for all segments.
165
+
166
+ Args:
167
+ segments: List of segment dicts (with 'prompt' and 'negative_prompt').
168
+ pipe: Loaded DiffusionPipeline.
169
+ output_dir: Directory to save images.
170
+ trigger_word: LoRA trigger word appended to prompts.
171
+ seed: Base seed (incremented per segment for variety).
172
+
173
+ Returns:
174
+ List of saved image paths.
175
+ """
176
+ output_dir = Path(output_dir)
177
+ output_dir.mkdir(parents=True, exist_ok=True)
178
+
179
+ paths = []
180
+ for seg in segments:
181
+ idx = seg["segment"]
182
+ path = output_dir / f"segment_{idx:03d}.png"
183
+
184
+ if path.exists():
185
+ print(f" Segment {idx}/{len(segments)}: already exists, skipping")
186
+ paths.append(path)
187
+ continue
188
+
189
+ prompt = seg["prompt"]
190
+ if trigger_word:
191
+ prompt = f"{trigger_word} style, {prompt}"
192
+ neg = seg.get("negative_prompt", "")
193
+
194
+ print(f" Segment {idx}/{len(segments)}: generating...")
195
+ image = generate_image(pipe, prompt, neg, seed=seed + idx)
196
+
197
+ path = output_dir / f"segment_{idx:03d}.png"
198
+ image.save(path)
199
+ paths.append(path)
200
+ print(f" Saved {path.name}")
201
+ if progress_callback:
202
+ progress_callback(idx, len(segments))
203
+
204
+ return paths
205
+
206
+
207
+ def run(
208
+ data_dir: str | Path,
209
+ style_name: str = "Warm Sunset",
210
+ seed: int = 42,
211
+ progress_callback=None,
212
+ ) -> list[Path]:
213
+ """Full image generation pipeline: load model, read segments, generate, save.
214
+
215
+ Args:
216
+ data_dir: Run directory containing segments.json (e.g. data/Gone/run_001/).
217
+ style_name: Style from the registry (see src/styles.py).
218
+ seed: Base random seed.
219
+
220
+ Returns:
221
+ List of saved image paths.
222
+ """
223
+ data_dir = Path(data_dir)
224
+ style = get_style(style_name)
225
+
226
+ with open(data_dir / "segments.json") as f:
227
+ segments = json.load(f)
228
+
229
+ pipe = load_pipeline(style_name)
230
+ paths = generate_all(segments, pipe, data_dir / "images", style["trigger"], seed, progress_callback)
231
+
232
+ print(f"\nGenerated {len(paths)} images in {data_dir / 'images'}")
233
+ return paths
234
+
235
+
236
+ if __name__ == "__main__":
237
+ import sys
238
+
239
+ if len(sys.argv) < 2:
240
+ print("Usage: python -m src.image_generator_hf <data_dir> [style_name]")
241
+ print(' e.g. python -m src.image_generator_hf data/Gone/run_001 "Warm Sunset"')
242
+ sys.exit(1)
243
+
244
+ style = sys.argv[2] if len(sys.argv) > 2 else "Warm Sunset"
245
+ run(sys.argv[1], style_name=style)
src/lyrics_extractor.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """WhisperX wrapper for lyrics extraction with word-level timestamps."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+ import whisperx
8
+
9
+
10
+ def extract_lyrics(
11
+ vocal_path: str | Path,
12
+ model_name: str = "large-v2",
13
+ device: str = "cpu",
14
+ language: str = "en",
15
+ output_dir: Optional[str | Path] = None,
16
+ ) -> list[dict]:
17
+ """Extract timestamped lyrics from an isolated vocal stem.
18
+
19
+ Args:
20
+ vocal_path: Path to the isolated vocal audio file (data/<song>/stems/vocals.wav).
21
+ model_name: Whisper model size. Default "large-v2" (best for lyrics).
22
+ device: Device to run on ("cpu", "cuda").
23
+ language: Language code for transcription.
24
+ output_dir: Directory to save lyrics.json. Defaults to data/<song>/.
25
+
26
+ Returns:
27
+ List of word dicts with keys: "word", "start", "end".
28
+ Example: [{"word": "hello", "start": 0.5, "end": 0.8}, ...]
29
+ """
30
+ vocal_path = str(vocal_path)
31
+
32
+ # Load audio
33
+ audio = whisperx.load_audio(vocal_path)
34
+
35
+ # Transcribe
36
+ model = whisperx.load_model(model_name, device, compute_type="int8", language=language)
37
+ result = model.transcribe(audio, batch_size=4)
38
+ del model # free Whisper model before loading alignment model
39
+
40
+ # Forced alignment for word-level timestamps
41
+ model_a, metadata = whisperx.load_align_model(language_code=language, device=device)
42
+ result = whisperx.align(result["segments"], model_a, metadata, audio, device)
43
+ del model_a, metadata # free alignment model
44
+
45
+ # Flatten to word list
46
+ words = []
47
+ for segment in result["segments"]:
48
+ for word in segment.get("words", []):
49
+ if "start" in word and "end" in word:
50
+ words.append({
51
+ "word": word["word"].strip(),
52
+ "start": word["start"],
53
+ "end": word["end"],
54
+ })
55
+
56
+ # Save to JSON in the song directory (stems/ parent = data/<song>/)
57
+ if output_dir is None:
58
+ output_dir = Path(vocal_path).parent.parent
59
+ output_dir = Path(output_dir)
60
+
61
+ output_path = output_dir / "lyrics.json"
62
+ with open(output_path, "w") as f:
63
+ json.dump(words, f, indent=2)
64
+
65
+ import gc
66
+ gc.collect()
67
+
68
+ return words
69
+
70
+
71
+ if __name__ == "__main__":
72
+ import sys
73
+
74
+ if len(sys.argv) < 2:
75
+ print("Usage: python -m src.lyrics_extractor <vocal_file>")
76
+ sys.exit(1)
77
+
78
+ words = extract_lyrics(sys.argv[1])
79
+ for w in words:
80
+ print(f"{w['start']:6.2f} - {w['end']:6.2f}: {w['word']}")
81
+
82
+ output_path = Path(sys.argv[1]).parent.parent / "lyrics.json"
83
+ print(f"\nSaved to {output_path}")
src/prompt_generator.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate image + video prompts from segments using an LLM.
2
+
3
+ Takes segments.json (lyrics mapped to beat intervals) and produces two
4
+ prompts per segment via two separate LLM calls:
5
+ 1. Image prompt β€” short, SDXL-optimized (≀77 CLIP tokens)
6
+ 2. Video prompt β€” detailed motion/action description for I2V (no token limit)
7
+
8
+ Consistency: LLM keeps all scenes within a shared setting from the style guidance.
9
+ Variety: LLM picks different subjects, camera angles, compositions per segment.
10
+ Narrative: LLM derives an overarching visual story from the lyrics.
11
+ """
12
+
13
+ import json
14
+ import os
15
+ from pathlib import Path
16
+ from typing import Optional
17
+
18
+ import anthropic
19
+ from dotenv import load_dotenv
20
+
21
+ load_dotenv()
22
+
23
+ # Camera angles to cycle through for visual variety between cuts
24
+ CAMERA_ANGLES = [
25
+ "wide establishing shot",
26
+ "close-up",
27
+ "aerial view",
28
+ "low angle shot",
29
+ "medium shot",
30
+ "extreme wide shot",
31
+ "over-the-shoulder perspective",
32
+ "dutch angle",
33
+ "tracking shot from the side",
34
+ "bird's eye view",
35
+ "ground-level shot",
36
+ "silhouette against the sky",
37
+ ]
38
+
39
+ # Default quality suffix β€” overridden by style-specific quality_suffix from styles.py
40
+ DEFAULT_QUALITY_SUFFIX = "8K, cinematic, atmospheric, sharp details"
41
+
42
+ NEGATIVE_PROMPT = (
43
+ "text, watermark, logo, blurry, low quality, deformed, "
44
+ "ugly, oversaturated, cartoon, anime"
45
+ )
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # LLM Call 1: Image prompts (short, SDXL-optimized)
49
+ # ---------------------------------------------------------------------------
50
+
51
+ IMAGE_SYSTEM_PROMPT = """\
52
+ You are a music video director. Given song lyrics, a SETTING, and a list of \
53
+ segments (each ~2 seconds long), create a visually compelling shot list for \
54
+ IMAGE generation (Stable Diffusion XL).
55
+
56
+ Rules:
57
+ 1. A SETTING will be provided at the end of these instructions. ALL scenes \
58
+ MUST take place within that setting β€” treat it as the world of a short film. \
59
+ Never leave this world.
60
+ 2. Use the LYRICS to shape the MOOD, ENERGY, and EMOTIONAL ARC of each scene. \
61
+ The lyrics dictate the vibe β€” if they're dark and melancholic, the visuals \
62
+ should feel heavy and somber even within the setting. If they're upbeat, the \
63
+ visuals should feel energetic.
64
+ 3. When lyrics are CONCRETE and naturally fit the setting, lean into them \
65
+ heavily. For example, if the setting is a coastal drive and the lyrics say \
66
+ "waves crashing down", make that segment literally about waves crashing \
67
+ against rocks as the car passes. If the lyrics say "fading light", show the \
68
+ sun dropping below the horizon. The more specific the lyrics, the more \
69
+ directly they should influence the scene.
70
+ 4. When lyrics are ABSTRACT or metaphorical (e.g. "lost in your eyes", \
71
+ "falling apart"), translate the emotion into something visual and physical \
72
+ within the setting β€” don't try to literally depict abstract concepts.
73
+ 5. Each segment gets a UNIQUE SHOT within the shared setting β€” vary the \
74
+ subject, angle, and composition, but NEVER leave the world.
75
+ CRITICAL: Every scene MUST depict ACTION or MOTION β€” something must be \
76
+ happening. These will be turned into short video clips, so static subjects \
77
+ like "a wooden floor", "a parked car", or "an empty room" are useless. \
78
+ Show vehicles driving, waves crashing, lights flickering, rain falling, \
79
+ fires burning β€” dynamic scenes only.
80
+ 6. Use the assigned camera angle for each segment.
81
+ 7. Segments WITHOUT lyrics (instrumental): use atmospheric, mood-driven \
82
+ details from the setting (environmental motion, weather, ambient action).
83
+ 8. Write prompts as SDXL-optimized natural language descriptions. \
84
+ Keep each scene between 25-35 words. Be specific β€” name exact objects, \
85
+ materials, colors, and weather details. Every word must earn its place. \
86
+ Focus on CONCRETE OBJECTS and ACTIONS β€” what is physically in the frame \
87
+ and what is happening. SDXL needs to know what to draw, not how to feel. \
88
+ BAD: "reflections layering over glass, interior light diffused through water" β€” abstract mood. \
89
+ GOOD: "taxi splashing through puddle on wet street, rain falling past neon bar sign" β€” objects + action. \
90
+ BAD: "streetlights bleeding through downpour, darkness stretching ahead" β€” vague atmosphere. \
91
+ GOOD: "car windshield wipers sweeping rain, blurred traffic lights ahead, wet dashboard" β€” specific things. \
92
+ BAD: "water sheeting off canvas edge in a thick curtain" β€” SDXL will draw a curtain. \
93
+ GOOD: "water pouring off awning edge, rain splashing on sidewalk below" β€” plain description. \
94
+ Write like you're telling a 10-year-old what's in the picture. Simple, plain words. \
95
+ Name the objects. Name the action. Lighting and mood come from the SETTING, \
96
+ you don't need to describe them β€” describe what's HAPPENING. \
97
+ Use LITERAL language only β€” no metaphors, no poetic phrasing. SDXL interprets \
98
+ words literally. BANNED words: bleeding, drowning, bathed, kissed, dancing, \
99
+ breathing, alive, whispering, haunting, cascading, diffusing, fragmenting. \
100
+ These cause SDXL to generate unintended objects. \
101
+ Also avoid describing PROCESSES or PHYSICS β€” SDXL generates a single frame, \
102
+ not a sequence. "ripples expanding", "light fragmenting and reforming", \
103
+ "reflections scattering" are processes, not objects. Instead describe the \
104
+ RESULT: "rippled puddle", "blurry neon reflection in water", "wet glass". \
105
+ Say exactly what a camera would capture in ONE freeze-frame. \
106
+ Before finalizing each scene, sanity-check it: does this make physical \
107
+ sense? Could this actually exist? "pooled water on a car hood" β€” no, car \
108
+ hoods are curved and water runs off. "rain falling upward" β€” no. \
109
+ "neon sign reflected in a brick wall" β€” no, brick doesn't reflect. \
110
+ Only write scenes that obey basic physics and real-world logic. \
111
+ Strip camera angle phrasing from the scene text (angles are metadata, not prompt words).
112
+ 9. Include lighting and color in every scene. Derive from the SETTING β€” \
113
+ a sunset drive = warm golden-hour light, lens flares, long shadows; \
114
+ a rainy city night = cold neon on wet surfaces, streetlight halos; \
115
+ a stormy harbour = overcast grey, dramatic cloud breaks. \
116
+ Keep lighting consistent across all scenes.
117
+ 10. Do NOT include style, quality, or technical tags in the scene β€” these \
118
+ are appended automatically. BANNED from scenes: "cinematic", "moody", \
119
+ "atmospheric", "dramatic lighting", "film grain", "color grade", "bokeh", \
120
+ "depth of field", "35mm", "8K", "masterpiece", "best quality". \
121
+ Your scene should contain ONLY objects, actions, and setting-derived light.
122
+ 11. Do NOT include text, words, or typography in the scenes.
123
+ 12. Do NOT end scenes with periods. Use commas to separate phrases. \
124
+ Every character counts β€” periods waste a token.
125
+
126
+ Return ONLY valid JSON: a list of objects with "segment" (number) and \
127
+ "scene" (the creative description). No markdown, no explanation.\
128
+ """
129
+
130
+ # ---------------------------------------------------------------------------
131
+ # LLM Call 2: Video prompts (detailed motion descriptions)
132
+ # ---------------------------------------------------------------------------
133
+
134
+ VIDEO_SYSTEM_PROMPT = """\
135
+ You are a music video director creating motion descriptions for an \
136
+ image-to-video AI model. You will receive a list of segments, each with \
137
+ an image scene description already written. Your job is to describe \
138
+ HOW each scene should MOVE and ANIMATE.
139
+
140
+ Rules:
141
+ 1. For each segment, write a detailed "video_prompt" (2-4 sentences) \
142
+ describing all motion in the scene:
143
+ - SUBJECT MOTION: what the subject does (walking, turning, reaching, \
144
+ driving, dancing, running, etc.)
145
+ - CAMERA MOTION: how the camera moves (slow pan left, dolly forward, \
146
+ tracking shot, crane up, handheld shake, static with zoom, etc.)
147
+ - ENVIRONMENTAL MOTION: ambient movement (wind blowing hair/clothes, \
148
+ rain falling, leaves drifting, smoke rising, lights flickering, waves \
149
+ crashing, clouds moving, reflections rippling, etc.)
150
+ - PACING: match the emotional energy β€” slow and contemplative for \
151
+ quiet moments, faster and more dynamic for intense moments.
152
+ 2. Be specific and physical. Not "things move around" but "the camera \
153
+ slowly tracks forward as rain streaks across the windshield and the \
154
+ wipers sweep left to right."
155
+ 3. Keep the motion consistent with the shared setting β€” all scenes are \
156
+ part of the same story.
157
+ 4. Do NOT describe visual style, colors, or lighting β€” the image already \
158
+ has those. Focus ONLY on motion and action.
159
+ 5. CRITICAL β€” ONLY animate what exists in the scene description. Do NOT \
160
+ introduce new subjects, people, or objects that are not explicitly \
161
+ mentioned. If the scene describes a landscape with no people, describe \
162
+ ONLY environmental motion (wind, water, light changes, camera movement). \
163
+ NEVER add a person walking into frame unless the scene already mentions \
164
+ a person or figure.
165
+
166
+ Return ONLY valid JSON: a list of objects with "segment" (number) and \
167
+ "video_prompt" (the motion description). No markdown, no explanation.\
168
+ """
169
+
170
+
171
+ def _build_user_prompt(
172
+ segments: list[dict], song_name: str, style_description: str = "",
173
+ ) -> str:
174
+ """Build the user message for the image prompt LLM call."""
175
+ all_lyrics = " ".join(
176
+ seg["lyrics"] for seg in segments if seg["lyrics"]
177
+ ).strip()
178
+
179
+ lines = [
180
+ f'Song: "{song_name}"',
181
+ f'Full lyrics in this clip: "{all_lyrics}"',
182
+ f"Number of segments: {len(segments)}",
183
+ ]
184
+
185
+ if style_description:
186
+ lines.append(f'Visual style direction: "{style_description}"')
187
+
188
+ lines += ["", "Segments:"]
189
+
190
+ for i, seg in enumerate(segments):
191
+ angle = CAMERA_ANGLES[i % len(CAMERA_ANGLES)]
192
+ lyrics_note = f'lyrics: "{seg["lyrics"]}"' if seg["lyrics"] else "instrumental"
193
+ lines.append(
194
+ f' {seg["segment"]}. ({seg["start"]:.1f}s–{seg["end"]:.1f}s) '
195
+ f'[{angle}] {lyrics_note}'
196
+ )
197
+
198
+ return "\n".join(lines)
199
+
200
+
201
+ def _build_video_user_prompt(segments: list[dict]) -> str:
202
+ """Build the user message for the video prompt LLM call."""
203
+ lines = [
204
+ "Generate motion descriptions for each segment.",
205
+ "IMPORTANT: ONLY animate elements that exist in the scene description.",
206
+ "Do NOT add people, figures, or objects that aren't mentioned.",
207
+ "",
208
+ "Image scenes:",
209
+ "",
210
+ ]
211
+
212
+ for seg in segments:
213
+ lyrics_note = f' (lyrics: "{seg["lyrics"]}")' if seg.get("lyrics") else " (instrumental)"
214
+ lines.append(
215
+ f' Segment {seg["segment"]}: "{seg["scene"]}"{lyrics_note}'
216
+ )
217
+
218
+ return "\n".join(lines)
219
+
220
+
221
+ def _parse_llm_json(raw: str) -> list[dict]:
222
+ """Parse JSON from LLM response, stripping markdown fences if present."""
223
+ raw = raw.strip()
224
+ if raw.startswith("```"):
225
+ raw = raw.split("\n", 1)[1]
226
+ raw = raw.rsplit("```", 1)[0]
227
+ return json.loads(raw)
228
+
229
+
230
+ def generate_prompts(
231
+ segments: list[dict],
232
+ song_name: str = "Unknown",
233
+ style_description: str = "",
234
+ image_prompt_guidance: str = "",
235
+ quality_suffix: str = "",
236
+ model: str = "claude-sonnet-4-6",
237
+ ) -> list[dict]:
238
+ """Generate image + video prompts for each segment using two LLM calls.
239
+
240
+ Args:
241
+ segments: List of segment dicts from segmenter (with lyrics).
242
+ song_name: Name of the song (helps the LLM set the mood).
243
+ style_description: Description of the visual style (from styles registry).
244
+ image_prompt_guidance: Style-specific creative direction appended to the
245
+ image system prompt (from styles registry).
246
+ quality_suffix: Style-specific quality tags appended to each prompt.
247
+ model: Anthropic model to use.
248
+
249
+ Returns:
250
+ Updated segments list with added keys:
251
+ - prompt: full SDXL prompt (scene + style suffix)
252
+ - video_prompt: detailed motion description for I2V
253
+ - negative_prompt: negative prompt for SDXL
254
+ - camera_angle: the assigned camera angle
255
+ - scene: raw scene description from LLM
256
+ """
257
+ client = anthropic.Anthropic()
258
+
259
+ # --- Call 1: Image prompts ---
260
+ print(" Generating image prompts...")
261
+ user_prompt = _build_user_prompt(segments, song_name, style_description)
262
+
263
+ # Inject style-specific guidance into the system prompt
264
+ image_system = IMAGE_SYSTEM_PROMPT
265
+ if image_prompt_guidance:
266
+ image_system += f"\n\n{image_prompt_guidance}"
267
+
268
+ response = client.messages.create(
269
+ model=model,
270
+ max_tokens=2048,
271
+ system=image_system,
272
+ messages=[{"role": "user", "content": user_prompt}],
273
+ )
274
+
275
+ scenes = _parse_llm_json(response.content[0].text)
276
+ scene_map = {s["segment"]: s for s in scenes}
277
+
278
+ # Merge image prompts into segments
279
+ suffix = quality_suffix or DEFAULT_QUALITY_SUFFIX
280
+ for i, seg in enumerate(segments):
281
+ angle = CAMERA_ANGLES[i % len(CAMERA_ANGLES)]
282
+ scene_data = scene_map.get(seg["segment"], {})
283
+ scene = scene_data.get("scene", "atmospheric landscape")
284
+
285
+ seg["scene"] = scene
286
+ seg["camera_angle"] = angle
287
+ seg["prompt"] = f"{scene}, {suffix}"
288
+ seg["negative_prompt"] = NEGATIVE_PROMPT
289
+
290
+ # --- Call 2: Video prompts ---
291
+ print(" Generating video prompts...")
292
+ video_user_prompt = _build_video_user_prompt(segments)
293
+
294
+ response = client.messages.create(
295
+ model=model,
296
+ max_tokens=4096,
297
+ system=VIDEO_SYSTEM_PROMPT,
298
+ messages=[{"role": "user", "content": video_user_prompt}],
299
+ )
300
+
301
+ video_scenes = _parse_llm_json(response.content[0].text)
302
+ video_map = {s["segment"]: s for s in video_scenes}
303
+
304
+ # Merge video prompts into segments
305
+ for seg in segments:
306
+ video_data = video_map.get(seg["segment"], {})
307
+ seg["video_prompt"] = video_data.get(
308
+ "video_prompt", f"smooth cinematic motion, {seg['scene']}"
309
+ )
310
+
311
+ return segments
312
+
313
+
314
+ def save_segments(
315
+ segments: list[dict],
316
+ output_path: str | Path,
317
+ ) -> Path:
318
+ """Save prompt-enriched segments to JSON."""
319
+ output_path = Path(output_path)
320
+ output_path.parent.mkdir(parents=True, exist_ok=True)
321
+
322
+ with open(output_path, "w") as f:
323
+ json.dump(segments, f, indent=2)
324
+
325
+ return output_path
326
+
327
+
328
+ def run(
329
+ data_dir: str | Path,
330
+ song_name: Optional[str] = None,
331
+ style_description: str = "",
332
+ image_prompt_guidance: str = "",
333
+ quality_suffix: str = "",
334
+ ) -> list[dict]:
335
+ """Full prompt generation pipeline: load segments, generate prompts, save.
336
+
337
+ Args:
338
+ data_dir: Run directory containing segments.json (e.g. data/Gone/run_001/).
339
+ song_name: Name of the song. Defaults to the parent directory name.
340
+ style_description: Description of the visual style (from styles registry).
341
+ image_prompt_guidance: Style-specific creative direction for image prompts.
342
+ quality_suffix: Style-specific quality tags appended to each prompt.
343
+
344
+ Returns:
345
+ List of prompt-enriched segment dicts.
346
+ """
347
+ data_dir = Path(data_dir)
348
+
349
+ if song_name is None:
350
+ song_name = data_dir.parent.name
351
+
352
+ with open(data_dir / "segments.json") as f:
353
+ segments = json.load(f)
354
+
355
+ segments = generate_prompts(
356
+ segments, song_name=song_name, style_description=style_description,
357
+ image_prompt_guidance=image_prompt_guidance,
358
+ quality_suffix=quality_suffix,
359
+ )
360
+ save_segments(segments, data_dir / "segments.json")
361
+
362
+ return segments
363
+
364
+
365
+ if __name__ == "__main__":
366
+ import sys
367
+
368
+ if len(sys.argv) < 2:
369
+ print("Usage: python -m src.prompt_generator <data_dir> [song_name]")
370
+ print(" e.g. python -m src.prompt_generator data/Gone 'Gone'")
371
+ sys.exit(1)
372
+
373
+ name = sys.argv[2] if len(sys.argv) > 2 else None
374
+ segments = run(sys.argv[1], song_name=name)
375
+
376
+ print(f"Generated prompts for {len(segments)} segments:\n")
377
+ for seg in segments:
378
+ lyrics_tag = f' [{seg["lyrics"]}]' if seg["lyrics"] else ""
379
+ print(f" Seg {seg['segment']}{lyrics_tag}")
380
+ print(f" Scene: {seg['scene']}")
381
+ print(f" Video: {seg['video_prompt'][:100]}...")
382
+ print(f" Prompt: {seg['prompt'][:100]}...")
383
+ print()
src/segmenter.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Lyrics-to-beat mapping: group beats into segments and assign lyrics."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Optional
6
+
7
+
8
+ def segment_lyrics(
9
+ beats: list[dict],
10
+ lyrics: list[dict],
11
+ beats_per_segment: int = 4,
12
+ ) -> list[dict]:
13
+ """Map timestamped lyrics onto beat-grouped segments.
14
+
15
+ Groups consecutive beats into segments (e.g. 4 beats = 1 bar in 4/4 time)
16
+ and assigns words to the segment where they start.
17
+
18
+ Args:
19
+ beats: List of beat dicts with "beat" and "time" keys.
20
+ lyrics: List of word dicts with "word", "start", "end" keys.
21
+ beats_per_segment: Number of beats per segment. 4 = one bar in 4/4 time.
22
+
23
+ Returns:
24
+ List of segment dicts with keys:
25
+ - segment: 1-indexed segment number
26
+ - start: start time in seconds
27
+ - end: end time in seconds
28
+ - duration: segment duration in seconds
29
+ - lyrics: raw lyrics text for this segment (may be empty)
30
+ - words: list of word dicts that fall in this segment
31
+ """
32
+ beat_times = [b["time"] for b in beats]
33
+
34
+ # Build segment boundaries by grouping every N beats
35
+ segments = []
36
+ seg_num = 1
37
+ for i in range(0, len(beat_times) - 1, beats_per_segment):
38
+ start = beat_times[i]
39
+ # End is either N beats later or the last beat
40
+ end_idx = min(i + beats_per_segment, len(beat_times) - 1)
41
+ end = beat_times[end_idx]
42
+
43
+ # Store individual beat timestamps for this segment
44
+ seg_beat_times = [
45
+ round(beat_times[j], 3)
46
+ for j in range(i, min(i + beats_per_segment + 1, len(beat_times)))
47
+ ]
48
+
49
+ segments.append({
50
+ "segment": seg_num,
51
+ "start": round(start, 3),
52
+ "end": round(end, 3),
53
+ "duration": round(end - start, 3),
54
+ "beats": seg_beat_times,
55
+ "lyrics": "",
56
+ "words": [],
57
+ })
58
+ seg_num += 1
59
+
60
+ # Assign words to segments based on where the word starts
61
+ for word in lyrics:
62
+ word_start = word["start"]
63
+ for seg in segments:
64
+ if seg["start"] <= word_start < seg["end"]:
65
+ seg["words"].append(word)
66
+ break
67
+ else:
68
+ # Word starts after last segment boundary β€” assign to last segment
69
+ if segments and word_start >= segments[-1]["start"]:
70
+ segments[-1]["words"].append(word)
71
+
72
+ # Build lyrics text per segment
73
+ for seg in segments:
74
+ seg["lyrics"] = " ".join(w["word"] for w in seg["words"])
75
+
76
+ return segments
77
+
78
+
79
+ def save_segments(
80
+ segments: list[dict],
81
+ output_path: str | Path,
82
+ ) -> Path:
83
+ """Save segments to a JSON file.
84
+
85
+ Args:
86
+ segments: List of segment dicts.
87
+ output_path: Path to save the JSON file.
88
+
89
+ Returns:
90
+ Path to the saved JSON file.
91
+ """
92
+ output_path = Path(output_path)
93
+ output_path.parent.mkdir(parents=True, exist_ok=True)
94
+
95
+ with open(output_path, "w") as f:
96
+ json.dump(segments, f, indent=2)
97
+
98
+ return output_path
99
+
100
+
101
+ def run(
102
+ data_dir: str | Path,
103
+ beats_per_segment: int = 4,
104
+ ) -> list[dict]:
105
+ """Full segmentation pipeline: load beats + lyrics, segment, and save.
106
+
107
+ Args:
108
+ data_dir: Song data directory containing beats.json and lyrics.json
109
+ (e.g. data/Gone/).
110
+ beats_per_segment: Number of beats per segment (4 = one bar).
111
+
112
+ Returns:
113
+ List of segment dicts.
114
+ """
115
+ data_dir = Path(data_dir)
116
+
117
+ with open(data_dir / "beats.json") as f:
118
+ beats = json.load(f)
119
+
120
+ with open(data_dir / "lyrics.json") as f:
121
+ lyrics = json.load(f)
122
+
123
+ segments = segment_lyrics(beats, lyrics, beats_per_segment=beats_per_segment)
124
+ save_segments(segments, data_dir / "segments.json")
125
+
126
+ return segments
127
+
128
+
129
+ if __name__ == "__main__":
130
+ import sys
131
+
132
+ if len(sys.argv) < 2:
133
+ print("Usage: python -m src.segmenter <data_dir>")
134
+ print(" e.g. python -m src.segmenter data/Gone")
135
+ sys.exit(1)
136
+
137
+ segments = run(sys.argv[1])
138
+ print(f"Created {len(segments)} segments:\n")
139
+ for seg in segments:
140
+ lyrics_display = f'"{seg["lyrics"]}"' if seg["lyrics"] else "(instrumental)"
141
+ print(f" Seg {seg['segment']}: {seg['start']:.3f}s - {seg['end']:.3f}s "
142
+ f"({seg['duration']:.3f}s) {lyrics_display}")
src/stem_separator.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LALAL.AI API wrapper for audio stem separation."""
2
+
3
+ import os
4
+ import shutil
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ import requests
10
+
11
+ API_BASE = "https://www.lalal.ai/api/v1"
12
+ DATA_DIR = Path(__file__).parent.parent / "data"
13
+
14
+ # Stems we need for the pipeline
15
+ STEMS_TO_EXTRACT = ["vocals", "drum"]
16
+ # Map LALAL.AI track labels to our file naming convention
17
+ LABEL_TO_FILENAME = {"vocals": "vocals.wav", "drum": "drums.wav"}
18
+
19
+
20
+ def _get_api_key() -> str:
21
+ key = os.environ.get("LALAL_KEY")
22
+ if not key:
23
+ raise RuntimeError(
24
+ "LALAL_KEY environment variable not set. "
25
+ "Set it locally or as a HuggingFace Space secret."
26
+ )
27
+ return key
28
+
29
+
30
+ def _headers(api_key: str) -> dict:
31
+ return {"X-License-Key": api_key}
32
+
33
+
34
+ def _next_run_dir(song_dir: Path) -> Path:
35
+ """Find the next available run directory (run_001, run_002, ...)."""
36
+ existing = sorted(song_dir.glob("run_*"))
37
+ next_num = 1
38
+ for d in existing:
39
+ try:
40
+ num = int(d.name.split("_")[1])
41
+ next_num = max(next_num, num + 1)
42
+ except (IndexError, ValueError):
43
+ continue
44
+ return song_dir / f"run_{next_num:03d}"
45
+
46
+
47
+ def _upload(audio_path: Path, api_key: str) -> str:
48
+ """Upload audio file to LALAL.AI. Returns source_id."""
49
+ with open(audio_path, "rb") as f:
50
+ resp = requests.post(
51
+ f"{API_BASE}/upload/",
52
+ headers={
53
+ **_headers(api_key),
54
+ "Content-Disposition": f'attachment; filename="{audio_path.name}"',
55
+ },
56
+ data=f,
57
+ )
58
+ resp.raise_for_status()
59
+ data = resp.json()
60
+ source_id = data["id"]
61
+ print(f" Uploaded {audio_path.name} β†’ source_id={source_id} "
62
+ f"(duration: {data['duration']:.1f}s)")
63
+ return source_id
64
+
65
+
66
+ def _split_stem(source_id: str, stem: str, api_key: str) -> str:
67
+ """Start a stem separation task. Returns task_id."""
68
+ # Andromeda is best for vocals but doesn't support all stems β€” use auto for others
69
+ splitter = "andromeda" if stem == "vocals" else None
70
+ resp = requests.post(
71
+ f"{API_BASE}/split/stem_separator/",
72
+ headers=_headers(api_key),
73
+ json={
74
+ "source_id": source_id,
75
+ "presets": {
76
+ "stem": stem,
77
+ "splitter": splitter,
78
+ "dereverb_enabled": False,
79
+ "encoder_format": "wav",
80
+ "extraction_level": "deep_extraction",
81
+ },
82
+ },
83
+ )
84
+ resp.raise_for_status()
85
+ data = resp.json()
86
+ task_id = data["task_id"]
87
+ print(f" Split task started: stem={stem}, task_id={task_id}")
88
+ return task_id
89
+
90
+
91
+ def _poll_tasks(task_ids: list[str], api_key: str, poll_interval: float = 5.0) -> dict:
92
+ """Poll tasks until all complete. Returns {task_id: result_data}."""
93
+ pending = set(task_ids)
94
+ results = {}
95
+
96
+ while pending:
97
+ resp = requests.post(
98
+ f"{API_BASE}/check/",
99
+ headers=_headers(api_key),
100
+ json={"task_ids": list(pending)},
101
+ )
102
+ resp.raise_for_status()
103
+ data = resp.json().get("result", resp.json())
104
+
105
+ for task_id, info in data.items():
106
+ status = info.get("status")
107
+ if status == "success":
108
+ results[task_id] = info
109
+ pending.discard(task_id)
110
+ print(f" Task {task_id}: complete")
111
+ elif status == "progress":
112
+ print(f" Task {task_id}: {info.get('progress', 0)}%")
113
+ elif status == "error":
114
+ error = info.get("error", {})
115
+ raise RuntimeError(
116
+ f"LALAL.AI task {task_id} failed: "
117
+ f"{error.get('detail', 'unknown error')} "
118
+ f"(code: {error.get('code')})"
119
+ )
120
+ elif status == "cancelled":
121
+ raise RuntimeError(f"LALAL.AI task {task_id} was cancelled")
122
+ elif status == "server_error":
123
+ raise RuntimeError(
124
+ f"LALAL.AI server error for task {task_id}: "
125
+ f"{info.get('error', 'unknown')}"
126
+ )
127
+
128
+ if pending:
129
+ time.sleep(poll_interval)
130
+
131
+ return results
132
+
133
+
134
+ def _download_track(url: str, output_path: Path) -> None:
135
+ """Download a track from LALAL.AI CDN."""
136
+ resp = requests.get(url, stream=True)
137
+ resp.raise_for_status()
138
+ with open(output_path, "wb") as f:
139
+ for chunk in resp.iter_content(chunk_size=8192):
140
+ f.write(chunk)
141
+ print(f" Downloaded β†’ {output_path.name} ({output_path.stat().st_size / 1024:.0f} KB)")
142
+
143
+
144
+ def _delete_source(source_id: str, api_key: str) -> None:
145
+ """Delete uploaded source file from LALAL.AI servers."""
146
+ try:
147
+ requests.post(
148
+ f"{API_BASE}/delete/",
149
+ headers=_headers(api_key),
150
+ json={"source_id": source_id},
151
+ )
152
+ print(f" Cleaned up remote source {source_id}")
153
+ except Exception:
154
+ pass # non-critical
155
+
156
+
157
+ def separate_stems(
158
+ audio_path: str | Path,
159
+ output_dir: Optional[str | Path] = None,
160
+ ) -> dict[str, Path]:
161
+ """Separate an audio file into vocals and drums using LALAL.AI.
162
+
163
+ Creates a new run directory for each invocation so multiple runs
164
+ on the same song don't overwrite each other.
165
+
166
+ Args:
167
+ audio_path: Path to the input audio file (mp3/wav) from input/.
168
+ output_dir: Directory to save stems. If None, auto-creates
169
+ data/<song>/run_NNN/stems/.
170
+
171
+ Returns:
172
+ Dict mapping stem names to their file paths.
173
+ Keys: "drums", "vocals", "run_dir"
174
+ """
175
+ audio_path = Path(audio_path)
176
+ song_name = audio_path.stem
177
+ song_dir = DATA_DIR / song_name
178
+ api_key = _get_api_key()
179
+
180
+ if output_dir is None:
181
+ run_dir = _next_run_dir(song_dir)
182
+ output_dir = run_dir / "stems"
183
+ else:
184
+ output_dir = Path(output_dir)
185
+ run_dir = output_dir.parent
186
+
187
+ output_dir.mkdir(parents=True, exist_ok=True)
188
+
189
+ # Copy original song into song directory (shared across runs)
190
+ song_copy = song_dir / audio_path.name
191
+ if not song_copy.exists():
192
+ shutil.copy2(audio_path, song_copy)
193
+
194
+ # 1. Upload
195
+ print("Stem separation (LALAL.AI):")
196
+ source_id = _upload(audio_path, api_key)
197
+
198
+ # 2. Start split tasks for each stem
199
+ task_to_stem = {}
200
+ for stem in STEMS_TO_EXTRACT:
201
+ task_id = _split_stem(source_id, stem, api_key)
202
+ task_to_stem[task_id] = stem
203
+
204
+ # 3. Poll until all tasks complete
205
+ results = _poll_tasks(list(task_to_stem.keys()), api_key)
206
+
207
+ # 4. Download the separated stem tracks
208
+ stem_paths = {"run_dir": run_dir}
209
+ for task_id, result_data in results.items():
210
+ stem = task_to_stem[task_id]
211
+ filename = LABEL_TO_FILENAME[stem]
212
+ tracks = result_data.get("result", {}).get("tracks", [])
213
+
214
+ # Find the "stem" track (not the "back"/inverse track)
215
+ stem_track = next((t for t in tracks if t["type"] == "stem"), None)
216
+ if stem_track is None:
217
+ raise RuntimeError(f"No stem track found in result for {stem}")
218
+
219
+ output_path = output_dir / filename
220
+ _download_track(stem_track["url"], output_path)
221
+
222
+ # Map to our naming: "drum" API stem β†’ "drums" key
223
+ key = "drums" if stem == "drum" else stem
224
+ stem_paths[key] = output_path
225
+
226
+ # 5. Cleanup remote files
227
+ _delete_source(source_id, api_key)
228
+
229
+ return stem_paths
230
+
231
+
232
+ if __name__ == "__main__":
233
+ import sys
234
+
235
+ if len(sys.argv) < 2:
236
+ print("Usage: python -m src.stem_separator <audio_file>")
237
+ sys.exit(1)
238
+
239
+ result = separate_stems(sys.argv[1])
240
+ print(f"Run directory: {result['run_dir']}")
241
+ for name, path in result.items():
242
+ if name != "run_dir":
243
+ print(f" {name}: {path}")
src/styles.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Style registry β€” maps style names to LoRA sources.
2
+
3
+ Each style can point to a local .safetensors file or a HuggingFace Hub repo.
4
+ pipe.load_lora_weights() handles both transparently.
5
+ """
6
+
7
+ STYLES = {
8
+ "Sunset Coastal Drive": {
9
+ "source": "samuelsattler/warm-sunset-lora",
10
+ "weight_name": "pytorch_lora_weights.safetensors",
11
+ "weight": 1.0,
12
+ "trigger": "sks",
13
+ "description": "Golden hour warmth, sun flares, silhouettes, warm color grading",
14
+ "quality_suffix": "8K, cinematic, golden hour glow, warm volumetric light, lens flare, shallow depth of field",
15
+ "image_prompt_guidance": (
16
+ "SETTING β€” Coastal sunset drive:\n"
17
+ "The shared world is a drive along a coastal highway as the sun "
18
+ "sets. All scenes take place in or around this journey β€” a car "
19
+ "cruising along cliffs above the ocean, waves crashing against "
20
+ "rocks below, wind whipping through open windows, palm trees "
21
+ "swaying overhead, the sun sinking into the sea on the horizon. "
22
+ "No humans visible β€” focus on the car, the road, the ocean, "
23
+ "and the landscape. Every shot must have motion: wheels turning, "
24
+ "waves rolling, sun flares shifting, clouds drifting."
25
+ ),
26
+ },
27
+ "Rainy City Night": {
28
+ "source": "artificialguybr/filmgrain-redmond-filmgrain-lora-for-sdxl",
29
+ "weight_name": "FilmGrainRedmond-FilmGrain-FilmGrainAF.safetensors",
30
+ "weight": 0.8,
31
+ "trigger": "FilmGrainAF",
32
+ "description": "35mm film grain, moody color grading, cinematic lighting",
33
+ "quality_suffix": "8K, cinematic, shot on 35mm film, dramatic rim lighting, high contrast, shallow depth of field",
34
+ "image_prompt_guidance": (
35
+ "SETTING β€” Rainy city at night:\n"
36
+ "The shared world is a walk through a rain-soaked city after dark. "
37
+ "All scenes take place on these streets β€” rain streaking through "
38
+ "streetlights, puddles reflecting neon signs, steam rising from "
39
+ "grates, traffic passing with blurred headlights, wet umbrellas, "
40
+ "rain hammering awnings, water streaming down windows. "
41
+ "No humans visible β€” focus on the environment, the rain, the "
42
+ "reflections, and the city itself. Every shot must have motion: "
43
+ "rain falling, cars passing, lights flickering, water flowing "
44
+ "through gutters."
45
+ ),
46
+ },
47
+ "Cyberpunk": {
48
+ "source": "jbilcke-hf/sdxl-cyberpunk-2077",
49
+ "weight_name": "pytorch_lora_weights.safetensors",
50
+ "weight": 0.9,
51
+ "trigger": "cyberpunk-2077",
52
+ "description": "Neon-lit cityscapes, dark futuristic vibes, glowing signs",
53
+ "quality_suffix": "8K, cinematic, neon-drenched, volumetric fog, sharp details, high contrast, dramatic lighting",
54
+ "image_prompt_guidance": (
55
+ "SETTING β€” Cyberpunk nightlife cityscape:\n"
56
+ "The shared world is a futuristic megacity at night. All scenes "
57
+ "take place in this neon-drenched urban sprawl β€” holographic "
58
+ "billboards flickering on skyscrapers, flying vehicles streaking "
59
+ "between towers, neon signs buzzing and glitching, rain falling "
60
+ "through laser grids, steam erupting from vents, LED-lit market "
61
+ "stalls with flickering displays. "
62
+ "No humans visible β€” focus on the city, the machines, the neon, "
63
+ "and the architecture. Every shot must have motion: vehicles "
64
+ "flying, signs flickering, rain falling, smoke drifting, lights "
65
+ "pulsing."
66
+ ),
67
+ },
68
+ "Watercolour Harbour": {
69
+ "source": "ostris/watercolor_style_lora_sdxl",
70
+ "weight_name": "watercolor_v1_sdxl.safetensors",
71
+ "weight": 1.4,
72
+ "trigger": "",
73
+ "description": "Soft watercolor painting style, fluid washes, gentle blending",
74
+ "quality_suffix": "8K, watercolor painting, soft painterly washes, fluid blending, delicate brushstrokes, atmospheric",
75
+ "image_prompt_guidance": (
76
+ "SETTING β€” Stormy harbour village:\n"
77
+ "The shared world is a coastal fishing village during a storm. "
78
+ "All scenes take place in and around this harbour β€” waves "
79
+ "crashing against stone sea walls, fishing boats rocking and "
80
+ "pulling at their moorings, rain sweeping across the harbour "
81
+ "in sheets, wind tearing through flags and sails, seabirds "
82
+ "wheeling against dark clouds, lanterns swinging on posts, "
83
+ "water pouring off rooftops into cobblestone streets. "
84
+ "No humans visible β€” focus on the sea, the boats, the storm, "
85
+ "and the village. Every shot must have motion: waves surging, "
86
+ "boats swaying, rain lashing, flags snapping in the wind."
87
+ ),
88
+ },
89
+ }
90
+
91
+
92
+ def get_style(name: str) -> dict:
93
+ """Look up a style by name. Raises KeyError if not found."""
94
+ return STYLES[name]
95
+
96
+
97
+ def style_names() -> list[str]:
98
+ """Return list of available style names for UI dropdowns."""
99
+ return list(STYLES.keys())