File size: 9,110 Bytes
9da0f47
 
 
520aa4f
 
9da0f47
 
 
520aa4f
 
 
9da0f47
520aa4f
 
 
 
 
 
 
 
9da0f47
520aa4f
 
 
 
 
 
 
 
 
9da0f47
520aa4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9da0f47
 
520aa4f
 
 
 
 
 
 
 
 
9da0f47
520aa4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9da0f47
 
520aa4f
 
 
 
 
9da0f47
520aa4f
 
9da0f47
520aa4f
 
9da0f47
520aa4f
 
 
 
 
 
 
 
 
 
9da0f47
 
520aa4f
 
 
 
 
9da0f47
520aa4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9da0f47
520aa4f
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from diffusers import StableVideoDiffusionPipeline
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write
import torch
import tempfile
import os
import cv2
import numpy as np
from PIL import Image

# Load SmolLM2-1.7B model (correct model name and size)
print("Loading text generation model...")
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
    "HuggingFaceTB/SmolLM2-1.7B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load Stable Video Diffusion model (correct model name)
print("Loading video generation model...")
video_pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt", 
    torch_dtype=torch.float16,
    variant="fp16"
)
if torch.cuda.is_available():
    video_pipe = video_pipe.to("cuda")
video_pipe.enable_model_cpu_offload()
video_pipe.enable_vae_slicing()

# Load MusicGen model
print("Loading music generation model...")
music_model = MusicGen.get_pretrained('facebook/musicgen-small')
music_model.set_generation_params(duration=8)  # 8 seconds music

def generate_music(prompt: str):
    """Generate background music from text prompt"""
    try:
        wav = music_model.generate([prompt], progress=True)
        tmp_dir = tempfile.mkdtemp()
        out_path = os.path.join(tmp_dir, "music")
        audio_write(out_path, wav[0].cpu(), music_model.sample_rate, format="mp3")
        return f"{out_path}.mp3"
    except Exception as e:
        print(f"Music generation error: {e}")
        return None

def generate_scenes_with_smol(script, style):
    """Generate scene descriptions using SmolLM2"""
    try:
        prompt = f"""<|im_start|>system
You are a professional video director. Break down scripts into detailed cinematic scenes.
<|im_end|>
<|im_start|>user
Break this {style.lower()} script into 3-5 cinematic scenes with camera angles, characters, and mood.

Script: {script}

Format each scene as:
Scene X: [Detailed visual description with camera angle, lighting, characters, and action]
<|im_end|>
<|im_start|>assistant"""
        
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}
            
        with torch.no_grad():
            outputs = model.generate(
                **inputs, 
                max_new_tokens=512,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract only the assistant's response
        response = decoded.split("<|im_start|>assistant")[-1].strip()
        
        # Parse scenes
        scenes = []
        lines = response.split('\n')
        for i, line in enumerate(lines):
            if line.strip() and ('Scene' in line or len(line.strip()) > 20):
                scenes.append({
                    "scene_id": len(scenes) + 1, 
                    "description": line.strip()
                })
        
        # Ensure we have at least one scene
        if not scenes:
            scenes = [{"scene_id": 1, "description": f"A {style.lower()} scene: {script[:100]}..."}]
            
        return scenes[:5]  # Limit to 5 scenes max
    except Exception as e:
        print(f"Scene generation error: {e}")
        return [{"scene_id": 1, "description": f"A {style.lower()} scene based on the script"}]

def create_initial_image(prompt, width=1024, height=576):
    """Create a simple initial image for SVD (since it requires an input image)"""
    # Create a simple gradient or solid color image as starting point
    # In practice, you'd want to use a text-to-image model like Stable Diffusion
    img = np.random.randint(50, 200, (height, width, 3), dtype=np.uint8)
    img = Image.fromarray(img)
    return img

def generate_video_with_svd(prompt):
    """Generate video using Stable Video Diffusion"""
    try:
        # Create initial image (in practice, use a text-to-image model)
        initial_image = create_initial_image(prompt)
        
        # Generate video frames
        frames = video_pipe(
            image=initial_image,
            decode_chunk_size=2,
            generator=torch.manual_seed(42),
            motion_bucket_id=127,
            noise_aug_strength=0.02,
        ).frames[0]
        
        # Save as video file
        tmp_dir = tempfile.mkdtemp()
        output_path = os.path.join(tmp_dir, "scene.mp4")
        
        # Convert PIL images to video using OpenCV
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        fps = 6  # SVD typically generates 6 fps
        height, width = frames[0].size[1], frames[0].size[0]
        
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
        
        for frame in frames:
            frame_array = np.array(frame)
            frame_bgr = cv2.cvtColor(frame_array, cv2.COLOR_RGB2BGR)
            out.write(frame_bgr)
        
        out.release()
        return output_path
        
    except Exception as e:
        print(f"Video generation error: {e}")
        # Return a placeholder or None
        return None

def process_script(script, style, want_music):
    """Main processing function"""
    if not script.strip():
        return [], None
        
    print("Generating scenes...")
    scenes = generate_scenes_with_smol(script, style)
    
    print("Generating videos...")
    video_clips = []
    for i, scene in enumerate(scenes):
        print(f"Processing scene {i+1}/{len(scenes)}")
        text_prompt = scene['description']
        video_path = generate_video_with_svd(text_prompt)
        if video_path:
            video_clips.append((scene['description'], video_path))
    
    music_path = None
    if want_music:
        print("Generating music...")
        music_prompt = f"Background music for {style.lower()} video: {script[:100]}"
        music_path = generate_music(music_prompt)
    
    return video_clips, music_path

# Gradio Interface
with gr.Blocks(title="Vividly MVP", theme=gr.themes.Soft()) as app:
    gr.Markdown("# 🎬 Vividly MVP – AI Video Creator")
    gr.Markdown("Transform your script into cinematic scenes with AI-generated videos and music!")
    
    with gr.Row():
        with gr.Column(scale=2):
            script_input = gr.Textbox(
                label="Video Script", 
                lines=6,
                placeholder="Enter your video script here..."
            )
        with gr.Column(scale=1):
            style_input = gr.Dropdown(
                ["Cinematic", "Vlog", "Explainer", "Documentary"], 
                value="Cinematic", 
                label="Video Style"
            )
            music_toggle = gr.Checkbox(label="Generate background music", value=True)
            submit_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
    
    with gr.Row():
        with gr.Column():
            video_outputs = gr.Video(
                label="Generated Video Clip", 
                interactive=False, 
                visible=False
            )
        with gr.Column():
            music_player = gr.Audio(
                label="Generated Background Music", 
                visible=False
            )
            
    scene_gallery = gr.Gallery(
        label="Scene Descriptions",
        visible=False,
        columns=1,
        height="auto"
    )
    
    def wrap_processing(script, style, music):
        if not script.strip():
            return (
                gr.update(visible=False), 
                gr.update(visible=False),
                gr.update(visible=False)
            )
            
        try:
            scenes, music_path = process_script(script, style, music)
            
            # Show first video if available
            first_video = scenes[0][1] if scenes else None
            
            # Create scene descriptions for gallery
            scene_descriptions = [scene[0] for scene in scenes] if scenes else []
            
            return (
                gr.update(value=first_video, visible=bool(first_video)),
                gr.update(value=music_path, visible=bool(music_path)),
                gr.update(value=scene_descriptions, visible=bool(scene_descriptions))
            )
        except Exception as e:
            print(f"Processing error: {e}")
            return (
                gr.update(visible=False),
                gr.update(visible=False), 
                gr.update(visible=False)
            )
    
    submit_btn.click(
        wrap_processing, 
        inputs=[script_input, style_input, music_toggle], 
        outputs=[video_outputs, music_player, scene_gallery]
    )

if __name__ == "__main__":
    print("Starting Vividly MVP...")
    app.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=True
    )