import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM from diffusers import StableVideoDiffusionPipeline from audiocraft.models import MusicGen from audiocraft.data.audio import audio_write import torch import tempfile import os import cv2 import numpy as np from PIL import Image # Load SmolLM2-1.7B model (correct model name and size) print("Loading text generation model...") tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct") model = AutoModelForCausalLM.from_pretrained( "HuggingFaceTB/SmolLM2-1.7B-Instruct", torch_dtype=torch.float16, device_map="auto" ) # Load Stable Video Diffusion model (correct model name) print("Loading video generation model...") video_pipe = StableVideoDiffusionPipeline.from_pretrained( "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16" ) if torch.cuda.is_available(): video_pipe = video_pipe.to("cuda") video_pipe.enable_model_cpu_offload() video_pipe.enable_vae_slicing() # Load MusicGen model print("Loading music generation model...") music_model = MusicGen.get_pretrained('facebook/musicgen-small') music_model.set_generation_params(duration=8) # 8 seconds music def generate_music(prompt: str): """Generate background music from text prompt""" try: wav = music_model.generate([prompt], progress=True) tmp_dir = tempfile.mkdtemp() out_path = os.path.join(tmp_dir, "music") audio_write(out_path, wav[0].cpu(), music_model.sample_rate, format="mp3") return f"{out_path}.mp3" except Exception as e: print(f"Music generation error: {e}") return None def generate_scenes_with_smol(script, style): """Generate scene descriptions using SmolLM2""" try: prompt = f"""<|im_start|>system You are a professional video director. Break down scripts into detailed cinematic scenes. <|im_end|> <|im_start|>user Break this {style.lower()} script into 3-5 cinematic scenes with camera angles, characters, and mood. Script: {script} Format each scene as: Scene X: [Detailed visual description with camera angle, lighting, characters, and action] <|im_end|> <|im_start|>assistant""" inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) if torch.cuda.is_available(): inputs = {k: v.to("cuda") for k, v in inputs.items()} with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=512, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id ) decoded = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract only the assistant's response response = decoded.split("<|im_start|>assistant")[-1].strip() # Parse scenes scenes = [] lines = response.split('\n') for i, line in enumerate(lines): if line.strip() and ('Scene' in line or len(line.strip()) > 20): scenes.append({ "scene_id": len(scenes) + 1, "description": line.strip() }) # Ensure we have at least one scene if not scenes: scenes = [{"scene_id": 1, "description": f"A {style.lower()} scene: {script[:100]}..."}] return scenes[:5] # Limit to 5 scenes max except Exception as e: print(f"Scene generation error: {e}") return [{"scene_id": 1, "description": f"A {style.lower()} scene based on the script"}] def create_initial_image(prompt, width=1024, height=576): """Create a simple initial image for SVD (since it requires an input image)""" # Create a simple gradient or solid color image as starting point # In practice, you'd want to use a text-to-image model like Stable Diffusion img = np.random.randint(50, 200, (height, width, 3), dtype=np.uint8) img = Image.fromarray(img) return img def generate_video_with_svd(prompt): """Generate video using Stable Video Diffusion""" try: # Create initial image (in practice, use a text-to-image model) initial_image = create_initial_image(prompt) # Generate video frames frames = video_pipe( image=initial_image, decode_chunk_size=2, generator=torch.manual_seed(42), motion_bucket_id=127, noise_aug_strength=0.02, ).frames[0] # Save as video file tmp_dir = tempfile.mkdtemp() output_path = os.path.join(tmp_dir, "scene.mp4") # Convert PIL images to video using OpenCV fourcc = cv2.VideoWriter_fourcc(*'mp4v') fps = 6 # SVD typically generates 6 fps height, width = frames[0].size[1], frames[0].size[0] out = cv2.VideoWriter(output_path, fourcc, fps, (width, height)) for frame in frames: frame_array = np.array(frame) frame_bgr = cv2.cvtColor(frame_array, cv2.COLOR_RGB2BGR) out.write(frame_bgr) out.release() return output_path except Exception as e: print(f"Video generation error: {e}") # Return a placeholder or None return None def process_script(script, style, want_music): """Main processing function""" if not script.strip(): return [], None print("Generating scenes...") scenes = generate_scenes_with_smol(script, style) print("Generating videos...") video_clips = [] for i, scene in enumerate(scenes): print(f"Processing scene {i+1}/{len(scenes)}") text_prompt = scene['description'] video_path = generate_video_with_svd(text_prompt) if video_path: video_clips.append((scene['description'], video_path)) music_path = None if want_music: print("Generating music...") music_prompt = f"Background music for {style.lower()} video: {script[:100]}" music_path = generate_music(music_prompt) return video_clips, music_path # Gradio Interface with gr.Blocks(title="Vividly MVP", theme=gr.themes.Soft()) as app: gr.Markdown("# 🎬 Vividly MVP – AI Video Creator") gr.Markdown("Transform your script into cinematic scenes with AI-generated videos and music!") with gr.Row(): with gr.Column(scale=2): script_input = gr.Textbox( label="Video Script", lines=6, placeholder="Enter your video script here..." ) with gr.Column(scale=1): style_input = gr.Dropdown( ["Cinematic", "Vlog", "Explainer", "Documentary"], value="Cinematic", label="Video Style" ) music_toggle = gr.Checkbox(label="Generate background music", value=True) submit_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg") with gr.Row(): with gr.Column(): video_outputs = gr.Video( label="Generated Video Clip", interactive=False, visible=False ) with gr.Column(): music_player = gr.Audio( label="Generated Background Music", visible=False ) scene_gallery = gr.Gallery( label="Scene Descriptions", visible=False, columns=1, height="auto" ) def wrap_processing(script, style, music): if not script.strip(): return ( gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) ) try: scenes, music_path = process_script(script, style, music) # Show first video if available first_video = scenes[0][1] if scenes else None # Create scene descriptions for gallery scene_descriptions = [scene[0] for scene in scenes] if scenes else [] return ( gr.update(value=first_video, visible=bool(first_video)), gr.update(value=music_path, visible=bool(music_path)), gr.update(value=scene_descriptions, visible=bool(scene_descriptions)) ) except Exception as e: print(f"Processing error: {e}") return ( gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) ) submit_btn.click( wrap_processing, inputs=[script_input, style_input, music_toggle], outputs=[video_outputs, music_player, scene_gallery] ) if __name__ == "__main__": print("Starting Vividly MVP...") app.launch( server_name="0.0.0.0", server_port=7860, share=False, debug=True )